Rust implementation of the CVM algorithm for counting distinct elements in a stream
0

Configure Feed

Select the types of activity you want to include in your feed.

Don't use positional arguments

+6 -6
+6 -6
src/main.rs
··· 1 1 use clap::{arg, crate_version, value_parser, Command}; 2 - use std::io::BufRead; 3 - use std::path::PathBuf; 4 2 use std::fs::File; 3 + use std::io::BufRead; 5 4 use std::io::BufReader; 6 5 use std::path::Path; 6 + use std::path::PathBuf; 7 7 8 8 use cvmcount::CVM; 9 9 ··· 21 21 .version(crate_version!()) 22 22 .author("Stephan Hügel <urschrei@gmail.com>") 23 23 .about("Use the CVM algorithm to estimate the number of unique tokens in a stream") 24 - .arg(arg!(-t --tokens <FILE> "A text file containing words").index(1).required(true).value_parser(value_parser!(PathBuf))) 25 - .arg(arg!(-e --epsilon <EPSILON> "How close you want your estimate to be to the true number of distinct tokens. A smaller ε means you require a more precise estimate. For example, ε = 0.05 means you want your estimate to be within 5 % of the actual value. An epsilon of 0.8 is a good starting point for most applications").index(2).required(true).value_parser(value_parser!(f64))) 26 - .arg(arg!(-d --delta <DELTA> "The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a higher chance the estimate might be outside the desired range. A delta of 0.1 is a good starting point for most applications").index(3).required(true).value_parser(value_parser!(f64))) 27 - .arg(arg!(-s --streamsize <STREAM_SIZE> "This is used to determine buffer size and can be a loose approximation. The closer it is to the stream size, the more accurate the results").index(4).required(true).value_parser(value_parser!(usize))) 24 + .arg(arg!(-t --tokens <FILE> "A text file containing words").required(true).value_parser(value_parser!(PathBuf))) 25 + .arg(arg!(-e --epsilon <EPSILON> "How close you want your estimate to be to the true number of distinct tokens. A smaller ε means you require a more precise estimate. For example, ε = 0.05 means you want your estimate to be within 5 % of the actual value. An epsilon of 0.8 is a good starting point for most applications").required(true).value_parser(value_parser!(f64))) 26 + .arg(arg!(-d --delta <DELTA> "The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a higher chance the estimate might be outside the desired range. A delta of 0.1 is a good starting point for most applications").required(true).value_parser(value_parser!(f64))) 27 + .arg(arg!(-s --streamsize <STREAM_SIZE> "This is used to determine buffer size and can be a loose approximation. The closer it is to the stream size, the more accurate the results").required(true).value_parser(value_parser!(usize))) 28 28 .get_matches(); 29 29 let input_file = params.get_one::<PathBuf>("tokens").unwrap(); 30 30 let epsilon = params.get_one::<f64>("epsilon").unwrap();