Rust implementation of the CVM algorithm for counting distinct elements in a stream
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 2.8 kB View raw
1#[macro_use] 2extern crate criterion; 3use std::{ 4 fs::File, 5 io::{BufRead, BufReader}, 6 path::Path, 7}; 8 9use criterion::Criterion; 10use cvmcount::CVM; 11use rand::{Rng, rng}; 12use regex::Regex; 13 14use std::collections::HashSet; 15 16// generate 1 million 7-digit random positive integers 17fn generate_random_numbers() -> Vec<i32> { 18 let mut rng = rng(); 19 20 (0..1_000_000) 21 .map(|_| rng.random_range(1_000_000..10_000_000)) 22 .collect() 23} 24 25fn open_file<P>(filename: P) -> BufReader<File> 26where 27 P: AsRef<Path>, 28{ 29 let f = File::open(filename).expect("Couldn't read from file"); 30 BufReader::new(f) 31} 32 33fn line_to_word(re: &Regex, cvm: &mut CVM<String>, line: &str) { 34 let words = line.split(' '); 35 words.for_each(|word| { 36 let clean_word = re.replace_all(word, "").to_lowercase(); 37 cvm.process_element(clean_word) 38 }) 39} 40 41#[allow(unused_must_use)] 42fn bench_count_strings_integers(c: &mut Criterion) { 43 c.bench_function( 44 "Count unique strings in The King in Yellow with regex regularization: e = 0.8, d = 0.1, s = 1000", 45 |b| { 46 let input_file = "benches/kiy.txt"; 47 let epsilon = 0.8; 48 let delta = 0.1; 49 let stream_size = 1000; 50 let re = Regex::new(r"[^\w\s]").unwrap(); 51 b.iter(|| { 52 let mut string_counter: CVM<String> = CVM::new(epsilon, delta, stream_size); 53 let br = open_file(input_file); 54 br.lines() 55 .for_each(|line| line_to_word(&re, &mut string_counter, &line.unwrap())); 56 string_counter.calculate_final_result() 57 }) 58 }, 59 ); 60 c.bench_function( 61 "Count uniques in ten million 7-digit random positive integers: e = 0.8, d = 0.1, s = 1000", 62 |b| { 63 let epsilon = 0.8; 64 let delta = 0.1; 65 let stream_size = 1000; 66 let digits = generate_random_numbers(); 67 b.iter(|| { 68 let mut int_counter: CVM<i32> = CVM::new(epsilon, delta, stream_size); 69 digits 70 .iter() 71 .for_each(|integer| int_counter.process_element(*integer)); 72 int_counter.calculate_final_result() 73 }) 74 }, 75 ); 76 c.bench_function( 77 "Count uniques in ten million 7-digit random positive integers: HashSet", 78 |b| { 79 let digits = generate_random_numbers(); 80 b.iter(|| { 81 let mut hs = HashSet::new(); 82 digits.iter().for_each(|digit| { 83 hs.insert(digit); 84 }); 85 digits.len() 86 }) 87 }, 88 ); 89} 90 91criterion_group!(benches, bench_count_strings_integers,); 92criterion_main!(benches);