Rust implementation of the CVM algorithm for counting distinct elements in a stream
1#[macro_use]
2extern crate criterion;
3use std::{
4 fs::File,
5 io::{BufRead, BufReader},
6 path::Path,
7};
8
9use criterion::Criterion;
10use cvmcount::CVM;
11use rand::{thread_rng, Rng};
12use regex::Regex;
13
14// generate 1 million 7-digit random positive integers
15fn generate_random_numbers() -> Vec<i32> {
16 let mut rng = thread_rng();
17
18 (0..1_000_000)
19 .map(|_| rng.gen_range(1_000_000..10_000_000))
20 .collect()
21}
22
23fn open_file<P>(filename: P) -> BufReader<File>
24where
25 P: AsRef<Path>,
26{
27 let f = File::open(filename).expect("Couldn't read from file");
28 BufReader::new(f)
29}
30
31fn line_to_word(re: &Regex, cvm: &mut CVM<String>, line: &str) {
32 let words = line.split(' ');
33 words.for_each(|word| {
34 let clean_word = re.replace_all(word, "").to_lowercase();
35 cvm.process_element(clean_word)
36 })
37}
38
39#[allow(unused_must_use)]
40fn bench_count_strings_integers(c: &mut Criterion) {
41 c.bench_function(
42 "Count unique strings in The King in Yellow with regex regularization: e = 0.8, d = 0.1, s = 1000",
43 |b| {
44 let input_file = "benches/kiy.txt";
45 let epsilon = 0.8;
46 let delta = 0.1;
47 let stream_size = 1000;
48 let re = Regex::new(r"[^\w\s]").unwrap();
49 b.iter(|| {
50 let mut string_counter: CVM<String> = CVM::new(epsilon, delta, stream_size);
51 let br = open_file(input_file);
52 br.lines()
53 .for_each(|line| line_to_word(&re, &mut string_counter, &line.unwrap()));
54 string_counter.calculate_final_result()
55 })
56 },
57 );
58 c.bench_function(
59 "Count uniques in ten million 7-digit random positive integers: e = 0.8, d = 0.1, s = 1000",
60 |b| {
61 let epsilon = 0.8;
62 let delta = 0.1;
63 let stream_size = 1000;
64 let digits = generate_random_numbers();
65 b.iter(|| {
66 let mut int_counter: CVM<i32> = CVM::new(epsilon, delta, stream_size);
67 digits.iter().for_each(|integer| int_counter.process_element(*integer));
68 int_counter.calculate_final_result()
69 })
70 }
71 );
72}
73
74criterion_group!(benches, bench_count_strings_integers,);
75criterion_main!(benches);