Rust implementation of the CVM algorithm for counting distinct elements in a stream
1#[macro_use]
2extern crate criterion;
3use std::{
4 fs::File,
5 io::{BufRead, BufReader},
6 path::Path,
7};
8
9use criterion::Criterion;
10use cvmcount::CVM;
11use rand::{Rng, rng};
12use regex::Regex;
13
14use std::collections::HashSet;
15
16// generate 1 million 7-digit random positive integers
17fn generate_random_numbers() -> Vec<i32> {
18 let mut rng = rng();
19
20 (0..1_000_000)
21 .map(|_| rng.random_range(1_000_000..10_000_000))
22 .collect()
23}
24
25fn open_file<P>(filename: P) -> BufReader<File>
26where
27 P: AsRef<Path>,
28{
29 let f = File::open(filename).expect("Couldn't read from file");
30 BufReader::new(f)
31}
32
33fn line_to_word(re: &Regex, cvm: &mut CVM<String>, line: &str) {
34 let words = line.split(' ');
35 words.for_each(|word| {
36 let clean_word = re.replace_all(word, "").to_lowercase();
37 cvm.process_element(clean_word)
38 })
39}
40
41#[allow(unused_must_use)]
42fn bench_count_strings_integers(c: &mut Criterion) {
43 c.bench_function(
44 "Count unique strings in The King in Yellow with regex regularization: e = 0.8, d = 0.1, s = 1000",
45 |b| {
46 let input_file = "benches/kiy.txt";
47 let epsilon = 0.8;
48 let delta = 0.1;
49 let stream_size = 1000;
50 let re = Regex::new(r"[^\w\s]").unwrap();
51 b.iter(|| {
52 let mut string_counter: CVM<String> = CVM::new(epsilon, delta, stream_size);
53 let br = open_file(input_file);
54 br.lines()
55 .for_each(|line| line_to_word(&re, &mut string_counter, &line.unwrap()));
56 string_counter.calculate_final_result()
57 })
58 },
59 );
60 c.bench_function(
61 "Count uniques in ten million 7-digit random positive integers: e = 0.8, d = 0.1, s = 1000",
62 |b| {
63 let epsilon = 0.8;
64 let delta = 0.1;
65 let stream_size = 1000;
66 let digits = generate_random_numbers();
67 b.iter(|| {
68 let mut int_counter: CVM<i32> = CVM::new(epsilon, delta, stream_size);
69 digits
70 .iter()
71 .for_each(|integer| int_counter.process_element(*integer));
72 int_counter.calculate_final_result()
73 })
74 },
75 );
76 c.bench_function(
77 "Count uniques in ten million 7-digit random positive integers: HashSet",
78 |b| {
79 let digits = generate_random_numbers();
80 b.iter(|| {
81 let mut hs = HashSet::new();
82 digits.iter().for_each(|digit| {
83 hs.insert(digit);
84 });
85 digits.len()
86 })
87 },
88 );
89}
90
91criterion_group!(benches, bench_count_strings_integers,);
92criterion_main!(benches);