Rust implementation of the CVM algorithm for counting distinct elements in a stream
0

Configure Feed

Select the types of activity you want to include in your feed.

Add test

It's not a real test, just allows quick calculation of the actual count

+38
+38
src/lib.rs
··· 73 73 fn buffer_size(epsilon: f64, delta: f64, stream_size: usize) -> usize { 74 74 ((12.0 / epsilon.powf(2.0)) * ((8.0 * stream_size as f64) / delta).log2()).ceil() as usize 75 75 } 76 + 77 + #[cfg(test)] 78 + mod tests { 79 + use std::{ 80 + fs::File, 81 + io::{BufRead, BufReader}, 82 + path::Path, 83 + }; 84 + 85 + use super::*; 86 + use regex::Regex; 87 + 88 + fn open_file<P>(filename: P) -> BufReader<File> 89 + where 90 + P: AsRef<Path>, 91 + { 92 + let f = File::open(filename).expect("Couldn't read from file"); 93 + BufReader::new(f) 94 + } 95 + 96 + fn line_to_word(re: &Regex, hs: &mut FxHashSet<String>, line: &str) { 97 + let words = line.split(' '); 98 + words.for_each(|word| { 99 + let clean_word = re.replace_all(word, "").to_lowercase(); 100 + hs.insert(clean_word); 101 + }) 102 + } 103 + #[test] 104 + fn actual() { 105 + let input_file = "benches/kiy.txt"; 106 + let re = Regex::new(r"[^\w\s]").unwrap(); 107 + let br = open_file(input_file); 108 + let mut hs = FxHashSet::with_hasher(Default::default()); 109 + br.lines() 110 + .for_each(|line| line_to_word(&re, &mut hs, &line.unwrap())); 111 + assert_eq!(hs.len(), 9016) 112 + } 113 + }