···11//! An implementation of the CVM fast element counting algorithm presented in
22//! Chakraborty, S., Vinodchandran, N. V., & Meel, K. S. (2022). *Distinct Elements in Streams: An Algorithm for the (Text) Book*. 6 pages, 727571 bytes. <https://doi.org/10.4230/LIPIcs.ESA.2022.34>
33+//!
44+//! This implementation uses a treap data structure as the buffer, following Knuth's original design.
55+66+mod treap;
3788+use crate::treap::Treap;
49use rand::rngs::StdRng;
510use rand::{Rng, SeedableRng};
61177-use rustc_hash::FxHashSet;
88-use std::hash::Hash;
99-1012/// A counter implementing the CVM algorithm
1113///
1414+/// This implementation uses a treap (randomized binary search tree) as the buffer,
1515+/// which provides `O(log n)` operations while maintaining the probabilistic properties
1616+/// needed for the algorithm.
1717+///
1218/// Note that the CVM struct's buffer takes ownership of its elements.
1313-pub struct CVM<T: PartialEq + Eq + Hash> {
1919+pub struct CVM<T: Ord> {
1420 buf_size: usize,
1515- buf: FxHashSet<T>,
2121+ buf: Treap<T>,
1622 probability: f64,
1723 rng: StdRng,
1824}
19252020-impl<T: PartialEq + Eq + Hash> CVM<T> {
2626+impl<T: Ord> CVM<T> {
2127 /// Initialise the algorithm
2228 ///
2323- /// epsilon: how close you want your estimate to be to the true number of distinct elements.
2424- /// A smaller ε means you require a more precise estimate.
2525- /// For example, ε = 0.05 means you want your estimate to be within 5% of the actual value.
2626- /// An epsilon of 0.8 is a good starting point for most applications.
2929+ /// `epsilon`: how close you want your estimate to be to the true number of distinct elements.
3030+ /// A smaller `ε` means you require a more precise estimate.
3131+ /// For example, `ε = 0.05` means you want your estimate to be within 5 % of the actual value.
3232+ /// An epsilon of `0.8` is a good starting point for most applications.
2733 ///
2828- /// delta: The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence
3434+ /// `delta`: The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence
2935 /// (e.g. 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g. 90 %) means there's a
3036 /// higher chance the estimate might be outside the desired range.
3131- /// A delta of 0.1 is a good starting point for most applications.
3737+ /// A `delta` of `0.1` is a good starting point for most applications.
3238 ///
3333- /// stream_size: this is used to determine buffer size and can be a loose approximation. The closer it is to the stream size,
3939+ /// `stream_size`: this is used to determine buffer size and can be a loose approximation. The closer it is to the stream size,
3440 /// the more accurate the result will be.
3541 pub fn new(epsilon: f64, delta: f64, stream_size: usize) -> Self {
3642 let bufsize = buffer_size(epsilon, delta, stream_size);
3743 Self {
3844 buf_size: bufsize,
3939- buf: FxHashSet::with_capacity_and_hasher(bufsize, Default::default()),
4545+ buf: Treap::new(),
4046 probability: 1.0,
4147 rng: StdRng::from_entropy(),
4248 }
4349 }
4450 /// Add an element, potentially updating the unique element count
4551 pub fn process_element(&mut self, elem: T) {
4646- // We should switch to a treap (as per Knuth) to avoid the hash overhead, but FxHash
4747- // is still a lot faster than linear searching a Vec, even at small (1000) buffer sizes
4848- // Round 0: if an element exists, remove it. Element is added back due to probability 1
4949- // When buffer is full, remove half the elements
5050- // Round 1: if an element exists, remove it. Element MAY be added back due to probability 0.5
5252+ // The algorithm works as follows:
5353+ // 1. If element exists in buffer, remove it (this ensures proper sampling)
5454+ // 2. Add element back with current probability
5555+ // 3. If buffer is full, remove ~half the elements and halve the probability
5656+ // This creates a geometric sampling scheme that provides an unbiased estimate
5157 if self.buf.contains(&elem) {
5258 self.buf.remove(&elem);
5359 }
5460 if self.rng.gen_bool(self.probability) {
5555- self.buf.insert(elem);
6161+ self.buf.insert(elem, &mut self.rng);
5662 }
5763 while self.buf.len() == self.buf_size {
5864 self.clear_about_half();
···6167 }
6268 // remove around half of the elements at random
6369 fn clear_about_half(&mut self) {
6464- self.buf.retain(|_| self.rng.gen_bool(0.5));
7070+ // Need to capture rng reference to use in closure
7171+ let rng = &mut self.rng;
7272+ self.buf.retain(|_| rng.gen_bool(0.5));
6573 }
6674 /// Calculate the current unique element count. You can continue to add elements after calling this method.
6775 pub fn calculate_final_result(&self) -> f64 {
···8290 path::Path,
8391 };
84928585- use super::*;
8693 use regex::Regex;
9494+ use std::collections::HashSet;
87958896 fn open_file<P>(filename: P) -> BufReader<File>
8997 where
···93101 BufReader::new(f)
94102 }
951039696- fn line_to_word(re: &Regex, hs: &mut FxHashSet<String>, line: &str) {
104104+ fn line_to_word(re: &Regex, hs: &mut HashSet<String>, line: &str) {
97105 let words = line.split(' ');
98106 words.for_each(|word| {
99107 let clean_word = re.replace_all(word, "").to_lowercase();
···105113 let input_file = "benches/kiy.txt";
106114 let re = Regex::new(r"[^\w\s]").unwrap();
107115 let br = open_file(input_file);
108108- let mut hs = FxHashSet::with_hasher(Default::default());
116116+ let mut hs = HashSet::new();
109117 br.lines()
110118 .for_each(|line| line_to_word(&re, &mut hs, &line.unwrap()));
111119 assert_eq!(hs.len(), 9016)