Rust implementation of the CVM algorithm for counting distinct elements in a stream
0

Configure Feed

Select the types of activity you want to include in your feed.

Builder function

author
Stephan Hügel
date (Jun 30, 2025, 12:54 AM +0100) commit 3b2a250a parent b4b35086 change-id spslktpu
+343 -3
+53
README.md
··· 46 46 47 47 The `--help` option is available. 48 48 49 + # Library Usage 50 + 51 + The library provides both a simple constructor and a builder pattern for more ergonomic usage: 52 + 53 + ## Simple Constructor 54 + 55 + ```rust 56 + use cvmcount::CVM; 57 + 58 + let mut cvm = CVM::new(0.05, 0.01, 10_000); 59 + for item in data_stream { 60 + cvm.process_element(item); 61 + } 62 + let estimate = cvm.calculate_final_result(); 63 + ``` 64 + 65 + ## Builder Pattern (Recommended) 66 + 67 + The builder pattern provides better readability and validation: 68 + 69 + ```rust 70 + use cvmcount::CVM; 71 + 72 + // Using defaults (epsilon=0.8, confidence=0.9, size=1000) 73 + let mut cvm: CVM<String> = CVM::builder().build().unwrap(); 74 + 75 + // Custom configuration with confidence level 76 + let mut cvm: CVM<i32> = CVM::builder() 77 + .epsilon(0.05) // 5 % accuracy 78 + .confidence(0.99) // 99 % confidence 79 + .estimated_size(50_000) 80 + .build() 81 + .unwrap(); 82 + 83 + // Using delta (failure probability) instead of confidence 84 + let mut cvm: CVM<String> = CVM::builder() 85 + .epsilon(0.1) // 10 % accuracy 86 + .delta(0.01) // 1 % chance of failure 87 + .estimated_size(1_000) 88 + .build() 89 + .unwrap(); 90 + 91 + // Process your data 92 + for word in text.split_whitespace() { 93 + cvm.process_element(word.to_string()); 94 + } 95 + 96 + let estimate = cvm.calculate_final_result(); 97 + println!("Estimated unique words: {}", estimate as usize); 98 + ``` 99 + 100 + The builder validates parameters and provides clear error messages for invalid inputs. 101 + 49 102 ## Analysis 50 103 51 104 ![](cvmcount.png)
+1 -1
benches/benchmarks.rs
··· 8 8 9 9 use criterion::Criterion; 10 10 use cvmcount::CVM; 11 - use rand::{thread_rng, Rng}; 11 + use rand::{Rng, thread_rng}; 12 12 use regex::Regex; 13 13 14 14 use std::collections::HashSet;
+287
src/lib.rs
··· 9 9 use rand::rngs::StdRng; 10 10 use rand::{Rng, SeedableRng}; 11 11 12 + /// Specification for confidence level in the CVM algorithm 13 + #[derive(Debug, Clone, Copy)] 14 + pub enum ConfidenceSpec { 15 + /// Specify delta directly (probability of failure) 16 + Delta(f64), 17 + /// Specify confidence level (probability of success) 18 + Confidence(f64), 19 + } 20 + 21 + impl ConfidenceSpec { 22 + /// Convert to delta value for internal use 23 + fn to_delta(self) -> f64 { 24 + match self { 25 + ConfidenceSpec::Delta(delta) => delta, 26 + ConfidenceSpec::Confidence(confidence) => 1.0 - confidence, 27 + } 28 + } 29 + 30 + /// Validate the confidence specification 31 + fn validate(self) -> Result<Self, String> { 32 + match self { 33 + ConfidenceSpec::Delta(delta) => { 34 + if delta <= 0.0 || delta >= 1.0 { 35 + Err("Delta must be between 0.0 and 1.0 (exclusive)".to_string()) 36 + } else { 37 + Ok(self) 38 + } 39 + } 40 + ConfidenceSpec::Confidence(confidence) => { 41 + if confidence <= 0.0 || confidence >= 1.0 { 42 + Err("Confidence must be between 0.0 and 1.0 (exclusive)".to_string()) 43 + } else { 44 + Ok(self) 45 + } 46 + } 47 + } 48 + } 49 + } 50 + 51 + /// Builder for constructing CVM instances with validation and defaults 52 + /// 53 + /// # Examples 54 + /// 55 + /// ``` 56 + /// use cvmcount::CVM; 57 + /// 58 + /// // Using defaults (`epsilon=0.8`, `confidence=0.9`, `size=1000`) 59 + /// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap(); 60 + /// 61 + /// // Custom parameters 62 + /// let cvm: CVM<i32> = CVM::<i32>::builder() 63 + /// .epsilon(0.05) // 5 % accuracy 64 + /// .confidence(0.99) // 99 % confidence 65 + /// .estimated_size(10_000) 66 + /// .build() 67 + /// .unwrap(); 68 + /// 69 + /// // Using delta instead of confidence 70 + /// let cvm: CVM<String> = CVM::<String>::builder() 71 + /// .epsilon(0.1) 72 + /// .delta(0.01) // 1 % failure probability 73 + /// .build() 74 + /// .unwrap(); 75 + /// ``` 76 + #[derive(Debug, Clone, Default)] 77 + pub struct CVMBuilder { 78 + epsilon: Option<f64>, 79 + confidence_spec: Option<ConfidenceSpec>, 80 + stream_size: Option<usize>, 81 + } 82 + 83 + impl CVMBuilder { 84 + /// Create a new builder with default values 85 + pub fn new() -> Self { 86 + Self::default() 87 + } 88 + 89 + /// Set the epsilon parameter (accuracy requirement) 90 + /// 91 + /// `Epsilon` determines how close you want your estimate to be to the true number 92 + /// of distinct elements. A smaller `ε` means you require a more precise estimate. 93 + /// For example, `ε = 0.05` means you want your estimate to be within 5 % of the 94 + /// actual value. 95 + /// 96 + /// Must be between 0.0 and 1.0 (exclusive). 97 + pub fn epsilon(mut self, epsilon: f64) -> Self { 98 + self.epsilon = Some(epsilon); 99 + self 100 + } 101 + 102 + /// Set the confidence level (probability that the estimate will be accurate) 103 + /// 104 + /// Confidence represents how certain you want to be that the algorithm's 105 + /// estimate will fall within the desired accuracy range. For example, 106 + /// `confidence = 0.99` means you're 99 % sure the estimate will be accurate. 107 + /// 108 + /// Must be between 0.0 and 1.0 (exclusive). 109 + /// Cannot be used together with [`Self::delta`] – the last one called will be used. 110 + pub fn confidence(mut self, confidence: f64) -> Self { 111 + self.confidence_spec = Some(ConfidenceSpec::Confidence(confidence)); 112 + self 113 + } 114 + 115 + /// Set the delta parameter (probability of failure) 116 + /// 117 + /// Delta represents the probability that the algorithm's estimate will fall 118 + /// outside the desired accuracy range. For example, `delta = 0.01` means there's 119 + /// a 1 % chance the estimate will be inaccurate. 120 + /// 121 + /// Must be between 0.0 and 1.0 (exclusive). 122 + /// Cannot be used together with [`Self::confidence()`] – the last one called will be used. 123 + pub fn delta(mut self, delta: f64) -> Self { 124 + self.confidence_spec = Some(ConfidenceSpec::Delta(delta)); 125 + self 126 + } 127 + 128 + /// Set the estimated stream size 129 + /// 130 + /// This is used to determine buffer size and can be a loose approximation. 131 + /// The closer it is to the actual stream size, the more accurate the results 132 + /// will be. 133 + pub fn estimated_size(mut self, size: usize) -> Self { 134 + self.stream_size = Some(size); 135 + self 136 + } 137 + 138 + /// Build the CVM instance with validation 139 + /// 140 + /// Uses the following defaults if not specified: 141 + /// - `epsilon: 0.8` (good starting point for most applications) 142 + /// - `confidence: 0.9` (90 % confidence, equivalent to delta = 0.1) 143 + /// - `estimated_size: 1000` 144 + /// 145 + /// Returns an error if any parameters are invalid. 146 + pub fn build<T: Ord>(self) -> Result<CVM<T>, String> { 147 + // Validate and get epsilon 148 + let epsilon = self.epsilon.unwrap_or(0.8); 149 + if epsilon <= 0.0 || epsilon >= 1.0 { 150 + return Err("Epsilon must be between 0.0 and 1.0 (exclusive)".to_string()); 151 + } 152 + 153 + // Validate and get delta 154 + let confidence_spec = self 155 + .confidence_spec 156 + .unwrap_or(ConfidenceSpec::Confidence(0.9)); 157 + let validated_spec = confidence_spec.validate()?; 158 + let delta = validated_spec.to_delta(); 159 + 160 + // Validate and get stream size 161 + let stream_size = self.stream_size.unwrap_or(1000); 162 + if stream_size == 0 { 163 + return Err("Stream size must be greater than 0".to_string()); 164 + } 165 + 166 + Ok(CVM::new(epsilon, delta, stream_size)) 167 + } 168 + } 169 + 12 170 /// A counter implementing the CVM algorithm 13 171 /// 14 172 /// This implementation uses a treap (randomized binary search tree) as the buffer, ··· 24 182 } 25 183 26 184 impl<T: Ord> CVM<T> { 185 + /// Create a new builder for constructing CVM instances 186 + /// 187 + /// The builder provides a more ergonomic way to construct CVM instances with 188 + /// validation and sensible defaults. 189 + /// 190 + /// # Examples 191 + /// 192 + /// ``` 193 + /// use cvmcount::CVM; 194 + /// 195 + /// // Using defaults 196 + /// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap(); 197 + /// 198 + /// // Custom configuration 199 + /// let cvm: CVM<i32> = CVM::<i32>::builder() 200 + /// .epsilon(0.05) 201 + /// .confidence(0.99) 202 + /// .estimated_size(10_000) 203 + /// .build() 204 + /// .unwrap(); 205 + /// ``` 206 + pub fn builder() -> CVMBuilder { 207 + CVMBuilder::new() 208 + } 209 + 27 210 /// Initialise the algorithm 28 211 /// 29 212 /// `epsilon`: how close you want your estimate to be to the true number of distinct elements. ··· 90 273 path::Path, 91 274 }; 92 275 276 + use super::{CVM, ConfidenceSpec}; 93 277 use regex::Regex; 94 278 use std::collections::HashSet; 95 279 ··· 117 301 br.lines() 118 302 .for_each(|line| line_to_word(&re, &mut hs, &line.unwrap())); 119 303 assert_eq!(hs.len(), 9016) 304 + } 305 + 306 + #[test] 307 + fn test_builder_defaults() { 308 + let cvm: CVM<String> = CVM::<String>::builder().build().unwrap(); 309 + // Verify that it's properly constructed with defaults 310 + assert_eq!(cvm.calculate_final_result(), 0.0); // Empty buffer 311 + } 312 + 313 + #[test] 314 + fn test_builder_custom_params() { 315 + let cvm: CVM<i32> = CVM::<i32>::builder() 316 + .epsilon(0.05) 317 + .confidence(0.99) 318 + .estimated_size(5000) 319 + .build() 320 + .unwrap(); 321 + 322 + // Test that it works by processing some elements 323 + let mut cvm = cvm; 324 + for i in 0..100 { 325 + cvm.process_element(i); 326 + } 327 + let result = cvm.calculate_final_result(); 328 + assert!(result > 0.0); 329 + } 330 + 331 + #[test] 332 + fn test_builder_delta_vs_confidence() { 333 + // Test that confidence and delta give equivalent results 334 + let cvm1: CVM<i32> = CVM::<i32>::builder().confidence(0.9).build().unwrap(); 335 + 336 + let cvm2: CVM<i32> = CVM::<i32>::builder().delta(0.1).build().unwrap(); 337 + 338 + // They should have the same internal configuration 339 + // (we can't directly test this without exposing internals, 340 + // but we can test they both work) 341 + assert_eq!(cvm1.calculate_final_result(), 0.0); 342 + assert_eq!(cvm2.calculate_final_result(), 0.0); 343 + } 344 + 345 + #[test] 346 + fn test_builder_last_wins() { 347 + // Test that the last confidence/delta setting wins 348 + let cvm: CVM<i32> = CVM::<i32>::builder() 349 + .confidence(0.9) 350 + .delta(0.05) // This should override confidence 351 + .build() 352 + .unwrap(); 353 + 354 + assert_eq!(cvm.calculate_final_result(), 0.0); 355 + } 356 + 357 + #[test] 358 + fn test_builder_validation() { 359 + // Test epsilon validation 360 + let result = CVM::<i32>::builder().epsilon(0.0).build::<i32>(); 361 + assert!(result.is_err()); 362 + 363 + let result = CVM::<i32>::builder().epsilon(1.0).build::<i32>(); 364 + assert!(result.is_err()); 365 + 366 + let result = CVM::<i32>::builder().epsilon(-0.5).build::<i32>(); 367 + assert!(result.is_err()); 368 + 369 + // Test confidence validation 370 + let result = CVM::<i32>::builder().confidence(0.0).build::<i32>(); 371 + assert!(result.is_err()); 372 + 373 + let result = CVM::<i32>::builder().confidence(1.0).build::<i32>(); 374 + assert!(result.is_err()); 375 + 376 + // Test delta validation 377 + let result = CVM::<i32>::builder().delta(0.0).build::<i32>(); 378 + assert!(result.is_err()); 379 + 380 + let result = CVM::<i32>::builder().delta(1.0).build::<i32>(); 381 + assert!(result.is_err()); 382 + 383 + // Test stream size validation 384 + let result = CVM::<i32>::builder().estimated_size(0).build::<i32>(); 385 + assert!(result.is_err()); 386 + } 387 + 388 + #[test] 389 + fn test_builder_method_chaining() { 390 + let result = CVM::<String>::builder() 391 + .epsilon(0.1) 392 + .confidence(0.95) 393 + .estimated_size(2000) 394 + .build::<String>(); 395 + 396 + assert!(result.is_ok()); 397 + } 398 + 399 + #[test] 400 + fn test_confidence_spec_conversion() { 401 + // Test ConfidenceSpec::to_delta conversion 402 + let confidence_spec = ConfidenceSpec::Confidence(0.9); 403 + assert!((confidence_spec.to_delta() - 0.1).abs() < f64::EPSILON); 404 + 405 + let delta_spec = ConfidenceSpec::Delta(0.05); 406 + assert!((delta_spec.to_delta() - 0.05).abs() < f64::EPSILON); 120 407 } 121 408 }
+1 -1
src/main.rs
··· 1 - use clap::{arg, crate_version, value_parser, Command}; 1 + use clap::{Command, arg, crate_version, value_parser}; 2 2 use regex::Regex; 3 3 use std::fs::File; 4 4 use std::io::BufRead;
+1 -1
src/treap.rs
··· 255 255 #[cfg(test)] 256 256 mod tests { 257 257 use super::*; 258 - use rand::rngs::StdRng; 259 258 use rand::SeedableRng; 259 + use rand::rngs::StdRng; 260 260 261 261 #[test] 262 262 fn test_insert_and_contains() {