···99use rand::rngs::StdRng;
1010use rand::{Rng, SeedableRng};
11111212+/// Specification for confidence level in the CVM algorithm
1313+#[derive(Debug, Clone, Copy)]
1414+pub enum ConfidenceSpec {
1515+ /// Specify delta directly (probability of failure)
1616+ Delta(f64),
1717+ /// Specify confidence level (probability of success)
1818+ Confidence(f64),
1919+}
2020+2121+impl ConfidenceSpec {
2222+ /// Convert to delta value for internal use
2323+ fn to_delta(self) -> f64 {
2424+ match self {
2525+ ConfidenceSpec::Delta(delta) => delta,
2626+ ConfidenceSpec::Confidence(confidence) => 1.0 - confidence,
2727+ }
2828+ }
2929+3030+ /// Validate the confidence specification
3131+ fn validate(self) -> Result<Self, String> {
3232+ match self {
3333+ ConfidenceSpec::Delta(delta) => {
3434+ if delta <= 0.0 || delta >= 1.0 {
3535+ Err("Delta must be between 0.0 and 1.0 (exclusive)".to_string())
3636+ } else {
3737+ Ok(self)
3838+ }
3939+ }
4040+ ConfidenceSpec::Confidence(confidence) => {
4141+ if confidence <= 0.0 || confidence >= 1.0 {
4242+ Err("Confidence must be between 0.0 and 1.0 (exclusive)".to_string())
4343+ } else {
4444+ Ok(self)
4545+ }
4646+ }
4747+ }
4848+ }
4949+}
5050+5151+/// Builder for constructing CVM instances with validation and defaults
5252+///
5353+/// # Examples
5454+///
5555+/// ```
5656+/// use cvmcount::CVM;
5757+///
5858+/// // Using defaults (`epsilon=0.8`, `confidence=0.9`, `size=1000`)
5959+/// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
6060+///
6161+/// // Custom parameters
6262+/// let cvm: CVM<i32> = CVM::<i32>::builder()
6363+/// .epsilon(0.05) // 5 % accuracy
6464+/// .confidence(0.99) // 99 % confidence
6565+/// .estimated_size(10_000)
6666+/// .build()
6767+/// .unwrap();
6868+///
6969+/// // Using delta instead of confidence
7070+/// let cvm: CVM<String> = CVM::<String>::builder()
7171+/// .epsilon(0.1)
7272+/// .delta(0.01) // 1 % failure probability
7373+/// .build()
7474+/// .unwrap();
7575+/// ```
7676+#[derive(Debug, Clone, Default)]
7777+pub struct CVMBuilder {
7878+ epsilon: Option<f64>,
7979+ confidence_spec: Option<ConfidenceSpec>,
8080+ stream_size: Option<usize>,
8181+}
8282+8383+impl CVMBuilder {
8484+ /// Create a new builder with default values
8585+ pub fn new() -> Self {
8686+ Self::default()
8787+ }
8888+8989+ /// Set the epsilon parameter (accuracy requirement)
9090+ ///
9191+ /// `Epsilon` determines how close you want your estimate to be to the true number
9292+ /// of distinct elements. A smaller `ε` means you require a more precise estimate.
9393+ /// For example, `ε = 0.05` means you want your estimate to be within 5 % of the
9494+ /// actual value.
9595+ ///
9696+ /// Must be between 0.0 and 1.0 (exclusive).
9797+ pub fn epsilon(mut self, epsilon: f64) -> Self {
9898+ self.epsilon = Some(epsilon);
9999+ self
100100+ }
101101+102102+ /// Set the confidence level (probability that the estimate will be accurate)
103103+ ///
104104+ /// Confidence represents how certain you want to be that the algorithm's
105105+ /// estimate will fall within the desired accuracy range. For example,
106106+ /// `confidence = 0.99` means you're 99 % sure the estimate will be accurate.
107107+ ///
108108+ /// Must be between 0.0 and 1.0 (exclusive).
109109+ /// Cannot be used together with [`Self::delta`] – the last one called will be used.
110110+ pub fn confidence(mut self, confidence: f64) -> Self {
111111+ self.confidence_spec = Some(ConfidenceSpec::Confidence(confidence));
112112+ self
113113+ }
114114+115115+ /// Set the delta parameter (probability of failure)
116116+ ///
117117+ /// Delta represents the probability that the algorithm's estimate will fall
118118+ /// outside the desired accuracy range. For example, `delta = 0.01` means there's
119119+ /// a 1 % chance the estimate will be inaccurate.
120120+ ///
121121+ /// Must be between 0.0 and 1.0 (exclusive).
122122+ /// Cannot be used together with [`Self::confidence()`] – the last one called will be used.
123123+ pub fn delta(mut self, delta: f64) -> Self {
124124+ self.confidence_spec = Some(ConfidenceSpec::Delta(delta));
125125+ self
126126+ }
127127+128128+ /// Set the estimated stream size
129129+ ///
130130+ /// This is used to determine buffer size and can be a loose approximation.
131131+ /// The closer it is to the actual stream size, the more accurate the results
132132+ /// will be.
133133+ pub fn estimated_size(mut self, size: usize) -> Self {
134134+ self.stream_size = Some(size);
135135+ self
136136+ }
137137+138138+ /// Build the CVM instance with validation
139139+ ///
140140+ /// Uses the following defaults if not specified:
141141+ /// - `epsilon: 0.8` (good starting point for most applications)
142142+ /// - `confidence: 0.9` (90 % confidence, equivalent to delta = 0.1)
143143+ /// - `estimated_size: 1000`
144144+ ///
145145+ /// Returns an error if any parameters are invalid.
146146+ pub fn build<T: Ord>(self) -> Result<CVM<T>, String> {
147147+ // Validate and get epsilon
148148+ let epsilon = self.epsilon.unwrap_or(0.8);
149149+ if epsilon <= 0.0 || epsilon >= 1.0 {
150150+ return Err("Epsilon must be between 0.0 and 1.0 (exclusive)".to_string());
151151+ }
152152+153153+ // Validate and get delta
154154+ let confidence_spec = self
155155+ .confidence_spec
156156+ .unwrap_or(ConfidenceSpec::Confidence(0.9));
157157+ let validated_spec = confidence_spec.validate()?;
158158+ let delta = validated_spec.to_delta();
159159+160160+ // Validate and get stream size
161161+ let stream_size = self.stream_size.unwrap_or(1000);
162162+ if stream_size == 0 {
163163+ return Err("Stream size must be greater than 0".to_string());
164164+ }
165165+166166+ Ok(CVM::new(epsilon, delta, stream_size))
167167+ }
168168+}
169169+12170/// A counter implementing the CVM algorithm
13171///
14172/// This implementation uses a treap (randomized binary search tree) as the buffer,
···24182}
2518326184impl<T: Ord> CVM<T> {
185185+ /// Create a new builder for constructing CVM instances
186186+ ///
187187+ /// The builder provides a more ergonomic way to construct CVM instances with
188188+ /// validation and sensible defaults.
189189+ ///
190190+ /// # Examples
191191+ ///
192192+ /// ```
193193+ /// use cvmcount::CVM;
194194+ ///
195195+ /// // Using defaults
196196+ /// let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
197197+ ///
198198+ /// // Custom configuration
199199+ /// let cvm: CVM<i32> = CVM::<i32>::builder()
200200+ /// .epsilon(0.05)
201201+ /// .confidence(0.99)
202202+ /// .estimated_size(10_000)
203203+ /// .build()
204204+ /// .unwrap();
205205+ /// ```
206206+ pub fn builder() -> CVMBuilder {
207207+ CVMBuilder::new()
208208+ }
209209+27210 /// Initialise the algorithm
28211 ///
29212 /// `epsilon`: how close you want your estimate to be to the true number of distinct elements.
···90273 path::Path,
91274 };
92275276276+ use super::{CVM, ConfidenceSpec};
93277 use regex::Regex;
94278 use std::collections::HashSet;
95279···117301 br.lines()
118302 .for_each(|line| line_to_word(&re, &mut hs, &line.unwrap()));
119303 assert_eq!(hs.len(), 9016)
304304+ }
305305+306306+ #[test]
307307+ fn test_builder_defaults() {
308308+ let cvm: CVM<String> = CVM::<String>::builder().build().unwrap();
309309+ // Verify that it's properly constructed with defaults
310310+ assert_eq!(cvm.calculate_final_result(), 0.0); // Empty buffer
311311+ }
312312+313313+ #[test]
314314+ fn test_builder_custom_params() {
315315+ let cvm: CVM<i32> = CVM::<i32>::builder()
316316+ .epsilon(0.05)
317317+ .confidence(0.99)
318318+ .estimated_size(5000)
319319+ .build()
320320+ .unwrap();
321321+322322+ // Test that it works by processing some elements
323323+ let mut cvm = cvm;
324324+ for i in 0..100 {
325325+ cvm.process_element(i);
326326+ }
327327+ let result = cvm.calculate_final_result();
328328+ assert!(result > 0.0);
329329+ }
330330+331331+ #[test]
332332+ fn test_builder_delta_vs_confidence() {
333333+ // Test that confidence and delta give equivalent results
334334+ let cvm1: CVM<i32> = CVM::<i32>::builder().confidence(0.9).build().unwrap();
335335+336336+ let cvm2: CVM<i32> = CVM::<i32>::builder().delta(0.1).build().unwrap();
337337+338338+ // They should have the same internal configuration
339339+ // (we can't directly test this without exposing internals,
340340+ // but we can test they both work)
341341+ assert_eq!(cvm1.calculate_final_result(), 0.0);
342342+ assert_eq!(cvm2.calculate_final_result(), 0.0);
343343+ }
344344+345345+ #[test]
346346+ fn test_builder_last_wins() {
347347+ // Test that the last confidence/delta setting wins
348348+ let cvm: CVM<i32> = CVM::<i32>::builder()
349349+ .confidence(0.9)
350350+ .delta(0.05) // This should override confidence
351351+ .build()
352352+ .unwrap();
353353+354354+ assert_eq!(cvm.calculate_final_result(), 0.0);
355355+ }
356356+357357+ #[test]
358358+ fn test_builder_validation() {
359359+ // Test epsilon validation
360360+ let result = CVM::<i32>::builder().epsilon(0.0).build::<i32>();
361361+ assert!(result.is_err());
362362+363363+ let result = CVM::<i32>::builder().epsilon(1.0).build::<i32>();
364364+ assert!(result.is_err());
365365+366366+ let result = CVM::<i32>::builder().epsilon(-0.5).build::<i32>();
367367+ assert!(result.is_err());
368368+369369+ // Test confidence validation
370370+ let result = CVM::<i32>::builder().confidence(0.0).build::<i32>();
371371+ assert!(result.is_err());
372372+373373+ let result = CVM::<i32>::builder().confidence(1.0).build::<i32>();
374374+ assert!(result.is_err());
375375+376376+ // Test delta validation
377377+ let result = CVM::<i32>::builder().delta(0.0).build::<i32>();
378378+ assert!(result.is_err());
379379+380380+ let result = CVM::<i32>::builder().delta(1.0).build::<i32>();
381381+ assert!(result.is_err());
382382+383383+ // Test stream size validation
384384+ let result = CVM::<i32>::builder().estimated_size(0).build::<i32>();
385385+ assert!(result.is_err());
386386+ }
387387+388388+ #[test]
389389+ fn test_builder_method_chaining() {
390390+ let result = CVM::<String>::builder()
391391+ .epsilon(0.1)
392392+ .confidence(0.95)
393393+ .estimated_size(2000)
394394+ .build::<String>();
395395+396396+ assert!(result.is_ok());
397397+ }
398398+399399+ #[test]
400400+ fn test_confidence_spec_conversion() {
401401+ // Test ConfidenceSpec::to_delta conversion
402402+ let confidence_spec = ConfidenceSpec::Confidence(0.9);
403403+ assert!((confidence_spec.to_delta() - 0.1).abs() < f64::EPSILON);
404404+405405+ let delta_spec = ConfidenceSpec::Delta(0.05);
406406+ assert!((delta_spec.to_delta() - 0.05).abs() < f64::EPSILON);
120407 }
121408}