A better Rust ATProto crate
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 14 kB View raw
1// License and Copyright Notice: 2// 3// Some of the code and doc comments in this module were ported or copied from 4// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine. 5// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java 6// 7// The original code/comments from Caffeine are licensed under the Apache License, 8// Version 2.0 <https://github.com/ben-manes/caffeine/blob/master/LICENSE> 9// 10// Copyrights of the original code/comments are retained by their contributors. 11// For full authorship information, see the version control history of 12// https://github.com/ben-manes/caffeine/ 13 14/// A probabilistic multi-set for estimating the popularity of an element within 15/// a time window. The maximum frequency of an element is limited to 15 (4-bits) 16/// and an aging process periodically halves the popularity of all elements. 17#[derive(Default)] 18pub(crate) struct FrequencySketch { 19 sample_size: u32, 20 table_mask: u32, 21 table: Box<[u64]>, 22 size: u32, 23} 24 25// A mixture of seeds from FNV-1a, CityHash, and Murmur3. (Taken from Caffeine) 26static SEED: [u64; 4] = [ 27 0xc3a5_c85c_97cb_3127, 28 0xb492_b66f_be98_f273, 29 0x9ae1_6a3b_2f90_404f, 30 0xcbf2_9ce4_8422_2325, 31]; 32 33static RESET_MASK: u64 = 0x7777_7777_7777_7777; 34 35static ONE_MASK: u64 = 0x1111_1111_1111_1111; 36 37// ------------------------------------------------------------------------------- 38// Some of the code and doc comments in this module were ported or copied from 39// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine. 40// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java 41// ------------------------------------------------------------------------------- 42// 43// FrequencySketch maintains a 4-bit CountMinSketch [1] with periodic aging to 44// provide the popularity history for the TinyLfu admission policy [2]. 45// The time and space efficiency of the sketch allows it to cheaply estimate the 46// frequency of an entry in a stream of cache access events. 47// 48// The counter matrix is represented as a single dimensional array holding 16 49// counters per slot. A fixed depth of four balances the accuracy and cost, 50// resulting in a width of four times the length of the array. To retain an 51// accurate estimation the array's length equals the maximum number of entries 52// in the cache, increased to the closest power-of-two to exploit more efficient 53// bit masking. This configuration results in a confidence of 93.75% and error 54// bound of e / width. 55// 56// The frequency of all entries is aged periodically using a sampling window 57// based on the maximum number of entries in the cache. This is referred to as 58// the reset operation by TinyLfu and keeps the sketch fresh by dividing all 59// counters by two and subtracting based on the number of odd counters 60// found. The O(n) cost of aging is amortized, ideal for hardware pre-fetching, 61// and uses inexpensive bit manipulations per array location. 62// 63// [1] An Improved Data Stream Summary: The Count-Min Sketch and its Applications 64// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf 65// [2] TinyLFU: A Highly Efficient Cache Admission Policy 66// https://dl.acm.org/citation.cfm?id=3149371 67// 68// ------------------------------------------------------------------------------- 69 70impl FrequencySketch { 71 /// Initializes and increases the capacity of this `FrequencySketch` instance, 72 /// if necessary, to ensure that it can accurately estimate the popularity of 73 /// elements given the maximum size of the cache. This operation forgets all 74 /// previous counts when resizing. 75 pub(crate) fn ensure_capacity(&mut self, cap: u32) { 76 // The max byte size of the table, Box<[u64; table_size]> 77 // 78 // | Pointer width | Max size | 79 // |:-----------------|---------:| 80 // | 16 bit | 8 KiB | 81 // | 32 bit | 128 MiB | 82 // | 64 bit or bigger | 8 GiB | 83 84 let maximum = if cfg!(target_pointer_width = "16") { 85 cap.min(1024) 86 } else if cfg!(target_pointer_width = "32") { 87 cap.min(2u32.pow(24)) // about 16 millions 88 } else { 89 // Same to Caffeine's limit: 90 // `Integer.MAX_VALUE >>> 1` with `ceilingPowerOfTwo()` applied. 91 cap.min(2u32.pow(30)) // about 1 billion 92 }; 93 let table_size = if maximum == 0 { 94 1 95 } else { 96 maximum.next_power_of_two() 97 }; 98 99 if self.table.len() as u32 >= table_size { 100 return; 101 } 102 103 self.table = vec![0; table_size as usize].into_boxed_slice(); 104 self.table_mask = table_size - 1; 105 self.sample_size = if cap == 0 { 106 10 107 } else { 108 maximum.saturating_mul(10).min(i32::MAX as u32) 109 }; 110 } 111 112 /// Takes the hash value of an element, and returns the estimated number of 113 /// occurrences of the element, up to the maximum (15). 114 pub(crate) fn frequency(&self, hash: u64) -> u8 { 115 if self.table.is_empty() { 116 return 0; 117 } 118 119 let start = ((hash & 3) << 2) as u8; 120 let mut frequency = u8::MAX; 121 for i in 0..4 { 122 let index = self.index_of(hash, i); 123 let shift = (start + i) << 2; 124 let count = ((self.table[index] >> shift) & 0xF) as u8; 125 frequency = frequency.min(count); 126 } 127 frequency 128 } 129 130 /// Take a hash value of an element and increments the popularity of the 131 /// element if it does not exceed the maximum (15). The popularity of all 132 /// elements will be periodically down sampled when the observed events 133 /// exceeds a threshold. This process provides a frequency aging to allow 134 /// expired long term entries to fade away. 135 pub(crate) fn increment(&mut self, hash: u64) { 136 if self.table.is_empty() { 137 return; 138 } 139 140 let start = ((hash & 3) << 2) as u8; 141 let mut added = false; 142 for i in 0..4 { 143 let index = self.index_of(hash, i); 144 added |= self.increment_at(index, start + i); 145 } 146 147 if added { 148 self.size += 1; 149 if self.size >= self.sample_size { 150 self.reset(); 151 } 152 } 153 } 154 155 /// Takes a table index (each entry has 16 counters) and counter index, and 156 /// increments the counter by 1 if it is not already at the maximum value 157 /// (15). Returns `true` if incremented. 158 fn increment_at(&mut self, table_index: usize, counter_index: u8) -> bool { 159 let offset = (counter_index as usize) << 2; 160 let mask = 0xF_u64 << offset; 161 if self.table[table_index] & mask != mask { 162 self.table[table_index] += 1u64 << offset; 163 true 164 } else { 165 false 166 } 167 } 168 169 /// Reduces every counter by half of its original value. 170 fn reset(&mut self) { 171 let mut count = 0u32; 172 for entry in self.table.iter_mut() { 173 // Count number of odd numbers. 174 count += (*entry & ONE_MASK).count_ones(); 175 *entry = (*entry >> 1) & RESET_MASK; 176 } 177 self.size = (self.size >> 1) - (count >> 2); 178 } 179 180 /// Returns the table index for the counter at the specified depth. 181 fn index_of(&self, hash: u64, depth: u8) -> usize { 182 let i = depth as usize; 183 let mut hash = hash.wrapping_add(SEED[i]).wrapping_mul(SEED[i]); 184 hash = hash.wrapping_add(hash >> 32); 185 (hash & (self.table_mask as u64)) as usize 186 } 187} 188 189// Methods only available for testing. 190#[cfg(test)] 191impl FrequencySketch { 192 pub(crate) fn table_len(&self) -> usize { 193 self.table.len() 194 } 195} 196 197// Some test cases were ported from Caffeine at: 198// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/test/java/com/github/benmanes/caffeine/cache/FrequencySketchTest.java 199// 200// To see the debug prints, run test as `cargo test -- --nocapture` 201#[cfg(test)] 202mod tests { 203 use super::FrequencySketch; 204 use once_cell::sync::Lazy; 205 use std::hash::{BuildHasher, Hash}; 206 207 static ITEM: Lazy<u32> = Lazy::new(|| { 208 let mut buf = [0; 4]; 209 getrandom::getrandom(&mut buf).unwrap(); 210 #[allow(unnecessary_transmutes)] 211 unsafe { 212 std::mem::transmute::<[u8; 4], u32>(buf) 213 } 214 }); 215 216 // This test was ported from Caffeine. 217 #[test] 218 fn increment_once() { 219 let mut sketch = FrequencySketch::default(); 220 sketch.ensure_capacity(512); 221 let hasher = hasher(); 222 let item_hash = hasher(*ITEM); 223 sketch.increment(item_hash); 224 assert_eq!(sketch.frequency(item_hash), 1); 225 } 226 227 // This test was ported from Caffeine. 228 #[test] 229 fn increment_max() { 230 let mut sketch = FrequencySketch::default(); 231 sketch.ensure_capacity(512); 232 let hasher = hasher(); 233 let item_hash = hasher(*ITEM); 234 for _ in 0..20 { 235 sketch.increment(item_hash); 236 } 237 assert_eq!(sketch.frequency(item_hash), 15); 238 } 239 240 // This test was ported from Caffeine. 241 #[test] 242 fn increment_distinct() { 243 let mut sketch = FrequencySketch::default(); 244 sketch.ensure_capacity(512); 245 let hasher = hasher(); 246 sketch.increment(hasher(*ITEM)); 247 sketch.increment(hasher(ITEM.wrapping_add(1))); 248 assert_eq!(sketch.frequency(hasher(*ITEM)), 1); 249 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(1))), 1); 250 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(2))), 0); 251 } 252 253 // This test was ported from Caffeine. 254 #[test] 255 fn index_of_around_zero() { 256 let mut sketch = FrequencySketch::default(); 257 sketch.ensure_capacity(512); 258 let mut indexes = std::collections::HashSet::new(); 259 let hashes = [u64::MAX, 0, 1]; 260 for hash in hashes.iter() { 261 for depth in 0..4 { 262 indexes.insert(sketch.index_of(*hash, depth)); 263 } 264 } 265 assert_eq!(indexes.len(), 4 * hashes.len()) 266 } 267 268 // This test was ported from Caffeine. 269 #[test] 270 fn reset() { 271 let mut reset = false; 272 let mut sketch = FrequencySketch::default(); 273 sketch.ensure_capacity(64); 274 let hasher = hasher(); 275 276 for i in 1..(20 * sketch.table.len() as u32) { 277 sketch.increment(hasher(i)); 278 if sketch.size != i { 279 reset = true; 280 break; 281 } 282 } 283 284 assert!(reset); 285 assert!(sketch.size <= sketch.sample_size / 2); 286 } 287 288 // This test was ported from Caffeine. 289 #[test] 290 fn heavy_hitters() { 291 let mut sketch = FrequencySketch::default(); 292 sketch.ensure_capacity(65_536); 293 let hasher = hasher(); 294 295 for i in 100..100_000 { 296 sketch.increment(hasher(i)); 297 } 298 299 for i in (0..10).step_by(2) { 300 for _ in 0..i { 301 sketch.increment(hasher(i)); 302 } 303 } 304 305 // A perfect popularity count yields an array [0, 0, 2, 0, 4, 0, 6, 0, 8, 0] 306 let popularity = (0..10) 307 .map(|i| sketch.frequency(hasher(i))) 308 .collect::<Vec<_>>(); 309 310 for (i, freq) in popularity.iter().enumerate() { 311 match i { 312 2 => assert!(freq <= &popularity[4]), 313 4 => assert!(freq <= &popularity[6]), 314 6 => assert!(freq <= &popularity[8]), 315 8 => (), 316 _ => assert!(freq <= &popularity[2]), 317 } 318 } 319 } 320 321 fn hasher<K: Hash>() -> impl Fn(K) -> u64 { 322 let build_hasher = std::collections::hash_map::RandomState::default(); 323 move |key| build_hasher.hash_one(&key) 324 } 325} 326 327// Verify that some properties hold such as no panic occurs on any possible inputs. 328#[cfg(kani)] 329mod kani { 330 use super::FrequencySketch; 331 332 const CAPACITIES: &[u32] = &[ 333 0, 334 1, 335 1024, 336 1025, 337 2u32.pow(24), 338 2u32.pow(24) + 1, 339 2u32.pow(30), 340 2u32.pow(30) + 1, 341 u32::MAX, 342 ]; 343 344 #[kani::proof] 345 fn verify_ensure_capacity() { 346 // Check for arbitrary capacities. 347 let capacity = kani::any(); 348 let mut sketch = FrequencySketch::default(); 349 sketch.ensure_capacity(capacity); 350 } 351 352 #[kani::proof] 353 fn verify_frequency() { 354 // Check for some selected capacities. 355 for capacity in CAPACITIES { 356 let mut sketch = FrequencySketch::default(); 357 sketch.ensure_capacity(*capacity); 358 359 // Check for arbitrary hashes. 360 let hash = kani::any(); 361 let frequency = sketch.frequency(hash); 362 assert!(frequency <= 15); 363 } 364 } 365 366 #[kani::proof] 367 fn verify_increment() { 368 // Only check for small capacities. Because Kani Rust Verifier is a model 369 // checking tool, it will take much longer time (exponential) to check larger 370 // capacities here. 371 for capacity in &[0, 1, 128] { 372 let mut sketch = FrequencySketch::default(); 373 sketch.ensure_capacity(*capacity); 374 375 // Check for arbitrary hashes. 376 let hash = kani::any(); 377 sketch.increment(hash); 378 } 379 } 380 381 #[kani::proof] 382 fn verify_index_of() { 383 // Check for arbitrary capacities. 384 let capacity = kani::any(); 385 let mut sketch = FrequencySketch::default(); 386 sketch.ensure_capacity(capacity); 387 388 // Check for arbitrary hashes. 389 let hash = kani::any(); 390 for i in 0..4 { 391 let index = sketch.index_of(hash, i); 392 assert!(index < sketch.table.len()); 393 } 394 } 395}