A better Rust ATProto crate
1// License and Copyright Notice:
2//
3// Some of the code and doc comments in this module were ported or copied from
4// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
5// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
6//
7// The original code/comments from Caffeine are licensed under the Apache License,
8// Version 2.0 <https://github.com/ben-manes/caffeine/blob/master/LICENSE>
9//
10// Copyrights of the original code/comments are retained by their contributors.
11// For full authorship information, see the version control history of
12// https://github.com/ben-manes/caffeine/
13
14/// A probabilistic multi-set for estimating the popularity of an element within
15/// a time window. The maximum frequency of an element is limited to 15 (4-bits)
16/// and an aging process periodically halves the popularity of all elements.
17#[derive(Default)]
18pub(crate) struct FrequencySketch {
19 sample_size: u32,
20 table_mask: u32,
21 table: Box<[u64]>,
22 size: u32,
23}
24
25// A mixture of seeds from FNV-1a, CityHash, and Murmur3. (Taken from Caffeine)
26static SEED: [u64; 4] = [
27 0xc3a5_c85c_97cb_3127,
28 0xb492_b66f_be98_f273,
29 0x9ae1_6a3b_2f90_404f,
30 0xcbf2_9ce4_8422_2325,
31];
32
33static RESET_MASK: u64 = 0x7777_7777_7777_7777;
34
35static ONE_MASK: u64 = 0x1111_1111_1111_1111;
36
37// -------------------------------------------------------------------------------
38// Some of the code and doc comments in this module were ported or copied from
39// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
40// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
41// -------------------------------------------------------------------------------
42//
43// FrequencySketch maintains a 4-bit CountMinSketch [1] with periodic aging to
44// provide the popularity history for the TinyLfu admission policy [2].
45// The time and space efficiency of the sketch allows it to cheaply estimate the
46// frequency of an entry in a stream of cache access events.
47//
48// The counter matrix is represented as a single dimensional array holding 16
49// counters per slot. A fixed depth of four balances the accuracy and cost,
50// resulting in a width of four times the length of the array. To retain an
51// accurate estimation the array's length equals the maximum number of entries
52// in the cache, increased to the closest power-of-two to exploit more efficient
53// bit masking. This configuration results in a confidence of 93.75% and error
54// bound of e / width.
55//
56// The frequency of all entries is aged periodically using a sampling window
57// based on the maximum number of entries in the cache. This is referred to as
58// the reset operation by TinyLfu and keeps the sketch fresh by dividing all
59// counters by two and subtracting based on the number of odd counters
60// found. The O(n) cost of aging is amortized, ideal for hardware pre-fetching,
61// and uses inexpensive bit manipulations per array location.
62//
63// [1] An Improved Data Stream Summary: The Count-Min Sketch and its Applications
64// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
65// [2] TinyLFU: A Highly Efficient Cache Admission Policy
66// https://dl.acm.org/citation.cfm?id=3149371
67//
68// -------------------------------------------------------------------------------
69
70impl FrequencySketch {
71 /// Initializes and increases the capacity of this `FrequencySketch` instance,
72 /// if necessary, to ensure that it can accurately estimate the popularity of
73 /// elements given the maximum size of the cache. This operation forgets all
74 /// previous counts when resizing.
75 pub(crate) fn ensure_capacity(&mut self, cap: u32) {
76 // The max byte size of the table, Box<[u64; table_size]>
77 //
78 // | Pointer width | Max size |
79 // |:-----------------|---------:|
80 // | 16 bit | 8 KiB |
81 // | 32 bit | 128 MiB |
82 // | 64 bit or bigger | 8 GiB |
83
84 let maximum = if cfg!(target_pointer_width = "16") {
85 cap.min(1024)
86 } else if cfg!(target_pointer_width = "32") {
87 cap.min(2u32.pow(24)) // about 16 millions
88 } else {
89 // Same to Caffeine's limit:
90 // `Integer.MAX_VALUE >>> 1` with `ceilingPowerOfTwo()` applied.
91 cap.min(2u32.pow(30)) // about 1 billion
92 };
93 let table_size = if maximum == 0 {
94 1
95 } else {
96 maximum.next_power_of_two()
97 };
98
99 if self.table.len() as u32 >= table_size {
100 return;
101 }
102
103 self.table = vec![0; table_size as usize].into_boxed_slice();
104 self.table_mask = table_size - 1;
105 self.sample_size = if cap == 0 {
106 10
107 } else {
108 maximum.saturating_mul(10).min(i32::MAX as u32)
109 };
110 }
111
112 /// Takes the hash value of an element, and returns the estimated number of
113 /// occurrences of the element, up to the maximum (15).
114 pub(crate) fn frequency(&self, hash: u64) -> u8 {
115 if self.table.is_empty() {
116 return 0;
117 }
118
119 let start = ((hash & 3) << 2) as u8;
120 let mut frequency = u8::MAX;
121 for i in 0..4 {
122 let index = self.index_of(hash, i);
123 let shift = (start + i) << 2;
124 let count = ((self.table[index] >> shift) & 0xF) as u8;
125 frequency = frequency.min(count);
126 }
127 frequency
128 }
129
130 /// Take a hash value of an element and increments the popularity of the
131 /// element if it does not exceed the maximum (15). The popularity of all
132 /// elements will be periodically down sampled when the observed events
133 /// exceeds a threshold. This process provides a frequency aging to allow
134 /// expired long term entries to fade away.
135 pub(crate) fn increment(&mut self, hash: u64) {
136 if self.table.is_empty() {
137 return;
138 }
139
140 let start = ((hash & 3) << 2) as u8;
141 let mut added = false;
142 for i in 0..4 {
143 let index = self.index_of(hash, i);
144 added |= self.increment_at(index, start + i);
145 }
146
147 if added {
148 self.size += 1;
149 if self.size >= self.sample_size {
150 self.reset();
151 }
152 }
153 }
154
155 /// Takes a table index (each entry has 16 counters) and counter index, and
156 /// increments the counter by 1 if it is not already at the maximum value
157 /// (15). Returns `true` if incremented.
158 fn increment_at(&mut self, table_index: usize, counter_index: u8) -> bool {
159 let offset = (counter_index as usize) << 2;
160 let mask = 0xF_u64 << offset;
161 if self.table[table_index] & mask != mask {
162 self.table[table_index] += 1u64 << offset;
163 true
164 } else {
165 false
166 }
167 }
168
169 /// Reduces every counter by half of its original value.
170 fn reset(&mut self) {
171 let mut count = 0u32;
172 for entry in self.table.iter_mut() {
173 // Count number of odd numbers.
174 count += (*entry & ONE_MASK).count_ones();
175 *entry = (*entry >> 1) & RESET_MASK;
176 }
177 self.size = (self.size >> 1) - (count >> 2);
178 }
179
180 /// Returns the table index for the counter at the specified depth.
181 fn index_of(&self, hash: u64, depth: u8) -> usize {
182 let i = depth as usize;
183 let mut hash = hash.wrapping_add(SEED[i]).wrapping_mul(SEED[i]);
184 hash = hash.wrapping_add(hash >> 32);
185 (hash & (self.table_mask as u64)) as usize
186 }
187}
188
189// Methods only available for testing.
190#[cfg(test)]
191impl FrequencySketch {
192 pub(crate) fn table_len(&self) -> usize {
193 self.table.len()
194 }
195}
196
197// Some test cases were ported from Caffeine at:
198// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/test/java/com/github/benmanes/caffeine/cache/FrequencySketchTest.java
199//
200// To see the debug prints, run test as `cargo test -- --nocapture`
201#[cfg(test)]
202mod tests {
203 use super::FrequencySketch;
204 use once_cell::sync::Lazy;
205 use std::hash::{BuildHasher, Hash};
206
207 static ITEM: Lazy<u32> = Lazy::new(|| {
208 let mut buf = [0; 4];
209 getrandom::getrandom(&mut buf).unwrap();
210 #[allow(unnecessary_transmutes)]
211 unsafe {
212 std::mem::transmute::<[u8; 4], u32>(buf)
213 }
214 });
215
216 // This test was ported from Caffeine.
217 #[test]
218 fn increment_once() {
219 let mut sketch = FrequencySketch::default();
220 sketch.ensure_capacity(512);
221 let hasher = hasher();
222 let item_hash = hasher(*ITEM);
223 sketch.increment(item_hash);
224 assert_eq!(sketch.frequency(item_hash), 1);
225 }
226
227 // This test was ported from Caffeine.
228 #[test]
229 fn increment_max() {
230 let mut sketch = FrequencySketch::default();
231 sketch.ensure_capacity(512);
232 let hasher = hasher();
233 let item_hash = hasher(*ITEM);
234 for _ in 0..20 {
235 sketch.increment(item_hash);
236 }
237 assert_eq!(sketch.frequency(item_hash), 15);
238 }
239
240 // This test was ported from Caffeine.
241 #[test]
242 fn increment_distinct() {
243 let mut sketch = FrequencySketch::default();
244 sketch.ensure_capacity(512);
245 let hasher = hasher();
246 sketch.increment(hasher(*ITEM));
247 sketch.increment(hasher(ITEM.wrapping_add(1)));
248 assert_eq!(sketch.frequency(hasher(*ITEM)), 1);
249 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(1))), 1);
250 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(2))), 0);
251 }
252
253 // This test was ported from Caffeine.
254 #[test]
255 fn index_of_around_zero() {
256 let mut sketch = FrequencySketch::default();
257 sketch.ensure_capacity(512);
258 let mut indexes = std::collections::HashSet::new();
259 let hashes = [u64::MAX, 0, 1];
260 for hash in hashes.iter() {
261 for depth in 0..4 {
262 indexes.insert(sketch.index_of(*hash, depth));
263 }
264 }
265 assert_eq!(indexes.len(), 4 * hashes.len())
266 }
267
268 // This test was ported from Caffeine.
269 #[test]
270 fn reset() {
271 let mut reset = false;
272 let mut sketch = FrequencySketch::default();
273 sketch.ensure_capacity(64);
274 let hasher = hasher();
275
276 for i in 1..(20 * sketch.table.len() as u32) {
277 sketch.increment(hasher(i));
278 if sketch.size != i {
279 reset = true;
280 break;
281 }
282 }
283
284 assert!(reset);
285 assert!(sketch.size <= sketch.sample_size / 2);
286 }
287
288 // This test was ported from Caffeine.
289 #[test]
290 fn heavy_hitters() {
291 let mut sketch = FrequencySketch::default();
292 sketch.ensure_capacity(65_536);
293 let hasher = hasher();
294
295 for i in 100..100_000 {
296 sketch.increment(hasher(i));
297 }
298
299 for i in (0..10).step_by(2) {
300 for _ in 0..i {
301 sketch.increment(hasher(i));
302 }
303 }
304
305 // A perfect popularity count yields an array [0, 0, 2, 0, 4, 0, 6, 0, 8, 0]
306 let popularity = (0..10)
307 .map(|i| sketch.frequency(hasher(i)))
308 .collect::<Vec<_>>();
309
310 for (i, freq) in popularity.iter().enumerate() {
311 match i {
312 2 => assert!(freq <= &popularity[4]),
313 4 => assert!(freq <= &popularity[6]),
314 6 => assert!(freq <= &popularity[8]),
315 8 => (),
316 _ => assert!(freq <= &popularity[2]),
317 }
318 }
319 }
320
321 fn hasher<K: Hash>() -> impl Fn(K) -> u64 {
322 let build_hasher = std::collections::hash_map::RandomState::default();
323 move |key| build_hasher.hash_one(&key)
324 }
325}
326
327// Verify that some properties hold such as no panic occurs on any possible inputs.
328#[cfg(kani)]
329mod kani {
330 use super::FrequencySketch;
331
332 const CAPACITIES: &[u32] = &[
333 0,
334 1,
335 1024,
336 1025,
337 2u32.pow(24),
338 2u32.pow(24) + 1,
339 2u32.pow(30),
340 2u32.pow(30) + 1,
341 u32::MAX,
342 ];
343
344 #[kani::proof]
345 fn verify_ensure_capacity() {
346 // Check for arbitrary capacities.
347 let capacity = kani::any();
348 let mut sketch = FrequencySketch::default();
349 sketch.ensure_capacity(capacity);
350 }
351
352 #[kani::proof]
353 fn verify_frequency() {
354 // Check for some selected capacities.
355 for capacity in CAPACITIES {
356 let mut sketch = FrequencySketch::default();
357 sketch.ensure_capacity(*capacity);
358
359 // Check for arbitrary hashes.
360 let hash = kani::any();
361 let frequency = sketch.frequency(hash);
362 assert!(frequency <= 15);
363 }
364 }
365
366 #[kani::proof]
367 fn verify_increment() {
368 // Only check for small capacities. Because Kani Rust Verifier is a model
369 // checking tool, it will take much longer time (exponential) to check larger
370 // capacities here.
371 for capacity in &[0, 1, 128] {
372 let mut sketch = FrequencySketch::default();
373 sketch.ensure_capacity(*capacity);
374
375 // Check for arbitrary hashes.
376 let hash = kani::any();
377 sketch.increment(hash);
378 }
379 }
380
381 #[kani::proof]
382 fn verify_index_of() {
383 // Check for arbitrary capacities.
384 let capacity = kani::any();
385 let mut sketch = FrequencySketch::default();
386 sketch.ensure_capacity(capacity);
387
388 // Check for arbitrary hashes.
389 let hash = kani::any();
390 for i in 0..4 {
391 let index = sketch.index_of(hash, i);
392 assert!(index < sketch.table.len());
393 }
394 }
395}