bloom-rs
bloom-rs copied to clipboard
False-positive rate much higher due to broken HashIter implementation
Hi,
I implemented a simple bloomfilter on my own and wondered, why your implementation was by a factor of ~2.5 faster than my implementation.
So I started to inspect your code and figured out that there is a serious issue:
Your different hashfunctions, implemented as HashIter
, are not independent from each other.
Due to cyclic groups algebra, some values may be occur much more often than others. A fix for this problem would be to use independent RandomState
s for each hashfunction.
This results in vastly bigger false-positive rates than intended.
See the following code snippet to understand what I mean:
for _ in 0..30 {
let rnd_st1 = RandomState::new();
let rnd_st2 = RandomState::new();
let hash_iter = HashIter::from(42u32, 1000000u32, &rnd_st1, &rnd_st2);
let mut arr = vec![0; 1<<16];
for val in hash_iter {
arr[val as u16 as usize] +=1;
}
let mut hashmap : HashMap<usize, usize> = HashMap::new();
for val in arr {
let x : usize;
if let Some(_x) = hashmap.get_mut(&val) {
x = *_x + 1;
} else {
x = 1;
}
hashmap.insert(val, x);
}
println!("HashMap {{(value, occurences)}} || {:?}", hashmap);
}
prints (not always the same due to some randomness):
HashMap {(value, occurences)} || {0: 32768, 30: 15808, 31: 16960}
HashMap {(value, occurences)} || {0: 32768, 31: 16958, 32: 1, 30: 15809}
HashMap {(value, occurences)} || {0: 32768, 31: 16960, 30: 15808}
HashMap {(value, occurences)} || {0: 57343, 122: 7617, 1: 1, 123: 575}
HashMap {(value, occurences)} || {0: 61439, 1: 1, 245: 575, 244: 3521}
HashMap {(value, occurences)} || {1: 1, 977: 575, 976: 449, 0: 64511}
HashMap {(value, occurences)} || {15: 48576, 16: 16960}
HashMap {(value, occurences)} || {15: 48576, 16: 16960}
HashMap {(value, occurences)} || {15: 48576, 16: 16960}
HashMap {(value, occurences)} || {15: 48576, 16: 16960}
HashMap {(value, occurences)} || {15: 48576, 16: 16960}
HashMap {(value, occurences)} || {15: 48576, 16: 16960}
HashMap {(value, occurences)} || {15: 48577, 17: 1, 16: 16958}
HashMap {(value, occurences)} || {16: 16958, 15: 48577, 17: 1}
HashMap {(value, occurences)} || {16: 16958, 17: 1, 15: 48577}
HashMap {(value, occurences)} || {16: 16960, 15: 48576}
HashMap {(value, occurences)} || {16: 16960, 15: 48576}
HashMap {(value, occurences)} || {17: 1, 15: 48577, 16: 16958}
HashMap {(value, occurences)} || {17: 1, 15: 48577, 16: 16958}
HashMap {(value, occurences)} || {17: 1, 15: 48577, 16: 16958}
HashMap {(value, occurences)} || {17: 1, 16: 16958, 15: 48577}
HashMap {(value, occurences)} || {245: 575, 0: 61439, 1: 1, 244: 3521}
HashMap {(value, occurences)} || {30: 15808, 0: 32768, 31: 16960}
HashMap {(value, occurences)} || {30: 15808, 31: 16960, 0: 32768}
HashMap {(value, occurences)} || {30: 15809, 0: 32768, 31: 16958, 32: 1}
HashMap {(value, occurences)} || {30: 15809, 32: 1, 0: 32768, 31: 16958}
HashMap {(value, occurences)} || {31: 16958, 32: 1, 30: 15809, 0: 32768}
HashMap {(value, occurences)} || {32: 1, 0: 32768, 31: 16958, 30: 15809}
HashMap {(value, occurences)} || {32: 1, 31: 16958, 0: 32768, 30: 15809}
HashMap {(value, occurences)} || {978: 1, 977: 573, 976: 450, 0: 64511, 1: 1}