Skip to content

Commit

Permalink
fix: support add_if_not_contains #6
Browse files Browse the repository at this point in the history
  • Loading branch information
yankun1992 committed Jul 10, 2023
1 parent 60ffbe3 commit d8a5656
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastbloom_rs"
version = "0.5.3"
version = "0.5.4"
edition = "2021"
authors = ["Yan Kun <[email protected]>"]
description = "Some fast bloom filter implemented by Rust for Python and Rust! 10x faster than pybloom!"
Expand Down
6 changes: 3 additions & 3 deletions benches/fastbloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ fn bloom_add_random_test(filter: &mut BloomFilter, random: &mut ThreadRng, range
}

fn bound_check_test(vec: &mut Vec<usize>, random: &mut ThreadRng) {
let value = random.gen_range(0..4096 * 1024 - 1);
let value = random.gen_range(0..20 * 1024 * 1024 - 1);
vec[value] = value;
}

fn unsafe_array_test(array: &mut [usize], random: &mut ThreadRng) {
let value = random.gen_range(0..4096 * 1024 - 1) as usize;
let value = random.gen_range(0..20 * 1024 * 1024 - 1) as usize;
unsafe {
let ptr = array.as_ptr() as *mut usize;
*ptr.add(value) = value;
Expand Down Expand Up @@ -129,7 +129,7 @@ fn hash_bench(c: &mut Criterion) {

fn bound_check_bench(c: &mut Criterion) {
let mut random = rand::thread_rng();
let mut vec = vec![0; 4096 * 1024];
let mut vec = vec![0; 20 * 1024 * 1024];
c.bench_function("bound_check_test", |b| b.iter(|| bound_check_test(&mut vec, &mut random)));
c.bench_function("unsafe_array_test", |b| b.iter(|| unsafe_array_test(&mut vec, &mut random)));
}
Expand Down
2 changes: 1 addition & 1 deletion fastbloom-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastbloom-rs"
version = "0.5.3"
version = "0.5.4"
edition = "2021"
authors = ["Yan Kun <[email protected]>"]
description = "Some fast bloom filter implemented by Rust for Python and Rust!"
Expand Down
27 changes: 27 additions & 0 deletions fastbloom-rs/src/bloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@ fn bit_check(bit_set: &BloomBitVec, value: &[u8], m: u64, k: u64) -> bool {
res
}

#[inline]
fn bit_check_and_set(bit_set: &mut BloomBitVec, value: &[u8], m: u64, k: u64) -> bool {
// let hash1 = (murmur3_x64_128(value, 0) % m) as u64;
// let hash2 = (murmur3_x64_128(value, 32) % m) as u64;
let hash1 = xxh3_64_with_seed(value, 0) % m;
let hash2 = xxh3_64_with_seed(value, 32) % m;
let mut res = bit_set.get(hash1 as usize);
bit_set.set(hash1 as usize);
// let m = m as u64;
for i in 1..k {
let mo = ((hash1 + i * hash2) % m) as usize;
res = res && bit_set.get(mo);
bit_set.set(mo);
}
res
}

#[inline]
fn get_bit_indices(bit_set: &BloomBitVec, value: &[u8], m: u64, k: u64) -> Vec<u64> {
let mut res = Vec::<u64>::with_capacity(k as usize);
Expand Down Expand Up @@ -135,6 +152,14 @@ impl BloomFilter {
BloomFilter { config, bit_set }
}

/// Tests whether an element is present in the filter (subject to the specified false
/// positive rate). And if it is not in this filter, add it to the filter.
#[inline]
pub fn add_if_not_contains(&mut self, element: &[u8]) -> bool {
bit_check_and_set(&mut self.bit_set, element, self.config.size,
self.config.hashes as u64)
}

/// Build a Bloom filter form `&[u8]`.
///
/// # Examples
Expand Down Expand Up @@ -667,6 +692,8 @@ fn bloom_test() {
println!("{:?}", &bloom.bit_set.storage[0..300]);
assert_eq!(bloom.contains(b"hello"), true);
assert_eq!(bloom.contains(b"world"), false);
assert_eq!(bloom.add_if_not_contains(b"hello2"), false);
assert_eq!(bloom.contains(b"hello2"), true);

let storage = &bloom.bit_set.storage[0..300];
println!("{:?}", storage);
Expand Down
9 changes: 9 additions & 0 deletions fastbloom_rs/fastbloom_rs.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,27 @@ class PyBloomFilter(object):
def add_int(self, element: int):
...

def add_int_if_not_contains(self, element: int) -> bool:
...

def add_int_batch(self, array: Sequence[int]):
...

def add_str(self, element: str):
...

def add_str_if_not_contains(self, element: str) -> bool:
...

def add_str_batch(self, array: Sequence[str]):
...

def add_bytes(self, element: bytes):
...

def add_bytes_if_not_contains(self, element: bytes) -> bool:
...

def add_bytes_batch(self, elements: Sequence[bytes]):
...

Expand Down
18 changes: 18 additions & 0 deletions fastbloom_rs/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,24 @@ def add(self, element: Union[str, int, bytes]):
else:
self._py_bloom.add_str(str(element))

def add_if_not_contains(self, element: Union[str, int, bytes]) -> bool:
"""
Tests whether an element is present in the filter (subject to the specified false positive rate).
And if it is not in this filter, add it to the filter.
:param element: value to test
:return: “False” if this element did not exist in the Bloom filter before, and then this method will insert
this element into the current filter. “True” if the element is already in the Bloom filter.
"""
if isinstance(element, int):
return self._py_bloom.add_int_if_not_contains(element)
elif isinstance(element, str):
return self._py_bloom.add_str_if_not_contains(element)
elif isinstance(element, bytes):
return self._py_bloom.add_bytes_if_not_contains(element)
else:
return self._py_bloom.add_str_if_not_contains(str(element))

def add_int(self, element: int):
"""
Add element to the filter.
Expand Down
5 changes: 5 additions & 0 deletions py_tests/test_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ def test_bloom_builder():
assert b'hello' in bloom
assert not bloom.contains_bytes(b'hello world')

assert not bloom.add_if_not_contains('hello2')
assert bloom.contains('hello2')
assert not bloom.add_if_not_contains(88)
assert bloom.contains(88)

bloom2 = BloomFilter.from_int_array(bloom.get_int_array(), bloom.hashes())

assert bloom2.contains_bytes(b'hello')
Expand Down
12 changes: 12 additions & 0 deletions src/pybloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ impl PyBloomFilter {
self.bloomfilter.add(&i64::to_le_bytes(element));
}

pub fn add_int_if_not_contains(&mut self, element: i64) -> bool {
self.bloomfilter.add_if_not_contains(&i64::to_le_bytes(element))
}

pub fn add_int_batch(&mut self, array: Vec<i64>) {
for x in array {
self.add_int(x)
Expand All @@ -72,6 +76,10 @@ impl PyBloomFilter {
self.bloomfilter.add(element.as_bytes());
}

pub fn add_str_if_not_contains(&mut self, element: &str) -> bool {
self.bloomfilter.add_if_not_contains(element.as_bytes())
}

pub fn add_str_batch(&mut self, array: Vec<&str>) {
for x in array {
self.bloomfilter.add(x.as_bytes())
Expand All @@ -88,6 +96,10 @@ impl PyBloomFilter {
}
}

pub fn add_bytes_if_not_contains(&mut self, bts: &PyBytes) -> bool {
self.bloomfilter.add_if_not_contains(bts.as_bytes())
}

pub fn contains_int(&mut self, element: i64) -> bool {
self.bloomfilter.contains(&i64::to_le_bytes(element))
}
Expand Down

0 comments on commit d8a5656

Please sign in to comment.