Skip to content

Commit

Permalink
Quotient Filter: Get Hashes and Resize / Auto Expand (#115)
Browse files Browse the repository at this point in the history
* quotient-filter: get hashes
* add quotient filter to project tags
* quotient filter: expand
  • Loading branch information
barrust authored Jan 13, 2024
1 parent 84dbffc commit 28a58b0
Show file tree
Hide file tree
Showing 4 changed files with 253 additions and 32 deletions.
16 changes: 2 additions & 14 deletions probables/blooms/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,21 +315,9 @@ def export_c_header(self, filename: Union[str, Path]) -> None:
with open(filename, "w", encoding="utf-8") as file:
print(f"/* BloomFilter Export of a {bloom_type} */", file=file)
print("#include <inttypes.h>", file=file)
print(
"const uint64_t estimated_elements = ",
self.estimated_elements,
";",
sep="",
file=file,
)
print("const uint64_t estimated_elements = ", self.estimated_elements, ";", sep="", file=file)
print("const uint64_t elements_added = ", self.elements_added, ";", sep="", file=file)
print(
"const float false_positive_rate = ",
self.false_positive_rate,
";",
sep="",
file=file,
)
print("const float false_positive_rate = ", self.false_positive_rate, ";", sep="", file=file)
print("const uint64_t number_bits = ", self.number_bits, ";", sep="", file=file)
print("const unsigned int number_hashes = ", self.number_hashes, ";", sep="", file=file)
print("const unsigned char bloom[] = {", *data, "};", sep="\n", file=file)
Expand Down
164 changes: 149 additions & 15 deletions probables/quotientfilter/quotientfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""

from array import array
from typing import Optional
from typing import Iterator, List, Optional

from probables.hashes import KeyT, SimpleHashT, fnv_1a_32
from probables.utilities import Bitarray
Expand All @@ -15,6 +15,7 @@ class QuotientFilter:
Args:
quotient (int): The size of the quotient to use
auto_expand (bool): Automatically expand or not
hash_function (function): Hashing strategy function to use `hf(key, number)`
Returns:
QuotientFilter: The initialized filter
Expand All @@ -35,18 +36,27 @@ class QuotientFilter:
"_is_continuation",
"_is_shifted",
"_filter",
"_max_load_factor",
"_auto_resize",
)

def __init__(self, quotient: int = 20, hash_function: Optional[SimpleHashT] = None): # needs to be parameterized
def __init__(
self, quotient: int = 20, auto_expand: bool = True, hash_function: Optional[SimpleHashT] = None
): # needs to be parameterized
if quotient < 3 or quotient > 31:
raise ValueError(
f"Quotient filter: Invalid quotient setting; quotient must be between 3 and 31; {quotient} was provided"
)
self._q = quotient
self._r = 32 - quotient
self._size = 1 << self._q # same as 2**q
self._elements_added = 0
self.__set_params(quotient, auto_expand, hash_function)

def __set_params(self, quotient: int, auto_expand: bool, hash_function: Optional[SimpleHashT]):
self._q: int = quotient
self._r: int = 32 - quotient
self._size: int = 1 << self._q # same as 2**q
self._elements_added: int = 0
self._auto_resize: bool = auto_expand
self._hash_func: SimpleHashT = fnv_1a_32 if hash_function is None else hash_function # type: ignore
self._max_load_factor: float = 0.85

# ensure we use the smallest type possible to reduce memory wastage
if self._r <= 8:
Expand Down Expand Up @@ -89,21 +99,61 @@ def elements_added(self) -> int:
return self._elements_added

@property
def bits_per_elm(self):
def bits_per_elm(self) -> int:
"""int: The number of bits used per element"""
return self._bits_per_elm

@property
def size(self) -> int:
"""int: The number of bins available in the filter
Note:
same as `num_elements`"""
return self._size

@property
def load_factor(self) -> float:
"""float: The load factor of the filter"""
return self._elements_added / self._size

@property
def auto_expand(self) -> bool:
"""bool: Will the quotient filter automatically expand"""
return self._auto_resize

@auto_expand.setter
def auto_expand(self, val: bool):
"""change the auto expand property"""
self._auto_resize = bool(val)

@property
def max_load_factor(self) -> float:
"""float: The maximum allowed load factor after which auto expanding should occur"""
return self._max_load_factor

@max_load_factor.setter
def max_load_factor(self, val: float):
"""set the maximum load factor"""
self._max_load_factor = float(val)

def add(self, key: KeyT) -> None:
"""Add key to the quotient filter
Args:
key (str|bytes): The element to add"""
_hash = self._hash_func(key, 0)
self.add_alt(_hash)

def add_alt(self, _hash: int) -> None:
"""Add the pre-hashed value to the quotient filter
Args:
_hash (int): The element to add"""
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)

if not self._contains(key_quotient, key_remainder):
# TODO, add it here
if self._contained_at_loc(key_quotient, key_remainder) == -1:
if self._auto_resize and self.load_factor >= self._max_load_factor:
self.resize()
self._add(key_quotient, key_remainder)

def check(self, key: KeyT) -> bool:
Expand All @@ -114,9 +164,92 @@ def check(self, key: KeyT) -> bool:
Return:
bool: True if likely encountered, False if definately not"""
_hash = self._hash_func(key, 0)
return self.check_alt(_hash)

def check_alt(self, _hash: int) -> bool:
"""Check to see if the pre-calculated hash is likely in the quotient filter
Args:
_hash (int): The element to add
Return:
bool: True if likely encountered, False if definately not"""
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)
return self._contains(key_quotient, key_remainder)
return not self._contained_at_loc(key_quotient, key_remainder) == -1

def iter_hashes(self) -> Iterator[int]:
"""A generator over the hashes in the quotient filter
Yields:
int: The next hash stored in the quotient filter"""
queue: List[int] = []

# find first empty location
start = 0
while True:
is_occupied = self._is_occupied.check_bit(start)
is_continuation = self._is_continuation.check_bit(start)
is_shifted = self._is_shifted.check_bit(start)
if is_occupied + is_continuation + is_shifted == 0:
break
start += 1

cur_quot = 0
for i in range(start, self._size + start): # this will allow for wrap-arounds
idx = i % self._size
is_occupied = self._is_occupied.check_bit(idx)
is_continuation = self._is_continuation.check_bit(idx)
is_shifted = self._is_shifted.check_bit(idx)
# Nothing here, keep going
if is_occupied + is_continuation + is_shifted == 0:
assert len(queue) == 0
continue

if is_occupied == 1: # keep track of the indicies that match a hashed quotient
queue.append(idx)

# run start
if not is_continuation and (is_occupied or is_shifted):
cur_quot = queue.pop(0)

if self._filter[idx] != 0:
yield (cur_quot << self._r) + self._filter[idx]

def get_hashes(self) -> List[int]:
"""Get the hashes from the quotient filter as a list
Returns:
list(int): The hash values stored in the quotient filter"""
return list(self.iter_hashes())

def resize(self, quotient: Optional[int] = None) -> None:
"""Resize the quotient filter to use the new quotient size
Args:
int: The new quotient to use
Note:
If `None` is provided, the quotient filter will double in size (quotient + 1)
Raises:
ValueError: When the new quotient will not accommodate the elements already added"""
if quotient is None:
quotient = self._q + 1

if self.elements_added >= (1 << quotient):
raise ValueError("Unable to shrink since there will be too many elements in the quotient filter")
if quotient < 3 or quotient > 31:
raise ValueError(
f"Quotient filter: Invalid quotient setting; quotient must be between 3 and 31; {quotient} was provided"
)

hashes = self.get_hashes()

for i in range(self._size):
self._filter[i] = 0

self.__set_params(quotient, self._auto_resize, self._hash_func)

for _h in hashes:
self.add_alt(_h)

def _shift_insert(self, k, v, start, j, flag):
if self._is_occupied[j] == 0 and self._is_continuation[j] == 0 and self._is_shifted[j] == 0:
Expand Down Expand Up @@ -215,9 +348,10 @@ def _add(self, q: int, r: int):
self._shift_insert(q, r, orig_start_idx, start_idx, 1)
self._elements_added += 1

def _contains(self, q: int, r: int) -> bool:
def _contained_at_loc(self, q: int, r: int) -> int:
"""returns the index location of the element, or -1 if not present"""
if self._is_occupied[q] == 0:
return False
return -1

start_idx = self._get_start_index(q)

Expand All @@ -236,7 +370,7 @@ def _contains(self, q: int, r: int) -> bool:
break

if self._filter[start_idx] == r:
return True
return start_idx

start_idx = (start_idx + 1) & (self._size - 1)
meta_bits = (
Expand All @@ -245,4 +379,4 @@ def _contains(self, q: int, r: int) -> bool:
+ self._is_shifted.check_bit(start_idx)
)

return False
return -1
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ keywords = [
"bloom-filter",
"count-min-sketch",
"cuckoo-filter",
"quotient-filter",
]
readme = "README.rst"
classifiers = [
Expand Down
Loading

0 comments on commit 28a58b0

Please sign in to comment.