Quotient Filter: Get Hashes and Resize / Auto Expand (#115)

* quotient-filter: get hashes * add quotient filter to project tags * quotient filter: expand
barrust · Jan 13, 2024 · 28a58b0 · 28a58b0
1 parent 84dbffc
commit 28a58b0
Show file tree

Hide file tree

Showing 4 changed files with 253 additions and 32 deletions.
diff --git a/probables/blooms/bloom.py b/probables/blooms/bloom.py
@@ -315,21 +315,9 @@ def export_c_header(self, filename: Union[str, Path]) -> None:
         with open(filename, "w", encoding="utf-8") as file:
             print(f"/* BloomFilter Export of a {bloom_type} */", file=file)
             print("#include <inttypes.h>", file=file)
-            print(
-                "const uint64_t estimated_elements = ",
-                self.estimated_elements,
-                ";",
-                sep="",
-                file=file,
-            )
+            print("const uint64_t estimated_elements = ", self.estimated_elements, ";", sep="", file=file)
             print("const uint64_t elements_added = ", self.elements_added, ";", sep="", file=file)
-            print(
-                "const float false_positive_rate = ",
-                self.false_positive_rate,
-                ";",
-                sep="",
-                file=file,
-            )
+            print("const float false_positive_rate = ", self.false_positive_rate, ";", sep="", file=file)
             print("const uint64_t number_bits = ", self.number_bits, ";", sep="", file=file)
             print("const unsigned int number_hashes = ", self.number_hashes, ";", sep="", file=file)
             print("const unsigned char bloom[] = {", *data, "};", sep="\n", file=file)

diff --git a/probables/quotientfilter/quotientfilter.py b/probables/quotientfilter/quotientfilter.py
@@ -4,7 +4,7 @@
 """
 
 from array import array
-from typing import Optional
+from typing import Iterator, List, Optional
 
 from probables.hashes import KeyT, SimpleHashT, fnv_1a_32
 from probables.utilities import Bitarray
@@ -15,6 +15,7 @@ class QuotientFilter:
 
     Args:
         quotient (int): The size of the quotient to use
+        auto_expand (bool): Automatically expand or not
         hash_function (function): Hashing strategy function to use `hf(key, number)`
     Returns:
         QuotientFilter: The initialized filter
@@ -35,18 +36,27 @@ class QuotientFilter:
         "_is_continuation",
         "_is_shifted",
         "_filter",
+        "_max_load_factor",
+        "_auto_resize",
     )
 
-    def __init__(self, quotient: int = 20, hash_function: Optional[SimpleHashT] = None):  # needs to be parameterized
+    def __init__(
+        self, quotient: int = 20, auto_expand: bool = True, hash_function: Optional[SimpleHashT] = None
+    ):  # needs to be parameterized
         if quotient < 3 or quotient > 31:
             raise ValueError(
                 f"Quotient filter: Invalid quotient setting; quotient must be between 3 and 31; {quotient} was provided"
             )
-        self._q = quotient
-        self._r = 32 - quotient
-        self._size = 1 << self._q  # same as 2**q
-        self._elements_added = 0
+        self.__set_params(quotient, auto_expand, hash_function)
+
+    def __set_params(self, quotient: int, auto_expand: bool, hash_function: Optional[SimpleHashT]):
+        self._q: int = quotient
+        self._r: int = 32 - quotient
+        self._size: int = 1 << self._q  # same as 2**q
+        self._elements_added: int = 0
+        self._auto_resize: bool = auto_expand
         self._hash_func: SimpleHashT = fnv_1a_32 if hash_function is None else hash_function  # type: ignore
+        self._max_load_factor: float = 0.85
 
         # ensure we use the smallest type possible to reduce memory wastage
         if self._r <= 8:
@@ -89,21 +99,61 @@ def elements_added(self) -> int:
         return self._elements_added
 
     @property
-    def bits_per_elm(self):
+    def bits_per_elm(self) -> int:
         """int: The number of bits used per element"""
         return self._bits_per_elm
 
+    @property
+    def size(self) -> int:
+        """int: The number of bins available in the filter
+
+        Note:
+            same as `num_elements`"""
+        return self._size
+
+    @property
+    def load_factor(self) -> float:
+        """float: The load factor of the filter"""
+        return self._elements_added / self._size
+
+    @property
+    def auto_expand(self) -> bool:
+        """bool: Will the quotient filter automatically expand"""
+        return self._auto_resize
+
+    @auto_expand.setter
+    def auto_expand(self, val: bool):
+        """change the auto expand property"""
+        self._auto_resize = bool(val)
+
+    @property
+    def max_load_factor(self) -> float:
+        """float: The maximum allowed load factor after which auto expanding should occur"""
+        return self._max_load_factor
+
+    @max_load_factor.setter
+    def max_load_factor(self, val: float):
+        """set the maximum load factor"""
+        self._max_load_factor = float(val)
+
     def add(self, key: KeyT) -> None:
         """Add key to the quotient filter
 
         Args:
             key (str|bytes): The element to add"""
         _hash = self._hash_func(key, 0)
+        self.add_alt(_hash)
+
+    def add_alt(self, _hash: int) -> None:
+        """Add the pre-hashed value to the quotient filter
+
+        Args:
+            _hash (int): The element to add"""
         key_quotient = _hash >> self._r
         key_remainder = _hash & ((1 << self._r) - 1)
-
-        if not self._contains(key_quotient, key_remainder):
-            # TODO, add it here
+        if self._contained_at_loc(key_quotient, key_remainder) == -1:
+            if self._auto_resize and self.load_factor >= self._max_load_factor:
+                self.resize()
             self._add(key_quotient, key_remainder)
 
     def check(self, key: KeyT) -> bool:
@@ -114,9 +164,92 @@ def check(self, key: KeyT) -> bool:
         Return:
             bool: True if likely encountered, False if definately not"""
         _hash = self._hash_func(key, 0)
+        return self.check_alt(_hash)
+
+    def check_alt(self, _hash: int) -> bool:
+        """Check to see if the pre-calculated hash is likely in the quotient filter
+
+        Args:
+            _hash (int): The element to add
+        Return:
+            bool: True if likely encountered, False if definately not"""
         key_quotient = _hash >> self._r
         key_remainder = _hash & ((1 << self._r) - 1)
-        return self._contains(key_quotient, key_remainder)
+        return not self._contained_at_loc(key_quotient, key_remainder) == -1
+
+    def iter_hashes(self) -> Iterator[int]:
+        """A generator over the hashes in the quotient filter
+
+        Yields:
+            int: The next hash stored in the quotient filter"""
+        queue: List[int] = []
+
+        # find first empty location
+        start = 0
+        while True:
+            is_occupied = self._is_occupied.check_bit(start)
+            is_continuation = self._is_continuation.check_bit(start)
+            is_shifted = self._is_shifted.check_bit(start)
+            if is_occupied + is_continuation + is_shifted == 0:
+                break
+            start += 1
+
+        cur_quot = 0
+        for i in range(start, self._size + start):  # this will allow for wrap-arounds
+            idx = i % self._size
+            is_occupied = self._is_occupied.check_bit(idx)
+            is_continuation = self._is_continuation.check_bit(idx)
+            is_shifted = self._is_shifted.check_bit(idx)
+            # Nothing here, keep going
+            if is_occupied + is_continuation + is_shifted == 0:
+                assert len(queue) == 0
+                continue
+
+            if is_occupied == 1:  # keep track of the indicies that match a hashed quotient
+                queue.append(idx)
+
+            #  run start
+            if not is_continuation and (is_occupied or is_shifted):
+                cur_quot = queue.pop(0)
+
+            if self._filter[idx] != 0:
+                yield (cur_quot << self._r) + self._filter[idx]
+
+    def get_hashes(self) -> List[int]:
+        """Get the hashes from the quotient filter as a list
+
+        Returns:
+            list(int): The hash values stored in the quotient filter"""
+        return list(self.iter_hashes())
+
+    def resize(self, quotient: Optional[int] = None) -> None:
+        """Resize the quotient filter to use the new quotient size
+
+        Args:
+            int: The new quotient to use
+        Note:
+            If `None` is provided, the quotient filter will double in size (quotient + 1)
+        Raises:
+            ValueError: When the new quotient will not accommodate the elements already added"""
+        if quotient is None:
+            quotient = self._q + 1
+
+        if self.elements_added >= (1 << quotient):
+            raise ValueError("Unable to shrink since there will be too many elements in the quotient filter")
+        if quotient < 3 or quotient > 31:
+            raise ValueError(
+                f"Quotient filter: Invalid quotient setting; quotient must be between 3 and 31; {quotient} was provided"
+            )
+
+        hashes = self.get_hashes()
+
+        for i in range(self._size):
+            self._filter[i] = 0
+
+        self.__set_params(quotient, self._auto_resize, self._hash_func)
+
+        for _h in hashes:
+            self.add_alt(_h)
 
     def _shift_insert(self, k, v, start, j, flag):
         if self._is_occupied[j] == 0 and self._is_continuation[j] == 0 and self._is_shifted[j] == 0:
@@ -215,9 +348,10 @@ def _add(self, q: int, r: int):
                     self._shift_insert(q, r, orig_start_idx, start_idx, 1)
         self._elements_added += 1
 
-    def _contains(self, q: int, r: int) -> bool:
+    def _contained_at_loc(self, q: int, r: int) -> int:
+        """returns the index location of the element, or -1 if not present"""
         if self._is_occupied[q] == 0:
-            return False
+            return -1
 
         start_idx = self._get_start_index(q)
 
@@ -236,7 +370,7 @@ def _contains(self, q: int, r: int) -> bool:
                 break
 
             if self._filter[start_idx] == r:
-                return True
+                return start_idx
 
             start_idx = (start_idx + 1) & (self._size - 1)
             meta_bits = (
@@ -245,4 +379,4 @@ def _contains(self, q: int, r: int) -> bool:
                 + self._is_shifted.check_bit(start_idx)
             )
 
-        return False
+        return -1
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ keywords = [
     "bloom-filter",
     "count-min-sketch",
     "cuckoo-filter",
+    "quotient-filter",
 ]
 readme = "README.rst"
 classifiers = [