Revert "False start"

This reverts commit c589811.
softwaredoug · Jun 19, 2024 · 4b0ab88 · 4b0ab88
1 parent c589811
commit 4b0ab88
Showing 1 changed file with 35 additions and 62 deletions.
diff --git a/searcharray/roaringish/spans.pyx b/searcharray/roaringish/spans.pyx
@@ -37,23 +37,6 @@ cdef _get_adj_spans(DTYPE_t[:, :] posns_arr,
     pass
 
 
-cdef _bits_set(DTYPE_t value,
-               DTYPE_t[:] buffer64,
-               DTYPE_t* buffer_write_len):
-    """Get the bits set in a 64 bit value."""
-    cdef DTYPE_t lsb = 0
-    cdef DTYPE_t bit_posn = 0
-    buffer_write_len[0] = 0
-    while value > 0:
-        lsb = value & -value
-        bit_posn = __builtin_ctzll(lsb)
-        print(f"LSB: {lsb:064b} | bt_posn: {bit_posn}")
-        buffer64[buffer_write_len[0]] = bit_posn
-        # Clear LSB
-        value &= value - 1
-        buffer_write_len[0] += 1
-
-
 cdef _span_freqs(DTYPE_t[:, :] posns_arr,
                  double[:] phrase_freqs,
                  DTYPE_t slop,
@@ -129,90 +112,80 @@ cdef _span_freqs(DTYPE_t[:, :] posns_arr,
     # Now we score the span to see if its < slop
     #
     # curr_posns are current bits analyzed for slop
-    cdef np.uint64_t[:] bits_set = np.empty(64, dtype=np.uint64)
-    cdef np.uint64_t bits_set_len = 0
-
+    cdef np.uint64_t[:] curr_posns = np.empty(posns_arr.shape[0], dtype=np.uint64)
     cdef np.uint64_t[:] active_spans_queue = np.empty(64, dtype=np.uint64)
-    cdef np.int64_t[:] span_score_queue = np.empty(64, dtype=np.int64)
+    cdef np.uint64_t[:] span_score_queue = np.empty(64, dtype=np.uint64)
     cdef np.uint64_t next_active_beg = 0
     cdef np.uint64_t curr_term_mask = 0
     cdef np.uint64_t num_terms = posns_arr.shape[0]
     cdef np.uint64_t all_terms_mask = (1 << num_terms) - 1
     cdef np.uint64_t term_ord = 0
     cdef np.uint64_t curr_key = 0
-    cdef np.uint64_t last_key = 0
     cdef np.uint64_t payload_base = 0
     last_set_idx = 0
     for i in range(posns_arr.shape[1]):
         curr_key = posns_arr[0, i] & key_mask
-
+        
         if curr_key != last_key:
-            print("-----------")
-            print(f"Collecting spans for {curr_key} - {next_active_beg} active spans")
-
+            print(f"Collecting spans for {curr_key}")
+            next_active_beg = 0
+        
             # Make new active span queue
             new_active_span_queue = np.empty(64, dtype=np.uint64)
-            new_span_score_queue = np.empty(64, dtype=np.int64)
+            new_span_score_queue = np.empty(64, dtype=np.uint64)
 
             # Copy existing
             for span_idx in range(next_active_beg):
-                span_size = __builtin_popcountll(active_spans_queue[span_idx])
-                print(f"Span {span_idx} size: {span_size}")
-                if span_size != num_terms:
+                if __builtin_popcountll(active_spans_queue[span_idx]) != num_terms:
                     continue
                 print("Keeping span")
                 print("Span score: ", span_score_queue[span_idx])
+                new_active_span_queue[span_idx] = active_spans_queue[span_idx]
+                new_span_score_queue[span_idx] = span_score_queue[span_idx]
                 phrase_freqs[curr_key] += 1
-
-            next_active_beg = 0
+
             active_spans_queue = new_active_span_queue
             span_score_queue = new_span_score_queue
 
-        # We may not be processing each term in order
         # Each term is potentially a new span
         for term_ord in range(num_terms):
             # Each msb
             term = posns_arr[term_ord, i] & payload_mask
-            _bits_set(term, bits_set, &bits_set_len)
-            last_set_idx = 0
-            print("---")
-            print(f"Term ord: {term_ord}, bits_set_len: {bits_set_len}")
-            for idx in range(bits_set_len):
-                set_idx = bits_set[idx]
-                print(f"set_idx: {set_idx}, last_set_idx: {last_set_idx}, payload_base: {payload_base}")
-                curr_term_mask = 0x1 << term_ord
-                active_spans_queue[next_active_beg] = curr_term_mask
-                span_score_queue[next_active_beg] = term_ord   # The term index as start score, because 0 is in order
-                for span_idx in range(next_active_beg):
-                    if __builtin_popcountll(active_spans_queue[span_idx]) == num_terms:
-                        print(f" Not updating completed span {span_idx} score: {span_score_queue[span_idx]}")
-                        continue
-                    active_spans_queue[span_idx] |= curr_term_mask
-                    span_score_queue[span_idx] += set_idx - last_set_idx - 1
-                    if span_score_queue[span_idx] > slop:
-                        print(f"Removing span {span_idx} w/ score {span_score_queue[span_idx]}")
-                        active_spans_queue[span_idx] = 0
-                        span_score_queue[span_idx] = 0x7FFFFFFFFFFFFFFF
-                    else:
-                        print(f" Keeping Span {span_idx} score: {span_score_queue[span_idx]}")
-                next_active_beg += 1
-                last_set_idx = set_idx
+            set_idx = __builtin_ctzll(posns_arr[term_ord, i] & payload_mask)
+            # Start a span
+            curr_term_mask = 0x1 << term_ord
+            active_spans_queue[next_active_beg] = curr_term_mask
+            span_score_queue[next_active_beg] = term_ord   # The term index as start score, because 0 is in order
+            for span_idx in range(next_active_beg):
+                active_spans_queue[span_idx] |= curr_term_mask
+                span_score_queue[span_idx] += payload_base + (set_idx - last_set_idx - 1)      # distance of 1, score 0
+                if span_score_queue[span_idx] > slop:
+                    print(f"Removing span {span_idx}")
+                    active_spans_queue[span_idx] = 0
+                    span_score_queue[span_idx] = 0x7FFFFFFFFFFFFFFF
+                print(f"Score of {span_idx} is {span_score_queue[span_idx]}")
+            next_active_beg += 1
+            last_set_idx = set_idx
 
         payload_base += lsb_bits
         last_key = curr_key
+    # Make new active span queue
+    new_active_span_queue = np.empty(64, dtype=np.uint64)
+    new_span_score_queue = np.empty(64, dtype=np.uint64)
 
     # Copy existing
-    print("-----------")
-    print(f"Collecting spans for {curr_key} - {next_active_beg} active spans")
     for span_idx in range(next_active_beg):
-        span_size = __builtin_popcountll(active_spans_queue[span_idx])
-        print(f"Span {span_idx} size: {span_size}")
-        if span_size != num_terms:
+        if __builtin_popcountll(active_spans_queue[span_idx]) != num_terms:
             continue
         print("Keeping span")
         print("Span score: ", span_score_queue[span_idx])
+        new_active_span_queue[span_idx] = active_spans_queue[span_idx]
+        new_span_score_queue[span_idx] = span_score_queue[span_idx]
         phrase_freqs[curr_key] += 1
 
+    active_spans_queue = new_active_span_queue
+    span_score_queue = new_span_score_queue
+
 
 def span_search(np.ndarray[DTYPE_t, ndim=2] posns_arr,
                 np.ndarray[double, ndim=1] phrase_freqs,