All pass except same term

softwaredoug · Jun 21, 2024 · 985ba43 · 985ba43
1 parent 3760158
commit 985ba43
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 10 deletions.
diff --git a/searcharray/roaringish/spans.pyx b/searcharray/roaringish/spans.pyx
@@ -135,17 +135,21 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
 
     for i in range(num_terms):
         curr_idx[i] = lengths[i]
+        print(f"Term {i} -- {curr_idx[i]}")
 
     while curr_idx[0] < lengths[1]:
         # Read each term up to the next  doc
         last_key = -1
+        print("****")
+        print("Restarting")
         for term_ord in range(num_terms):
             curr_key = ((posns[curr_idx[term_ord]] & key_mask) >> (64 - key_bits))
+            print(f"Term {term_ord} -- {curr_key} | {curr_idx[term_ord]} -- lengths {lengths[term_ord+1]}")
             while curr_idx[term_ord] < lengths[term_ord+1]:
                 last_key = curr_key
                 term = posns[curr_idx[term_ord]] & payload_mask
 
-                print(f"Term {term_ord} -- {term:0b} | {curr_key} | {curr_idx[term_ord]}")
+                print(f"Term {term_ord} -- {term:0b} | {curr_key} | {curr_idx[term_ord]} -- num_terms {num_terms}")
 
                 while term != 0:
                     # Consume into span
@@ -156,22 +160,33 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
                     curr_term_mask = 0x1 << term_ord
                     active_spans_queue[next_active_beg] = curr_term_mask
                     active_spans_posns[next_active_beg] = 1 << set_idx
-                    print(f"{next_active_beg} -- added {set_idx} | {active_spans_posns[next_active_beg]:0b}")
+                    print(" New span")
+                    print(f" {next_active_beg} -- added {set_idx} | {active_spans_posns[next_active_beg]:0b}")
                     if term_ord == 0:
-                        span_beg[next_active_beg] = payload_base + set_idx
+                        span_beg[next_active_beg] = set_idx
+                    print(f" beg {span_beg[next_active_beg]} end {span_end[next_active_beg]}")
 
                     # Remove spans that are too long
+                    print(" Scanning spans")
                     for span_idx in range(next_active_beg):
                         # Continue active spans
                         num_terms_visited = __builtin_popcountll(active_spans_queue[span_idx])
                         num_posns_visited = __builtin_popcountll(active_spans_posns[span_idx])
-                        if num_terms_visited == num_terms and num_posns_visited == num_terms:
+                        print(f" {span_idx} -- num_terms_visited {num_terms_visited} | num_posns {num_posns_visited}")
+                        print(f" {span_idx} -- {active_spans_queue[span_idx]:0b} | {active_spans_posns[span_idx]:0b}")
+                        if num_terms_visited < num_terms and num_posns_visited == num_terms:
                             continue
                         active_spans_queue[span_idx] |= curr_term_mask
-                        print(f"{span_idx} -- set_idx {set_idx} | {active_spans_posns[span_idx]:0b}")
-                        active_spans_posns[span_idx] |= (1 << set_idx)
-                        span_end[span_idx] = payload_base + set_idx
-                        if abs(span_end[span_idx] - span_beg[span_idx]) > num_terms + slop:
+                        num_terms_visited_now = __builtin_popcountll(active_spans_queue[span_idx])
+                        if num_terms_visited_now > num_terms_visited:
+                            # Add position for new unique term
+                            active_spans_posns[span_idx] |= (1 << set_idx)
+                            span_end[span_idx] = set_idx
+                            print(f" {span_idx} -- set_idx {set_idx} | {active_spans_posns[span_idx]:0b}")
+                            print(f" {span_idx} -- beg {span_beg[span_idx]} | end {span_end[span_idx]}")
+                        if num_terms_visited_now == num_terms and abs(span_end[span_idx] - span_beg[span_idx]) > num_terms + slop:
+                            print(f" {span_idx} -- removing {span_beg[span_idx]}-{span_end[span_idx]}")
+                            print(f" {span_idx} -- slop: {slop} num_terms: {num_terms}")
                             span_beg[span_idx] = 0
                             span_end[span_idx] = 0
                             active_spans_queue[span_idx] = 0
@@ -182,12 +197,14 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
                     next_active_beg += 1
                     last_set_idx = set_idx
                 curr_idx[term_ord] += 1
-                curr_key = posns[curr_idx[term_ord]] & key_mask
+                curr_key = (posns[curr_idx[term_ord]] & key_mask) >> (64 - key_bits)
                 if curr_key != last_key or next_active_beg > 64:
+                    print(f"Term {term_ord} -- Key change {curr_key} != {last_key} | {next_active_beg}")
                     break
 
         # All terms consumed for doc
 
+        print("***")
         print("***")
         print(f"Collect for {last_key}", flush=True)
 

diff --git a/test/test_slop_matches.py b/test/test_slop_matches.py
@@ -74,7 +74,7 @@
 
 @w_scenarios(scenarios)
 def test_phrase_slop(phrase, doc, slop, match):
-    sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"] * 100)
+    sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"] * 4)
     phrase_toks = sa.tokenizer(phrase)
     start = perf_counter()
     scores = sa.score(phrase_toks, slop=slop)