Remove prints etc

softwaredoug · Jun 20, 2024 · d0117ca · d0117ca
1 parent ca0223b
commit d0117ca
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 28 deletions.
diff --git a/searcharray/roaringish/spans.pyx b/searcharray/roaringish/spans.pyx
@@ -120,32 +120,27 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
     cdef np.int64_t[:] span_end = np.empty(64, dtype=np.int64)
     cdef np.uint64_t next_active_beg = 0
     cdef np.uint64_t curr_term_mask = 0
-    cdef np.uint64_t num_terms = len(lengths)
+    cdef np.uint64_t num_terms = len(lengths) - 1
     cdef np.uint64_t all_terms_mask = (1 << num_terms) - 1
     cdef np.uint64_t term_ord = 0
     cdef np.uint64_t curr_key = 0
     cdef np.uint64_t last_key = 0
     cdef np.uint64_t payload_base = 0
     last_set_idx = 0
     while curr_idx[0] < lengths[1]:
-        print(f"curr_key: {curr_key}")
-
         # Read each term up to the next  doc
         last_key = -1
         for term_ord in range(num_terms):
-            curr_key = (posns[curr_idx[term_ord]] & key_mask >> (64 - key_bits))
+            curr_key = ((posns[curr_idx[term_ord]] & key_mask) >> (64 - key_bits))
             while curr_idx[term_ord] < lengths[term_ord+1]:
                 last_key = curr_key
                 term = posns[curr_idx[term_ord]] & payload_mask
 
-                print("Starting loop")
                 while term != 0:
                     # Consume into span
                     set_idx = __builtin_ctzll(term)
-                    print(term_ord, term, set_idx)
                     # Clear LSB
                     term = (term & (term - 1))
-                    print("Cleared", term_ord, term, set_idx)
                     # Start a span
                     curr_term_mask = 0x1 << term_ord
                     active_spans_queue[next_active_beg] = curr_term_mask
@@ -161,25 +156,20 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
                         active_spans_queue[span_idx] |= curr_term_mask
                         span_end[span_idx] = payload_base + set_idx
                         if abs(span_end[span_idx] - span_beg[span_idx]) > num_terms + slop:
-                            print(f"Removing span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}, slop: {slop}")
                             span_beg[span_idx] = 0
                             span_end[span_idx] = 0
                             active_spans_queue[span_idx] = 0
-                        else:
-                            print(f" Keeping span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}, slop: {slop}")
 
-                    if next_active_beg > 64: 
+                    if next_active_beg > 64:
                         break
                     next_active_beg += 1
                     last_set_idx = set_idx
-                print("Next posn in term")
                 curr_idx[term_ord] += 1
                 curr_key = posns[curr_idx[term_ord]] & key_mask
                 if curr_key != last_key or next_active_beg > 64:
                     break
 
         # All terms consumed for doc
-        print(f"Collecting spans for {last_key}")
 
         # Make new active span queue
         new_active_span_queue = np.empty(64, dtype=np.uint64)
@@ -191,9 +181,7 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
             popcount = __builtin_popcountll(active_spans_queue[span_idx])
             if popcount != num_terms:
                 continue
-            print(f"Collecting span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}")
             phrase_freqs[last_key] += 1
-        print(f"Phrase freqs: {phrase_freqs[last_key]}")
 
         # Reset
         next_active_beg = 0
@@ -202,7 +190,6 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
         span_end = new_span_end
 
 
-
 def span_search(np.ndarray[DTYPE_t, ndim=1] posns,
                 np.ndarray[DTYPE_t, ndim=1] lengths,
                 np.ndarray[double, ndim=1] phrase_freqs,

diff --git a/test/test_slop_matches.py b/test/test_slop_matches.py
@@ -1,25 +1,30 @@
 from searcharray.postings import SearchArray
+import numpy as np
 from test_utils import w_scenarios
+from time import perf_counter
 
 
 scenarios = {
     "direct_phrase": {
         "phrase": "intergalactic bounty hunters",
         "doc": """A massive ball of furry creatures from another world eat their way through a small mid-western town followed by intergalactic bounty hunters opposed only by militant townspeople.""",
-        "slop": 0
+        "slop": 0,
+        "match": True
     },
     "slop 1": {
         "phrase": "massive ball furry",
         "doc": """A massive ball of furry creatures from another world eat their way through a small mid-western town followed by intergalactic bounty hunters opposed only by militant townspeople.""",
-        "slop": 1
+        "slop": 1,
+        "match": True
     },
     "two_after_the": {
         "phrase": "the to be",
         "doc": """
             Broke and alone on New Year's Eve, Wilson just wants to spend the rest of a very bad year in bed.
             But, when his best friend convinces him to post a personal ad,
             he meets a woman bent on finding the right guy to be with at midnight.""",
-        "slop": 2
+        "slop": 2,
+        "match": True
     },
     "slop_3_order": {
         "phrase": "the to be",
@@ -29,24 +34,43 @@
             The murders are found to be the work of an out-of-control experiment in genetic engineering.
             The two men must descend into the city's sewer systems to destroy the horrific miscreation.
             It won't be hard to find, as it's already looking for its next victims...""",
-        "slop": 3
+        "slop": 3,
+        "match": True
     },
     "slop_5": {
         "phrase": "spice found substance",
         "doc": """
 In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""",
-        "slop": 5
+        "slop": 5,
+        "match": True
+    },
+    "slop_5_len_5": {
+        "phrase": "spice found substance can be",
+        "doc": """
+In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""",
+        "slop": 5,
+        "match": True
+    },
+    "slop_5_len_5_no_match": {
+        "phrase": "there is no match for this",
+        "doc": """
+In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""",
+        "slop": 5,
+        "match": False
     },
 }
 
 
 @w_scenarios(scenarios)
-def test_phrase_slop(phrase, doc, slop):
-    sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"])
+def test_phrase_slop(phrase, doc, slop, match):
+    sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"] * 100)
     phrase_toks = sa.tokenizer(phrase)
-    # assert sa.score(phrase_toks, slop=slop)
+    start = perf_counter()
+    scores = sa.score(phrase_toks, slop=slop)
+    print("Elapsed time:", perf_counter() - start)
     for match_slop in range(slop, max(slop, 10)):
-        assert sa.score(phrase_toks, slop=match_slop)[0] > 0
-        assert sa.score(phrase_toks, slop=match_slop)[1] == 0
-        assert sa.score(phrase_toks, slop=match_slop)[2] > 0
-        assert sa.score(phrase_toks, slop=match_slop)[3] == 0
+        if match:
+            assert np.all(scores[::2] > 0)
+        else:
+            assert np.all(scores[::2] == 0)
+        assert np.all(scores[1::2] == 0)