From d0117caddcb0af5d67e64d32b7767406fa0c00e7 Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Thu, 20 Jun 2024 15:54:45 -0400 Subject: [PATCH] Remove prints etc --- searcharray/roaringish/spans.pyx | 19 ++----------- test/test_slop_matches.py | 48 ++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/searcharray/roaringish/spans.pyx b/searcharray/roaringish/spans.pyx index 4b1add8..78eca47 100644 --- a/searcharray/roaringish/spans.pyx +++ b/searcharray/roaringish/spans.pyx @@ -120,7 +120,7 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array cdef np.int64_t[:] span_end = np.empty(64, dtype=np.int64) cdef np.uint64_t next_active_beg = 0 cdef np.uint64_t curr_term_mask = 0 - cdef np.uint64_t num_terms = len(lengths) + cdef np.uint64_t num_terms = len(lengths) - 1 cdef np.uint64_t all_terms_mask = (1 << num_terms) - 1 cdef np.uint64_t term_ord = 0 cdef np.uint64_t curr_key = 0 @@ -128,24 +128,19 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array cdef np.uint64_t payload_base = 0 last_set_idx = 0 while curr_idx[0] < lengths[1]: - print(f"curr_key: {curr_key}") - # Read each term up to the next doc last_key = -1 for term_ord in range(num_terms): - curr_key = (posns[curr_idx[term_ord]] & key_mask >> (64 - key_bits)) + curr_key = ((posns[curr_idx[term_ord]] & key_mask) >> (64 - key_bits)) while curr_idx[term_ord] < lengths[term_ord+1]: last_key = curr_key term = posns[curr_idx[term_ord]] & payload_mask - print("Starting loop") while term != 0: # Consume into span set_idx = __builtin_ctzll(term) - print(term_ord, term, set_idx) # Clear LSB term = (term & (term - 1)) - print("Cleared", term_ord, term, set_idx) # Start a span curr_term_mask = 0x1 << term_ord active_spans_queue[next_active_beg] = curr_term_mask @@ -161,25 +156,20 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array active_spans_queue[span_idx] |= curr_term_mask span_end[span_idx] = payload_base + set_idx if abs(span_end[span_idx] - span_beg[span_idx]) > num_terms + slop: - print(f"Removing span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}, slop: {slop}") span_beg[span_idx] = 0 span_end[span_idx] = 0 active_spans_queue[span_idx] = 0 - else: - print(f" Keeping span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}, slop: {slop}") - if next_active_beg > 64: + if next_active_beg > 64: break next_active_beg += 1 last_set_idx = set_idx - print("Next posn in term") curr_idx[term_ord] += 1 curr_key = posns[curr_idx[term_ord]] & key_mask if curr_key != last_key or next_active_beg > 64: break # All terms consumed for doc - print(f"Collecting spans for {last_key}") # Make new active span queue new_active_span_queue = np.empty(64, dtype=np.uint64) @@ -191,9 +181,7 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array popcount = __builtin_popcountll(active_spans_queue[span_idx]) if popcount != num_terms: continue - print(f"Collecting span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}") phrase_freqs[last_key] += 1 - print(f"Phrase freqs: {phrase_freqs[last_key]}") # Reset next_active_beg = 0 @@ -202,7 +190,6 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array span_end = new_span_end - def span_search(np.ndarray[DTYPE_t, ndim=1] posns, np.ndarray[DTYPE_t, ndim=1] lengths, np.ndarray[double, ndim=1] phrase_freqs, diff --git a/test/test_slop_matches.py b/test/test_slop_matches.py index 1340905..1586322 100644 --- a/test/test_slop_matches.py +++ b/test/test_slop_matches.py @@ -1,17 +1,21 @@ from searcharray.postings import SearchArray +import numpy as np from test_utils import w_scenarios +from time import perf_counter scenarios = { "direct_phrase": { "phrase": "intergalactic bounty hunters", "doc": """A massive ball of furry creatures from another world eat their way through a small mid-western town followed by intergalactic bounty hunters opposed only by militant townspeople.""", - "slop": 0 + "slop": 0, + "match": True }, "slop 1": { "phrase": "massive ball furry", "doc": """A massive ball of furry creatures from another world eat their way through a small mid-western town followed by intergalactic bounty hunters opposed only by militant townspeople.""", - "slop": 1 + "slop": 1, + "match": True }, "two_after_the": { "phrase": "the to be", @@ -19,7 +23,8 @@ Broke and alone on New Year's Eve, Wilson just wants to spend the rest of a very bad year in bed. But, when his best friend convinces him to post a personal ad, he meets a woman bent on finding the right guy to be with at midnight.""", - "slop": 2 + "slop": 2, + "match": True }, "slop_3_order": { "phrase": "the to be", @@ -29,24 +34,43 @@ The murders are found to be the work of an out-of-control experiment in genetic engineering. The two men must descend into the city's sewer systems to destroy the horrific miscreation. It won't be hard to find, as it's already looking for its next victims...""", - "slop": 3 + "slop": 3, + "match": True }, "slop_5": { "phrase": "spice found substance", "doc": """ In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""", - "slop": 5 + "slop": 5, + "match": True + }, + "slop_5_len_5": { + "phrase": "spice found substance can be", + "doc": """ +In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""", + "slop": 5, + "match": True + }, + "slop_5_len_5_no_match": { + "phrase": "there is no match for this", + "doc": """ +In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""", + "slop": 5, + "match": False }, } @w_scenarios(scenarios) -def test_phrase_slop(phrase, doc, slop): - sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"]) +def test_phrase_slop(phrase, doc, slop, match): + sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"] * 100) phrase_toks = sa.tokenizer(phrase) - # assert sa.score(phrase_toks, slop=slop) + start = perf_counter() + scores = sa.score(phrase_toks, slop=slop) + print("Elapsed time:", perf_counter() - start) for match_slop in range(slop, max(slop, 10)): - assert sa.score(phrase_toks, slop=match_slop)[0] > 0 - assert sa.score(phrase_toks, slop=match_slop)[1] == 0 - assert sa.score(phrase_toks, slop=match_slop)[2] > 0 - assert sa.score(phrase_toks, slop=match_slop)[3] == 0 + if match: + assert np.all(scores[::2] > 0) + else: + assert np.all(scores[::2] == 0) + assert np.all(scores[1::2] == 0)