Skip to content

Commit

Permalink
Remove prints etc
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jun 20, 2024
1 parent ca0223b commit d0117ca
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 28 deletions.
19 changes: 3 additions & 16 deletions searcharray/roaringish/spans.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -120,32 +120,27 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
cdef np.int64_t[:] span_end = np.empty(64, dtype=np.int64)
cdef np.uint64_t next_active_beg = 0
cdef np.uint64_t curr_term_mask = 0
cdef np.uint64_t num_terms = len(lengths)
cdef np.uint64_t num_terms = len(lengths) - 1
cdef np.uint64_t all_terms_mask = (1 << num_terms) - 1
cdef np.uint64_t term_ord = 0
cdef np.uint64_t curr_key = 0
cdef np.uint64_t last_key = 0
cdef np.uint64_t payload_base = 0
last_set_idx = 0
while curr_idx[0] < lengths[1]:
print(f"curr_key: {curr_key}")

# Read each term up to the next doc
last_key = -1
for term_ord in range(num_terms):
curr_key = (posns[curr_idx[term_ord]] & key_mask >> (64 - key_bits))
curr_key = ((posns[curr_idx[term_ord]] & key_mask) >> (64 - key_bits))
while curr_idx[term_ord] < lengths[term_ord+1]:
last_key = curr_key
term = posns[curr_idx[term_ord]] & payload_mask

print("Starting loop")
while term != 0:
# Consume into span
set_idx = __builtin_ctzll(term)
print(term_ord, term, set_idx)
# Clear LSB
term = (term & (term - 1))
print("Cleared", term_ord, term, set_idx)
# Start a span
curr_term_mask = 0x1 << term_ord
active_spans_queue[next_active_beg] = curr_term_mask
Expand All @@ -161,25 +156,20 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
active_spans_queue[span_idx] |= curr_term_mask
span_end[span_idx] = payload_base + set_idx
if abs(span_end[span_idx] - span_beg[span_idx]) > num_terms + slop:
print(f"Removing span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}, slop: {slop}")
span_beg[span_idx] = 0
span_end[span_idx] = 0
active_spans_queue[span_idx] = 0
else:
print(f" Keeping span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}, slop: {slop}")

if next_active_beg > 64:
if next_active_beg > 64:
break
next_active_beg += 1
last_set_idx = set_idx
print("Next posn in term")
curr_idx[term_ord] += 1
curr_key = posns[curr_idx[term_ord]] & key_mask
if curr_key != last_key or next_active_beg > 64:
break

# All terms consumed for doc
print(f"Collecting spans for {last_key}")

# Make new active span queue
new_active_span_queue = np.empty(64, dtype=np.uint64)
Expand All @@ -191,9 +181,7 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
popcount = __builtin_popcountll(active_spans_queue[span_idx])
if popcount != num_terms:
continue
print(f"Collecting span {span_idx} | popcount {popcount} -- begin: {span_beg[span_idx]}, end: {span_end[span_idx]}")
phrase_freqs[last_key] += 1
print(f"Phrase freqs: {phrase_freqs[last_key]}")

# Reset
next_active_beg = 0
Expand All @@ -202,7 +190,6 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
span_end = new_span_end



def span_search(np.ndarray[DTYPE_t, ndim=1] posns,
np.ndarray[DTYPE_t, ndim=1] lengths,
np.ndarray[double, ndim=1] phrase_freqs,
Expand Down
48 changes: 36 additions & 12 deletions test/test_slop_matches.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
from searcharray.postings import SearchArray
import numpy as np
from test_utils import w_scenarios
from time import perf_counter


scenarios = {
"direct_phrase": {
"phrase": "intergalactic bounty hunters",
"doc": """A massive ball of furry creatures from another world eat their way through a small mid-western town followed by intergalactic bounty hunters opposed only by militant townspeople.""",
"slop": 0
"slop": 0,
"match": True
},
"slop 1": {
"phrase": "massive ball furry",
"doc": """A massive ball of furry creatures from another world eat their way through a small mid-western town followed by intergalactic bounty hunters opposed only by militant townspeople.""",
"slop": 1
"slop": 1,
"match": True
},
"two_after_the": {
"phrase": "the to be",
"doc": """
Broke and alone on New Year's Eve, Wilson just wants to spend the rest of a very bad year in bed.
But, when his best friend convinces him to post a personal ad,
he meets a woman bent on finding the right guy to be with at midnight.""",
"slop": 2
"slop": 2,
"match": True
},
"slop_3_order": {
"phrase": "the to be",
Expand All @@ -29,24 +34,43 @@
The murders are found to be the work of an out-of-control experiment in genetic engineering.
The two men must descend into the city's sewer systems to destroy the horrific miscreation.
It won't be hard to find, as it's already looking for its next victims...""",
"slop": 3
"slop": 3,
"match": True
},
"slop_5": {
"phrase": "spice found substance",
"doc": """
In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""",
"slop": 5
"slop": 5,
"match": True
},
"slop_5_len_5": {
"phrase": "spice found substance can be",
"doc": """
In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""",
"slop": 5,
"match": True
},
"slop_5_len_5_no_match": {
"phrase": "there is no match for this",
"doc": """
In the year 10,191, the world is at war for control of the desert planet Dune—the only place where the time-travel substance spice can be found But when one leader gives up control, it's only so he can stage a coup with some unsavory characters.""",
"slop": 5,
"match": False
},
}


@w_scenarios(scenarios)
def test_phrase_slop(phrase, doc, slop):
sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"])
def test_phrase_slop(phrase, doc, slop, match):
sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"] * 100)
phrase_toks = sa.tokenizer(phrase)
# assert sa.score(phrase_toks, slop=slop)
start = perf_counter()
scores = sa.score(phrase_toks, slop=slop)
print("Elapsed time:", perf_counter() - start)
for match_slop in range(slop, max(slop, 10)):
assert sa.score(phrase_toks, slop=match_slop)[0] > 0
assert sa.score(phrase_toks, slop=match_slop)[1] == 0
assert sa.score(phrase_toks, slop=match_slop)[2] > 0
assert sa.score(phrase_toks, slop=match_slop)[3] == 0
if match:
assert np.all(scores[::2] > 0)
else:
assert np.all(scores[::2] == 0)
assert np.all(scores[1::2] == 0)

0 comments on commit d0117ca

Please sign in to comment.