Skip to content

Commit

Permalink
All pass except same term
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jun 21, 2024
1 parent 3760158 commit 985ba43
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
35 changes: 26 additions & 9 deletions searcharray/roaringish/spans.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -135,17 +135,21 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array

for i in range(num_terms):
curr_idx[i] = lengths[i]
print(f"Term {i} -- {curr_idx[i]}")

while curr_idx[0] < lengths[1]:
# Read each term up to the next doc
last_key = -1
print("****")
print("Restarting")
for term_ord in range(num_terms):
curr_key = ((posns[curr_idx[term_ord]] & key_mask) >> (64 - key_bits))
print(f"Term {term_ord} -- {curr_key} | {curr_idx[term_ord]} -- lengths {lengths[term_ord+1]}")
while curr_idx[term_ord] < lengths[term_ord+1]:
last_key = curr_key
term = posns[curr_idx[term_ord]] & payload_mask

print(f"Term {term_ord} -- {term:0b} | {curr_key} | {curr_idx[term_ord]}")
print(f"Term {term_ord} -- {term:0b} | {curr_key} | {curr_idx[term_ord]} -- num_terms {num_terms}")

while term != 0:
# Consume into span
Expand All @@ -156,22 +160,33 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
curr_term_mask = 0x1 << term_ord
active_spans_queue[next_active_beg] = curr_term_mask
active_spans_posns[next_active_beg] = 1 << set_idx
print(f"{next_active_beg} -- added {set_idx} | {active_spans_posns[next_active_beg]:0b}")
print(" New span")
print(f" {next_active_beg} -- added {set_idx} | {active_spans_posns[next_active_beg]:0b}")
if term_ord == 0:
span_beg[next_active_beg] = payload_base + set_idx
span_beg[next_active_beg] = set_idx
print(f" beg {span_beg[next_active_beg]} end {span_end[next_active_beg]}")

# Remove spans that are too long
print(" Scanning spans")
for span_idx in range(next_active_beg):
# Continue active spans
num_terms_visited = __builtin_popcountll(active_spans_queue[span_idx])
num_posns_visited = __builtin_popcountll(active_spans_posns[span_idx])
if num_terms_visited == num_terms and num_posns_visited == num_terms:
print(f" {span_idx} -- num_terms_visited {num_terms_visited} | num_posns {num_posns_visited}")
print(f" {span_idx} -- {active_spans_queue[span_idx]:0b} | {active_spans_posns[span_idx]:0b}")
if num_terms_visited < num_terms and num_posns_visited == num_terms:
continue
active_spans_queue[span_idx] |= curr_term_mask
print(f"{span_idx} -- set_idx {set_idx} | {active_spans_posns[span_idx]:0b}")
active_spans_posns[span_idx] |= (1 << set_idx)
span_end[span_idx] = payload_base + set_idx
if abs(span_end[span_idx] - span_beg[span_idx]) > num_terms + slop:
num_terms_visited_now = __builtin_popcountll(active_spans_queue[span_idx])
if num_terms_visited_now > num_terms_visited:
# Add position for new unique term
active_spans_posns[span_idx] |= (1 << set_idx)
span_end[span_idx] = set_idx
print(f" {span_idx} -- set_idx {set_idx} | {active_spans_posns[span_idx]:0b}")
print(f" {span_idx} -- beg {span_beg[span_idx]} | end {span_end[span_idx]}")
if num_terms_visited_now == num_terms and abs(span_end[span_idx] - span_beg[span_idx]) > num_terms + slop:
print(f" {span_idx} -- removing {span_beg[span_idx]}-{span_end[span_idx]}")
print(f" {span_idx} -- slop: {slop} num_terms: {num_terms}")
span_beg[span_idx] = 0
span_end[span_idx] = 0
active_spans_queue[span_idx] = 0
Expand All @@ -182,12 +197,14 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
next_active_beg += 1
last_set_idx = set_idx
curr_idx[term_ord] += 1
curr_key = posns[curr_idx[term_ord]] & key_mask
curr_key = (posns[curr_idx[term_ord]] & key_mask) >> (64 - key_bits)
if curr_key != last_key or next_active_beg > 64:
print(f"Term {term_ord} -- Key change {curr_key} != {last_key} | {next_active_beg}")
break

# All terms consumed for doc

print("***")
print("***")
print(f"Collect for {last_key}", flush=True)

Expand Down
2 changes: 1 addition & 1 deletion test/test_slop_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@

@w_scenarios(scenarios)
def test_phrase_slop(phrase, doc, slop, match):
sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"] * 100)
sa = SearchArray.index([doc, " empty ", doc + " " + doc, " empty"] * 4)
phrase_toks = sa.tokenizer(phrase)
start = perf_counter()
scores = sa.score(phrase_toks, slop=slop)
Expand Down

0 comments on commit 985ba43

Please sign in to comment.