Skip to content

Commit

Permalink
Flesh out naive span search algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jun 19, 2024
1 parent 3019828 commit 33430c8
Showing 1 changed file with 66 additions and 18 deletions.
84 changes: 66 additions & 18 deletions searcharray/roaringish/spans.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -113,26 +113,74 @@ cdef _span_freqs(DTYPE_t[:, :] posns_arr,
#
# curr_posns are current bits analyzed for slop
cdef np.uint64_t[:] curr_posns = np.empty(posns_arr.shape[0], dtype=np.uint64)
cdef np.uint64_t[:] active_spans_queue = np.empty(64, dtype=np.uint64)
cdef np.uint64_t[:] active_spans_keys = np.empty(64, dtype=np.uint64)
cdef np.uint64_t[:] span_score_queue = np.empty(64, dtype=np.uint64)
cdef np.uint64_t next_active_beg = 0
cdef np.uint64_t curr_term_mask = 0
cdef np.uint64_t num_terms = posns_arr.shape[0]
cdef np.uint64_t all_terms_mask = (1 << num_terms) - 1
cdef np.uint64_t term_ord = 0
cdef np.uint64_t curr_key = 0
last_set_idx = 0
for i in range(posns_arr.shape[1]):

# Each term
for j in range(posns_arr.shape[0]):
curr_key = posns_arr[0, i] & key_mask

if curr_key != last_key:
next_active_beg = 0

# Make new active span queue
new_active_span_queue = np.empty(64, dtype=np.uint64)
new_span_score_queue = np.empty(64, dtype=np.uint64)
new_span_keys_queue = np.empty(64, dtype=np.uint64)

# Copy existing
for span_idx in range(next_active_beg):
if __builtin_popcountll(active_spans_queue[span_idx]) != num_terms:
continue
print("Keeping span")
print("Span score: ", span_score_queue[span_idx])
new_active_span_queue[span_idx] = active_spans_queue[span_idx]
new_span_score_queue[span_idx] = span_score_queue[span_idx]
new_span_keys_queue[span_idx] = active_spans_keys[span_idx]

# Each term is potentially a new span
for term_ord in range(num_terms):
# Each msb
# Later optimization - could we do this without storing which_terms?
term = posns_arr[j, i] & payload_mask
set_idx = __builtin_ctzll(term)
posns_arr[j, i] &= ~(1 << set_idx)
which_terms[set_idx] = j

# Gather and score min spans
for posn in which_terms:
print(posn)
if posn == 0xFF:
continue

# Shift the which_terms up by num_payload_bits
for j in range(64 - lsb_bits):
which_terms[j + lsb_bits] = which_terms[j]
term = posns_arr[term_ord, i] & payload_mask
set_idx = __builtin_ctzll(posns_arr[term_ord, i] & payload_mask)
# Start a span
curr_term_mask = 0x1 << term_ord
active_spans_queue[next_active_beg] = curr_term_mask
active_spans_keys[next_active_beg] = curr_key
span_score_queue[next_active_beg] = term_ord # The term index as start score, because 0 is in order
for span_idx in range(next_active_beg):
active_spans_queue[span_idx] |= curr_term_mask
span_score_queue[span_idx] += set_idx - last_set_idx - 1 # distance of 1, score 0
if span_score_queue[span_idx] > slop:
print(f"Removing span {span_idx}")
active_spans_queue[span_idx] = 0
active_spans_keys[span_idx] = 0
span_score_queue[span_idx] = 0x7FFFFFFFFFFFFFFF
print(f"Score of {span_idx} is {span_score_queue[span_idx]}")
next_active_beg += 1
last_set_idx = set_idx

last_key = curr_key

# Collect last
# Copy existing
for span_idx in range(next_active_beg):
if __builtin_popcountll(active_spans_queue[span_idx]) != num_terms:
continue
print("For key ", curr_key)
print("Keeping span")
print("Span score: ", span_score_queue[span_idx])
new_active_span_queue[span_idx] = active_spans_queue[span_idx]
new_span_score_queue[span_idx] = span_score_queue[span_idx]
new_span_keys_queue[span_idx] = active_spans_keys[span_idx]



# The min popcount is the upper bound of phrase freq
# popcount_xored_min = 128
Expand Down

0 comments on commit 33430c8

Please sign in to comment.