Skip to content

Commit

Permalink
Revert "False start"
Browse files Browse the repository at this point in the history
This reverts commit c589811.
  • Loading branch information
softwaredoug committed Jun 19, 2024
1 parent c589811 commit 4b0ab88
Showing 1 changed file with 35 additions and 62 deletions.
97 changes: 35 additions & 62 deletions searcharray/roaringish/spans.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,6 @@ cdef _get_adj_spans(DTYPE_t[:, :] posns_arr,
pass


cdef _bits_set(DTYPE_t value,
DTYPE_t[:] buffer64,
DTYPE_t* buffer_write_len):
"""Get the bits set in a 64 bit value."""
cdef DTYPE_t lsb = 0
cdef DTYPE_t bit_posn = 0
buffer_write_len[0] = 0
while value > 0:
lsb = value & -value
bit_posn = __builtin_ctzll(lsb)
print(f"LSB: {lsb:064b} | bt_posn: {bit_posn}")
buffer64[buffer_write_len[0]] = bit_posn
# Clear LSB
value &= value - 1
buffer_write_len[0] += 1


cdef _span_freqs(DTYPE_t[:, :] posns_arr,
double[:] phrase_freqs,
DTYPE_t slop,
Expand Down Expand Up @@ -129,90 +112,80 @@ cdef _span_freqs(DTYPE_t[:, :] posns_arr,
# Now we score the span to see if its < slop
#
# curr_posns are current bits analyzed for slop
cdef np.uint64_t[:] bits_set = np.empty(64, dtype=np.uint64)
cdef np.uint64_t bits_set_len = 0

cdef np.uint64_t[:] curr_posns = np.empty(posns_arr.shape[0], dtype=np.uint64)
cdef np.uint64_t[:] active_spans_queue = np.empty(64, dtype=np.uint64)
cdef np.int64_t[:] span_score_queue = np.empty(64, dtype=np.int64)
cdef np.uint64_t[:] span_score_queue = np.empty(64, dtype=np.uint64)
cdef np.uint64_t next_active_beg = 0
cdef np.uint64_t curr_term_mask = 0
cdef np.uint64_t num_terms = posns_arr.shape[0]
cdef np.uint64_t all_terms_mask = (1 << num_terms) - 1
cdef np.uint64_t term_ord = 0
cdef np.uint64_t curr_key = 0
cdef np.uint64_t last_key = 0
cdef np.uint64_t payload_base = 0
last_set_idx = 0
for i in range(posns_arr.shape[1]):
curr_key = posns_arr[0, i] & key_mask

if curr_key != last_key:
print("-----------")
print(f"Collecting spans for {curr_key} - {next_active_beg} active spans")

print(f"Collecting spans for {curr_key}")
next_active_beg = 0
# Make new active span queue
new_active_span_queue = np.empty(64, dtype=np.uint64)
new_span_score_queue = np.empty(64, dtype=np.int64)
new_span_score_queue = np.empty(64, dtype=np.uint64)

# Copy existing
for span_idx in range(next_active_beg):
span_size = __builtin_popcountll(active_spans_queue[span_idx])
print(f"Span {span_idx} size: {span_size}")
if span_size != num_terms:
if __builtin_popcountll(active_spans_queue[span_idx]) != num_terms:
continue
print("Keeping span")
print("Span score: ", span_score_queue[span_idx])
new_active_span_queue[span_idx] = active_spans_queue[span_idx]
new_span_score_queue[span_idx] = span_score_queue[span_idx]
phrase_freqs[curr_key] += 1

next_active_beg = 0

active_spans_queue = new_active_span_queue
span_score_queue = new_span_score_queue

# We may not be processing each term in order
# Each term is potentially a new span
for term_ord in range(num_terms):
# Each msb
term = posns_arr[term_ord, i] & payload_mask
_bits_set(term, bits_set, &bits_set_len)
last_set_idx = 0
print("---")
print(f"Term ord: {term_ord}, bits_set_len: {bits_set_len}")
for idx in range(bits_set_len):
set_idx = bits_set[idx]
print(f"set_idx: {set_idx}, last_set_idx: {last_set_idx}, payload_base: {payload_base}")
curr_term_mask = 0x1 << term_ord
active_spans_queue[next_active_beg] = curr_term_mask
span_score_queue[next_active_beg] = term_ord # The term index as start score, because 0 is in order
for span_idx in range(next_active_beg):
if __builtin_popcountll(active_spans_queue[span_idx]) == num_terms:
print(f" Not updating completed span {span_idx} score: {span_score_queue[span_idx]}")
continue
active_spans_queue[span_idx] |= curr_term_mask
span_score_queue[span_idx] += set_idx - last_set_idx - 1
if span_score_queue[span_idx] > slop:
print(f"Removing span {span_idx} w/ score {span_score_queue[span_idx]}")
active_spans_queue[span_idx] = 0
span_score_queue[span_idx] = 0x7FFFFFFFFFFFFFFF
else:
print(f" Keeping Span {span_idx} score: {span_score_queue[span_idx]}")
next_active_beg += 1
last_set_idx = set_idx
set_idx = __builtin_ctzll(posns_arr[term_ord, i] & payload_mask)
# Start a span
curr_term_mask = 0x1 << term_ord
active_spans_queue[next_active_beg] = curr_term_mask
span_score_queue[next_active_beg] = term_ord # The term index as start score, because 0 is in order
for span_idx in range(next_active_beg):
active_spans_queue[span_idx] |= curr_term_mask
span_score_queue[span_idx] += payload_base + (set_idx - last_set_idx - 1) # distance of 1, score 0
if span_score_queue[span_idx] > slop:
print(f"Removing span {span_idx}")
active_spans_queue[span_idx] = 0
span_score_queue[span_idx] = 0x7FFFFFFFFFFFFFFF
print(f"Score of {span_idx} is {span_score_queue[span_idx]}")
next_active_beg += 1
last_set_idx = set_idx

payload_base += lsb_bits
last_key = curr_key
# Make new active span queue
new_active_span_queue = np.empty(64, dtype=np.uint64)
new_span_score_queue = np.empty(64, dtype=np.uint64)

# Copy existing
print("-----------")
print(f"Collecting spans for {curr_key} - {next_active_beg} active spans")
for span_idx in range(next_active_beg):
span_size = __builtin_popcountll(active_spans_queue[span_idx])
print(f"Span {span_idx} size: {span_size}")
if span_size != num_terms:
if __builtin_popcountll(active_spans_queue[span_idx]) != num_terms:
continue
print("Keeping span")
print("Span score: ", span_score_queue[span_idx])
new_active_span_queue[span_idx] = active_spans_queue[span_idx]
new_span_score_queue[span_idx] = span_score_queue[span_idx]
phrase_freqs[curr_key] += 1

active_spans_queue = new_active_span_queue
span_score_queue = new_span_score_queue


def span_search(np.ndarray[DTYPE_t, ndim=2] posns_arr,
np.ndarray[double, ndim=1] phrase_freqs,
Expand Down

0 comments on commit 4b0ab88

Please sign in to comment.