-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathPositionalMerge.py
115 lines (107 loc) · 4.79 KB
/
PositionalMerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# [(docID, tf, []), ...]
# [(docID, tf, []), ...]
import math
DOC_ID_INDEX = 0
TF_INDEX = 1
POSTINGS_INDEX = 2
def get_docID(list, index):
return list[index][DOC_ID_INDEX]
def has_skip(list, index, skip_dist):
return index + skip_dist < len(list) and index % skip_dist == 0
def get_skip_docID(list, index, skip_dist):
return list[index + skip_dist][DOC_ID_INDEX]
def get_skip_position(list, index, skip_dist):
return list[index + skip_dist]
def get_postings_from_phrase(phrase, postings_lists):
'''
Returns a postings list where each posting is a (docID, tf) tuple for the given phrase.
The algorithm goes through the phrase and performs two-way merge for each contiguous pair in phrase, in the
case where the phrase has three words. Only one merge is done for biwords.
:param phrase: a list of either 2 or 3 terms (biword or triword).
:param postings_lists: the postings lists for each term in the phrase in order, where each posting is in the form
[docID, tf, position list].
'''
if len(phrase) == 2:
postings_list_A = postings_lists[0]
postings_list_B = postings_lists[1]
df_A = len(postings_list_A)
df_B = len(postings_list_B)
docs = intersect_lists(postings_list_A, postings_list_B, df_A, df_B)
result = list(map(lambda x: (x[0], x[1]), docs))
else: # len(phrase) == 3
postings_list_A = postings_lists[0]
postings_list_B = postings_lists[1]
postings_list_C = postings_lists[2]
df_A = len(postings_list_A)
df_B = len(postings_list_B)
df_C = len(postings_list_C)
first = intersect_lists(postings_list_A, postings_list_B, df_A, df_B)
second = intersect_lists(postings_list_B, postings_list_C, df_B, df_C)
docs = intersect_lists(first, second, len(first), len(second))
result = list(map(lambda x: (x[0], x[1]), docs))
return result
def intersect_lists(listA, listB, A_length, B_length):
'''
Merges two lists of the form [docID, tf, position list] by first merging by docID.
When the same docID is encountered in both lists, the intersect_position_lists function is called
to merge the position lists and find positions which start the phrase.
Skip pointers are dynamically generated based on the length of the lists.
Returns a list of (docID, position list) tuples.
'''
result = []
i, j = 0, 0
skip_dist_A = int(math.sqrt(A_length))
skip_dist_B = int(math.sqrt(B_length))
while i < A_length and j < B_length:
docID_A = get_docID(listA, i)
docID_B = get_docID(listB, j)
if docID_A == docID_B:
positions = intersect_postion_lists(listA[i][POSTINGS_INDEX], listB[j][POSTINGS_INDEX],
listA[i][TF_INDEX], listB[j][TF_INDEX])
if positions:
result.append((docID_A, len(positions), positions))
i += 1
j += 1
elif docID_A < docID_B:
if has_skip(listA, i, skip_dist_A) and get_skip_docID(listA, i, skip_dist_A) <= docID_B:
while has_skip(listA, i, skip_dist_A) and get_skip_docID(listA, i, skip_dist_A) <= docID_B:
i += skip_dist_A
else:
i += 1
elif docID_A > docID_B:
if has_skip(listB, j, skip_dist_B) and get_skip_docID(listB, j, skip_dist_B) <= docID_A:
while has_skip(listB, j, skip_dist_B) and get_skip_docID(listB, j, skip_dist_B) <= docID_A:
j += skip_dist_B
else:
j += 1
return result
def intersect_postion_lists(listA, listB, A_length, B_length):
'''
Returns the intersection of listA and listB, both of which are positional lists.
For a positional index x to be appended to the final result, the position x must be in listA
while x+1 must be found in listB.
'''
result = []
i, j = 0, 0
skip_dist_A = int(math.sqrt(A_length))
skip_dist_B = int(math.sqrt(B_length))
while i < A_length and j < B_length:
pos_A = listA[i]
pos_B = listB[j]
if pos_A == pos_B - 1:
result.append(pos_A)
i += 1
j += 1
elif pos_A < pos_B - 1:
if has_skip(listA, i, skip_dist_A) and get_skip_position(listA, i, skip_dist_A) <= pos_B:
while has_skip(listA, i, skip_dist_A) and get_skip_position(listA, i, skip_dist_A) <= pos_B:
i += skip_dist_A
else:
i += 1
elif pos_A > pos_B - 1:
if has_skip(listB, j, skip_dist_B) and get_skip_position(listB, j, skip_dist_B) <= pos_A:
while has_skip(listB, j, skip_dist_B) and get_skip_position(listB, j, skip_dist_B) <= pos_A:
j += skip_dist_B
else:
j += 1
return result