-
Notifications
You must be signed in to change notification settings - Fork 186
/
Copy pathlex_features.py
71 lines (57 loc) · 1.99 KB
/
lex_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from featureforge.feature import output_schema
@output_schema({str})
def bag_of_left_entity_IOB_chain(datapoint):
print (hash(datapoint))
eo = datapoint.left_entity_occurrence
return set()
def _bag_of_eo_IOB_chain(datapoint, eo):
tokens = datapoint.segment.tokens
eo_tokens = tokens[eo.segment_offset: eo.segment_offset_end]
result = set()
lex_trees = datapoint.segment.let_trees
if not lex_trees:
return set() # was not parsed on preprocess
sentences = datapoint.segment.sentences
for idx, eo_tk in enumerate(eo_tokens, eo.segment_offset):
sentence = max(filter(lambda x: x<=idx, sentences))
sentence_idx = sentences.index(sentence)
tree = lex_trees[sentence_idx]
tk_actual_idx = idx - sentence
assert tk_actual_idx >= 0
path = tree.leaf_treeposition(tk_actual_idx)
#chain =
#######
def walk_tree(tree, path):
result = tree
for i in path:
result = result[i]
return result
def chunk_tag(evidence):
result = set()
tree = evidence.segment.lex_trees[0]
for i, _ in enumerate(tree.leaves()):
path = tree.leaf_treeposition(i)
parent = walk_tree(tree, path[:-2])
parent_label = parent.label()
position_in_sentence = path[-2]
if parent_label == "S":
tag = "O"
else:
modifier = "B" if position_in_sentence == 0 else "I"
tag = "{}-{}".format(modifier, parent_label)
result.add(tag)
return result
def iob_chain(evidence):
result = set()
tree = evidence.segment.lex_trees[0]
for i, _ in enumerate(tree.leaves()):
path = tree.leaf_treeposition(i)[:-1]
chain = []
subtree = tree
for (step, next_step) in zip(path, path[1:]):
subtree = subtree[step]
modifier = "B" if next_step == 0 else "I"
tag = "{}-{}".format(modifier, subtree.label())
chain.append(tag)
result.add("/".join(chain))
return result