-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCN.py
174 lines (153 loc) · 5.71 KB
/
CN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# host = 'cn'
from from_spacy import token_lemmatize
import pickle
import os
#host = 'aws' or 'cn' or 'csv'
cn_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'CN')
cn_dir = '/data/ghazaleh/datasets/knowledge_pickles/'
with open(os.path.join(cn_dir, 'isa.pickle'), 'rb') as rf:
isadict = pickle.load(rf)
with open(os.path.join(cn_dir, 'synonym.pickle'), 'rb') as rf:
syndict = pickle.load(rf)
def findserver(hostname):
if hostname == 'cn':
return 'http://api.conceptnet.io/c/en/'
elif hostname == 'aws':
return 'http://172.31.2.195/c/en/'
elif hostname == 'csv':
return '/data/ghazaleh/neuralsymbolic/cn/ConceptNet.csv'
def getCNobject(term):
import requests
hostname = 'csv'
prefix = findserver(hostname)
if prefix.startswith('http'):
suffix = '?offset=0&limit=2000'
address = prefix + term + suffix
obj = requests.get(address).json()
return obj
else:
fin = open(prefix, 'r', encoding="utf8")
lines = fin.readlines()
keep = []
for line in lines:
ls = line.split('\t')
if ls[1] == term or ls[2] == term:
keep += [(ls[0], ls[1], ls[2])]
fin.close()
return keep
# def findParentsByLabel(term):
# import re
# if ' ' in term:
# term2 = re.sub(' ', '_', term)
# else:
# term2 = term
# obj = getCNobject(term2)
# if type(obj) == list:
# return [triple[2] for triple in obj if triple[0] == 'isa' and triple[1] == term]
# if '@id' not in obj.keys() or not obj['@id']:
# return []
# return [edge['end']['label'] for edge in obj['edges'] if edge['rel']['label'] == 'IsA' and edge['start']['label'] == term]
def findParentsByLabel(term):
return set(isadict[term])
def findParentsRecursive(phrase, stopterms = None, verbose=False):
all_parents = set()
traced = [phrase]
really_traced = []
if stopterms:
while traced and not [k for k in stopterms if k in all_parents]:
i = 0
while len(traced) > i and traced[i] not in really_traced and not [k for k in stopterms if k in all_parents]:
term = traced.pop(i)
x = findParentsByLabel(term)
really_traced += [term]
i += 1
y = [i for i in set(x) if i not in traced]
traced += y
all_parents.update(set(y))
if verbose:
print(all_parents)
return all_parents
else:
while traced:
if verbose:
print(traced)
term = traced.pop(0)
x = findParentsByLabel(term)
# y = list(set(traced).union(set(x)))
y = [i for i in x if i not in traced]
traced += y
all_parents.update(set(y))
return all_parents
def isLocative(phrase):
x = findParentsRecursive(phrase, ['spatial_thing', 'location'])
return 'spatial_thing' in x or 'location' in x
def isTangible(phrase):
phrase = token_lemmatize(phrase)
x = findParentsRecursive(phrase, ['tangible thing', 'finite spatial thing'])
return 'tangible thing' in x or 'finite spatial thing' in x
def findRelatedTerms(term):
import requests, re
# prefix = findserver('aws')
# # prefix = 'http://api.conceptnet.io/c/en/'
# suffix = '?offset=0&limit=2000'
if ' ' in term:
term2 = re.sub(' ', '_', term)
else:
term2 = term
# address = prefix + term2 + suffix
# obj = requests.get(address).json()
obj = getCNobject(term2)
# related = set([edge['end']['label'] for edge in obj['edges'] if edge['rel']['label'] == 'RelatedTo' and edge['start']['label'] == term]).union(set([edge['start']['label'] for edge in obj['edges'] if edge['rel']['label'] == 'RelatedTo' and edge['end']['label'] == term]))
rels = set()
if '@id' not in obj.keys() or not obj['@id']:
return rels
for edge in obj['edges']:
if edge['rel']['label'] == 'RelatedTo':
id_list = edge['@id'].split('/c/en/')[1:]
if len(id_list) > 1:
keep = [re.sub(r'[\W_]+$', '', s) for s in id_list]
rels.update([x.split('/')[0] for x in keep if x != term2])
return rels
# def ablative_allative(term):
# ablative_terms = ['out', 'away', 'exit', 'leave']
# allative_terms = ['go', 'come', 'join']
# related = findRelatedTerms(term)
# if [x for x in allative_terms if x in related]:
# return 'to'
# elif [x for x in ablative_terms if x in related]:
# return 'from'
# else:
# return None
# def findSynonyms(term):
# import re # requests
# # prefix = findserver('aws')
# # # prefix = 'http://api.conceptnet.io/c/en/'
# # suffix = '?offset=0&limit=2000'
# if ' ' in term:
# term2 = re.sub(' ', '_', term)
# else:
# term2 = term
# # address = prefix + term2 + suffix
# # obj = requests.get(address).json()
# obj = getCNobject(term2)
# if '@id' not in obj.keys() or not obj['@id']:
# # this prevents crashing if the server is down. you can avoid also crashing by running a local copy (on AWS)
# return set()
# # related = set([edge['end']['label'] for edge in obj['edges'] if edge['rel']['label'] == 'RelatedTo' and edge['start']['label'] == term]).union(set([edge['start']['label'] for edge in obj['edges'] if edge['rel']['label'] == 'RelatedTo' and edge['end']['label'] == term]))
# syns = set()
# for edge in obj['edges']:
# if edge['rel']['label'] == 'Synonym':
# id_list = edge['@id'].split('/c/en/')[1:]
# if len(id_list) > 1:
# keep = [re.sub(r'[\W_]+$', '', s) for s in id_list]
# syns.update([x.split('/')[0] for x in keep if x != term2])
# return syns
def findSynonyms(term):
return set(syndict[term])
if __name__ == '__main__':
# print(findSynonyms('electrical energy'))
# print(findParentsByLabel('plant'))
print(isLocative('underground'))
# print(findParentsByLabel('underground'))
# print(findParentsRecursive('underground', ['spatial_thing', 'location']))
# print(getCNobject('plant'))