-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
124 lines (101 loc) · 4.95 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from GraphDatabase import GraphDatabase
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
from numpy import transpose, identity
from preprocessing import lemmatizeAll, tokenizeSentence
import os.path
from helper import generateVocabulary
def script():
database = GraphDatabase()
name = 'NG_guns_motorcycles_10'
filename = 'processedDocuments/'+ name +'.pkl'
minFrequency = 2
if not os.path.exists(filename):
print 'Load Documents'
#data = fetch_20newsgroups(categories=['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'], remove=('headers', 'footers', 'quotes'))
data = fetch_20newsgroups(categories=['talk.politics.guns', 'rec.motorcycles'], remove=('headers', 'footers', 'quotes'))
categories = data.target_names
data = pd.DataFrame({'text': data['data'], 'category': data['target']})
data = data[0:10]
for index, category in enumerate(categories):
print 'Category: ' + category + ' N: ' + str(len(data[data.category==index]))
print 'Preprocessing'
docs = data.text.tolist()
vectorizer = CountVectorizer(min_df=minFrequency, stop_words='english', token_pattern='[a-zA-Z]+')
wordCounts = vectorizer.fit_transform(docs)
vocabulary = vectorizer.get_feature_names()
print('Number of Unique words: %d' % len(vocabulary))
print('Minimal Frequency: %d' % minFrequency)
docsSplitInSentences = [sent_tokenize(doc) for doc in docs]
tokenizedCollection = [[tokenizeSentence(sentence) for sentence in sentences] for sentences in docsSplitInSentences]
cleanedTokens = [[[lemmatizeAll(word.lower()) for word in sentence if word.lower() in vocabulary and len(word)>1] for sentence in doc] for doc in tokenizedCollection]
cleanedTokens = [filter(None, doc) for doc in cleanedTokens]
data['sentences'] = cleanedTokens
vocabulary = generateVocabulary(data.sentences.tolist())
fullCleanText = [' '.join(sum(post, [])) for post in data.sentences.tolist()]
data['cleanText'] = fullCleanText
tfIdf = TfidfVectorizer(vocabulary=vocabulary)
docs = data.cleanText.tolist()
tfidf_vec = tfIdf.fit_transform(docs)
data['tfIdf'] = [list(elem) for elem in tfidf_vec.toarray()]
tf = CountVectorizer(vocabulary=vocabulary)
tf_vec = tf.fit_transform(docs)
data['tf'] = [list(elem) for elem in tf_vec.toarray()]
# Remove posts with no features
for index in range(len(data)):
tfIdfSum = np.sum(data.loc[index, 'tfIdf'])
if tfIdfSum==0:
print index
data.drop(index, inplace=True)
data.index = range(len(data))
data.to_pickle(filename)
data = pd.read_pickle(filename)
vocabulary = generateVocabulary(data.sentences.tolist())
#toydata = [[0, [['This','is','it','.'],['it','.']]], [1,[['it','is','here','is','.']]]]
#data = pd.DataFrame(toydata, columns=['category', 'sentences'])
print 'Graph Construction'
startNode = database.createFeatureNode(-1,'$Start$')
endNode = database.createFeatureNode(len(vocabulary), '$End$')
for index, text in enumerate(data.sentences):
print 'Document' + str(index)
label = data.category.loc[index]
docNode = database.createDocumentNode(index, label)
for sentence in text:
preceedingWord = startNode
database.createWeightedRelation(startNode,docNode, 'is_in')
for ind,word in enumerate(sentence):
exists = len(list(database.graph.find('Feature', property_key='word', property_value=word))) > 0
if not exists:
wordID = vocabulary[word]
wordNode = database.createFeatureNode(wordID, word)
else:
wordNode = database.getFeatureNode(word)
database.createWeightedRelation(wordNode, docNode, 'is_in')
database.createWeightedRelation(preceedingWord, wordNode, 'followed_by')
preceedingWord = wordNode
if ind==len(sentence)-1:
database.createWeightedRelation(wordNode, endNode, 'followed_by')
database.createWeightedRelation(endNode, docNode, 'is_in')
print 'Normalize relationships'
docNodes = database.getNodes('Document')
database.normalizeRelationships(docNodes, 'is_in')
featureNodes = database.getNodes('Feature')
database.normalizeRelationships(featureNodes, 'followed_by')
print 'Set Context Similarity'
database.cypherContextSim()
contextSim = database.getMatrix(featureNodes, relation='related_to', propertyType = 'contextSim')
np.save('matrices/' + name + '_contextSim', contextSim)
print 'Create Matrix'
docMatrix = identity(len(docNodes))
featureMatrix = database.getMatrix(featureNodes)
featureDocMatrix = database.getMatrix(featureNodes, docNodes, 'is_in')
docAll = np.concatenate((docMatrix, np.transpose(featureDocMatrix)), axis=1)
featureAll = np.concatenate((featureDocMatrix, featureMatrix), axis=1)
combinedMatrix = np.concatenate((docAll, featureAll))
print combinedMatrix.shape
np.save('matrices/' + name, combinedMatrix)
if __name__ == '__main__':
script()