-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathUnigramTfFeatureGeneration.py
122 lines (104 loc) · 4.51 KB
/
UnigramTfFeatureGeneration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import random
from collections import Counter
import numpy as np
from googletrans import Translator
from nltk.tokenize import word_tokenize
import codecs
from dbn_outside.dbn.tensorflow import SupervisedDBNClassification
hm_lines = 5000000
translator = Translator()
stopwords = codecs.open("hindi_stopwords.txt", "r", encoding='utf-8', errors='ignore').read().split('\n')
# Creating a set of lexicons which is a kind of dictionary of words.
def create_lexicon(pos, neg):
lexicon = []
for file_name in [pos, neg]:
with codecs.open(file_name, 'r',encoding='utf-8',errors='ignore') as f:
contents = f.read()
for line in contents.split('$'):
data = line.strip('\n')
if data:
all_words = word_tokenize(data)
lexicon += list(all_words)
lexicons = []
for word in lexicon:
if not word in stopwords:
lexicons.append(word)
word_counts = Counter(lexicons) # it will return kind of dictionary
l2 = []
for word in word_counts:
if 60 > word_counts[word]:
l2.append(word)
return l2
def sample_handling(sample, lexicon, classification):
featureset = []
with codecs.open(sample, 'r', encoding="utf8",errors='ignore') as f:
contents = f.read()
for line in contents.split('$'):
data = line.strip('\n')
if data:
all_words = word_tokenize(data)
all_words_new = []
for word in all_words:
if not word in stopwords:
all_words_new.append(word)
features = np.zeros(len(lexicon))
for word in all_words_new:
if word in lexicon:
idx = lexicon.index(word)
features[idx] = 1
features = list(features)
featureset.append([features, classification])
return featureset
def create_feature_set_and_labels(pos, neg, test_size=0.2):
lexicon = create_lexicon(pos, neg)
features = []
features += sample_handling(pos, lexicon, 1)
features += sample_handling(neg, lexicon, 0)
random.shuffle(features)
features = np.array(features)
#print(len(features))
testing_size = int((1 - test_size) * len(features))
x_train = list(features[:, 0][:testing_size]) # taking features array upto testing_size
y_train = list(features[:, 1][:testing_size]) # taking labels upto testing_size
x_test = list(features[:, 0][testing_size:])
y_test = list(features[:, 1][testing_size:])
return x_train, y_train, x_test, y_test
def check_class(text, lexicon):
line = translator.translate(text, dest='hi').text
classifier = SupervisedDBNClassification.load('dbn.pkl')
predict_set = []
all_words = word_tokenize(line)
# all_words = [lemmatizer.lemmatize(i) for i in all_words]
features = np.zeros(len(lexicon))
for word in all_words:
if word in lexicon:
idx = lexicon.index(word)
features[idx] += 1
features = list(features)
predict_set.append(features)
predict_set = np.array(predict_set, dtype=np.float32)
predict_set = classifier.predict(predict_set)
#print(predict_set)
def create_feature_set_and_labels_simple(pos, neg, test_size=0.2):
lexicon = create_lexicon(pos, neg)
features = []
features += sample_handling(pos, lexicon, [1, 0])
features += sample_handling(neg, lexicon, [0, 1])
random.shuffle(features)
features = np.array(features)
#print(len(features))
testing_size = int((1 - test_size) * len(features))
x_train = list(features[:, 0][:testing_size]) # taking features array upto testing_size
y_train = list(features[:, 1][:testing_size]) # taking labels upto testing_size
x_test = list(features[:, 0][testing_size:])
y_test = list(features[:, 1][testing_size:])
return x_train, y_train, x_test, y_test
if __name__ == '__main__':
create_lexicon('pos_hindi.txt', 'neg_hindi.txt')
# x_train,y_train,x_test,y_test = create_feature_set_and_labels('pos_hindi.txt','neg_hindi.txt')
# print (x_train[0])
# with open('sentiment_data.pickle','wb') as f:
# pickle.dump([x_train,y_train,x_test,y_test],f)
# lexicon = create_lexicon('pos_hindi.txt','neg_hindi.txt')
# check_class('while the performances are often engaging , this loose collection of largely improvised numbers would probably have worked better as a one-hour tv documentary . \
# interesting , but not compelling . ',lexicon)