-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathsentiment_tfidf.py
74 lines (64 loc) · 2.22 KB
/
sentiment_tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sentiment_buildwd
import numpy as np
import tweetprocess
import random
from sklearn import linear_model
TRAIN_FILE = 'data/sentiment/training.1600000.processed.noemoticon.csv'
def tfidf(mat=None, rownames=None):
"""TF-IDF on mat. rownames is unused; it's an argument only
for consistency with other methods used here"""
colsums = np.sum(mat, axis=0)
doccount = mat.shape[1]
w = np.array([_tfidf_row_func(row, colsums, doccount) for row in mat])
return (w, rownames)
def _tfidf_row_func(row, colsums, doccount):
df = float(len([x for x in row if x > 0]))
idf = 0.0
# This ensures a defined IDF value >= 0.0:
if df > 0.0 and df != doccount:
idf = np.log(doccount / df)
tfs = row/colsums
return tfs * idf
def tfidf_logreg(train_file):
wd = sentiment_buildwd.buildWD(train_file)
colnames = wd[1]
rownames = wd[2]
subjects = wd[3]
idf = tfidf(wd[0], rownames)
trainMat = np.zeros((len(colnames), wd[0].shape[1]))
f = open(train_file)
matCol = 0
for line in f:
words = line.strip('\"').split(',')
if words[1] in colnames:
trainRow = np.zeros(wd[0].shape[1])
numWords = 0
tweet = sentiment_buildwd.buildTweet(words[5:])
for word in tweetprocess.tokenize(tweet):
pword = sentiment_buildwd.processWord(word)
if pword in rownames:
numWords += 1
trainRow = trainRow + idf[0][rownames.index(pword)]
trainRow = (trainRow*1.0) / numWords
trainMat[matCol,:] = trainRow
matCol += 1
f.close()
trainVals = sentiment_buildwd.trainValsFromSubjects(subjects)
# RANDOMIZE
random.seed(17)
shuffle = range(len(subjects))
random.shuffle(shuffle)
train = []
labels = []
index = 0
for i in shuffle:
train.append(trainMat[i])
labels.append(trainVals[i])
index += 1
cutoff = int(index*0.7)
logreg = linear_model.LogisticRegression()
logreg.fit(train[0:cutoff], labels[0:cutoff])
return logreg.score(train[cutoff:], labels[cutoff:])
if __name__ == "__main__":
score_logreg = tfidf_logreg(TRAIN_FILE)
print 'LogReg: ', score_logreg