-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNaiveBayesClassifier.py
136 lines (110 loc) · 5.39 KB
/
NaiveBayesClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class NaiveBayesClassifier:
"""Naive Bayes Classifier class"""
def __init__(self):
self.keys = []
self.vectors = []
self.key_appearances = []
self.positives = 0
def train(self, trainingVectorsPath):
"""Method that trains the algorithm from the training vector file provided.
trainingVectorsPath -- (str) path to the training vector file.
"""
# Flush data structures for training...
self.keys = []
self.vectors = []
self.key_appearances = [[[0,0],[0,0]] for _ in self.keys]
# Open the training vector file:
with open(trainingVectorsPath, "r", encoding='utf-8') as trainingfile:
lines = trainingfile.readlines()
# Read the keyword line and extract the keywords:
self.keys.extend(lines[0].split(","))
self.keys.pop(-1)
lines.pop(0)
# Read the rest of the training vectors and parse them
# to a list:
for line in lines:
vector = line.strip("\n").split(",")
vector = [int(item) for item in vector]
self.vectors.append(vector)
# Prepare the appearance counters for each keyword:
self.key_appearances = [[[0,0],[0,0]] for _ in self.keys]
# Calculate the positive training vector counter:
self.positives = sum([1 for vec in self.vectors if vec[-1] == 1])
for index in range(len(self.keys)):
for vector in self.vectors:
# If training vector is negative
if vector[-1] == 0:
# If vector does not have key
if vector[index] == 0:
# ex. key_appearances[index('BAD')][0][0]
# == number of training vectors that
# are NEGATIVE and DON'T have the
# keyword 'BAD'
# ex. key_appearances[index('BAD')][0][1]
# == number of training vectors that
# are NEGATIVE and DO have the
# keyword 'BAD'
# ex. key_appearances[index('BAD')][1][0]
# == number of training vectors that
# are POSITIVE and DON'T have the
# keyword 'BAD'
# ex. key_appearances[index('BAD')][1][1]
# == number of training vectors that
# are POSITIVE and DO have the
# keyword 'BAD'
self.key_appearances[index][0][0] += 1
else:
self.key_appearances[index][0][1] += 1
else:
# If vector does not have key
if vector[index] == 0:
self.key_appearances[index][1][0] += 1
else:
self.key_appearances[index][1][1] += 1
def classify(self, revpath):
"""Main classification method that classifies a review as positive (True) or negative (False)
revpath - (str) path to the review file"""
# Initialize a dummy training vector full of 0s.
rev_vector = [0 for _ in range(len(self.keys)+1)]
# Open the review file:
with open(revpath, "r", encoding='utf-8') as revfile:
rev_text = revfile.read()
words = rev_text.split(" ")
for word in words:
cleanWord = word.strip(".,!").upper()
if cleanWord in self.keys:
# Vector-ify the review text...
# ex. review_vector[indexOf('BAD')] = 1
rev_vector[self.keys.index(cleanWord)] = 1
# if P(C = 1 | X) > P(C = 0 | X)
# RETURN 1
#P(C=1|X):
#P(X=xi | C=1):
# In how many training examples that are positive (c=1)
# does the keyword X appear (xi = 1) or not (xi = 0).
# (according to the value xi of the to-be-classified vector).
# Begin by calculating the probability of
# a positive training vector among the training
# vectors.
pc1x = self.positives / len(self.vectors)
# For every keyword
for i in range(len(self.keys)):
# Multiply the product by the probability P(X=xi | C=1):
# P(X=xi | C=1) = #appearances in positive vectors with Xi = rev_vector[i] / #positive vectors
# Lastlyt, Laplace approximation (in case the #appearances in positive vectors is 0)
pc1x *= (1 + self.key_appearances[i][1][rev_vector[i]]) / (self.positives + 2)
# The same for the negative outcome:
pc0x = (len(self.vectors)-self.positives)/len(self.vectors)
for i in range(len(self.keys)):
pc0x *= (1 + self.key_appearances[i][0][rev_vector[i]]) / ((len(self.vectors) - self.positives) + 2)
# If the probability of the to-be-classified review
# being positive is greater than that of being negative,
# classify as positive
if pc1x > pc0x:
return True
else:
# Otherwise as negative:
return False
def __str__(self):
"""toString method as an identifier"""
return "nbc"