-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfv_logres.py
46 lines (30 loc) · 1.41 KB
/
tfv_logres.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
from nltk import ngrams
from nltk.tokenize import word_tokenize
from sklearn import linear_model, metrics, model_selection,naive_bayes
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
if __name__=="__main__":
df = pd.read_csv("../input/imdb.csv")
df.sentiment = df.sentiment.apply(lambda x: 1 if x=='positive' else 0)
df["kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True)
y = df.sentiment.values
kf = model_selection.StratifiedKFold(n_splits=5)
for f, (t_,v_) in enumerate(kf.split(X =df,y=y)):
df.loc[v_,"kfold"] = f
for fold in range(5):
train_df = df[df.kfold!=fold].reset_index(drop=True)
test_df = df[df.kfold==fold].reset_index(drop=True)
count_vec = TfidfVectorizer(tokenizer=word_tokenize,token_pattern=None,ngram_range=(1,3))
count_vec.fit(train_df.review)
xtrain = count_vec.transform(train_df.review)
xtest = count_vec.transform(test_df.review)
# model = linear_model.LogisticRegression()
# imporve
model = naive_bayes.MultinomialNB()
model.fit(xtrain,train_df.sentiment)
preds = model.predict(xtest)
accuracy = metrics.accuracy_score(test_df.sentiment,preds)
print(f"Fold: {fold}")
print(f"Accuracy: {accuracy}")
print("")