-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcategory_predictor.py
124 lines (81 loc) · 3.32 KB
/
category_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import json
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib as plt
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from reduction import num_metacategories
def generate_sequences(data):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size: ", vocab_size)
x = tokenizer.texts_to_sequences(data)
return x
# def one_hot_encode_labels(labels):
# return labels
def preprocess_data(input_data):
X_train, X_test, y_train, y_test = train_test_split(input_data[0], input_data[1], test_size=0.25, random_state=1000)
X_train = generate_sequences(X_train)
X_test = generate_sequences(X_test)
X_train = pad_sequences(X_train, padding='post', maxlen=max_sequence_length)
X_train = pad_sequences(X_test, padding='post', maxlen=max_sequence_length)
return X_train, X_test, y_train, y_test
def create_model():
model = Sequential()
# model.add(Embedding(vocab_size, 50(Specift how many dimensions to represent word), input_length=seq_length))
# keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', input_length=None)
model.add(Embedding(vocab_size, vector_len, input_length=max_sequence_length))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(LSTM(hidden_size))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.50))
model.add(Dense(num_categories, activation='softmax'))
return model
if __name__ == "__main__":
df = pd.read_pickle("combined_data.pkl")
# print df.head()
print (df.text)
# Unpack column by column into an num_review-by-num_metacategories matrix again
target_vecs = np.vstack([
df["cat_{}".format(i)] for i in range(num_metacategories)
]).T
print (target_vecs)
data_path = ''
epochs = 10
batch_size = 16
# vocab_size = 10000 #The size of vocabulary list being fed to network
hidden_size = 100 # The number of hidden neurons for our LSTM layer
vector_len = 50
max_sequence_length = 500
num_categories = 6
x_train, x_test, y_train, y_test = preprocess_data(data_path)
model = create_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
history = model.fit(x_train, y_train,
epochs=epochs,
verbose=False,
validation_data=(x_test, y_test),
batch_size=batch_size
)
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
# plot_history(history)
# model.fit_generator()
model.history()
print(model.summary())