-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
268 lines (208 loc) · 11.6 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from joblib import dump
# helper functions
def drop_columns():
""" Return a list of the columns that are not needed for the model. """
# remove the columns that are not needed for the model
drop_columns_1 = ['lastName', 'penaltyMinutes', 'faceoffWinPct', 'playerId', 'seasonId', 'shootsCatches', 'skaterFullName', 'teamAbbrevs']
#remove the columns related to points - prevents data leakage and overfitting
drop_columns_2 = ['goals', 'assists', 'evGoals', 'evPoints','pointsPerGame', 'ppGoals', 'ppPoints']
# merge the two lists
drop_columns_1.extend(drop_columns_2)
return drop_columns_1
def load_and_prepare_data():
""" Load the data from the CSV file and prepare it for the model. """
# Read the 5 year stats CSV file into a DataFrame
stats_5_year_df = pd.read_csv("csv/Top-100-NHL-5-year-Stats.csv")
#print(stats_5_year_df.head())
# drop the columns that are not needed for the model
drop_columns_1 = drop_columns()
# drop the columns
stats_5_year_df_1 = stats_5_year_df.drop(drop_columns_1, axis=1)
# print(stats_5_year_df.head().to_string())
# check for missing values
#print(stats_5_year_df_1.isnull().sum())
#one-hot encode positionCode
stats_5_year_df_1 = pd.get_dummies(stats_5_year_df_1, columns=['positionCode'])
# set features and target
features = stats_5_year_df_1.drop(['points'], axis=1)
target = stats_5_year_df_1['points']
return features, target, drop_columns_1
def create_test_train_split(features, target):
""" Create a test and training split of the data. Normalize the data. """
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Get a list of all columns that were created by one-hot encoding
encoded_cols = [col for col in X_train.columns if 'positionCode_' in col]
# Calculate min and max values from the training set
min_values = X_train.min()
max_values = X_train.max()
# Normalize the test and training data independently to prevent data leakage and overfitting
# Normalize training data using min_values and max_values
X_train = X_train.apply(lambda x: x if x.name in encoded_cols else (x - min_values[x.name]) / (max_values[x.name] - min_values[x.name]))
# Normalize testing data using min_values and max_values
X_test = X_test.apply(lambda x: x if x.name in encoded_cols else (x - min_values[x.name]) / (max_values[x.name] - min_values[x.name]))
return X_train, X_test, y_train, y_test, min_values, max_values
def test_output_points_model(model, X_test, y_test, name, folder):
""" Test the model and output the metrics to console and to a txt file."""
# make predictions using the testing set
linear_predictions = model.predict(X_test)
# get the metrics for the model
MAE = metrics.mean_absolute_error(y_test, linear_predictions)
MSE = metrics.mean_squared_error(y_test, linear_predictions)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, linear_predictions))
R2 = metrics.r2_score(y_test, linear_predictions)
# print the metrics for the model
print(name)
print("Mean Absolute Error (MAE):", MAE)
print("Mean Squared Error (MSE):", MSE)
print("Root Mean Squared Error (RMSE):", RMSE)
print("R2 Score:", R2)
print("\n")
feature_importances = False
if name in ["Linear_Regression_Model", "Linear_Regression_Model_Tuned", ]:
feature_importances = pd.DataFrame(model.coef_, index = X_test.columns, columns=['importance']).sort_values('importance', ascending=False)
else:
feature_importances = pd.DataFrame(model.feature_importances_, index = X_test.columns, columns=['importance']).sort_values('importance', ascending=False)
print("Feature Importances: ")
print(feature_importances)
#
# overwite and save to a txt file
with open(f"{folder}/console/{name}_output_results.txt", "w") as f:
f.write(name + "\n")
f.write("Mean Absolute Error (MAE): " + str(MAE) + "\n")
f.write("Mean Squared Error (MSE): " + str(MSE) + "\n")
f.write("Root Mean Squared Error (RMSE): " + str(RMSE) + "\n")
f.write("R2 Score: " + str(R2) + "\n")
if feature_importances is not False:
f.write("\n")
f.write("Feature Importances: " + "\n")
f.write(feature_importances.to_string())
f.write("\n")
f.write("\n"+ "Output Date and Time" + "\n")
f.write(str(pd.Timestamp.now()) + "\n")
def output_tuned_model(linear_model_grid, random_forest_model_grid, gradient_boost_model_grid):
""" Output the best parameters for each model. Outputs to console and to a txt file."""
# print the best parameters for each model
print("Linear Regression Model")
print(linear_model_grid.best_params_)
print("\n")
print("Random Forest Model")
print(random_forest_model_grid.best_params_)
print("\n")
print("Gradient Boost Model")
print(gradient_boost_model_grid.best_params_)
print("\n")
# write the best parameters to a single txt file
with open("output_tuned/console/best_model_parameters.txt", "w") as f:
f.write("Linear Regression Model" + "\n")
f.write(str(linear_model_grid.best_params_) + "\n")
f.write("\n")
f.write("Random Forest Model" + "\n")
f.write(str(random_forest_model_grid.best_params_) + "\n")
f.write("\n")
f.write("Gradient Boost Model" + "\n")
f.write(str(gradient_boost_model_grid.best_params_) + "\n")
f.write("\n")
f.write("\n"+ "Output Date and Time" + "\n")
f.write(str(pd.Timestamp.now()) + "\n")
def save_model(model, min_values, max_values, name, folder):
""" Save the model to a joblib file for later use. Allows for the model to be used again without having to retrain it."""
# save the model to a joblib file
dump(model, f"{folder}/joblib/{name}_model.joblib")
# save the min and max values using joblib
dump(min_values, f"{folder}/joblib/min_values.joblib")
dump(max_values, f"{folder}/joblib/max_values.joblib")
# default functions
def create_default_points_model():
"""
Create a default points model using linear regression, random forest, and gradient boost. Without modifying the model parameters.
"""
# load and prepare the data
features, target, drop_columns_1 = load_and_prepare_data()
# split the data into training and testing sets
X_train, X_test, y_train, y_test, min_values, max_values = create_test_train_split(features, target)
# create a linear regression model
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=10)
gradient_boost_model = GradientBoostingRegressor(n_estimators=100, max_depth=10)
# fit the model to the training data
linear_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
gradient_boost_model.fit(X_train, y_train)
# test each model - outputs to console
test_output_points_model(linear_model, X_test, y_test, "Linear_Regression_Model", "output")
test_output_points_model(random_forest_model, X_test, y_test, "Random_Forest_Model", "output")
test_output_points_model(gradient_boost_model, X_test, y_test, "Gradient_Boost_Model", "output")
# save the models
save_model(linear_model, min_values, max_values,"linear_regression_default", "output")
save_model(random_forest_model, min_values, max_values,"random_forest_default", "output")
save_model(gradient_boost_model, min_values, max_values, "gradient_boost_default", "output")
return drop_columns_1, linear_model, random_forest_model, gradient_boost_model, min_values, max_values
def create_tuned_points_model():
"""
Create a points model using linear regression, random forest, and gradient boost. Tuned using the best parameters found by GridSearchCV.
"""
# load and prepare the data
features, target, drop_columns_1 = load_and_prepare_data()
# split the data into training and testing sets
X_train, X_test, y_train, y_test, min_values, max_values = create_test_train_split(features, target)
# create linear regression model parameters options
linear_model_parameters = {
'fit_intercept': [True, False],
}
# create random forest model hyperparameter options
random_forest_model_parameters = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2', None]
}
# create gradient boost model hyperparameter options
gradient_boost_model_parameters = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 4, 5],
'min_samples_split': [2, 4, 6],
'min_samples_leaf': [1, 2, 3],
'subsample': [0.8, 0.9, 1.0],
'max_features': ['sqrt', 'log2', None]
}
#initialize the GridSearchCV objects
linear_model_grid = GridSearchCV(LinearRegression(), linear_model_parameters, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
random_forest_model_grid = GridSearchCV(RandomForestRegressor(), random_forest_model_parameters, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
gradient_boost_model_grid = GridSearchCV(GradientBoostingRegressor(), gradient_boost_model_parameters, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
print("fitting models...")
# fit the model to the training data
linear_model_grid.fit(X_train, y_train)
print("Linear Regression Model Complete!")
random_forest_model_grid.fit(X_train, y_train)
print("Random Forest Model Complete!")
gradient_boost_model_grid.fit(X_train, y_train)
print("Gradient Boost Model Complete!")
# output the best parameters for each model
output_tuned_model(linear_model_grid, random_forest_model_grid, gradient_boost_model_grid)
# get the best models
linear_model = linear_model_grid.best_estimator_
random_forest_model = random_forest_model_grid.best_estimator_
gradient_boost_model = gradient_boost_model_grid.best_estimator_
# test each model - outputs to console
test_output_points_model(linear_model, X_test, y_test, "Linear_Regression_Model_Tuned", "output_tuned")
test_output_points_model(random_forest_model, X_test, y_test, "Random_Forest_Model_Tuned", "output_tuned")
test_output_points_model(gradient_boost_model, X_test, y_test, "Gradient_Boost_Model_Tuned","output_tuned")
# save the models
save_model(linear_model, min_values, max_values, "linear_regression_tuned", "output_tuned")
save_model(random_forest_model, min_values, max_values, "random_forest_tuned", "output_tuned")
save_model(gradient_boost_model, min_values, max_values, "gradient_boost_tuned", "output_tuned")
return drop_columns_1, linear_model, random_forest_model, gradient_boost_model, min_values, max_values
if __name__ == '__main__':
# uncomment the function you want to run
# create_default_points_model()
create_tuned_points_model()