-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathperformance_tests2.py
131 lines (104 loc) · 4.48 KB
/
performance_tests2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from imodels.util.data_util import get_clean_dataset
import numpy as np
from beta import ShrinkageClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
import sys
clf_datasets = [
("heart", "heart", "imodels"),
("breast-cancer", "breast_cancer", "imodels"),
("haberman", "haberman", "imodels"),
("ionosphere", "ionosphere", "pmlb"),
("diabetes-clf", "diabetes", "pmlb"),
("german", "german", "pmlb"),
("juvenile", "juvenile_clean", "imodels"),
("recidivism", "compas_two_year_clean", "imodels")
]
clf_datasets = [
("heart", "heart", "imodels"),
("breast-cancer", "breast_cancer", "imodels"),
("haberman", "haberman", "imodels"),
("ionosphere", "ionosphere", "pmlb"),
("diabetes-clf", "diabetes", "pmlb"),
("german", "german", "pmlb")
]
clf_datasets = [
("breast-cancer", "breast_cancer", "imodels")
]
clf_datasets = [
("heart", "heart", "imodels")
]
clf_datasets = [
("diabetes-clf", "diabetes", "pmlb")
]
####
clf_datasets = [
("juvenile", "juvenile_clean", "imodels")
]
# ionosphere --> bad performance for beta
# scoring
sc = "balanced_accuracy"
#sc = "roc_auc"
#ntrees = 10
for ntrees in [1, 2, 5, 10, 50, 100]:
iterations = np.arange(0, 20, 1)
for ds_name, id, source in clf_datasets:
X, y, feature_names = get_clean_dataset(id, data_source=source)
scores = {}
print(ds_name)
#for shrink_mode in ["hs", "hs_entropy", "hs_entropy_2", "hs_log_cardinality"]:
# scores[shrink_mode] = []
# for lmb in lmbs:
# clf = ShrinkageClassifier(shrink_mode=shrink_mode, lmb=lmb)
# scores[shrink_mode].append(
# cross_val_score(clf, X, y, cv=10, n_jobs=-1,
# scoring="balanced_accuracy").mean())
scores["vanilla"] = []
scores["hs"] = []
scores["beta"] = []
for xx in iterations:
# vanilla
print("Vanilla Mode")
shrink_mode="vanilla"
#scores[shrink_mode] = []
clf = RandomForestClassifier(n_estimators=ntrees) #DecisionTreeClassifier() #RandomForestClassifier(n_estimators=1) ## DecisionTreeClassifier() #
scores[shrink_mode].append(cross_val_score(clf, X, y, cv=5, n_jobs=-1, scoring=sc).mean())
# hs
print("HS Mode")
shrink_mode="hs"
#scores[shrink_mode] = []
param_grid = {
"lmb": [0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 200],
"shrink_mode": ["hs"]}
grid_search = GridSearchCV(ShrinkageClassifier(RandomForestClassifier(n_estimators=ntrees)), param_grid, cv=5, n_jobs=-1, scoring=sc)
grid_search.fit(X, y)
best_params = grid_search.best_params_
print(best_params)
clf = ShrinkageClassifier(RandomForestClassifier(n_estimators=ntrees),shrink_mode=shrink_mode, lmb=best_params.get('lmb'))
#print(clf)
scores[shrink_mode].append(cross_val_score(clf, X, y, cv=5, n_jobs=-1, scoring=sc).mean())
# beta
print("Beta Shrinkage")
shrink_mode="beta"
#scores[shrink_mode] = []
param_grid = {
"alpha": [5000, 4000, 3000, 2000, 1500, 1000, 800, 500, 100, 50, 30, 10, 1],
"beta": [5000, 4000, 3000, 2000, 1500, 1000, 800, 500, 100, 50, 30, 10, 1],
"shrink_mode": ["beta"]}
grid_search = GridSearchCV(ShrinkageClassifier(RandomForestClassifier(n_estimators=ntrees)), param_grid, cv=5, n_jobs=-1, scoring=sc)
grid_search.fit(X, y)
best_params = grid_search.best_params_
print(best_params)
clf = ShrinkageClassifier(RandomForestClassifier(n_estimators=ntrees),shrink_mode=shrink_mode, alpha=best_params.get('alpha'), beta=best_params.get('beta'))
#print(clf)
scores[shrink_mode].append(cross_val_score(clf, X, y, cv=5, n_jobs=-1, scoring=sc).mean())
print(scores)
#for key in scores:
# #plt.plot(lmbs, scores[key], label=key)
# plt.boxplot(scores[key], labels=key)
RES = np.vstack([scores['vanilla'],scores['hs'],scores['beta']])
print(RES)
np.savetxt(str(ntrees),RES, delimiter='\t')