-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalytics.py
103 lines (92 loc) · 3.92 KB
/
analytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import model_selection, tree
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
### List of metrics analysed in the paper ###
# The 'combined' list has all the 22 metrics
feature_names_combined = (
'entities', 'agents', 'activities', # PROV types (for nodes)
'nodes', 'edges', 'diameter', 'assortativity', # standard metrics
'acc', 'acc_e', 'acc_a', 'acc_ag', # average clustering coefficients
'mfd_e_e', 'mfd_e_a', 'mfd_e_ag', # MFDs
'mfd_a_e', 'mfd_a_a', 'mfd_a_ag',
'mfd_ag_e', 'mfd_ag_a', 'mfd_ag_ag',
'mfd_der', # MFD derivations
'powerlaw_alpha' # Power Law
)
# The 'generic' list has 6 generic network metrics (that do not take provenance information into account)
feature_names_generic = (
'nodes', 'edges', 'diameter', 'assortativity', # standard metrics
'acc',
'powerlaw_alpha' # Power Law
)
# The 'provenance' list has 16 provenance-specific network metrics
feature_names_provenance = (
'entities', 'agents', 'activities', # PROV types (for nodes)
'acc_e', 'acc_a', 'acc_ag', # average clustering coefficients
'mfd_e_e', 'mfd_e_a', 'mfd_e_ag', # MFDs
'mfd_a_e', 'mfd_a_a', 'mfd_a_ag',
'mfd_ag_e', 'mfd_ag_a', 'mfd_ag_ag',
'mfd_der', # MFD derivations
)
# The utitility of above threes set of metrics will be assessed in our experiements to
# understand whether provenance type information help us improve data classification performance
feature_name_lists = (
('combined', feature_names_combined),
('generic', feature_names_generic),
('provenance', feature_names_provenance)
)
def balance_smote(df):
X = df.drop('label', axis=1)
Y = df.label
print('Original data shapes:', X.shape, Y.shape)
smoX, smoY = X, Y
c = Counter(smoY)
while (min(c.values()) < max(c.values())): # check if all classes are balanced, if not balance the first minority class
smote = SMOTE(ratio="auto", kind='regular')
with warnings.catch_warnings():
warnings.simplefilter("ignore")
smoX, smoY = smote.fit_sample(smoX, smoY)
c = Counter(smoY)
print('Balanced data shapes:', smoX.shape, smoY.shape)
df_balanced = pd.DataFrame(smoX, columns=X.columns)
df_balanced['label'] = smoY
return df_balanced
def t_confidence_interval(an_array, alpha=0.95):
s = np.std(an_array)
n = len(an_array)
return stats.t.interval(alpha=alpha, df=(n - 1), scale=(s / np.sqrt(n)))
def cv_test(X, Y, n_iterations=1000, test_id=""):
accuracies = []
importances = []
while len(accuracies) < n_iterations:
skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
for train, test in skf.split(X, Y):
clf = tree.DecisionTreeClassifier()
clf.fit(X.iloc[train], Y.iloc[train])
accuracies.append(clf.score(X.iloc[test], Y.iloc[test]))
importances.append(clf.feature_importances_)
print("Accuracy: %.2f%% ±%.4f <-- %s" % (np.mean(accuracies) * 100, t_confidence_interval(accuracies)[1] * 100, test_id))
return accuracies, importances
def test_classification(df, n_iterations=1000, test_id=''):
results = pd.DataFrame()
imps = pd.DataFrame()
Y = df.label
for feature_list_name, feature_names in feature_name_lists:
X = df[list(feature_names)]
accuracies, importances = cv_test(
X, Y, n_iterations, '-'.join((test_id, feature_list_name)) if test_id else feature_list_name
)
rs = pd.DataFrame(
{
'Metrics': feature_list_name,
'Accuracy': accuracies
}
)
results = results.append(rs, ignore_index=True)
if feature_list_name == "combined": # we are interested in the relevance of all features (i.e. 'combined')
imps = pd.DataFrame(importances, columns=feature_names)
return results, imps