-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmeanShiftTitanic.py
102 lines (78 loc) · 3.05 KB
/
meanShiftTitanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, cross_validation
import pandas as pd
import matplotlib.pyplot as plt
'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''
# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)
def handle_non_numerical_data(df):
# handling non-numerical data: must convert.
columns = df.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
#print(column,df[column].dtype)
if df[column].dtype != np.int64 and df[column].dtype != np.float64:
column_contents = df[column].values.tolist()
#finding just the uniques
unique_elements = set(column_contents)
# great, found them.
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
# creating dict that contains new
# id per unique string
text_digit_vals[unique] = x
x+=1
# now we map the new "id" vlaue
# to replace the string.
df[column] = list(map(convert_to_int,df[column]))
return df
df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])
clf = MeanShift()
clf.fit(X)
labels = clf.labels_
cluster_centers = clf.cluster_centers_
#adding a new column to original dataframe
original_df['cluster_group']=np.nan
#iterating thru the labels and populating the labels to the empty column
for i in range(len(X)):
#with iloc we are referenceing the index or the row of the dataframe where column is cluster group and we are puttin the coresponding labls in there
original_df['cluster_group'].iloc[i] = labels[i]
#check the survival rates for each of the groups we happen to find
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
#print(temp_df.head())
survival_cluster = temp_df[ (temp_df['survived'] == 1) ]
survival_rate = len(survival_cluster) / len(temp_df)
#print(i,survival_rate)
survival_rates[i] = survival_rate
#each cluster group is a different class ticket people note .describe() gives stats on the dataframe
print(original_df[ (original_df['cluster_group']==1) ].describe())
print(survival_rates)