-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEDA_JerrickGerald.py
106 lines (51 loc) · 1.46 KB
/
EDA_JerrickGerald.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# coding: utf-8
# ## Mushroom Edibility prediction
#
#
# ### Import Library
# In[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# In[2]:
df=pd.read_csv(r'final.csv')
df=df.drop(['Unnamed: 0'],axis=1)
df.head()
# In[14]:
df.describe()
# ### CLASS DISTRIBUTION
#
# #### P - Poisonous, E - Edible
# In[3]:
df['class'].value_counts()
# In[4]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for column in df.columns:
df[column] = labelencoder.fit_transform(df[column])
# In[5]:
count_classes = pd.value_counts(df['class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
# ### So, from the distribution we can see that we have higher number of poisonous class when compare to edible mushroom class. And no need of balancing technique is required for this dataset.
# ### Distribution of Stem - Color
# In[6]:
df.head()
# In[7]:
count_classes = pd.value_counts(df['stem-color'], sort = True)
count_classes.plot(kind = 'bar', rot=0,color='tan')
# In[ ]:
#
# In[9]:
plt.figure(figsize=(16, 10))
comat=df.corr()
k=15
col=comat.nlargest(k,'class')['class'].index
cm=np.corrcoef(df[col].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm,annot=True,cbar=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=col.values, xticklabels=col.values)
plt.show()
# In[12]:
sns.boxplot(y='stem-color',x='class',data=df)
# In[ ]: