-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcaroline_eda.py
42 lines (33 loc) · 1.33 KB
/
caroline_eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#%%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("final.csv")
# %%
df.head()
# %%
# Stem height
plt.hist(df['stem-height'], stacked = True)
df.pivot(columns='class', values = 'stem-height').plot.hist(stacked = True)
# The stem-height is approximately normally ditributed, with a slight right skew
# No strong evidence of a difference between groups
# %%
# has_ring
counts = df['has-ring'].value_counts()
df.groupby(['has-ring']).size().plot(kind = "bar")
df.groupby(['has-ring', 'class']).size().plot(kind = "bar", stacked=True)
print(counts)
# there are 3 times are many mushrooms without a ring (f), so our data is unbalanced. However, they look fairly evenly split between the poisonous and edible mushrooms
# %%
# Ring type
df.groupby(['ring-type']).size().plot(kind = "bar", stacked=True)
counts = df['ring-type'].value_counts()
print(counts)
pd.crosstab(df['ring_type'], df['class']).plot(kind='bar', stacked=True)
# the ring-type f is more common than all the other ring types combined. the next most common ring-types are e and z.
# %%
# cap-diameter
plt.hist(df['cap-diameter'])
df.pivot(columns='class', values = 'cap-diameter').plot.hist(stacked = True)
# The distribution of cap diameter is right skewed; no evidence of a difference between groups
# %%