Skip to content

Commit

Permalink
With precommit checks
Browse files Browse the repository at this point in the history
  • Loading branch information
shreeya-cy committed Jul 26, 2024
1 parent 96cbad7 commit a116e08
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 44 deletions.
87 changes: 61 additions & 26 deletions stream_viz/data_missingness/missingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,18 +241,22 @@ def plot(self, start, end, features):
def _add_interactive_plot(self):
super()._add_interactive_plot()

class StackedBarGraph (Plotter):

class StackedBarGraph(Plotter):

def __init__(
self,
missing_encoder_obj: MissingDataEncoder,
self,
missing_encoder_obj: MissingDataEncoder,
):
self._missing_encoder = missing_encoder_obj
self._data_df = self._missing_encoder.X_encoded_data
def plot(self,feature,chunk_size):

def plot(self, feature, chunk_size):
df = self._data_df
num_chunks = len(df) // chunk_size
chunk_ranges = [(i * chunk_size, (i + 1) * chunk_size) for i in range(num_chunks)]
chunk_ranges = [
(i * chunk_size, (i + 1) * chunk_size) for i in range(num_chunks)
]

counts = []
for start, end in chunk_ranges:
Expand All @@ -262,28 +266,33 @@ def plot(self,feature,chunk_size):
count_nan = chunk[feature].isna().sum()
counts.append([count_0, count_1, count_nan])

counts_df = pd.DataFrame(counts, columns=['A', 'B', 'Missing'], index=range(num_chunks))
counts_df = pd.DataFrame(
counts, columns=["A", "B", "Missing"], index=range(num_chunks)
)

counts_df.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.xlabel('Time period')
plt.ylabel('Count')
plt.title(f'Stacked Bar Graph of {feature} for each time period of {chunk_size} Instances')
plt.legend(title=feature, loc='upper right')
counts_df.plot(kind="bar", stacked=True, figsize=(12, 8))
plt.xlabel("Time period")
plt.ylabel("Count")
plt.title(
f"Stacked Bar Graph of {feature} for each time period of {chunk_size} Instances"
)
plt.legend(title=feature, loc="upper right")
plt.show()


class ScatterPlotter(Plotter):
def __init__(
self,
normal_encoder_obj: NormalDataEncoder,
missing_encoder_obj: MissingDataEncoder,
self,
normal_encoder_obj: NormalDataEncoder,
missing_encoder_obj: MissingDataEncoder,
):
self._normal_encoder = normal_encoder_obj
self._missing_encoder = missing_encoder_obj

def plot(self):
"""Dummy implementation of the abstract method 'plot'."""
pass

def plot_numerical(self, feature):
normal_df = self._normal_encoder.X_encoded_data
missing_df = self._missing_encoder.X_encoded_data
Expand All @@ -292,9 +301,23 @@ def plot_numerical(self, feature):
missing_attr = missing_df[feature]
missing_mask = np.isnan(missing_attr)
plt.figure(figsize=(10, 6))
plt.scatter(np.arange(len(attr)), attr, color='blue', label='Not missing', alpha=0.5, s=20)
plt.scatter(np.where(missing_mask)[0], attr[missing_mask], color='red', label='Missing', alpha=0.5, s=20)
plt.xlabel('Time Points')
plt.scatter(
np.arange(len(attr)),
attr,
color="blue",
label="Not missing",
alpha=0.5,
s=20,
)
plt.scatter(
np.where(missing_mask)[0],
attr[missing_mask],
color="red",
label="Missing",
alpha=0.5,
s=20,
)
plt.xlabel("Time Points")
plt.ylabel(feature)
plt.xticks(np.arange(0, 14000, 1000))
plt.legend()
Expand All @@ -315,11 +338,23 @@ def plot_categorical(self, feature):

missing_mask = np.isnan(selected_missing_df)
plt.figure(figsize=(10, 6))
plt.scatter(np.arange(len(selected_normal_df)), selected_normal_df, color='blue', label='Not missing',
alpha=0.5, s=20)
plt.scatter(np.where(missing_mask)[0], selected_normal_df[missing_mask], color='red', label='Missing',
alpha=0.5, s=20)
plt.xlabel('Time Points')
plt.scatter(
np.arange(len(selected_normal_df)),
selected_normal_df,
color="blue",
label="Not missing",
alpha=0.5,
s=20,
)
plt.scatter(
np.where(missing_mask)[0],
selected_normal_df[missing_mask],
color="red",
label="Missing",
alpha=0.5,
s=20,
)
plt.xlabel("Time Points")
plt.ylabel(feature)
plt.xticks(np.arange(0, 110, 10))
plt.legend()
Expand Down Expand Up @@ -354,10 +389,10 @@ def plot_categorical(self, feature):
# mar_hm.plot(start_tpt=200, end_tpt=500, significance_level=0.05)

# ------------ Test Run : For StackedBarGraph -----------------
#bargraph = StackedBarGraph(missing_encoder_obj=missing)
#bargraph.plot('c5_b', 1000)
# bargraph = StackedBarGraph(missing_encoder_obj=missing)
# bargraph.plot('c5_b', 1000)

# ------------ Test Run : For ScatterPlotter -----------------
scatter = ScatterPlotter(normal_encoder_obj=normal, missing_encoder_obj=missing)
scatter.plot_numerical('n0')
scatter.plot_categorical('c5_b')
scatter.plot_numerical("n0")
scatter.plot_categorical("c5_b")
14 changes: 8 additions & 6 deletions stream_viz/tutorial/UserGuide.ipynb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 24 additions & 12 deletions stream_viz/velocity/velocity_charts.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import itertools
from typing import Iterable, List, Union

import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as mticker

from stream_viz.base import Velocity
from stream_viz.data_encoders.cfpdss_data_encoder import CfpdssDataEncoder

Expand Down Expand Up @@ -293,6 +294,8 @@ def plot(self, features: Union[str, Iterable[str]], *args, **kwargs) -> None:
"Features parameter should be either a string for a categorical feature or "
"an iterable for numerical features."
)


class StreamGraph(Velocity):

def __init__(self, data_obj: CfpdssDataEncoder) -> None:
Expand All @@ -306,16 +309,18 @@ def __init__(self, data_obj: CfpdssDataEncoder) -> None:
"""
self._data_obj: CfpdssDataEncoder = data_obj

def get_timepoint(self,window_size):
def get_timepoint(self, window_size):
time_points = list(range(0, 13000, window_size))
return time_points

def count_categories_in_chunks(self,column, chunk_size=50):
def count_categories_in_chunks(self, column, chunk_size=50):
# Initialize an empty dictionary to hold the counts for each category
category_counts = {}

# Calculate the number of chunks
num_chunks = len(column) // chunk_size + (1 if len(column) % chunk_size != 0 else 0)
num_chunks = len(column) // chunk_size + (
1 if len(column) % chunk_size != 0 else 0
)

for i in range(num_chunks):
# Get the start and end indices for the current chunk
Expand All @@ -337,7 +342,9 @@ def count_categories_in_chunks(self,column, chunk_size=50):
# Ensure all categories have a list of the correct length
for category in category_counts:
if len(category_counts[category]) < num_chunks:
category_counts[category].extend([0] * (num_chunks - len(category_counts[category])))
category_counts[category].extend(
[0] * (num_chunks - len(category_counts[category]))
)

return category_counts

Expand All @@ -347,12 +354,17 @@ def plot(self, feature):
result = self.count_categories_in_chunks(self._data_obj.X_encoded_data[feature])
fig, ax = plt.subplots()
fig.set_size_inches(10, 6)
ax.stackplot(time_points, result.values(),
labels=result.keys(), alpha=0.8, baseline='wiggle')
ax.legend(loc='upper right', reverse=True)
ax.set_title(f'Velocity of {feature}')
ax.set_xlabel('Time')
ax.set_ylabel('Height')
ax.stackplot(
time_points,
result.values(),
labels=result.keys(),
alpha=0.8,
baseline="wiggle",
)
ax.legend(loc="upper right", reverse=True)
ax.set_title(f"Velocity of {feature}")
ax.set_xlabel("Time")
ax.set_ylabel("Height")
ax.xaxis.set_major_locator(mticker.MultipleLocator(1000))
plt.show()

Expand Down Expand Up @@ -386,4 +398,4 @@ def plot(self, feature):
# numerical_features = ["n0", "n1"]
# roll_mean_obj.plot_velocity(missing.X_encoded_data, numerical_features, window_size=10)
stream_graph = StreamGraph(normal)
stream_graph.plot('c5_b')
stream_graph.plot("c5_b")

0 comments on commit a116e08

Please sign in to comment.