Skip to content

Commit

Permalink
Merge pull request #45 from QSD-Group/algae
Browse files Browse the repository at this point in the history
Merge algae branch to main
  • Loading branch information
yalinli2 authored Nov 14, 2023
2 parents 742b96e + 4cfc9c1 commit df6b041
Show file tree
Hide file tree
Showing 4 changed files with 357 additions and 1 deletion.
2 changes: 1 addition & 1 deletion exposan/pm2_ecorecover/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ pm2_ecorecover: Calibration & Validation of Phototrophic-Mixotrophic Process Mod

Summary
-------
This module is used to calibrate and validate the Phototrophic-Mixotrophic Process Model (PM2; ``qsdsan.processes.PM2``) using the continuous online monitoring data from the EcoRecover plant. Note that the package `optuna <https://optuna.org/>`_ is needed to run the calibration script.
This module is used to calibrate and validate the Phototrophic-Mixotrophic Process Model (PM2; ``qsdsan.processes.PM2``) using the continuous online monitoring data from the EcoRecover plant. Note that the package `optuna <https://optuna.org/>`_ is needed to run the calibration script and the package `scikit-fda <https://pypi.org/project/scikit-fda/>`_ is needed to run the data cleaning script.

.. figure:: ./readme_figures/pm2_ecorecover.svg

Expand Down
Binary file not shown.
355 changes: 355 additions & 0 deletions exposan/pm2_ecorecover/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
# -*- coding: utf-8 -*-
'''
Created on Fri Nov 4 19:45:06 2022
This module is developed by:
Ga-Yeong Kim <[email protected]>
Joy Zhang <[email protected]>
This module is under the University of Illinois/NCSA Open Source License.
Please refer to https://github.com/QSD-Group/QSDsan/blob/main/LICENSE.txt
for license details.
'''
import matplotlib.pyplot as plt, matplotlib.ticker as ticker, \
pandas as pd, numpy as np
from matplotlib import dates
from qsdsan.utils import load_data, ospath

from skfda.misc.hat_matrix import (KNeighborsHatMatrix,
LocalLinearRegressionHatMatrix,
NadarayaWatsonHatMatrix,)

from skfda.preprocessing.smoothing import KernelSmoother
from skfda.representation.grid import FDataGrid

#%% Load raw data
folder = ospath.dirname(__file__)
file = ospath.join(folder, 'data/data_cleaning_rawdata.xlsx')
raw_data = load_data(file, sheet=None)
scada = raw_data['SCADA']

# Join AIMS data columns
aims = raw_data['AIMS_Inf_NH4'].join(raw_data['AIMS_Eff_NH4'], how='outer', sort=True) # initial
for k, df in raw_data.items():
if k.startswith('AIMS_') and not k.endswith('NH4'):
aims = aims.join(df, how='outer', sort=True) # append
del raw_data

# Delete '_aims' in column names
aims.columns = [c.rstrip('_aims') for c in aims.columns]

# Set starting point
from datetime import datetime
day0 = datetime.strptime('03-12-2022 12:00 AM', '%m-%d-%Y %I:%M %p')

#%% Difference between SCADA and AIMS
all_data = scada.join(aims, how='right', rsuffix='_aims', sort=True)
sheets = ['nh4_inf', 'nh4_eff', 'no3_eff', 'po4_inf', 'po4_eff', 'tss']
dct_compare = {}
for sheet in sheets:
cols = [col for col in all_data.columns if col.startswith(sheet)]
df = all_data.loc[:,cols]
if sheet == 'tss':
for i in ('_harv', '_postmix'):
df[sheet+i+'_diff'] = df[sheet+i] - df[sheet+'_ret']
dct_compare[sheet+i] = df.loc[:, [sheet+'_ret', sheet+i, sheet+i+'_diff']].dropna(axis=0)
else:
df[sheet+'_diff'] = df[sheet+'_aims'] - df[sheet]
dct_compare[sheet] = df.dropna(axis=0)

with pd.ExcelWriter(ospath.join(folder, 'results/data_cleaning/excel/scada_vs_aims.xlsx')) as writer:
for sheet, df in dct_compare.items():
df.to_excel(writer, sheet_name=sheet)

#%% Plotting functions

plot_path = ospath.join(folder, 'results/data_cleaning/plot/')
date0 = dates.date2num(day0)

@ticker.FuncFormatter
def fmt_day(t_stamp, pos):
day = t_stamp - date0
return f'{day:.0f}'

def plot_single(x, y, x_by_date=True, file_name=None):
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(x, y, label=y.name)
ax.xaxis.set_major_locator(dates.DayLocator(interval=5))
fig.legend(loc='upper right', bbox_to_anchor=(1,0.9))
if x_by_date:
ax.xaxis.set_major_formatter(dates.DateFormatter('%Y-%m-%d'))
ax.set_xlabel('Date')
subfolder = 'raw/date'
else:
ax.xaxis.set_major_formatter(fmt_day)
ax.set_xlabel('Time [day]')
subfolder = 'raw/day'
if file_name:
fig.savefig(ospath.join(plot_path, f'{subfolder}/{file_name}.png'),
dpi=300, bbox_inches='tight', facecolor='white')
else: return fig, ax

def plot_SCADA(x_by_date=True):
for col, y in scada.items():
plot_single(scada.index, y, x_by_date=x_by_date, file_name=col)

# plot_SCADA(x_by_date=True) # date
# plot_SCADA(x_by_date=False) # day

def plot_multiple(df1, df2, labels=None, smooth=False, x_by_date=True, file_name=None):
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(df1)
ax.plot(df2, 'o', ms=4, linestyle='-')
labels = labels or [*df1.columns, *[col+'_aims' for col in df2.columns]]
ax.xaxis.set_major_locator(dates.DayLocator(interval=5))
if x_by_date:
ax.xaxis.set_major_formatter(dates.DateFormatter('%Y-%m-%d'))
ax.set_xlabel('Date')
file = f'raw/date/{file_name}_vs.png'
else:
ax.xaxis.set_major_formatter(fmt_day)
ax.set_xlabel('Time [day]')
file = f'raw/day/{file_name}_vs.png'
fig.legend(labels=labels, loc='upper right', bbox_to_anchor=(1,0.9))
if file_name:
fig.savefig(ospath.join(plot_path, file),
dpi=300, bbox_inches='tight', facecolor='white')
else: return fig, ax

def plot_SCADA_vs_AIMS(x_by_date=True):
for var in set(scada.columns) & set(aims.columns):
df1 = scada.loc[:, [var]]
df2 = aims.loc[:, [var]].dropna(axis=0)
plot_multiple(df1, df2, x_by_date=x_by_date, file_name=var)
df1 = scada.loc[:,['tss_ret']]
df2 = aims.loc[:,['tss_harv', 'tss_postmix']] # this should be separated? or masked?
labels = [*df1.columns, *df2.columns]
plot_multiple(df1, df2, labels, x_by_date, file_name='tss')

#%% To export plots
for by in (True, False):
plot_SCADA(by)
plot_SCADA_vs_AIMS(by)

#%% Kernel smoothing function

# Calculate t_stamp as day
calc_day = lambda t_stamp: (t_stamp-day0).total_seconds()/60/60/24

def plot_raw_vs_smooth(x1, y1, x2, y2, save_as=''):
fig, ax = plt.subplots(figsize=(15, 5))
l1, = ax.plot(x1, y1, label='Raw data')
l2, = ax.plot(x2, y2, label='Smoothed')
ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
ax.set_xlabel('Time [day]')
ax.legend(handles=[l1,l2], title=save_as)
path = ospath.join(plot_path, f'smoothed/{save_as}.png')
fig.savefig(path, dpi=300, facecolor='white')

def kernel_smoothing(data, method, kernel, bw, file_name=None, **kwargs):
'''
Smooths input data with kernel smoother.
Parameters
----------
data : pandas.DataFrame or pandas.Series
Raw data.
method : str
One of ('knn', 'llr', 'nw').
kernel : Callable
Kernel function.
bw : float or int
Bandwidth or the number of neighbor to consider
**kwargs : optional
Other parameters passed to `KernelSmoother`.
'''
# To use skfda package: 1) pandas series -> array,
# 2) float64 -> float32,
# 3) use 'day' data, not 'date' data

x = pd.Series(data.index).apply(calc_day).to_numpy()

if method == 'knn': kernel_estimator = KNeighborsHatMatrix(n_neighbors=bw, kernel=kernel)
elif method == 'llr': kernel_estimator = LocalLinearRegressionHatMatrix(bandwidth=bw, kernel=kernel)
else: kernel_estimator = NadarayaWatsonHatMatrix(bandwidth=bw, kernel=kernel)

smoother = KernelSmoother(kernel_estimator=kernel_estimator, **kwargs)
xhat = kwargs.pop('output_points', None) or x # in case times points other than the raw data are specified as output_points (output time point of the smoothed data doesn't have to be exactly the same as the raw data; If not specified, then it'll be the same.)
out = pd.DataFrame({'day':xhat})
if len(data.shape) == 1:
y = data.to_numpy()
fd = FDataGrid(y, x) # creation of FDataGrid
fd_smoothed = smoother.fit_transform(fd) # smooth
yhat = pd.Series(fd_smoothed.data_matrix.flatten()) # smoothed output
if xhat is x:
out['raw'] = y
out['smoothed'] = yhat
plot_raw_vs_smooth(x, y, xhat, yhat, f'{data.name}_{method}')
out.to_excel(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{data.name}_{method}.xlsx'))
else: # multiple y
for col, y in data.items():
fd = FDataGrid(y.to_numpy(), x)
fd_smoothed = smoother.fit_transform(fd)
yhat = pd.Series(fd_smoothed.data_matrix.flatten())
out[col+'_smoothed'] = yhat
plot_raw_vs_smooth(x, y, xhat, yhat, f'{col}_{method}')
out.to_excel(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{file_name}_{method}.xlsx'))

#%% Kernel smoothing commands
from skfda.misc.kernels import (
normal,
# cosine,
# epanechnikov,
# tri_weight,
# quartic,
# uniform,
)

#%% SCADA data smoothing (need to check one by one manually)

# Flow & volume
kernel_smoothing(data=scada.flow_mix, method='llr', kernel=normal, bw=0.1)
kernel_smoothing(data=scada.flow_pbr, method='llr', kernel=normal, bw=0.5)
kernel_smoothing(data=scada.flow_mem, method='llr', kernel=normal, bw=0.1)
kernel_smoothing(data=scada.flow_ret, method='llr', kernel=normal, bw=0.1)
kernel_smoothing(data=scada.flow_eff, method='llr', kernel=normal, bw=0.1)
# kernel_smoothing(data=scada.flow_cen, method='llr', kernel=normal, bw=0.1) # use as it is

kernel_smoothing(data=scada.vol_mix, method='llr', kernel=normal, bw=0.2)
kernel_smoothing(data=scada.vol_mem, method='llr', kernel=normal, bw=0.5)
kernel_smoothing(data=scada.vol_ret, method='llr', kernel=normal, bw=0.2)

# Temperature & light
kernel_smoothing(data=scada.temp, method='llr', kernel=normal, bw=0.1)
kernel_smoothing(data=scada.light, method='llr', kernel=normal, bw=0.001)

# Aqueous species
kernel_smoothing(data=scada.do, method='llr', kernel=normal, bw=0.001)
kernel_smoothing(data=scada.nh4_inf, method='llr', kernel=normal, bw=0.5)
kernel_smoothing(data=scada.nh4_eff, method='llr', kernel=normal, bw=0.5)
kernel_smoothing(data=scada.no3_eff, method='llr', kernel=normal, bw=0.8)
kernel_smoothing(data=scada.po4_inf, method='llr', kernel=normal, bw=0.5)
kernel_smoothing(data=scada.po4_eff, method='llr', kernel=normal, bw=0.5)
kernel_smoothing(data=scada.tss_ret, method='llr', kernel=normal, bw=0.7)

#%% SCADA vs. AIMS differences smoothing (need to check one by one manually)

var = ['nh4_inf', 'nh4_eff', 'no3_eff', 'po4_inf', 'po4_eff',
'tss_harv', 'tss_postmix']

kernel_smoothing(data=dct_compare[var[0]][var[0]+'_diff'], method='llr', kernel=normal, bw=0.8)
kernel_smoothing(data=dct_compare[var[1]][var[1]+'_diff'], method='llr', kernel=normal, bw=0.6)
kernel_smoothing(data=dct_compare[var[2]][var[2]+'_diff'], method='llr', kernel=normal, bw=0.8)
kernel_smoothing(data=dct_compare[var[3]][var[3]+'_diff'], method='llr', kernel=normal, bw=1)
kernel_smoothing(data=dct_compare[var[4]][var[4]+'_diff'], method='llr', kernel=normal, bw=1)
kernel_smoothing(data=dct_compare[var[5]][var[5]+'_diff'], method='llr', kernel=normal, bw=1)
kernel_smoothing(data=dct_compare[var[6]][var[6]+'_diff'], method='llr', kernel=normal, bw=2)

#%% Interpolate diff & correct smoothed SCADA

t_stamp = scada.index
var = ['nh4_inf', 'nh4_eff', 'no3_eff', 'po4_inf', 'po4_eff', 'tss_harv', 'tss_postmix']

def plot_raw_vs_corrected(df1, df2, df3, save_as=''):
fig, ax = plt.subplots(figsize=(15, 5))
l1, = ax.plot(df1, label='Raw SCADA')
l2, = ax.plot(df2, label='Raw AIMS')
l3, = ax.plot(df3, label='Corrected')
ax.xaxis.set_major_locator(dates.DayLocator(interval=5))
ax.xaxis.set_major_formatter(fmt_day)
ax.set_xlabel('Time [day]')
ax.legend(handles=[l1,l2,l3], title=save_as)
path = ospath.join(plot_path, f'corrected/{save_as}.png')
fig.savefig(path, dpi=300, facecolor='white')

def correct_scada(t_stamp, var, smoothing_method='llr'):
t_intp = calc_day(t_stamp).to_numpy()
diff_intp = pd.DataFrame()
diff_intp.index = t_stamp
diff_intp['t_interp'] = t_intp
with pd.ExcelWriter(ospath.join(folder, 'results/data_cleaning/excel/scada_corrected.xlsx')) as writer:
for i in var:
if i == 'tss_harv':
pass
else:
diff_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{i}_diff_{smoothing_method}.xlsx'))
diff_smoothed_t = diff_smoothed.day
diff_smoothed_df = diff_smoothed.smoothed
diff_intp[i+'diff_smo_interp'] = diff = np.interp(t_intp, diff_smoothed_t, diff_smoothed_df)
if i == 'tss_postmix':
scada_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/tss_ret_{smoothing_method}.xlsx'))
scada_smoothed.index = t_stamp
corrected = scada_smoothed[i+'_corr'] = scada_smoothed.smoothed + diff
corrected[corrected < 0] = 0
scada_smoothed[i+'_corr'] = corrected
scada_smoothed.to_excel(writer, sheet_name='tss_ret')
raw_scada = scada.loc[:,'tss_ret']
else:
scada_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{i}_{smoothing_method}.xlsx'))
scada_smoothed.index = t_stamp
corrected = scada_smoothed[i+'_corr'] = scada_smoothed.smoothed + diff
corrected[corrected < 0] = 0
scada_smoothed[i+'_corr'] = corrected
scada_smoothed.to_excel(writer, sheet_name=i)
raw_scada = scada.loc[:,i]
raw_aims = aims.loc[:,i].dropna(axis=0)
plot_raw_vs_corrected(raw_scada, raw_aims, corrected, i)
diff_intp.to_excel(ospath.join(folder, 'results/data_cleaning/excel/interpolated_smoothed_diff.xlsx'))

correct_scada(t_stamp, var, smoothing_method='llr')

#%% HRT of smoothed SCADA data

flow = ['flow_mix', 'flow_pbr', 'flow_mem', 'flow_ret', 'flow_cen']
vol = ['vol_mix', 'vol_pbr', 'vol_mem', 'vol_ret']

def hrt(smoothing_method='llr'):
with pd.ExcelWriter(ospath.join(folder, 'results/data_cleaning/excel/hrt.xlsx')) as writer:
for i in range(4):
flow_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{flow[i]}_{smoothing_method}.xlsx'))
if i == 1:
v = 77.49 # fixed
else:
v = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{vol[i]}_{smoothing_method}.xlsx')).smoothed
flow_smoothed[vol[i]] = v
flow_smoothed['hrt'] = v/flow_smoothed.smoothed
flow_smoothed.to_excel(writer, sheet_name=flow[i])

plt.hist(flow_smoothed['hrt'], bins=15)
plt.show()

hrt(smoothing_method='llr')

#%% Split ratio

def ratio(smoothing_method='llr'):
flow_pbr_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/flow_pbr_{smoothing_method}.xlsx')).smoothed
flow_mem_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/flow_mem_{smoothing_method}.xlsx')).smoothed
flow_eff_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/flow_eff_{smoothing_method}.xlsx')).smoothed
flow_cen = scada['flow_cen']
flow_cen.index = flow_mem_smoothed.index

ratio_pbr_to_mem = flow_mem_smoothed/flow_pbr_smoothed # ME = 0.51
ratio_pbr_to_ret = 1 - ratio_pbr_to_mem # INT = 0.49

ratio_mem_to_eff = flow_eff_smoothed/flow_mem_smoothed # TE = 0.39
ratio_mem_to_postmem = 1 - ratio_mem_to_eff # RETEN = 0.61

ratio_postmem_to_cen = flow_cen/(flow_mem_smoothed - flow_eff_smoothed) # CE = 0.03
ratio_postmem_to_ret = 1 - ratio_postmem_to_cen # RE = 0.97

ratios = pd.DataFrame({'t_stamp_day': calc_day(t_stamp).to_numpy(),
'ratio_pbr_to_mem (ME)':ratio_pbr_to_mem,
'ratio_pbr_to_ret (INT)':ratio_pbr_to_ret,
'ratio_mem_to_eff (TE)':ratio_mem_to_eff,
'ratio_mem_to_postmem (RETEN)':ratio_mem_to_postmem,
'ratio_postmem_to_cen (CE)':ratio_postmem_to_cen,
'ratio_postmem_to_ret (RE)':ratio_postmem_to_ret})

ratios.to_excel(ospath.join(folder, 'results/data_cleaning/excel/ratio.xlsx'))

for i in range(6):
plt.hist(ratios.iloc[:,i+1], bins=15)
plt.show()

ratio(smoothing_method='llr')
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ addopts =
--ignore-glob='exposan/new_generator/**'
--ignore-glob='exposan/pm2_batch/calibration.py'
--ignore-glob='exposan/pm2_ecorecover/calibration.py'
--ignore-glob='exposan/pm2_ecorecover/data_cleaning.py'
--ignore-glob='exposan/pou_disinfection/analyses/**'
norecursedirs =
build
Expand Down

0 comments on commit df6b041

Please sign in to comment.