diff --git a/exposan/pm2_ecorecover/README.rst b/exposan/pm2_ecorecover/README.rst
index c2a7ddb0..9796068d 100644
--- a/exposan/pm2_ecorecover/README.rst
+++ b/exposan/pm2_ecorecover/README.rst
@@ -4,7 +4,7 @@ pm2_ecorecover: Calibration & Validation of Phototrophic-Mixotrophic Process Mod
Summary
-------
-This module is used to calibrate and validate the Phototrophic-Mixotrophic Process Model (PM2; ``qsdsan.processes.PM2``) using the continuous online monitoring data from the EcoRecover plant. Note that the package `optuna `_ is needed to run the calibration script.
+This module is used to calibrate and validate the Phototrophic-Mixotrophic Process Model (PM2; ``qsdsan.processes.PM2``) using the continuous online monitoring data from the EcoRecover plant. Note that the package `optuna `_ is needed to run the calibration script and the package `scikit-fda `_ is needed to run the data cleaning script.
.. figure:: ./readme_figures/pm2_ecorecover.svg
diff --git a/exposan/pm2_ecorecover/data/data_cleaning_rawdata.xlsx b/exposan/pm2_ecorecover/data/data_cleaning_rawdata.xlsx
new file mode 100644
index 00000000..6395ee71
Binary files /dev/null and b/exposan/pm2_ecorecover/data/data_cleaning_rawdata.xlsx differ
diff --git a/exposan/pm2_ecorecover/data_cleaning.py b/exposan/pm2_ecorecover/data_cleaning.py
new file mode 100644
index 00000000..e462a90d
--- /dev/null
+++ b/exposan/pm2_ecorecover/data_cleaning.py
@@ -0,0 +1,355 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Fri Nov 4 19:45:06 2022
+
+This module is developed by:
+ Ga-Yeong Kim
+ Joy Zhang
+
+This module is under the University of Illinois/NCSA Open Source License.
+Please refer to https://github.com/QSD-Group/QSDsan/blob/main/LICENSE.txt
+for license details.
+'''
+import matplotlib.pyplot as plt, matplotlib.ticker as ticker, \
+ pandas as pd, numpy as np
+from matplotlib import dates
+from qsdsan.utils import load_data, ospath
+
+from skfda.misc.hat_matrix import (KNeighborsHatMatrix,
+ LocalLinearRegressionHatMatrix,
+ NadarayaWatsonHatMatrix,)
+
+from skfda.preprocessing.smoothing import KernelSmoother
+from skfda.representation.grid import FDataGrid
+
+#%% Load raw data
+folder = ospath.dirname(__file__)
+file = ospath.join(folder, 'data/data_cleaning_rawdata.xlsx')
+raw_data = load_data(file, sheet=None)
+scada = raw_data['SCADA']
+
+# Join AIMS data columns
+aims = raw_data['AIMS_Inf_NH4'].join(raw_data['AIMS_Eff_NH4'], how='outer', sort=True) # initial
+for k, df in raw_data.items():
+ if k.startswith('AIMS_') and not k.endswith('NH4'):
+ aims = aims.join(df, how='outer', sort=True) # append
+del raw_data
+
+# Delete '_aims' in column names
+aims.columns = [c.rstrip('_aims') for c in aims.columns]
+
+# Set starting point
+from datetime import datetime
+day0 = datetime.strptime('03-12-2022 12:00 AM', '%m-%d-%Y %I:%M %p')
+
+#%% Difference between SCADA and AIMS
+all_data = scada.join(aims, how='right', rsuffix='_aims', sort=True)
+sheets = ['nh4_inf', 'nh4_eff', 'no3_eff', 'po4_inf', 'po4_eff', 'tss']
+dct_compare = {}
+for sheet in sheets:
+ cols = [col for col in all_data.columns if col.startswith(sheet)]
+ df = all_data.loc[:,cols]
+ if sheet == 'tss':
+ for i in ('_harv', '_postmix'):
+ df[sheet+i+'_diff'] = df[sheet+i] - df[sheet+'_ret']
+ dct_compare[sheet+i] = df.loc[:, [sheet+'_ret', sheet+i, sheet+i+'_diff']].dropna(axis=0)
+ else:
+ df[sheet+'_diff'] = df[sheet+'_aims'] - df[sheet]
+ dct_compare[sheet] = df.dropna(axis=0)
+
+with pd.ExcelWriter(ospath.join(folder, 'results/data_cleaning/excel/scada_vs_aims.xlsx')) as writer:
+ for sheet, df in dct_compare.items():
+ df.to_excel(writer, sheet_name=sheet)
+
+#%% Plotting functions
+
+plot_path = ospath.join(folder, 'results/data_cleaning/plot/')
+date0 = dates.date2num(day0)
+
+@ticker.FuncFormatter
+def fmt_day(t_stamp, pos):
+ day = t_stamp - date0
+ return f'{day:.0f}'
+
+def plot_single(x, y, x_by_date=True, file_name=None):
+ fig, ax = plt.subplots(figsize=(15, 5))
+ ax.plot(x, y, label=y.name)
+ ax.xaxis.set_major_locator(dates.DayLocator(interval=5))
+ fig.legend(loc='upper right', bbox_to_anchor=(1,0.9))
+ if x_by_date:
+ ax.xaxis.set_major_formatter(dates.DateFormatter('%Y-%m-%d'))
+ ax.set_xlabel('Date')
+ subfolder = 'raw/date'
+ else:
+ ax.xaxis.set_major_formatter(fmt_day)
+ ax.set_xlabel('Time [day]')
+ subfolder = 'raw/day'
+ if file_name:
+ fig.savefig(ospath.join(plot_path, f'{subfolder}/{file_name}.png'),
+ dpi=300, bbox_inches='tight', facecolor='white')
+ else: return fig, ax
+
+def plot_SCADA(x_by_date=True):
+ for col, y in scada.items():
+ plot_single(scada.index, y, x_by_date=x_by_date, file_name=col)
+
+# plot_SCADA(x_by_date=True) # date
+# plot_SCADA(x_by_date=False) # day
+
+def plot_multiple(df1, df2, labels=None, smooth=False, x_by_date=True, file_name=None):
+ fig, ax = plt.subplots(figsize=(15, 5))
+ ax.plot(df1)
+ ax.plot(df2, 'o', ms=4, linestyle='-')
+ labels = labels or [*df1.columns, *[col+'_aims' for col in df2.columns]]
+ ax.xaxis.set_major_locator(dates.DayLocator(interval=5))
+ if x_by_date:
+ ax.xaxis.set_major_formatter(dates.DateFormatter('%Y-%m-%d'))
+ ax.set_xlabel('Date')
+ file = f'raw/date/{file_name}_vs.png'
+ else:
+ ax.xaxis.set_major_formatter(fmt_day)
+ ax.set_xlabel('Time [day]')
+ file = f'raw/day/{file_name}_vs.png'
+ fig.legend(labels=labels, loc='upper right', bbox_to_anchor=(1,0.9))
+ if file_name:
+ fig.savefig(ospath.join(plot_path, file),
+ dpi=300, bbox_inches='tight', facecolor='white')
+ else: return fig, ax
+
+def plot_SCADA_vs_AIMS(x_by_date=True):
+ for var in set(scada.columns) & set(aims.columns):
+ df1 = scada.loc[:, [var]]
+ df2 = aims.loc[:, [var]].dropna(axis=0)
+ plot_multiple(df1, df2, x_by_date=x_by_date, file_name=var)
+ df1 = scada.loc[:,['tss_ret']]
+ df2 = aims.loc[:,['tss_harv', 'tss_postmix']] # this should be separated? or masked?
+ labels = [*df1.columns, *df2.columns]
+ plot_multiple(df1, df2, labels, x_by_date, file_name='tss')
+
+#%% To export plots
+for by in (True, False):
+ plot_SCADA(by)
+ plot_SCADA_vs_AIMS(by)
+
+#%% Kernel smoothing function
+
+# Calculate t_stamp as day
+calc_day = lambda t_stamp: (t_stamp-day0).total_seconds()/60/60/24
+
+def plot_raw_vs_smooth(x1, y1, x2, y2, save_as=''):
+ fig, ax = plt.subplots(figsize=(15, 5))
+ l1, = ax.plot(x1, y1, label='Raw data')
+ l2, = ax.plot(x2, y2, label='Smoothed')
+ ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
+ ax.set_xlabel('Time [day]')
+ ax.legend(handles=[l1,l2], title=save_as)
+ path = ospath.join(plot_path, f'smoothed/{save_as}.png')
+ fig.savefig(path, dpi=300, facecolor='white')
+
+def kernel_smoothing(data, method, kernel, bw, file_name=None, **kwargs):
+ '''
+ Smooths input data with kernel smoother.
+
+ Parameters
+ ----------
+ data : pandas.DataFrame or pandas.Series
+ Raw data.
+ method : str
+ One of ('knn', 'llr', 'nw').
+ kernel : Callable
+ Kernel function.
+ bw : float or int
+ Bandwidth or the number of neighbor to consider
+ **kwargs : optional
+ Other parameters passed to `KernelSmoother`.
+ '''
+ # To use skfda package: 1) pandas series -> array,
+ # 2) float64 -> float32,
+ # 3) use 'day' data, not 'date' data
+
+ x = pd.Series(data.index).apply(calc_day).to_numpy()
+
+ if method == 'knn': kernel_estimator = KNeighborsHatMatrix(n_neighbors=bw, kernel=kernel)
+ elif method == 'llr': kernel_estimator = LocalLinearRegressionHatMatrix(bandwidth=bw, kernel=kernel)
+ else: kernel_estimator = NadarayaWatsonHatMatrix(bandwidth=bw, kernel=kernel)
+
+ smoother = KernelSmoother(kernel_estimator=kernel_estimator, **kwargs)
+ xhat = kwargs.pop('output_points', None) or x # in case times points other than the raw data are specified as output_points (output time point of the smoothed data doesn't have to be exactly the same as the raw data; If not specified, then it'll be the same.)
+ out = pd.DataFrame({'day':xhat})
+ if len(data.shape) == 1:
+ y = data.to_numpy()
+ fd = FDataGrid(y, x) # creation of FDataGrid
+ fd_smoothed = smoother.fit_transform(fd) # smooth
+ yhat = pd.Series(fd_smoothed.data_matrix.flatten()) # smoothed output
+ if xhat is x:
+ out['raw'] = y
+ out['smoothed'] = yhat
+ plot_raw_vs_smooth(x, y, xhat, yhat, f'{data.name}_{method}')
+ out.to_excel(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{data.name}_{method}.xlsx'))
+ else: # multiple y
+ for col, y in data.items():
+ fd = FDataGrid(y.to_numpy(), x)
+ fd_smoothed = smoother.fit_transform(fd)
+ yhat = pd.Series(fd_smoothed.data_matrix.flatten())
+ out[col+'_smoothed'] = yhat
+ plot_raw_vs_smooth(x, y, xhat, yhat, f'{col}_{method}')
+ out.to_excel(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{file_name}_{method}.xlsx'))
+
+#%% Kernel smoothing commands
+from skfda.misc.kernels import (
+ normal,
+ # cosine,
+ # epanechnikov,
+ # tri_weight,
+ # quartic,
+ # uniform,
+ )
+
+#%% SCADA data smoothing (need to check one by one manually)
+
+# Flow & volume
+kernel_smoothing(data=scada.flow_mix, method='llr', kernel=normal, bw=0.1)
+kernel_smoothing(data=scada.flow_pbr, method='llr', kernel=normal, bw=0.5)
+kernel_smoothing(data=scada.flow_mem, method='llr', kernel=normal, bw=0.1)
+kernel_smoothing(data=scada.flow_ret, method='llr', kernel=normal, bw=0.1)
+kernel_smoothing(data=scada.flow_eff, method='llr', kernel=normal, bw=0.1)
+# kernel_smoothing(data=scada.flow_cen, method='llr', kernel=normal, bw=0.1) # use as it is
+
+kernel_smoothing(data=scada.vol_mix, method='llr', kernel=normal, bw=0.2)
+kernel_smoothing(data=scada.vol_mem, method='llr', kernel=normal, bw=0.5)
+kernel_smoothing(data=scada.vol_ret, method='llr', kernel=normal, bw=0.2)
+
+# Temperature & light
+kernel_smoothing(data=scada.temp, method='llr', kernel=normal, bw=0.1)
+kernel_smoothing(data=scada.light, method='llr', kernel=normal, bw=0.001)
+
+# Aqueous species
+kernel_smoothing(data=scada.do, method='llr', kernel=normal, bw=0.001)
+kernel_smoothing(data=scada.nh4_inf, method='llr', kernel=normal, bw=0.5)
+kernel_smoothing(data=scada.nh4_eff, method='llr', kernel=normal, bw=0.5)
+kernel_smoothing(data=scada.no3_eff, method='llr', kernel=normal, bw=0.8)
+kernel_smoothing(data=scada.po4_inf, method='llr', kernel=normal, bw=0.5)
+kernel_smoothing(data=scada.po4_eff, method='llr', kernel=normal, bw=0.5)
+kernel_smoothing(data=scada.tss_ret, method='llr', kernel=normal, bw=0.7)
+
+#%% SCADA vs. AIMS differences smoothing (need to check one by one manually)
+
+var = ['nh4_inf', 'nh4_eff', 'no3_eff', 'po4_inf', 'po4_eff',
+ 'tss_harv', 'tss_postmix']
+
+kernel_smoothing(data=dct_compare[var[0]][var[0]+'_diff'], method='llr', kernel=normal, bw=0.8)
+kernel_smoothing(data=dct_compare[var[1]][var[1]+'_diff'], method='llr', kernel=normal, bw=0.6)
+kernel_smoothing(data=dct_compare[var[2]][var[2]+'_diff'], method='llr', kernel=normal, bw=0.8)
+kernel_smoothing(data=dct_compare[var[3]][var[3]+'_diff'], method='llr', kernel=normal, bw=1)
+kernel_smoothing(data=dct_compare[var[4]][var[4]+'_diff'], method='llr', kernel=normal, bw=1)
+kernel_smoothing(data=dct_compare[var[5]][var[5]+'_diff'], method='llr', kernel=normal, bw=1)
+kernel_smoothing(data=dct_compare[var[6]][var[6]+'_diff'], method='llr', kernel=normal, bw=2)
+
+#%% Interpolate diff & correct smoothed SCADA
+
+t_stamp = scada.index
+var = ['nh4_inf', 'nh4_eff', 'no3_eff', 'po4_inf', 'po4_eff', 'tss_harv', 'tss_postmix']
+
+def plot_raw_vs_corrected(df1, df2, df3, save_as=''):
+ fig, ax = plt.subplots(figsize=(15, 5))
+ l1, = ax.plot(df1, label='Raw SCADA')
+ l2, = ax.plot(df2, label='Raw AIMS')
+ l3, = ax.plot(df3, label='Corrected')
+ ax.xaxis.set_major_locator(dates.DayLocator(interval=5))
+ ax.xaxis.set_major_formatter(fmt_day)
+ ax.set_xlabel('Time [day]')
+ ax.legend(handles=[l1,l2,l3], title=save_as)
+ path = ospath.join(plot_path, f'corrected/{save_as}.png')
+ fig.savefig(path, dpi=300, facecolor='white')
+
+def correct_scada(t_stamp, var, smoothing_method='llr'):
+ t_intp = calc_day(t_stamp).to_numpy()
+ diff_intp = pd.DataFrame()
+ diff_intp.index = t_stamp
+ diff_intp['t_interp'] = t_intp
+ with pd.ExcelWriter(ospath.join(folder, 'results/data_cleaning/excel/scada_corrected.xlsx')) as writer:
+ for i in var:
+ if i == 'tss_harv':
+ pass
+ else:
+ diff_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{i}_diff_{smoothing_method}.xlsx'))
+ diff_smoothed_t = diff_smoothed.day
+ diff_smoothed_df = diff_smoothed.smoothed
+ diff_intp[i+'diff_smo_interp'] = diff = np.interp(t_intp, diff_smoothed_t, diff_smoothed_df)
+ if i == 'tss_postmix':
+ scada_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/tss_ret_{smoothing_method}.xlsx'))
+ scada_smoothed.index = t_stamp
+ corrected = scada_smoothed[i+'_corr'] = scada_smoothed.smoothed + diff
+ corrected[corrected < 0] = 0
+ scada_smoothed[i+'_corr'] = corrected
+ scada_smoothed.to_excel(writer, sheet_name='tss_ret')
+ raw_scada = scada.loc[:,'tss_ret']
+ else:
+ scada_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{i}_{smoothing_method}.xlsx'))
+ scada_smoothed.index = t_stamp
+ corrected = scada_smoothed[i+'_corr'] = scada_smoothed.smoothed + diff
+ corrected[corrected < 0] = 0
+ scada_smoothed[i+'_corr'] = corrected
+ scada_smoothed.to_excel(writer, sheet_name=i)
+ raw_scada = scada.loc[:,i]
+ raw_aims = aims.loc[:,i].dropna(axis=0)
+ plot_raw_vs_corrected(raw_scada, raw_aims, corrected, i)
+ diff_intp.to_excel(ospath.join(folder, 'results/data_cleaning/excel/interpolated_smoothed_diff.xlsx'))
+
+correct_scada(t_stamp, var, smoothing_method='llr')
+
+#%% HRT of smoothed SCADA data
+
+flow = ['flow_mix', 'flow_pbr', 'flow_mem', 'flow_ret', 'flow_cen']
+vol = ['vol_mix', 'vol_pbr', 'vol_mem', 'vol_ret']
+
+def hrt(smoothing_method='llr'):
+ with pd.ExcelWriter(ospath.join(folder, 'results/data_cleaning/excel/hrt.xlsx')) as writer:
+ for i in range(4):
+ flow_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{flow[i]}_{smoothing_method}.xlsx'))
+ if i == 1:
+ v = 77.49 # fixed
+ else:
+ v = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/{vol[i]}_{smoothing_method}.xlsx')).smoothed
+ flow_smoothed[vol[i]] = v
+ flow_smoothed['hrt'] = v/flow_smoothed.smoothed
+ flow_smoothed.to_excel(writer, sheet_name=flow[i])
+
+ plt.hist(flow_smoothed['hrt'], bins=15)
+ plt.show()
+
+hrt(smoothing_method='llr')
+
+#%% Split ratio
+
+def ratio(smoothing_method='llr'):
+ flow_pbr_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/flow_pbr_{smoothing_method}.xlsx')).smoothed
+ flow_mem_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/flow_mem_{smoothing_method}.xlsx')).smoothed
+ flow_eff_smoothed = load_data(ospath.join(folder, f'results/data_cleaning/excel/smoothed/flow_eff_{smoothing_method}.xlsx')).smoothed
+ flow_cen = scada['flow_cen']
+ flow_cen.index = flow_mem_smoothed.index
+
+ ratio_pbr_to_mem = flow_mem_smoothed/flow_pbr_smoothed # ME = 0.51
+ ratio_pbr_to_ret = 1 - ratio_pbr_to_mem # INT = 0.49
+
+ ratio_mem_to_eff = flow_eff_smoothed/flow_mem_smoothed # TE = 0.39
+ ratio_mem_to_postmem = 1 - ratio_mem_to_eff # RETEN = 0.61
+
+ ratio_postmem_to_cen = flow_cen/(flow_mem_smoothed - flow_eff_smoothed) # CE = 0.03
+ ratio_postmem_to_ret = 1 - ratio_postmem_to_cen # RE = 0.97
+
+ ratios = pd.DataFrame({'t_stamp_day': calc_day(t_stamp).to_numpy(),
+ 'ratio_pbr_to_mem (ME)':ratio_pbr_to_mem,
+ 'ratio_pbr_to_ret (INT)':ratio_pbr_to_ret,
+ 'ratio_mem_to_eff (TE)':ratio_mem_to_eff,
+ 'ratio_mem_to_postmem (RETEN)':ratio_mem_to_postmem,
+ 'ratio_postmem_to_cen (CE)':ratio_postmem_to_cen,
+ 'ratio_postmem_to_ret (RE)':ratio_postmem_to_ret})
+
+ ratios.to_excel(ospath.join(folder, 'results/data_cleaning/excel/ratio.xlsx'))
+
+ for i in range(6):
+ plt.hist(ratios.iloc[:,i+1], bins=15)
+ plt.show()
+
+ratio(smoothing_method='llr')
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
index cbc3d078..9303d042 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -17,6 +17,7 @@ addopts =
--ignore-glob='exposan/new_generator/**'
--ignore-glob='exposan/pm2_batch/calibration.py'
--ignore-glob='exposan/pm2_ecorecover/calibration.py'
+ --ignore-glob='exposan/pm2_ecorecover/data_cleaning.py'
--ignore-glob='exposan/pou_disinfection/analyses/**'
norecursedirs =
build