From c0292578894826484a489a8c08d0ccf1be8cf2c1 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 19 Aug 2020 09:49:26 -0600 Subject: [PATCH 1/6] added init kwargs and attrs for feature and output names --- phygnn/phygnn.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py index ecbd19d..114af5d 100644 --- a/phygnn/phygnn.py +++ b/phygnn/phygnn.py @@ -24,7 +24,8 @@ class PhysicsGuidedNeuralNetwork: def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), input_dims=1, output_dims=1, metric='mae', initializer=None, optimizer=None, - learning_rate=0.01, history=None): + learning_rate=0.01, history=None, + feature_names=None, output_names=None): """ Parameters ---------- @@ -62,6 +63,14 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), Optimizer learning rate. history : None | pd.dataframe Learning history if continuing a training session. + feature_names : list | tuple | None + Training feature names (strings). Mostly a convenience so that a + loaded-from-disk model will have declared feature names, making it + easier to feed in features for prediction. + output_names : list | tuple | None + Prediction output names (strings). Mostly a convenience so that a + loaded-from-disk model will have declared output names, making it + easier to understand prediction output. """ self._p_fun = p_fun @@ -73,6 +82,8 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), self._optimizer = None self._history = history self._learning_rate = learning_rate + self.feature_names = feature_names + self.output_names = output_names self.set_loss_weights(loss_weights) @@ -554,6 +565,8 @@ def save(self, fpath): 'learning_rate': self._learning_rate, 'weight_dict': weight_dict, 'history': self._history, + 'feature_names': self.feature_names, + 'output_names': self.output_names, } with open(fpath, 'wb') as f: From caf00a1a6c5d74dd0f59020dfc72291b4ec0e018 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 19 Aug 2020 11:02:17 -0600 Subject: [PATCH 2/6] added better one-hot category encoding with column names that are the specified categories --- phygnn/pre_processing.py | 199 +++++++++++++++++++++++++++++------ tests/test_pre_processing.py | 45 +++++--- 2 files changed, 194 insertions(+), 50 deletions(-) diff --git a/phygnn/pre_processing.py b/phygnn/pre_processing.py index 1fd2301..a837b4f 100644 --- a/phygnn/pre_processing.py +++ b/phygnn/pre_processing.py @@ -12,22 +12,14 @@ class PreProcess: """Class to handle the pre-processing of feature data.""" - def __init__(self, features, categories={}): + def __init__(self, features): """ Parameters ---------- features : np.ndarray | pd.DataFrame Feature data in a 2D array or DataFrame. - categories : dict - Categories to use for one hot encoding. Empty dict results in - categories being determined automatically. Format: - { - 'col_name1' : ['cat1', 'cat2', 'cat3'], - 'col_name2' : ['other_cat1', 'other_cat2'] - } """ - self._categories = categories self._features = features self._pd = False if isinstance(self._features, pd.DataFrame): @@ -37,20 +29,75 @@ def __init__(self, features, categories={}): if not features.index.is_unique: raise AttributeError('DataFrame indices must be unique') - def process_one_hot(self, convert_int=False): - """Process str and int columns in the feature data to one-hot vectors. + @staticmethod + def _is_one_hot(arr, convert_int=False): + """Check if an array of data is to be transformed into a one-hot vector + by sampling the first datum and checking the type. Parameters ---------- + arr : np.ndarray + Array (column) of data to be checked. convert_int : bool Flag to convert integer data to one-hot vectors. Returns ------- - processed : np.ndarray | pd.DataFrame - Feature data with str and int columns removed and one-hot boolean - vectors appended as new columns. + one_hot : bool + True if arr is to be transformed into a one-hot vector. """ + if len(arr.shape) == 1: + sample = arr[0] + elif len(arr.shape) == 2: + sample = arr[0, 0] + else: + e = 'Cannot process 3D column into one hot' + logger.error(e) + raise ValueError(e) + + one_hot = False + + if isinstance(sample, str): + one_hot = True + elif np.issubdtype(type(sample), np.integer) and convert_int: + one_hot = True + + return one_hot + + def _get_one_hot_data(self, convert_int=False, categories=None): + """Get one hot data and column indexes. + + Parameters + ---------- + convert_int : bool + Flag to convert integer data to one-hot vectors. + categories : dict | None + Categories to use for one hot encoding where a key is the original + column name in the feature dataframe and value is a list of the + possible unique values of the feature column. The value list must + have as many or more entries as unique values in the feature + column. This will name the feature column headers for the new + one-hot-encoding if features is a dataframe. Empty dict or None + results in category names being determined automatically. Format: + {'col_name1' : ['cat1', 'cat2', 'cat3'], + 'col_name2' : ['other_cat1', 'other_cat2']} + + Returns + ------- + one_hot_ind : list + List of numeric column indices in the native data that are + to-be-transformed into one-hot vectors. + one_hot_data : list + List of arrays of one hot data columns that are transformations of + the one_hot_ind columns. + numerical_ind : list + List of numeric column indices in the native data that are + continuous numerical columns that are not to-be-transformed into + one-hot vectors. + """ + + if categories is None: + categories = {} one_hot_ind = [] one_hot_data = [] @@ -68,46 +115,128 @@ def process_one_hot(self, convert_int=False): else: col = self._features[:, i].reshape((n, 1)) - sample = col[0, 0] - one_hot = False - - if isinstance(sample, str): - one_hot = True - elif np.issubdtype(type(sample), np.integer) and convert_int: - one_hot = True - - if one_hot: + if not self._is_one_hot(col, convert_int=convert_int): + numerical_ind.append(i) + else: logger.debug('One hot encoding {}'.format(col_name)) one_hot_ind.append(i) - if col_name in self._categories: - categories = [self._categories[col_name]] + + if col_name in categories: + cats = [categories[col_name]] logger.debug('Using categories {} for column {}' - ''.format(categories, col_name)) - oh_obj = OneHotEncoder(sparse=False, categories=categories) + ''.format(cats, col_name)) + oh_obj = OneHotEncoder(sparse=False, categories=cats) else: oh_obj = OneHotEncoder(sparse=False) + oh_obj.fit(col) one_hot_data.append(oh_obj.transform(col)) + + return one_hot_ind, one_hot_data, numerical_ind + + def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data, + categories=None): + """ + Parameters + ---------- + one_hot_ind : list + List of numeric column indices in the native data that are + to-be-transformed into one-hot vectors. + one_hot_data : list + List of arrays of one hot data columns that are transformations of + the one_hot_ind columns. + categories : dict | None + Categories to use for one hot encoding where a key is the original + column name in the feature dataframe and value is a list of the + possible unique values of the feature column. The value list must + have as many or more entries as unique values in the feature + column. This will name the feature column headers for the new + one-hot-encoding if features is a dataframe. Empty dict or None + results in category names being determined automatically. Format: + {'col_name1' : ['cat1', 'cat2', 'cat3'], + 'col_name2' : ['other_cat1', 'other_cat2']} + """ + + if categories is None: + categories = {} + + col_labels = [] + for i, oh_ind in enumerate(one_hot_ind): + orig_col_label = self._features.columns.values[oh_ind] + if orig_col_label in categories: + cat_labels = categories[orig_col_label] + + msg = ('Values in the categories input dict must be a ' + 'list or tuple!') + assert isinstance(cat_labels, (list, tuple)), msg + + unique_vals = pd.unique(self._features[orig_col_label]) + msg = ('Categories for "{a}" one-hot column had fewer unique ' + 'entries than one-hot encodings! You input these ' + 'categories: {b} but "{a}" has these values: {c}' + .format(a=orig_col_label, b=cat_labels, c=unique_vals)) + assert len(cat_labels) >= len(unique_vals), msg + + if isinstance(cat_labels, tuple): + cat_labels = list(cat_labels) + + col_labels += cat_labels else: - numerical_ind.append(i) + def_labels = [orig_col_label + '_' + str(k) + for k in range(one_hot_data[i].shape[1])] + col_labels += def_labels + + return col_labels + + def process_one_hot(self, convert_int=False, categories=None): + """Process str and int columns in the feature data to one-hot vectors. + + Parameters + ---------- + convert_int : bool + Flag to convert integer data to one-hot vectors. + categories : dict | None + Categories to use for one hot encoding where a key is the original + column name in the feature dataframe and value is a list of the + possible unique values of the feature column. The value list must + have as many or more entries as unique values in the feature + column. This will name the feature column headers for the new + one-hot-encoding if features is a dataframe. Empty dict or None + results in category names being determined automatically. Format: + {'col_name1' : ['cat1', 'cat2', 'cat3'], + 'col_name2' : ['other_cat1', 'other_cat2']} + + Returns + ------- + processed : np.ndarray | pd.DataFrame + Feature data with str and int columns removed and one-hot boolean + vectors appended as new columns. If features is a dataframe and + categories is input, the new one-hot columns will be named + according to categories. + """ + + if categories is None: + categories = {} + + one_hot_ind, one_hot_data, numerical_ind = self._get_one_hot_data( + convert_int=convert_int, categories=categories) if not one_hot_ind: return self._features - if one_hot_ind: + else: if self._pd: num_df = self._features.iloc[:, numerical_ind] - cols = [[self._features.columns[j] + '_' + str(k) - for k in range(one_hot_data[i].shape[1])] - for i, j in enumerate(one_hot_ind)] - cols = [a for sublist in cols for a in sublist] + col_labels = self._make_df_one_hot_cols_labels(one_hot_ind, + one_hot_data, + categories) one_hot_df = pd.DataFrame(np.hstack(one_hot_data), - columns=cols, + columns=col_labels, index=self._features.index) processed = num_df.join(one_hot_df) - assert processed.shape[0] == num_df.shape[0] == \ one_hot_df.shape[0] + else: processed = np.hstack((self._features[:, numerical_ind], np.hstack(one_hot_data))) diff --git a/tests/test_pre_processing.py b/tests/test_pre_processing.py index 6ce2188..2667554 100644 --- a/tests/test_pre_processing.py +++ b/tests/test_pre_processing.py @@ -9,7 +9,7 @@ index = pd.date_range('20180101', '20190101', freq='5min') A = pd.DataFrame({'f1': ['a', 'b', 'c', 'd', 'a', 'c', 'a'], 'f2': np.arange(7) * 0.333, - 'f3': np.arange(7)}, index=index[:7]) + 'f3': np.arange(7, dtype=int)}, index=index[:7]) def test_one_hot_encoding(): @@ -44,20 +44,35 @@ def test_one_hot_encoding(): def test_categories(): """ Verify predefined categories handle missing data """ proc = PreProcess(A) - np_out = proc.process_one_hot(convert_int=False) - assert (np_out.columns == ['f2', 'f3', 'f1_0', 'f1_1', - 'f1_2', 'f1_3']).all() + out = proc.process_one_hot(convert_int=False) + assert (out.columns == ['f2', 'f3', 'f1_0', 'f1_1', + 'f1_2', 'f1_3']).all() # Verify columns are created for missing categories - proc = PreProcess(A, categories={'f1': ['a', 'b', 'c', 'd', 'missing']}) - np_out = proc.process_one_hot(convert_int=False) - assert (np_out.columns == ['f2', 'f3', 'f1_0', 'f1_1', - 'f1_2', 'f1_3', 'f1_4']).all() - assert (np_out['f1_4'] == np.zeros(7)).all() + # and that the new one-hot columns have names corresponding to their values + proc = PreProcess(A) + out0 = proc.process_one_hot( + convert_int=False, categories={'f1': ['a', 'b', 'c', 'd', 'missing']}) + assert (out0.columns == ['f2', 'f3', 'a', 'b', 'c', 'd', 'missing']).all() + assert (out0['missing'] == np.zeros(7)).all() - # Verify columns are created for missing categories in correct order - proc = PreProcess(A, categories={'f1': ['missing', 'a', 'b', 'c', 'd']}) - np_out = proc.process_one_hot(convert_int=False) - assert (np_out.columns == ['f2', 'f3', 'f1_0', 'f1_1', - 'f1_2', 'f1_3', 'f1_4']).all() - assert (np_out['f1_0'] == np.zeros(7)).all() + # verify ordering works. + out1 = proc.process_one_hot( + convert_int=False, categories={'f1': ['missing', 'd', 'c', 'a', 'b']}) + assert (out1.columns == ['f2', 'f3', 'missing', 'd', 'c', 'a', 'b']).all() + assert all(out0.a == out1.a) + assert all(out0.b == out1.b) + assert all(out0.c == out1.c) + assert all(out0.d == out1.d) + assert (out1['missing'] == np.zeros(7)).all() + assert out1.a.values[0] == 1 + assert out1.a.values[1] == 0 + assert out1.a.values[2] == 0 + assert out1.a.values[3] == 0 + assert out1.a.values[4] == 1 + + # Verify good error with bad categories input. + try: + proc.process_one_hot(categories={'f1': ['a', 'b', 'c']}) + except ValueError as e: + assert 'Found unknown categories' in str(e) From 2be1c5cec1cecb7930b71d9e4d709d7652227d8d Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 19 Aug 2020 11:39:20 -0600 Subject: [PATCH 3/6] added feature to accept dataframes as input to phygnn with column label checking --- phygnn/phygnn.py | 133 ++++++++++++++++++++++++++++++++++++------- tests/test_phygnn.py | 40 ++++++++++++- 2 files changed, 150 insertions(+), 23 deletions(-) diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py index 114af5d..d5a2454 100644 --- a/phygnn/phygnn.py +++ b/phygnn/phygnn.py @@ -61,16 +61,18 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), None defaults to Adam. learning_rate : float Optimizer learning rate. - history : None | pd.dataframe + history : None | pd.DataFrame Learning history if continuing a training session. feature_names : list | tuple | None Training feature names (strings). Mostly a convenience so that a loaded-from-disk model will have declared feature names, making it - easier to feed in features for prediction. + easier to feed in features for prediction. This will also get set + if phygnn is trained on a DataFrame. output_names : list | tuple | None Prediction output names (strings). Mostly a convenience so that a loaded-from-disk model will have declared output names, making it - easier to understand prediction output. + easier to understand prediction output. This will also get set + if phygnn is trained on a DataFrame. """ self._p_fun = p_fun @@ -118,7 +120,7 @@ def _check_shapes(x, y): assert len(x) == len(y), 'Number of input observations dont match!' return True - def _preflight_p_fun(self, x, y_true, p, p_kwargs): + def preflight_p_fun(self, x, y_true, p, p_kwargs): """Run a pre-flight check making sure the p_fun is differentiable.""" if p_kwargs is None: @@ -153,8 +155,37 @@ def _preflight_p_fun(self, x, y_true, p, p_kwargs): logger.debug('p_fun passed preflight check.') - def _preflight_data(self, x, y, p): - """Run simple preflight checks on data shapes.""" + def preflight_data(self, x, y, p): + """Run simple preflight checks on data shapes. + + Parameters + ---------- + x : np.ndarray | pd.DataFrame + Feature data in a 2D array or DataFrame. If this is a DataFrame, + the index is ignored, the columns are used with self.feature_names, + and the df is converted into a numpy array for batching and passing + to the training algorithm. + y : np.ndarray | pd.DataFrame + Known output data in a 2D array or DataFrame. If this is a + DataFrame, the index is ignored, the columns are used with + self.output_names, and the df is converted into a numpy array for + batching and passing to the training algorithm. + p : np.ndarray | pd.DataFrame + Supplemental feature data for the physics loss function in 2D array + or DataFrame. If this is a DataFrame, the index and column labels + are ignored and the df is converted into a numpy array for batching + and passing to the training algorithm and physical loss function. + + Returns + ---------- + x : np.ndarray + Feature data in a 2D array + y : np.ndarray + Known output data in a 2D array + p : np.ndarray + Supplemental feature data for the physics loss function in 2D array + """ + self._check_shapes(x, y) self._check_shapes(x, p) x_msg = ('x data has {} features but expected {}' @@ -164,13 +195,59 @@ def _preflight_data(self, x, y, p): assert x.shape[1] == self._input_dims, x_msg assert y.shape[1] == self._output_dims, y_msg - def _preflight_predict(self, x): - """Run simple preflight checks on feature data shape for prediction.""" - assert len(x.shape) == 2, 'PhyGNN can only predict on 2D data!' + x = self.preflight_features(x) + + if isinstance(y, pd.DataFrame): + y_cols = y.columns.values.tolist() + if self.output_names is None: + self.output_names = y_cols + else: + msg = ('Cannot work with input y columns: {}, previously set ' + 'output names are: {}' + .format(y_cols, self.output_names)) + assert self.output_names == y_cols, msg + y = y.values + + if isinstance(p, pd.DataFrame): + p = p.values + + return x, y, p + + def preflight_features(self, x): + """Run preflight checks and data conversions on feature data. + + Parameters + ---------- + x : np.ndarray | pd.DataFrame + Feature data in a 2D array or DataFrame. If this is a DataFrame, + the index is ignored, the columns are used with self.feature_names, + and the df is converted into a numpy array for batching and passing + to the training algorithm. + + Returns + ---------- + x : np.ndarray + Feature data in a 2D array + """ + + assert len(x.shape) == 2, 'PhyGNN can only use 2D data as input!' x_msg = ('x data has {} features but expected {}' .format(x.shape[1], self._input_dims)) assert x.shape[1] == self._input_dims, x_msg + if isinstance(x, pd.DataFrame): + x_cols = x.columns.values.tolist() + if self.feature_names is None: + self.feature_names = x_cols + else: + msg = ('Cannot work with input x columns: {}, previously set ' + 'feature names are: {}' + .format(x_cols, self.feature_names)) + assert self.feature_names == x_cols, msg + x = x.values + + return x + @staticmethod def seed(s=0): """Set the random seed for reproducable results.""" @@ -179,7 +256,7 @@ def seed(s=0): @property def history(self): - """Get the training history dataframe (None if not yet trained).""" + """Get the training history DataFrame (None if not yet trained).""" return self._history @property @@ -234,6 +311,10 @@ def loss(self, y_predicted, y_true, p, p_kwargs): Sum of the NN loss function comparing the y_predicted against y_true and the physical loss function (self._p_fun) with respective weights applied. + nn_loss : tf.tensor + Standard NN training loss comparing y to y_predicted. + p_loss : tf.tensor + Physics loss from p_fun. """ if p_kwargs is None: @@ -299,8 +380,8 @@ def _get_grad(self, x, y_true, p, p_kwargs): return grad, loss - def _run_sgd(self, x, y_true, p, p_kwargs): - """Run stochastic gradient descent for one mini-batch of (x, y_true) + def _run_gradient_descent(self, x, y_true, p, p_kwargs): + """Run gradient descent for one mini-batch of (x, y_true) and adjust NN weights.""" grad, loss = self._get_grad(x, y_true, p, p_kwargs) self._optimizer.apply_gradients(zip(grad, self.weights)) @@ -431,12 +512,21 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True, Parameters ---------- - x : np.ndarray - Feature data in a 2D array - y : np.ndarray - Known output data in a 2D array. - p : np.ndarray + x : np.ndarray | pd.DataFrame + Feature data in a 2D array or DataFrame. If this is a DataFrame, + the index is ignored, the columns are used with self.feature_names, + and the df is converted into a numpy array for batching and passing + to the training algorithm. + y : np.ndarray | pd.DataFrame + Known output data in a 2D array or DataFrame. If this is a + DataFrame, the index is ignored, the columns are used with + self.output_names, and the df is converted into a numpy array for + batching and passing to the training algorithm. + p : np.ndarray | pd.DataFrame Supplemental feature data for the physics loss function in 2D array + or DataFrame. If this is a DataFrame, the index and column labels + are ignored and the df is converted into a numpy array for batching + and passing to the training algorithm and physical loss function. n_batch : int Number of times to update the NN weights per epoch (number of mini-batches). The training data will be split into this many @@ -462,7 +552,7 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True, Namespace of training parameters that can be used for diagnostics. """ - self._preflight_data(x, y, p) + x, y, p = self.preflight_data(x, y, p) epochs = list(range(n_epoch)) @@ -477,7 +567,7 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True, x, y, p, shuffle=shuffle, validation_split=validation_split) if self._loss_weights[1] > 0 and run_preflight: - self._preflight_p_fun(x_val, y_val, p_val, p_kwargs) + self.preflight_p_fun(x_val, y_val, p_val, p_kwargs) t0 = time.time() for epoch in epochs: @@ -487,7 +577,8 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True, batch_iter = zip(x_batches, y_batches, p_batches) for x_batch, y_batch, p_batch in batch_iter: - tr_loss = self._run_sgd(x_batch, y_batch, p_batch, p_kwargs)[1] + tr_loss = self._run_gradient_descent( + x_batch, y_batch, p_batch, p_kwargs)[1] y_val_pred = self.predict(x_val, to_numpy=False) val_loss = self.loss(y_val_pred, y_val, p_val, p_kwargs)[0] @@ -523,7 +614,7 @@ def predict(self, x, to_numpy=True): Predicted output data in a 2D array. """ - self._preflight_predict(x) + x = self.preflight_features(x) y = self._layers[0](x) for layer in self._layers[1:]: y = layer(y) diff --git a/tests/test_phygnn.py b/tests/test_phygnn.py index d525a5e..338b825 100644 --- a/tests/test_phygnn.py +++ b/tests/test_phygnn.py @@ -5,6 +5,7 @@ import os import pytest import numpy as np +import pandas as pd import tensorflow as tf from phygnn import PhysicsGuidedNeuralNetwork, TESTDATADIR @@ -77,7 +78,9 @@ def test_nn(): model = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, hidden_layers=HIDDEN_LAYERS, loss_weights=(1.0, 0.0), - input_dims=2, output_dims=1) + input_dims=2, output_dims=1, + feature_names=['a', 'b'], + output_names=['c']) model.fit(X, Y_NOISE, P, n_batch=4, n_epoch=20) test_mae = np.mean(np.abs(model.predict(X) - Y)) @@ -107,13 +110,44 @@ def test_phygnn(): assert test_mae < 0.015 +def test_df_input(): + """Test the operation of the PGNN with labeled input dataframes.""" + PhysicsGuidedNeuralNetwork.seed(0) + model = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, + hidden_layers=HIDDEN_LAYERS, + loss_weights=(0.0, 1.0), + input_dims=2, output_dims=1) + x_df = pd.DataFrame(X, columns=('a', 'b')) + y_df = pd.DataFrame(Y_NOISE, columns=('c',)) + p_df = pd.DataFrame(P, columns=('a', 'b')) + model.fit(x_df, y_df, p_df, n_batch=1, n_epoch=2) + + assert model.feature_names == ['a', 'b'] + assert model.output_names == ['c'] + + x_df_bad = pd.DataFrame(X, columns=('x1', 'x2')) + y_df_bad = pd.DataFrame(Y_NOISE, columns=('y',)) + + try: + model.fit(x_df_bad, y_df_bad, p_df, n_batch=1, n_epoch=2) + except AssertionError as e: + assert "Cannot work with input x columns: ['x1', 'x2']" in str(e) + + try: + model.fit(x_df, y_df_bad, p_df, n_batch=1, n_epoch=2) + except AssertionError as e: + assert "Cannot work with input y columns: ['y']" in str(e) + + def test_save_load(): """Test the save/load operations of PGNN""" PhysicsGuidedNeuralNetwork.seed(0) model = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, hidden_layers=HIDDEN_LAYERS, loss_weights=(0.0, 1.0), - input_dims=2, output_dims=1) + input_dims=2, output_dims=1, + feature_names=['a', 'b'], + output_names=['c']) model.fit(X, Y_NOISE, P, n_batch=4, n_epoch=20) y_pred = model.predict(X) @@ -122,6 +156,8 @@ def test_save_load(): loaded = PhysicsGuidedNeuralNetwork.load(FPATH) y_pred_loaded = loaded.predict(X) assert np.allclose(y_pred, y_pred_loaded) + assert loaded.feature_names == ['a', 'b'] + assert loaded.output_names == ['c'] os.remove(FPATH) From 61c9081aa04f6e7d40c5fe7b14bb9bf98b86394b Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 19 Aug 2020 13:14:06 -0600 Subject: [PATCH 4/6] added kernel and bias regularization kwargs and logic --- phygnn/phygnn.py | 115 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 6 deletions(-) diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py index d5a2454..5b56b24 100644 --- a/phygnn/phygnn.py +++ b/phygnn/phygnn.py @@ -10,7 +10,8 @@ import pandas as pd import logging import tensorflow as tf -from tensorflow.keras import layers, optimizers, initializers +from tensorflow.keras import optimizers, initializers +from tensorflow.keras.layers import InputLayer, Dense, Dropout from phygnn.loss_metrics import METRICS @@ -25,6 +26,8 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), input_dims=1, output_dims=1, metric='mae', initializer=None, optimizer=None, learning_rate=0.01, history=None, + kernel_reg_rate=0.0, kernel_reg_power=1, + bias_reg_rate=0.0, bias_reg_power=1, feature_names=None, output_names=None): """ Parameters @@ -63,6 +66,26 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), Optimizer learning rate. history : None | pd.DataFrame Learning history if continuing a training session. + kernel_reg_rate : float + Kernel regularization rate. Increasing this value above zero will + add a structural loss term to the loss function that + disincentivizes large hidden layer weights and should reduce + model complexity. Setting this to 0.0 will disable kernel + regularization. + kernel_reg_power : int + Kernel regularization power. kernel_reg_power=1 is L1 + regularization (lasso regression), and kernel_reg_power=2 is L2 + regularization (ridge regression). + bias_reg_rate : float + Bias regularization rate. Increasing this value above zero will + add a structural loss term to the loss function that + disincentivizes large hidden layer biases and should reduce + model complexity. Setting this to 0.0 will disable bias + regularization. + bias_reg_power : int + Bias regularization power. bias_reg_power=1 is L1 + regularization (lasso regression), and bias_reg_power=2 is L2 + regularization (ridge regression). feature_names : list | tuple | None Training feature names (strings). Mostly a convenience so that a loaded-from-disk model will have declared feature names, making it @@ -84,6 +107,10 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), self._optimizer = None self._history = history self._learning_rate = learning_rate + self.kernel_reg_rate = kernel_reg_rate + self.kernel_reg_power = kernel_reg_power + self.bias_reg_rate = bias_reg_rate + self.bias_reg_power = bias_reg_power self.feature_names = feature_names self.output_names = output_names @@ -106,10 +133,10 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5), if optimizer is None: self._optimizer = optimizers.Adam(learning_rate=learning_rate) - self._layers.append(layers.InputLayer(input_shape=[input_dims])) + self._layers.append(InputLayer(input_shape=[input_dims])) for hidden_layer in hidden_layers: self.add_layer(hidden_layer) - self._layers.append(layers.Dense( + self._layers.append(Dense( output_dims, kernel_initializer=self._initializer)) @staticmethod @@ -272,6 +299,38 @@ def weights(self): weights += layer.variables return weights + @property + def kernel_weights(self): + """Get a list of the NN kernel weights + + (can be used for kernel regularization). + + Does not include input layer or dropout layers. + Does include the output layer. + """ + weights = [] + for layer in self.layers: + if isinstance(layer, Dense): + weights.append(layer.get_weights()[0]) + + return weights + + @property + def bias_weights(self): + """Get a list of the NN bias weights + + (can be used for bias regularization). + + Does not include input layer or dropout layers. + Does include the output layer. + """ + weights = [] + for layer in self.layers: + if isinstance(layer, Dense): + weights.append(layer.get_weights()[1]) + + return weights + def reset_history(self): """Erase previous training history without resetting trained weights""" self._history = None @@ -291,6 +350,30 @@ def set_loss_weights(self, loss_weights): assert len(loss_weights) == 2, 'loss_weights can only have two values!' self._loss_weights = loss_weights + @property + def kernel_regularization_term(self): + """Get the regularization term for the kernel regularization without + the regularization rate applied.""" + loss_k_reg = [tf.math.abs(x) for x in self.kernel_weights] + loss_k_reg = [tf.math.pow(x, self.kernel_reg_power) + for x in loss_k_reg] + loss_k_reg = tf.math.reduce_sum( + [tf.math.reduce_sum(x) for x in loss_k_reg]) + + return loss_k_reg + + @property + def bias_regularization_term(self): + """Get the regularization term for the bias regularization without + the regularization rate applied.""" + loss_b_reg = [tf.math.abs(x) for x in self.bias_weights] + loss_b_reg = [tf.math.pow(x, self.bias_reg_power) + for x in loss_b_reg] + loss_b_reg = tf.math.reduce_sum( + [tf.math.reduce_sum(x) for x in loss_b_reg]) + + return loss_b_reg + def loss(self, y_predicted, y_true, p, p_kwargs): """Calculate the loss function by comparing model-predicted y to y_true @@ -335,8 +418,24 @@ def loss(self, y_predicted, y_true, p, p_kwargs): logger.debug('NN Loss: {:.2e}, P Loss: {:.2e}, Total Loss: {:.2e}' .format(nn_loss, p_loss, loss)) + if self.kernel_reg_rate > 0: + loss_kernel_reg = (self.kernel_regularization_term + * self.kernel_reg_rate) + loss += loss_kernel_reg + logger.debug('Kernel regularization loss: {:.2e}, ' + 'Total Loss: {:.2e}'.format(loss_kernel_reg, loss)) + + if self.bias_reg_rate > 0: + loss_bias_reg = (self.bias_regularization_term + * self.bias_reg_rate) + loss += loss_bias_reg + logger.debug('Bias regularization loss: {:.2e}, ' + 'Total Loss: {:.2e}'.format(loss_bias_reg, loss)) + if tf.math.is_nan(loss): - raise ArithmeticError('Loss is nan.') + msg = 'phygnn calculated a NaN loss value!' + logger.error(msg) + raise ArithmeticError(msg) return loss, nn_loss, p_loss @@ -355,14 +454,14 @@ def add_layer(self, layer_kwargs, insert_index=None): """ dropout = layer_kwargs.pop('dropout', None) - layer = layers.Dense(**layer_kwargs) + layer = Dense(**layer_kwargs) if insert_index: self._layers.insert(insert_index, layer) else: self._layers.append(layer) if dropout is not None: - d_layer = layers.Dropout(dropout) + d_layer = Dropout(dropout) if insert_index: self._layers.insert(insert_index + 1, d_layer) else: @@ -656,6 +755,10 @@ def save(self, fpath): 'learning_rate': self._learning_rate, 'weight_dict': weight_dict, 'history': self._history, + 'kernel_reg_rate': self.kernel_reg_rate, + 'kernel_reg_power': self.kernel_reg_power, + 'bias_reg_rate': self.bias_reg_rate, + 'bias_reg_power': self.bias_reg_power, 'feature_names': self.feature_names, 'output_names': self.output_names, } From 0f400aada4cbefa8c3e5dfb4cebe7a24ef069934 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 19 Aug 2020 16:46:30 -0600 Subject: [PATCH 5/6] added kernel and bias regularization terms and kwargs to phygnn --- phygnn/phygnn.py | 61 +++++++++++++++++++------------------- tests/test_phygnn.py | 70 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 31 deletions(-) diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py index 5b56b24..87e0205 100644 --- a/phygnn/phygnn.py +++ b/phygnn/phygnn.py @@ -297,11 +297,12 @@ def weights(self): weights = [] for layer in self._layers: weights += layer.variables + return weights @property def kernel_weights(self): - """Get a list of the NN kernel weights + """Get a list of the NN kernel weights (tensors) (can be used for kernel regularization). @@ -311,13 +312,13 @@ def kernel_weights(self): weights = [] for layer in self.layers: if isinstance(layer, Dense): - weights.append(layer.get_weights()[0]) + weights.append(layer.variables[0]) return weights @property def bias_weights(self): - """Get a list of the NN bias weights + """Get a list of the NN bias weights (tensors) (can be used for bias regularization). @@ -327,31 +328,12 @@ def bias_weights(self): weights = [] for layer in self.layers: if isinstance(layer, Dense): - weights.append(layer.get_weights()[1]) + weights.append(layer.variables[1]) return weights - def reset_history(self): - """Erase previous training history without resetting trained weights""" - self._history = None - - def set_loss_weights(self, loss_weights): - """Set new loss weights - - Parameters - ---------- - loss_weights : tuple - Loss weights for the neural network y_predicted vs. y_true - and for the p_fun loss, respectively. For example, - loss_weights=(0.0, 1.0) would simplify the PGNN loss function - to just the p_fun output. - """ - assert np.sum(loss_weights) > 0, 'Sum of loss_weights must be > 0!' - assert len(loss_weights) == 2, 'loss_weights can only have two values!' - self._loss_weights = loss_weights - @property - def kernel_regularization_term(self): + def kernel_reg_term(self): """Get the regularization term for the kernel regularization without the regularization rate applied.""" loss_k_reg = [tf.math.abs(x) for x in self.kernel_weights] @@ -363,7 +345,7 @@ def kernel_regularization_term(self): return loss_k_reg @property - def bias_regularization_term(self): + def bias_reg_term(self): """Get the regularization term for the bias regularization without the regularization rate applied.""" loss_b_reg = [tf.math.abs(x) for x in self.bias_weights] @@ -374,6 +356,25 @@ def bias_regularization_term(self): return loss_b_reg + def reset_history(self): + """Erase previous training history without resetting trained weights""" + self._history = None + + def set_loss_weights(self, loss_weights): + """Set new loss weights + + Parameters + ---------- + loss_weights : tuple + Loss weights for the neural network y_predicted vs. y_true + and for the p_fun loss, respectively. For example, + loss_weights=(0.0, 1.0) would simplify the PGNN loss function + to just the p_fun output. + """ + assert np.sum(loss_weights) > 0, 'Sum of loss_weights must be > 0!' + assert len(loss_weights) == 2, 'loss_weights can only have two values!' + self._loss_weights = loss_weights + def loss(self, y_predicted, y_true, p, p_kwargs): """Calculate the loss function by comparing model-predicted y to y_true @@ -418,16 +419,14 @@ def loss(self, y_predicted, y_true, p, p_kwargs): logger.debug('NN Loss: {:.2e}, P Loss: {:.2e}, Total Loss: {:.2e}' .format(nn_loss, p_loss, loss)) - if self.kernel_reg_rate > 0: - loss_kernel_reg = (self.kernel_regularization_term - * self.kernel_reg_rate) + if self.kernel_reg_rate != 0: + loss_kernel_reg = self.kernel_reg_term * self.kernel_reg_rate loss += loss_kernel_reg logger.debug('Kernel regularization loss: {:.2e}, ' 'Total Loss: {:.2e}'.format(loss_kernel_reg, loss)) - if self.bias_reg_rate > 0: - loss_bias_reg = (self.bias_regularization_term - * self.bias_reg_rate) + if self.bias_reg_rate != 0: + loss_bias_reg = self.bias_reg_term * self.bias_reg_rate loss += loss_bias_reg logger.debug('Bias regularization loss: {:.2e}, ' 'Total Loss: {:.2e}'.format(loss_bias_reg, loss)) diff --git a/tests/test_phygnn.py b/tests/test_phygnn.py index 338b825..73445ae 100644 --- a/tests/test_phygnn.py +++ b/tests/test_phygnn.py @@ -139,6 +139,76 @@ def test_df_input(): assert "Cannot work with input y columns: ['y']" in str(e) +def test_kernel_regularization(): + """Test the kernel regularization of phygnn.""" + base = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, + hidden_layers=HIDDEN_LAYERS, + loss_weights=(1.0, 0.0), + input_dims=2, output_dims=1) + + model_l1 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, + hidden_layers=HIDDEN_LAYERS, + loss_weights=(1.0, 0.0), + input_dims=2, output_dims=1, + kernel_reg_rate=0.01, + kernel_reg_power=1) + + model_l2 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, + hidden_layers=HIDDEN_LAYERS, + loss_weights=(1.0, 0.0), + input_dims=2, output_dims=1, + kernel_reg_rate=0.01, + kernel_reg_power=2) + + base.seed(0) + base.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20) + model_l1.seed(0) + model_l1.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20) + model_l2.seed(0) + model_l2.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20) + + assert base.kernel_reg_term > model_l1.kernel_reg_term + assert model_l1.kernel_reg_term > model_l2.kernel_reg_term + assert np.abs(base.kernel_reg_term - 497.95) < 1 + assert np.abs(model_l1.kernel_reg_term - 84.55) < 1 + assert np.abs(model_l2.kernel_reg_term - 17.29) < 1 + + +def test_bias_regularization(): + """Test the bias regularization of phygnn.""" + base = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, + hidden_layers=HIDDEN_LAYERS, + loss_weights=(1.0, 0.0), + input_dims=2, output_dims=1) + + model_l1 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, + hidden_layers=HIDDEN_LAYERS, + loss_weights=(1.0, 0.0), + input_dims=2, output_dims=1, + bias_reg_rate=0.01, + bias_reg_power=1) + + model_l2 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag, + hidden_layers=HIDDEN_LAYERS, + loss_weights=(1.0, 0.0), + input_dims=2, output_dims=1, + bias_reg_rate=0.01, + bias_reg_power=2) + + base.seed(0) + base.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20) + model_l1.seed(0) + model_l1.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20) + model_l2.seed(0) + model_l2.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20) + + assert base.bias_reg_term > model_l1.bias_reg_term + assert model_l1.bias_reg_term > model_l2.bias_reg_term + assert np.abs(base.bias_reg_term - 5.77) < 1 + assert np.abs(model_l1.bias_reg_term - 2.37) < 1 + assert np.abs(model_l2.bias_reg_term - 0.30) < 1 + + def test_save_load(): """Test the save/load operations of PGNN""" PhysicsGuidedNeuralNetwork.seed(0) From 94e76adfc46422e1b6cd714d35bff5a0edf5ab9b Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 20 Aug 2020 10:45:22 -0600 Subject: [PATCH 6/6] updated doc string for preprocess one hot column labeler --- phygnn/pre_processing.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/phygnn/pre_processing.py b/phygnn/pre_processing.py index a837b4f..a024ec0 100644 --- a/phygnn/pre_processing.py +++ b/phygnn/pre_processing.py @@ -136,7 +136,9 @@ def _get_one_hot_data(self, convert_int=False, categories=None): def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data, categories=None): - """ + """Make unique column labels for the new one-hot data. This will use + column labels from categories if available. + Parameters ---------- one_hot_ind : list @@ -155,6 +157,11 @@ def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data, results in category names being determined automatically. Format: {'col_name1' : ['cat1', 'cat2', 'cat3'], 'col_name2' : ['other_cat1', 'other_cat2']} + + Returns + ------- + col_labels : list + List of string labels corresponding to np.hstack(one_hot_data). """ if categories is None: