From c0292578894826484a489a8c08d0ccf1be8cf2c1 Mon Sep 17 00:00:00 2001
From: grantbuster <grant.buster@nrel.gov>
Date: Wed, 19 Aug 2020 09:49:26 -0600
Subject: [PATCH 1/6] added init kwargs and attrs for feature and output names

---
 phygnn/phygnn.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py
index ecbd19d..114af5d 100644
--- a/phygnn/phygnn.py
+++ b/phygnn/phygnn.py
@@ -24,7 +24,8 @@ class PhysicsGuidedNeuralNetwork:
     def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
                  input_dims=1, output_dims=1, metric='mae',
                  initializer=None, optimizer=None,
-                 learning_rate=0.01, history=None):
+                 learning_rate=0.01, history=None,
+                 feature_names=None, output_names=None):
         """
         Parameters
         ----------
@@ -62,6 +63,14 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
             Optimizer learning rate.
         history : None | pd.dataframe
             Learning history if continuing a training session.
+        feature_names : list | tuple | None
+            Training feature names (strings). Mostly a convenience so that a
+            loaded-from-disk model will have declared feature names, making it
+            easier to feed in features for prediction.
+        output_names : list | tuple | None
+            Prediction output names (strings). Mostly a convenience so that a
+            loaded-from-disk model will have declared output names, making it
+            easier to understand prediction output.
         """
 
         self._p_fun = p_fun
@@ -73,6 +82,8 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
         self._optimizer = None
         self._history = history
         self._learning_rate = learning_rate
+        self.feature_names = feature_names
+        self.output_names = output_names
 
         self.set_loss_weights(loss_weights)
 
@@ -554,6 +565,8 @@ def save(self, fpath):
                         'learning_rate': self._learning_rate,
                         'weight_dict': weight_dict,
                         'history': self._history,
+                        'feature_names': self.feature_names,
+                        'output_names': self.output_names,
                         }
 
         with open(fpath, 'wb') as f:

From caf00a1a6c5d74dd0f59020dfc72291b4ec0e018 Mon Sep 17 00:00:00 2001
From: grantbuster <grant.buster@nrel.gov>
Date: Wed, 19 Aug 2020 11:02:17 -0600
Subject: [PATCH 2/6] added better one-hot category encoding with column names
 that are the specified categories

---
 phygnn/pre_processing.py     | 199 +++++++++++++++++++++++++++++------
 tests/test_pre_processing.py |  45 +++++---
 2 files changed, 194 insertions(+), 50 deletions(-)

diff --git a/phygnn/pre_processing.py b/phygnn/pre_processing.py
index 1fd2301..a837b4f 100644
--- a/phygnn/pre_processing.py
+++ b/phygnn/pre_processing.py
@@ -12,22 +12,14 @@
 class PreProcess:
     """Class to handle the pre-processing of feature data."""
 
-    def __init__(self, features, categories={}):
+    def __init__(self, features):
         """
         Parameters
         ----------
         features : np.ndarray | pd.DataFrame
             Feature data in a 2D array or DataFrame.
-        categories : dict
-            Categories to use for one hot encoding. Empty dict results in
-            categories being determined automatically. Format:
-                {
-                    'col_name1' : ['cat1', 'cat2', 'cat3'],
-                    'col_name2' : ['other_cat1', 'other_cat2']
-                }
         """
 
-        self._categories = categories
         self._features = features
         self._pd = False
         if isinstance(self._features, pd.DataFrame):
@@ -37,20 +29,75 @@ def __init__(self, features, categories={}):
             if not features.index.is_unique:
                 raise AttributeError('DataFrame indices must be unique')
 
-    def process_one_hot(self, convert_int=False):
-        """Process str and int columns in the feature data to one-hot vectors.
+    @staticmethod
+    def _is_one_hot(arr, convert_int=False):
+        """Check if an array of data is to be transformed into a one-hot vector
+        by sampling the first datum and checking the type.
 
         Parameters
         ----------
+        arr : np.ndarray
+            Array (column) of data to be checked.
         convert_int : bool
             Flag to convert integer data to one-hot vectors.
 
         Returns
         -------
-        processed : np.ndarray | pd.DataFrame
-            Feature data with str and int columns removed and one-hot boolean
-            vectors appended as new columns.
+        one_hot : bool
+            True if arr is to be transformed into a one-hot vector.
         """
+        if len(arr.shape) == 1:
+            sample = arr[0]
+        elif len(arr.shape) == 2:
+            sample = arr[0, 0]
+        else:
+            e = 'Cannot process 3D column into one hot'
+            logger.error(e)
+            raise ValueError(e)
+
+        one_hot = False
+
+        if isinstance(sample, str):
+            one_hot = True
+        elif np.issubdtype(type(sample), np.integer) and convert_int:
+            one_hot = True
+
+        return one_hot
+
+    def _get_one_hot_data(self, convert_int=False, categories=None):
+        """Get one hot data and column indexes.
+
+        Parameters
+        ----------
+        convert_int : bool
+            Flag to convert integer data to one-hot vectors.
+        categories : dict | None
+            Categories to use for one hot encoding where a key is the original
+            column name in the feature dataframe and value is a list of the
+            possible unique values of the feature column. The value list must
+            have as many or more entries as unique values in the feature
+            column. This will name the feature column headers for the new
+            one-hot-encoding if features is a dataframe. Empty dict or None
+            results in category names being determined automatically. Format:
+                {'col_name1' : ['cat1', 'cat2', 'cat3'],
+                 'col_name2' : ['other_cat1', 'other_cat2']}
+
+        Returns
+        -------
+        one_hot_ind : list
+            List of numeric column indices in the native data that are
+            to-be-transformed into one-hot vectors.
+        one_hot_data : list
+            List of arrays of one hot data columns that are transformations of
+            the one_hot_ind columns.
+        numerical_ind : list
+            List of numeric column indices in the native data that are
+            continuous numerical columns that are not to-be-transformed into
+            one-hot vectors.
+        """
+
+        if categories is None:
+            categories = {}
 
         one_hot_ind = []
         one_hot_data = []
@@ -68,46 +115,128 @@ def process_one_hot(self, convert_int=False):
             else:
                 col = self._features[:, i].reshape((n, 1))
 
-            sample = col[0, 0]
-            one_hot = False
-
-            if isinstance(sample, str):
-                one_hot = True
-            elif np.issubdtype(type(sample), np.integer) and convert_int:
-                one_hot = True
-
-            if one_hot:
+            if not self._is_one_hot(col, convert_int=convert_int):
+                numerical_ind.append(i)
+            else:
                 logger.debug('One hot encoding {}'.format(col_name))
                 one_hot_ind.append(i)
-                if col_name in self._categories:
-                    categories = [self._categories[col_name]]
+
+                if col_name in categories:
+                    cats = [categories[col_name]]
                     logger.debug('Using categories {} for column {}'
-                                 ''.format(categories, col_name))
-                    oh_obj = OneHotEncoder(sparse=False, categories=categories)
+                                 ''.format(cats, col_name))
+                    oh_obj = OneHotEncoder(sparse=False, categories=cats)
                 else:
                     oh_obj = OneHotEncoder(sparse=False)
+
                 oh_obj.fit(col)
                 one_hot_data.append(oh_obj.transform(col))
+
+        return one_hot_ind, one_hot_data, numerical_ind
+
+    def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data,
+                                     categories=None):
+        """
+        Parameters
+        ----------
+        one_hot_ind : list
+            List of numeric column indices in the native data that are
+            to-be-transformed into one-hot vectors.
+        one_hot_data : list
+            List of arrays of one hot data columns that are transformations of
+            the one_hot_ind columns.
+        categories : dict | None
+            Categories to use for one hot encoding where a key is the original
+            column name in the feature dataframe and value is a list of the
+            possible unique values of the feature column. The value list must
+            have as many or more entries as unique values in the feature
+            column. This will name the feature column headers for the new
+            one-hot-encoding if features is a dataframe. Empty dict or None
+            results in category names being determined automatically. Format:
+                {'col_name1' : ['cat1', 'cat2', 'cat3'],
+                 'col_name2' : ['other_cat1', 'other_cat2']}
+        """
+
+        if categories is None:
+            categories = {}
+
+        col_labels = []
+        for i, oh_ind in enumerate(one_hot_ind):
+            orig_col_label = self._features.columns.values[oh_ind]
+            if orig_col_label in categories:
+                cat_labels = categories[orig_col_label]
+
+                msg = ('Values in the categories input dict must be a '
+                       'list or tuple!')
+                assert isinstance(cat_labels, (list, tuple)), msg
+
+                unique_vals = pd.unique(self._features[orig_col_label])
+                msg = ('Categories for "{a}" one-hot column had fewer unique '
+                       'entries than one-hot encodings! You input these '
+                       'categories: {b} but "{a}" has these values: {c}'
+                       .format(a=orig_col_label, b=cat_labels, c=unique_vals))
+                assert len(cat_labels) >= len(unique_vals), msg
+
+                if isinstance(cat_labels, tuple):
+                    cat_labels = list(cat_labels)
+
+                col_labels += cat_labels
             else:
-                numerical_ind.append(i)
+                def_labels = [orig_col_label + '_' + str(k)
+                              for k in range(one_hot_data[i].shape[1])]
+                col_labels += def_labels
+
+        return col_labels
+
+    def process_one_hot(self, convert_int=False, categories=None):
+        """Process str and int columns in the feature data to one-hot vectors.
+
+        Parameters
+        ----------
+        convert_int : bool
+            Flag to convert integer data to one-hot vectors.
+        categories : dict | None
+            Categories to use for one hot encoding where a key is the original
+            column name in the feature dataframe and value is a list of the
+            possible unique values of the feature column. The value list must
+            have as many or more entries as unique values in the feature
+            column. This will name the feature column headers for the new
+            one-hot-encoding if features is a dataframe. Empty dict or None
+            results in category names being determined automatically. Format:
+                {'col_name1' : ['cat1', 'cat2', 'cat3'],
+                 'col_name2' : ['other_cat1', 'other_cat2']}
+
+        Returns
+        -------
+        processed : np.ndarray | pd.DataFrame
+            Feature data with str and int columns removed and one-hot boolean
+            vectors appended as new columns. If features is a dataframe and
+            categories is input, the new one-hot columns will be named
+            according to categories.
+        """
+
+        if categories is None:
+            categories = {}
+
+        one_hot_ind, one_hot_data, numerical_ind = self._get_one_hot_data(
+            convert_int=convert_int, categories=categories)
 
         if not one_hot_ind:
             return self._features
 
-        if one_hot_ind:
+        else:
             if self._pd:
                 num_df = self._features.iloc[:, numerical_ind]
-                cols = [[self._features.columns[j] + '_' + str(k)
-                         for k in range(one_hot_data[i].shape[1])]
-                        for i, j in enumerate(one_hot_ind)]
-                cols = [a for sublist in cols for a in sublist]
+                col_labels = self._make_df_one_hot_cols_labels(one_hot_ind,
+                                                               one_hot_data,
+                                                               categories)
                 one_hot_df = pd.DataFrame(np.hstack(one_hot_data),
-                                          columns=cols,
+                                          columns=col_labels,
                                           index=self._features.index)
                 processed = num_df.join(one_hot_df)
-
                 assert processed.shape[0] == num_df.shape[0] == \
                     one_hot_df.shape[0]
+
             else:
                 processed = np.hstack((self._features[:, numerical_ind],
                                        np.hstack(one_hot_data)))
diff --git a/tests/test_pre_processing.py b/tests/test_pre_processing.py
index 6ce2188..2667554 100644
--- a/tests/test_pre_processing.py
+++ b/tests/test_pre_processing.py
@@ -9,7 +9,7 @@
 index = pd.date_range('20180101', '20190101', freq='5min')
 A = pd.DataFrame({'f1': ['a', 'b', 'c', 'd', 'a', 'c', 'a'],
                   'f2': np.arange(7) * 0.333,
-                  'f3': np.arange(7)}, index=index[:7])
+                  'f3': np.arange(7, dtype=int)}, index=index[:7])
 
 
 def test_one_hot_encoding():
@@ -44,20 +44,35 @@ def test_one_hot_encoding():
 def test_categories():
     """ Verify predefined categories handle missing data """
     proc = PreProcess(A)
-    np_out = proc.process_one_hot(convert_int=False)
-    assert (np_out.columns == ['f2', 'f3', 'f1_0', 'f1_1',
-                               'f1_2', 'f1_3']).all()
+    out = proc.process_one_hot(convert_int=False)
+    assert (out.columns == ['f2', 'f3', 'f1_0', 'f1_1',
+                            'f1_2', 'f1_3']).all()
 
     # Verify columns are created for missing categories
-    proc = PreProcess(A, categories={'f1': ['a', 'b', 'c', 'd', 'missing']})
-    np_out = proc.process_one_hot(convert_int=False)
-    assert (np_out.columns == ['f2', 'f3', 'f1_0', 'f1_1',
-                               'f1_2', 'f1_3', 'f1_4']).all()
-    assert (np_out['f1_4'] == np.zeros(7)).all()
+    # and that the new one-hot columns have names corresponding to their values
+    proc = PreProcess(A)
+    out0 = proc.process_one_hot(
+        convert_int=False, categories={'f1': ['a', 'b', 'c', 'd', 'missing']})
+    assert (out0.columns == ['f2', 'f3', 'a', 'b', 'c', 'd', 'missing']).all()
+    assert (out0['missing'] == np.zeros(7)).all()
 
-    # Verify columns are created for missing categories in correct order
-    proc = PreProcess(A, categories={'f1': ['missing', 'a', 'b', 'c', 'd']})
-    np_out = proc.process_one_hot(convert_int=False)
-    assert (np_out.columns == ['f2', 'f3', 'f1_0', 'f1_1',
-                               'f1_2', 'f1_3', 'f1_4']).all()
-    assert (np_out['f1_0'] == np.zeros(7)).all()
+    # verify ordering works.
+    out1 = proc.process_one_hot(
+        convert_int=False, categories={'f1': ['missing', 'd', 'c', 'a', 'b']})
+    assert (out1.columns == ['f2', 'f3', 'missing', 'd', 'c', 'a', 'b']).all()
+    assert all(out0.a == out1.a)
+    assert all(out0.b == out1.b)
+    assert all(out0.c == out1.c)
+    assert all(out0.d == out1.d)
+    assert (out1['missing'] == np.zeros(7)).all()
+    assert out1.a.values[0] == 1
+    assert out1.a.values[1] == 0
+    assert out1.a.values[2] == 0
+    assert out1.a.values[3] == 0
+    assert out1.a.values[4] == 1
+
+    # Verify good error with bad categories input.
+    try:
+        proc.process_one_hot(categories={'f1': ['a', 'b', 'c']})
+    except ValueError as e:
+        assert 'Found unknown categories' in str(e)

From 2be1c5cec1cecb7930b71d9e4d709d7652227d8d Mon Sep 17 00:00:00 2001
From: grantbuster <grant.buster@nrel.gov>
Date: Wed, 19 Aug 2020 11:39:20 -0600
Subject: [PATCH 3/6] added feature to accept dataframes as input to phygnn
 with column label checking

---
 phygnn/phygnn.py     | 133 ++++++++++++++++++++++++++++++++++++-------
 tests/test_phygnn.py |  40 ++++++++++++-
 2 files changed, 150 insertions(+), 23 deletions(-)

diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py
index 114af5d..d5a2454 100644
--- a/phygnn/phygnn.py
+++ b/phygnn/phygnn.py
@@ -61,16 +61,18 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
             None defaults to Adam.
         learning_rate : float
             Optimizer learning rate.
-        history : None | pd.dataframe
+        history : None | pd.DataFrame
             Learning history if continuing a training session.
         feature_names : list | tuple | None
             Training feature names (strings). Mostly a convenience so that a
             loaded-from-disk model will have declared feature names, making it
-            easier to feed in features for prediction.
+            easier to feed in features for prediction. This will also get set
+            if phygnn is trained on a DataFrame.
         output_names : list | tuple | None
             Prediction output names (strings). Mostly a convenience so that a
             loaded-from-disk model will have declared output names, making it
-            easier to understand prediction output.
+            easier to understand prediction output. This will also get set
+            if phygnn is trained on a DataFrame.
         """
 
         self._p_fun = p_fun
@@ -118,7 +120,7 @@ def _check_shapes(x, y):
         assert len(x) == len(y), 'Number of input observations dont match!'
         return True
 
-    def _preflight_p_fun(self, x, y_true, p, p_kwargs):
+    def preflight_p_fun(self, x, y_true, p, p_kwargs):
         """Run a pre-flight check making sure the p_fun is differentiable."""
 
         if p_kwargs is None:
@@ -153,8 +155,37 @@ def _preflight_p_fun(self, x, y_true, p, p_kwargs):
 
         logger.debug('p_fun passed preflight check.')
 
-    def _preflight_data(self, x, y, p):
-        """Run simple preflight checks on data shapes."""
+    def preflight_data(self, x, y, p):
+        """Run simple preflight checks on data shapes.
+
+        Parameters
+        ----------
+        x : np.ndarray | pd.DataFrame
+            Feature data in a 2D array or DataFrame. If this is a DataFrame,
+            the index is ignored, the columns are used with self.feature_names,
+            and the df is converted into a numpy array for batching and passing
+            to the training algorithm.
+        y : np.ndarray | pd.DataFrame
+            Known output data in a 2D array or DataFrame. If this is a
+            DataFrame, the index is ignored, the columns are used with
+            self.output_names, and the df is converted into a numpy array for
+            batching and passing to the training algorithm.
+        p : np.ndarray | pd.DataFrame
+            Supplemental feature data for the physics loss function in 2D array
+            or DataFrame. If this is a DataFrame, the index and column labels
+            are ignored and the df is converted into a numpy array for batching
+            and passing to the training algorithm and physical loss function.
+
+        Returns
+        ----------
+        x : np.ndarray
+            Feature data in a 2D array
+        y : np.ndarray
+            Known output data in a 2D array
+        p : np.ndarray
+            Supplemental feature data for the physics loss function in 2D array
+        """
+
         self._check_shapes(x, y)
         self._check_shapes(x, p)
         x_msg = ('x data has {} features but expected {}'
@@ -164,13 +195,59 @@ def _preflight_data(self, x, y, p):
         assert x.shape[1] == self._input_dims, x_msg
         assert y.shape[1] == self._output_dims, y_msg
 
-    def _preflight_predict(self, x):
-        """Run simple preflight checks on feature data shape for prediction."""
-        assert len(x.shape) == 2, 'PhyGNN can only predict on 2D data!'
+        x = self.preflight_features(x)
+
+        if isinstance(y, pd.DataFrame):
+            y_cols = y.columns.values.tolist()
+            if self.output_names is None:
+                self.output_names = y_cols
+            else:
+                msg = ('Cannot work with input y columns: {}, previously set '
+                       'output names are: {}'
+                       .format(y_cols, self.output_names))
+                assert self.output_names == y_cols, msg
+            y = y.values
+
+        if isinstance(p, pd.DataFrame):
+            p = p.values
+
+        return x, y, p
+
+    def preflight_features(self, x):
+        """Run preflight checks and data conversions on feature data.
+
+        Parameters
+        ----------
+        x : np.ndarray | pd.DataFrame
+            Feature data in a 2D array or DataFrame. If this is a DataFrame,
+            the index is ignored, the columns are used with self.feature_names,
+            and the df is converted into a numpy array for batching and passing
+            to the training algorithm.
+
+        Returns
+        ----------
+        x : np.ndarray
+            Feature data in a 2D array
+        """
+
+        assert len(x.shape) == 2, 'PhyGNN can only use 2D data as input!'
         x_msg = ('x data has {} features but expected {}'
                  .format(x.shape[1], self._input_dims))
         assert x.shape[1] == self._input_dims, x_msg
 
+        if isinstance(x, pd.DataFrame):
+            x_cols = x.columns.values.tolist()
+            if self.feature_names is None:
+                self.feature_names = x_cols
+            else:
+                msg = ('Cannot work with input x columns: {}, previously set '
+                       'feature names are: {}'
+                       .format(x_cols, self.feature_names))
+                assert self.feature_names == x_cols, msg
+            x = x.values
+
+        return x
+
     @staticmethod
     def seed(s=0):
         """Set the random seed for reproducable results."""
@@ -179,7 +256,7 @@ def seed(s=0):
 
     @property
     def history(self):
-        """Get the training history dataframe (None if not yet trained)."""
+        """Get the training history DataFrame (None if not yet trained)."""
         return self._history
 
     @property
@@ -234,6 +311,10 @@ def loss(self, y_predicted, y_true, p, p_kwargs):
             Sum of the NN loss function comparing the y_predicted against
             y_true and the physical loss function (self._p_fun) with
             respective weights applied.
+        nn_loss : tf.tensor
+            Standard NN training loss comparing y to y_predicted.
+        p_loss : tf.tensor
+            Physics loss from p_fun.
         """
 
         if p_kwargs is None:
@@ -299,8 +380,8 @@ def _get_grad(self, x, y_true, p, p_kwargs):
 
         return grad, loss
 
-    def _run_sgd(self, x, y_true, p, p_kwargs):
-        """Run stochastic gradient descent for one mini-batch of (x, y_true)
+    def _run_gradient_descent(self, x, y_true, p, p_kwargs):
+        """Run gradient descent for one mini-batch of (x, y_true)
         and adjust NN weights."""
         grad, loss = self._get_grad(x, y_true, p, p_kwargs)
         self._optimizer.apply_gradients(zip(grad, self.weights))
@@ -431,12 +512,21 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True,
 
         Parameters
         ----------
-        x : np.ndarray
-            Feature data in a 2D array
-        y : np.ndarray
-            Known output data in a 2D array.
-        p : np.ndarray
+        x : np.ndarray | pd.DataFrame
+            Feature data in a 2D array or DataFrame. If this is a DataFrame,
+            the index is ignored, the columns are used with self.feature_names,
+            and the df is converted into a numpy array for batching and passing
+            to the training algorithm.
+        y : np.ndarray | pd.DataFrame
+            Known output data in a 2D array or DataFrame. If this is a
+            DataFrame, the index is ignored, the columns are used with
+            self.output_names, and the df is converted into a numpy array for
+            batching and passing to the training algorithm.
+        p : np.ndarray | pd.DataFrame
             Supplemental feature data for the physics loss function in 2D array
+            or DataFrame. If this is a DataFrame, the index and column labels
+            are ignored and the df is converted into a numpy array for batching
+            and passing to the training algorithm and physical loss function.
         n_batch : int
             Number of times to update the NN weights per epoch (number of
             mini-batches). The training data will be split into this many
@@ -462,7 +552,7 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True,
             Namespace of training parameters that can be used for diagnostics.
         """
 
-        self._preflight_data(x, y, p)
+        x, y, p = self.preflight_data(x, y, p)
 
         epochs = list(range(n_epoch))
 
@@ -477,7 +567,7 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True,
             x, y, p, shuffle=shuffle, validation_split=validation_split)
 
         if self._loss_weights[1] > 0 and run_preflight:
-            self._preflight_p_fun(x_val, y_val, p_val, p_kwargs)
+            self.preflight_p_fun(x_val, y_val, p_val, p_kwargs)
 
         t0 = time.time()
         for epoch in epochs:
@@ -487,7 +577,8 @@ def fit(self, x, y, p, n_batch=16, n_epoch=10, shuffle=True,
 
             batch_iter = zip(x_batches, y_batches, p_batches)
             for x_batch, y_batch, p_batch in batch_iter:
-                tr_loss = self._run_sgd(x_batch, y_batch, p_batch, p_kwargs)[1]
+                tr_loss = self._run_gradient_descent(
+                    x_batch, y_batch, p_batch, p_kwargs)[1]
 
             y_val_pred = self.predict(x_val, to_numpy=False)
             val_loss = self.loss(y_val_pred, y_val, p_val, p_kwargs)[0]
@@ -523,7 +614,7 @@ def predict(self, x, to_numpy=True):
             Predicted output data in a 2D array.
         """
 
-        self._preflight_predict(x)
+        x = self.preflight_features(x)
         y = self._layers[0](x)
         for layer in self._layers[1:]:
             y = layer(y)
diff --git a/tests/test_phygnn.py b/tests/test_phygnn.py
index d525a5e..338b825 100644
--- a/tests/test_phygnn.py
+++ b/tests/test_phygnn.py
@@ -5,6 +5,7 @@
 import os
 import pytest
 import numpy as np
+import pandas as pd
 import tensorflow as tf
 from phygnn import PhysicsGuidedNeuralNetwork, TESTDATADIR
 
@@ -77,7 +78,9 @@ def test_nn():
     model = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
                                        hidden_layers=HIDDEN_LAYERS,
                                        loss_weights=(1.0, 0.0),
-                                       input_dims=2, output_dims=1)
+                                       input_dims=2, output_dims=1,
+                                       feature_names=['a', 'b'],
+                                       output_names=['c'])
     model.fit(X, Y_NOISE, P, n_batch=4, n_epoch=20)
 
     test_mae = np.mean(np.abs(model.predict(X) - Y))
@@ -107,13 +110,44 @@ def test_phygnn():
     assert test_mae < 0.015
 
 
+def test_df_input():
+    """Test the operation of the PGNN with labeled input dataframes."""
+    PhysicsGuidedNeuralNetwork.seed(0)
+    model = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
+                                       hidden_layers=HIDDEN_LAYERS,
+                                       loss_weights=(0.0, 1.0),
+                                       input_dims=2, output_dims=1)
+    x_df = pd.DataFrame(X, columns=('a', 'b'))
+    y_df = pd.DataFrame(Y_NOISE, columns=('c',))
+    p_df = pd.DataFrame(P, columns=('a', 'b'))
+    model.fit(x_df, y_df, p_df, n_batch=1, n_epoch=2)
+
+    assert model.feature_names == ['a', 'b']
+    assert model.output_names == ['c']
+
+    x_df_bad = pd.DataFrame(X, columns=('x1', 'x2'))
+    y_df_bad = pd.DataFrame(Y_NOISE, columns=('y',))
+
+    try:
+        model.fit(x_df_bad, y_df_bad, p_df, n_batch=1, n_epoch=2)
+    except AssertionError as e:
+        assert "Cannot work with input x columns: ['x1', 'x2']" in str(e)
+
+    try:
+        model.fit(x_df, y_df_bad, p_df, n_batch=1, n_epoch=2)
+    except AssertionError as e:
+        assert "Cannot work with input y columns: ['y']" in str(e)
+
+
 def test_save_load():
     """Test the save/load operations of PGNN"""
     PhysicsGuidedNeuralNetwork.seed(0)
     model = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
                                        hidden_layers=HIDDEN_LAYERS,
                                        loss_weights=(0.0, 1.0),
-                                       input_dims=2, output_dims=1)
+                                       input_dims=2, output_dims=1,
+                                       feature_names=['a', 'b'],
+                                       output_names=['c'])
 
     model.fit(X, Y_NOISE, P, n_batch=4, n_epoch=20)
     y_pred = model.predict(X)
@@ -122,6 +156,8 @@ def test_save_load():
     loaded = PhysicsGuidedNeuralNetwork.load(FPATH)
     y_pred_loaded = loaded.predict(X)
     assert np.allclose(y_pred, y_pred_loaded)
+    assert loaded.feature_names == ['a', 'b']
+    assert loaded.output_names == ['c']
     os.remove(FPATH)
 
 

From 61c9081aa04f6e7d40c5fe7b14bb9bf98b86394b Mon Sep 17 00:00:00 2001
From: grantbuster <grant.buster@nrel.gov>
Date: Wed, 19 Aug 2020 13:14:06 -0600
Subject: [PATCH 4/6] added kernel and bias regularization kwargs and logic

---
 phygnn/phygnn.py | 115 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 6 deletions(-)

diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py
index d5a2454..5b56b24 100644
--- a/phygnn/phygnn.py
+++ b/phygnn/phygnn.py
@@ -10,7 +10,8 @@
 import pandas as pd
 import logging
 import tensorflow as tf
-from tensorflow.keras import layers, optimizers, initializers
+from tensorflow.keras import optimizers, initializers
+from tensorflow.keras.layers import InputLayer, Dense, Dropout
 
 from phygnn.loss_metrics import METRICS
 
@@ -25,6 +26,8 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
                  input_dims=1, output_dims=1, metric='mae',
                  initializer=None, optimizer=None,
                  learning_rate=0.01, history=None,
+                 kernel_reg_rate=0.0, kernel_reg_power=1,
+                 bias_reg_rate=0.0, bias_reg_power=1,
                  feature_names=None, output_names=None):
         """
         Parameters
@@ -63,6 +66,26 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
             Optimizer learning rate.
         history : None | pd.DataFrame
             Learning history if continuing a training session.
+        kernel_reg_rate : float
+            Kernel regularization rate. Increasing this value above zero will
+            add a structural loss term to the loss function that
+            disincentivizes large hidden layer weights and should reduce
+            model complexity. Setting this to 0.0 will disable kernel
+            regularization.
+        kernel_reg_power : int
+            Kernel regularization power. kernel_reg_power=1 is L1
+            regularization (lasso regression), and kernel_reg_power=2 is L2
+            regularization (ridge regression).
+        bias_reg_rate : float
+            Bias regularization rate. Increasing this value above zero will
+            add a structural loss term to the loss function that
+            disincentivizes large hidden layer biases and should reduce
+            model complexity. Setting this to 0.0 will disable bias
+            regularization.
+        bias_reg_power : int
+            Bias regularization power. bias_reg_power=1 is L1
+            regularization (lasso regression), and bias_reg_power=2 is L2
+            regularization (ridge regression).
         feature_names : list | tuple | None
             Training feature names (strings). Mostly a convenience so that a
             loaded-from-disk model will have declared feature names, making it
@@ -84,6 +107,10 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
         self._optimizer = None
         self._history = history
         self._learning_rate = learning_rate
+        self.kernel_reg_rate = kernel_reg_rate
+        self.kernel_reg_power = kernel_reg_power
+        self.bias_reg_rate = bias_reg_rate
+        self.bias_reg_power = bias_reg_power
         self.feature_names = feature_names
         self.output_names = output_names
 
@@ -106,10 +133,10 @@ def __init__(self, p_fun, hidden_layers, loss_weights=(0.5, 0.5),
         if optimizer is None:
             self._optimizer = optimizers.Adam(learning_rate=learning_rate)
 
-        self._layers.append(layers.InputLayer(input_shape=[input_dims]))
+        self._layers.append(InputLayer(input_shape=[input_dims]))
         for hidden_layer in hidden_layers:
             self.add_layer(hidden_layer)
-        self._layers.append(layers.Dense(
+        self._layers.append(Dense(
             output_dims, kernel_initializer=self._initializer))
 
     @staticmethod
@@ -272,6 +299,38 @@ def weights(self):
             weights += layer.variables
         return weights
 
+    @property
+    def kernel_weights(self):
+        """Get a list of the NN kernel weights
+
+        (can be used for kernel regularization).
+
+        Does not include input layer or dropout layers.
+        Does include the output layer.
+        """
+        weights = []
+        for layer in self.layers:
+            if isinstance(layer, Dense):
+                weights.append(layer.get_weights()[0])
+
+        return weights
+
+    @property
+    def bias_weights(self):
+        """Get a list of the NN bias weights
+
+        (can be used for bias regularization).
+
+        Does not include input layer or dropout layers.
+        Does include the output layer.
+        """
+        weights = []
+        for layer in self.layers:
+            if isinstance(layer, Dense):
+                weights.append(layer.get_weights()[1])
+
+        return weights
+
     def reset_history(self):
         """Erase previous training history without resetting trained weights"""
         self._history = None
@@ -291,6 +350,30 @@ def set_loss_weights(self, loss_weights):
         assert len(loss_weights) == 2, 'loss_weights can only have two values!'
         self._loss_weights = loss_weights
 
+    @property
+    def kernel_regularization_term(self):
+        """Get the regularization term for the kernel regularization without
+        the regularization rate applied."""
+        loss_k_reg = [tf.math.abs(x) for x in self.kernel_weights]
+        loss_k_reg = [tf.math.pow(x, self.kernel_reg_power)
+                      for x in loss_k_reg]
+        loss_k_reg = tf.math.reduce_sum(
+            [tf.math.reduce_sum(x) for x in loss_k_reg])
+
+        return loss_k_reg
+
+    @property
+    def bias_regularization_term(self):
+        """Get the regularization term for the bias regularization without
+        the regularization rate applied."""
+        loss_b_reg = [tf.math.abs(x) for x in self.bias_weights]
+        loss_b_reg = [tf.math.pow(x, self.bias_reg_power)
+                      for x in loss_b_reg]
+        loss_b_reg = tf.math.reduce_sum(
+            [tf.math.reduce_sum(x) for x in loss_b_reg])
+
+        return loss_b_reg
+
     def loss(self, y_predicted, y_true, p, p_kwargs):
         """Calculate the loss function by comparing model-predicted y to y_true
 
@@ -335,8 +418,24 @@ def loss(self, y_predicted, y_true, p, p_kwargs):
         logger.debug('NN Loss: {:.2e}, P Loss: {:.2e}, Total Loss: {:.2e}'
                      .format(nn_loss, p_loss, loss))
 
+        if self.kernel_reg_rate > 0:
+            loss_kernel_reg = (self.kernel_regularization_term
+                               * self.kernel_reg_rate)
+            loss += loss_kernel_reg
+            logger.debug('Kernel regularization loss: {:.2e}, '
+                         'Total Loss: {:.2e}'.format(loss_kernel_reg, loss))
+
+        if self.bias_reg_rate > 0:
+            loss_bias_reg = (self.bias_regularization_term
+                             * self.bias_reg_rate)
+            loss += loss_bias_reg
+            logger.debug('Bias regularization loss: {:.2e}, '
+                         'Total Loss: {:.2e}'.format(loss_bias_reg, loss))
+
         if tf.math.is_nan(loss):
-            raise ArithmeticError('Loss is nan.')
+            msg = 'phygnn calculated a NaN loss value!'
+            logger.error(msg)
+            raise ArithmeticError(msg)
 
         return loss, nn_loss, p_loss
 
@@ -355,14 +454,14 @@ def add_layer(self, layer_kwargs, insert_index=None):
         """
 
         dropout = layer_kwargs.pop('dropout', None)
-        layer = layers.Dense(**layer_kwargs)
+        layer = Dense(**layer_kwargs)
         if insert_index:
             self._layers.insert(insert_index, layer)
         else:
             self._layers.append(layer)
 
         if dropout is not None:
-            d_layer = layers.Dropout(dropout)
+            d_layer = Dropout(dropout)
             if insert_index:
                 self._layers.insert(insert_index + 1, d_layer)
             else:
@@ -656,6 +755,10 @@ def save(self, fpath):
                         'learning_rate': self._learning_rate,
                         'weight_dict': weight_dict,
                         'history': self._history,
+                        'kernel_reg_rate': self.kernel_reg_rate,
+                        'kernel_reg_power': self.kernel_reg_power,
+                        'bias_reg_rate': self.bias_reg_rate,
+                        'bias_reg_power': self.bias_reg_power,
                         'feature_names': self.feature_names,
                         'output_names': self.output_names,
                         }

From 0f400aada4cbefa8c3e5dfb4cebe7a24ef069934 Mon Sep 17 00:00:00 2001
From: grantbuster <grant.buster@nrel.gov>
Date: Wed, 19 Aug 2020 16:46:30 -0600
Subject: [PATCH 5/6] added kernel and bias regularization terms and kwargs to
 phygnn

---
 phygnn/phygnn.py     | 61 +++++++++++++++++++-------------------
 tests/test_phygnn.py | 70 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 31 deletions(-)

diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py
index 5b56b24..87e0205 100644
--- a/phygnn/phygnn.py
+++ b/phygnn/phygnn.py
@@ -297,11 +297,12 @@ def weights(self):
         weights = []
         for layer in self._layers:
             weights += layer.variables
+
         return weights
 
     @property
     def kernel_weights(self):
-        """Get a list of the NN kernel weights
+        """Get a list of the NN kernel weights (tensors)
 
         (can be used for kernel regularization).
 
@@ -311,13 +312,13 @@ def kernel_weights(self):
         weights = []
         for layer in self.layers:
             if isinstance(layer, Dense):
-                weights.append(layer.get_weights()[0])
+                weights.append(layer.variables[0])
 
         return weights
 
     @property
     def bias_weights(self):
-        """Get a list of the NN bias weights
+        """Get a list of the NN bias weights (tensors)
 
         (can be used for bias regularization).
 
@@ -327,31 +328,12 @@ def bias_weights(self):
         weights = []
         for layer in self.layers:
             if isinstance(layer, Dense):
-                weights.append(layer.get_weights()[1])
+                weights.append(layer.variables[1])
 
         return weights
 
-    def reset_history(self):
-        """Erase previous training history without resetting trained weights"""
-        self._history = None
-
-    def set_loss_weights(self, loss_weights):
-        """Set new loss weights
-
-        Parameters
-        ----------
-        loss_weights : tuple
-            Loss weights for the neural network y_predicted vs. y_true
-            and for the p_fun loss, respectively. For example,
-            loss_weights=(0.0, 1.0) would simplify the PGNN loss function
-            to just the p_fun output.
-        """
-        assert np.sum(loss_weights) > 0, 'Sum of loss_weights must be > 0!'
-        assert len(loss_weights) == 2, 'loss_weights can only have two values!'
-        self._loss_weights = loss_weights
-
     @property
-    def kernel_regularization_term(self):
+    def kernel_reg_term(self):
         """Get the regularization term for the kernel regularization without
         the regularization rate applied."""
         loss_k_reg = [tf.math.abs(x) for x in self.kernel_weights]
@@ -363,7 +345,7 @@ def kernel_regularization_term(self):
         return loss_k_reg
 
     @property
-    def bias_regularization_term(self):
+    def bias_reg_term(self):
         """Get the regularization term for the bias regularization without
         the regularization rate applied."""
         loss_b_reg = [tf.math.abs(x) for x in self.bias_weights]
@@ -374,6 +356,25 @@ def bias_regularization_term(self):
 
         return loss_b_reg
 
+    def reset_history(self):
+        """Erase previous training history without resetting trained weights"""
+        self._history = None
+
+    def set_loss_weights(self, loss_weights):
+        """Set new loss weights
+
+        Parameters
+        ----------
+        loss_weights : tuple
+            Loss weights for the neural network y_predicted vs. y_true
+            and for the p_fun loss, respectively. For example,
+            loss_weights=(0.0, 1.0) would simplify the PGNN loss function
+            to just the p_fun output.
+        """
+        assert np.sum(loss_weights) > 0, 'Sum of loss_weights must be > 0!'
+        assert len(loss_weights) == 2, 'loss_weights can only have two values!'
+        self._loss_weights = loss_weights
+
     def loss(self, y_predicted, y_true, p, p_kwargs):
         """Calculate the loss function by comparing model-predicted y to y_true
 
@@ -418,16 +419,14 @@ def loss(self, y_predicted, y_true, p, p_kwargs):
         logger.debug('NN Loss: {:.2e}, P Loss: {:.2e}, Total Loss: {:.2e}'
                      .format(nn_loss, p_loss, loss))
 
-        if self.kernel_reg_rate > 0:
-            loss_kernel_reg = (self.kernel_regularization_term
-                               * self.kernel_reg_rate)
+        if self.kernel_reg_rate != 0:
+            loss_kernel_reg = self.kernel_reg_term * self.kernel_reg_rate
             loss += loss_kernel_reg
             logger.debug('Kernel regularization loss: {:.2e}, '
                          'Total Loss: {:.2e}'.format(loss_kernel_reg, loss))
 
-        if self.bias_reg_rate > 0:
-            loss_bias_reg = (self.bias_regularization_term
-                             * self.bias_reg_rate)
+        if self.bias_reg_rate != 0:
+            loss_bias_reg = self.bias_reg_term * self.bias_reg_rate
             loss += loss_bias_reg
             logger.debug('Bias regularization loss: {:.2e}, '
                          'Total Loss: {:.2e}'.format(loss_bias_reg, loss))
diff --git a/tests/test_phygnn.py b/tests/test_phygnn.py
index 338b825..73445ae 100644
--- a/tests/test_phygnn.py
+++ b/tests/test_phygnn.py
@@ -139,6 +139,76 @@ def test_df_input():
         assert "Cannot work with input y columns: ['y']" in str(e)
 
 
+def test_kernel_regularization():
+    """Test the kernel regularization of phygnn."""
+    base = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
+                                      hidden_layers=HIDDEN_LAYERS,
+                                      loss_weights=(1.0, 0.0),
+                                      input_dims=2, output_dims=1)
+
+    model_l1 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
+                                          hidden_layers=HIDDEN_LAYERS,
+                                          loss_weights=(1.0, 0.0),
+                                          input_dims=2, output_dims=1,
+                                          kernel_reg_rate=0.01,
+                                          kernel_reg_power=1)
+
+    model_l2 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
+                                          hidden_layers=HIDDEN_LAYERS,
+                                          loss_weights=(1.0, 0.0),
+                                          input_dims=2, output_dims=1,
+                                          kernel_reg_rate=0.01,
+                                          kernel_reg_power=2)
+
+    base.seed(0)
+    base.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20)
+    model_l1.seed(0)
+    model_l1.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20)
+    model_l2.seed(0)
+    model_l2.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20)
+
+    assert base.kernel_reg_term > model_l1.kernel_reg_term
+    assert model_l1.kernel_reg_term > model_l2.kernel_reg_term
+    assert np.abs(base.kernel_reg_term - 497.95) < 1
+    assert np.abs(model_l1.kernel_reg_term - 84.55) < 1
+    assert np.abs(model_l2.kernel_reg_term - 17.29) < 1
+
+
+def test_bias_regularization():
+    """Test the bias regularization of phygnn."""
+    base = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
+                                      hidden_layers=HIDDEN_LAYERS,
+                                      loss_weights=(1.0, 0.0),
+                                      input_dims=2, output_dims=1)
+
+    model_l1 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
+                                          hidden_layers=HIDDEN_LAYERS,
+                                          loss_weights=(1.0, 0.0),
+                                          input_dims=2, output_dims=1,
+                                          bias_reg_rate=0.01,
+                                          bias_reg_power=1)
+
+    model_l2 = PhysicsGuidedNeuralNetwork(p_fun=p_fun_pythag,
+                                          hidden_layers=HIDDEN_LAYERS,
+                                          loss_weights=(1.0, 0.0),
+                                          input_dims=2, output_dims=1,
+                                          bias_reg_rate=0.01,
+                                          bias_reg_power=2)
+
+    base.seed(0)
+    base.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20)
+    model_l1.seed(0)
+    model_l1.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20)
+    model_l2.seed(0)
+    model_l2.fit(X, Y_NOISE, P, n_batch=1, n_epoch=20)
+
+    assert base.bias_reg_term > model_l1.bias_reg_term
+    assert model_l1.bias_reg_term > model_l2.bias_reg_term
+    assert np.abs(base.bias_reg_term - 5.77) < 1
+    assert np.abs(model_l1.bias_reg_term - 2.37) < 1
+    assert np.abs(model_l2.bias_reg_term - 0.30) < 1
+
+
 def test_save_load():
     """Test the save/load operations of PGNN"""
     PhysicsGuidedNeuralNetwork.seed(0)

From 94e76adfc46422e1b6cd714d35bff5a0edf5ab9b Mon Sep 17 00:00:00 2001
From: grantbuster <grant.buster@nrel.gov>
Date: Thu, 20 Aug 2020 10:45:22 -0600
Subject: [PATCH 6/6] updated doc string for preprocess one hot column labeler

---
 phygnn/pre_processing.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/phygnn/pre_processing.py b/phygnn/pre_processing.py
index a837b4f..a024ec0 100644
--- a/phygnn/pre_processing.py
+++ b/phygnn/pre_processing.py
@@ -136,7 +136,9 @@ def _get_one_hot_data(self, convert_int=False, categories=None):
 
     def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data,
                                      categories=None):
-        """
+        """Make unique column labels for the new one-hot data. This will use
+        column labels from categories if available.
+
         Parameters
         ----------
         one_hot_ind : list
@@ -155,6 +157,11 @@ def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data,
             results in category names being determined automatically. Format:
                 {'col_name1' : ['cat1', 'cat2', 'cat3'],
                  'col_name2' : ['other_cat1', 'other_cat2']}
+
+        Returns
+        -------
+        col_labels : list
+            List of string labels corresponding to np.hstack(one_hot_data).
         """
 
         if categories is None: