From c647a90721e1cf25f854e29cb12593a2d1bb57fa Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Fri, 21 Aug 2020 15:39:42 -0600 Subject: [PATCH 1/6] add MLModels allow multiple labels --- phygnn/ml_model.py | 1409 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1409 insertions(+) create mode 100644 phygnn/ml_model.py diff --git a/phygnn/ml_model.py b/phygnn/ml_model.py new file mode 100644 index 0000000..9b8c904 --- /dev/null +++ b/phygnn/ml_model.py @@ -0,0 +1,1409 @@ +# -*- coding: utf-8 -*- +""" +TensorFlow Model +""" +import json +import logging +import numpy as np +import os +import pandas as pd +from sklearn.ensemble import RandomForestRegressor +import tensorflow as tf +from tensorflow import feature_column +from tensorflow.keras import layers +from warnings import warn + +logger = logging.getLogger(__name__) + + +class MLModelBase: + """ + Machine Learning Model Base + """ + + def __init__(self, model, feature_names=None, label_names=None, + norm_params=None): + """ + Parameters + ---------- + model : OBJ + Sci-kit learn or tensorflow model + feature_names : list + Ordered list of feature names. + label_names : list + Ordered list of label (output) names. + norm_params : dict, optional + Dictionary mapping feature and label names (keys) to normalization + parameters (mean, stdev), by default None + """ + self._model = model + + if isinstance(feature_names, str): + feature_names = [feature_names] + + self._feature_names = feature_names + + if isinstance(label_names, str): + label_names = [label_names] + + self._label_names = label_names + if norm_params is None: + norm_params = {} + + self._norm_params = norm_params + + def __repr__(self): + msg = "{}:\n{}".format(self.__class__.__name__, self.model_summary) + + return msg + + def __getitem__(self, features): + """ + Use model to predict label from given features + + Parameters + ---------- + features : pandas.DataFrame + features to predict from + + Returns + ------- + pandas.DataFrame + label prediction + """ + return self.predict(features) + + @property + def model_summary(self): + """ + Tensorflow model summary + + Returns + ------- + str + """ + try: + summary = self._model.summary() + except ValueError: + summary = None + + return summary + + @property + def feature_names(self): + """ + List of the feature variable names. + + Returns + ------- + list + """ + return self._feature_names + + @property + def label_names(self): + """ + label variable names + + Returns + ------- + list + """ + return self._label_names + + @property + def normalization_parameters(self): + """ + Features and label (un)normalization parameters + + Returns + ------- + dict + """ + return self._norm_params + + @property + def means(self): + """ + Mapping feature/label names to the mean values for + (un)normalization + + Returns + ------- + dict + """ + means = {k: v['mean'] for k, v in self._norm_params.items()} + + return means + + @property + def stdevs(self): + """ + Mapping feature/label names to the stdev values for + (un)normalization + + Returns + ------- + dict + """ + stdevs = {k: v['stdev'] for k, v in self._norm_params.items()} + + return stdevs + + @property + def model(self): + """ + Trained model + + Returns + ------- + tensorflow.keras.models + """ + return self._model + + @property + def feature_means(self): + """ + Feature means, used for (un)normalization + + Returns + ------- + dict + """ + means = None + if self._feature_names is not None: + means = {} + for f in self._feature_names: + v = self._norm_params.get(f, None) + if v is not None: + means[f] = v['mean'] + + return means + + @property + def feature_stdevs(self): + """ + Feature stdevs, used for (un)normalization + + Returns + ------- + dict + """ + stdevs = None + if self._feature_names is not None: + stdevs = {} + for f in self._feature_names: + v = self._norm_params.get(f, None) + if v is not None: + stdevs[f] = v['stdev'] + + return stdevs + + @property + def label_means(self): + """ + label means, used for (un)normalization + + Returns + ------- + dict + """ + means = None + if self.label_names is not None: + means = {} + for l_n in self.label_names: + v = self._norm_params.get(l_n, None) + if v is not None: + means[l_n] = v['mean'] + + return means + + @property + def label_stdevs(self): + """ + label stdevs, used for (un)normalization + + Returns + ------- + dict + """ + stdevs = None + if self.label_names is not None: + stdevs = {} + for l_n in self.label_names: + v = self._norm_params.get(l_n, None) + if v is not None: + stdevs[l_n] = v['stdev'] + + return stdevs + + @staticmethod + def _normalize(native_arr, mean=None, stdev=None): + """ + Normalize features with mean at 0 and stdev of 1. + + Parameters + ---------- + native_arr : ndarray + native data + mean : float | None + mean to use for normalization + stdev : float | None + stdev to use for normalization + + Returns + ------- + norm_arr : ndarray + normalized data + mean : float + mean used for normalization + stdev : float + stdev used for normalization + """ + + if mean is None: + mean = np.nanmean(native_arr, axis=0) + + if stdev is None: + stdev = np.nanstd(native_arr, axis=0) + + norm_arr = native_arr - mean + norm_arr /= stdev + + return norm_arr, mean, stdev + + @staticmethod + def _unnormalize(norm_arr, mean, stdev): + """ + Unnormalize data with mean at 0 and stdev of 1. + + Parameters + ---------- + norm_arr : ndarray + normalized data + mean : float + mean used for normalization + stdev : float + stdev used for normalization + + Returns + ------- + native_arr : ndarray + native un-normalized data + """ + native_arr = norm_arr * stdev + native_arr += mean + + return native_arr + + @staticmethod + def dict_json_convert(inp): + """Recursively convert numeric values in dict to work with json dump + + Parameters + ---------- + inp : dict + Dictionary to convert. + + Returns + ------- + out : dict + Copy of dict input with all nested numeric values converted to + base python int or float and all arrays converted to lists. + """ + + if isinstance(inp, dict): + out = {k: MLModelBase.dict_json_convert(v) for k, v in inp.items()} + elif isinstance(inp, (list, tuple)): + out = [MLModelBase.dict_json_convert(i) for i in inp] + elif np.issubdtype(type(inp), np.floating): + out = float(inp) + elif np.issubdtype(type(inp), np.integer): + out = int(inp) + elif isinstance(inp, np.ndarray): + out = inp.tolist() + else: + out = inp + + return out + + @staticmethod + def _parse_data(data): + """ + Parse features or labels + + Parameters + ---------- + data : dict | pandas.DataFrame + Features or label to use with model + + Returns + ------- + data : dict + Dictionary of normalized (if desired) features or label + """ + + return data + + def get_norm_params(self, name): + """ + Feature or label normalization parameters + + Parameters + ---------- + name : str + feature | label name + + Returns + ------- + dict + mean and stdev values for given feature | label + """ + return self._norm_params.get(name, None) + + def get_mean(self, name): + """ + Get feature | label mean + + Parameters + ---------- + name : str + feature | label name + + Returns + ------- + mean : float + Mean value used for normalization + """ + mean = self._norm_params.get(name, None) + if mean is not None: + mean = mean.get('mean', None) + + return mean + + def get_stdev(self, name): + """ + Get feature | label stdev + + Parameters + ---------- + name : str + feature | label name + + Returns + ------- + stdev : float + Stdev value used for normalization + """ + stdev = self._norm_params.get(name, None) + if stdev is not None: + stdev = stdev.get('stdev', None) + + return stdev + + def normalize(self, items): + """ + Normalize given items (features | labels) + + Parameters + ---------- + items : dict + mapping of names to vectors + + Returns + ------- + norm_items : dict + mapping of names to normalized-feature vectors + """ + norm_items = {} + for key, value in items.items(): + mean = self.get_mean(key) + stdev = self.get_stdev(key) + try: + value, mean, stdev = self._normalize(value, mean=mean, + stdev=stdev) + norm_params = {key: {'mean': mean, 'stdev': stdev}} + self._norm_params.update(norm_params) + except Exception as ex: + msg = "Could not normalize {}:\n{}".format(key, ex) + logger.warning(msg) + warn(msg) + + norm_items[key] = value + + return norm_items + + def unnormalize(self, items): + """ + Unnormalize given items (features | labels) + + Parameters + ---------- + items : dict + mapping of names to vectors + + Returns + ------- + native_items : dict + mapping of names to native vectors + """ + native_items = {} + for key, value in items.items(): + norm_params = self.get_norm_params(key) + if norm_params is not None: + value = self._unnormalize(value, norm_params['mean'], + norm_params['stdev']) + native_items[key] = value + else: + msg = ("Normalization Parameters unavailable for {}" + .format(key)) + logger.warning(msg) + warn(msg) + + return native_items + + def unnormalize_prediction(self, prediction): + """ + Unnormalize prediction if needed + + Parameters + ---------- + prediction : ndarray + TfModel prediction + + Returns + ------- + prediction : ndarray + Unnormalized prediction + """ + out = np.empty(prediction.shape, prediction.dtype) + for ax, label in enumerate(self.label_names): + value = prediction[:, ax] + norm_params = self.get_norm_params(label) + if norm_params is not None: + value = self._unnormalize(value, norm_params['mean'], + norm_params['stdev']) + else: + msg = ("Normalization Parameters unavailable for {}" + .format(label)) + logger.warning(msg) + warn(msg) + + out[:, ax] = value + + return out + + def predict(self, features, **kwargs): + """ + Use model to predict label from given features + + Parameters + ---------- + features : dict | pandas.DataFrame + features to predict from + kwargs : dict + kwargs for tensorflow.*.predict + + Returns + ------- + prediction : dict + label prediction + """ + features = self._parse_data(features) + + prediction = pd.DataFrame(self._model.predict(features, **kwargs), + columns=self.label_names) + + prediction = self.unnormalize_prediction(prediction) + + return prediction + + +class TfModel(MLModelBase): + """ + TensorFlow Keras Model + """ + def __init__(self, model, feature_names=None, label_names=None, + norm_params=None): + """ + Parameters + ---------- + model : tensorflow.keras.models.Sequential + Tensorflow Keras Model + feature_names : list + Ordered list of feature names. + label_names : list + Ordered list of label (output) names. + norm_params : dict, optional + Dictionary mapping feature and label names (keys) to normalization + parameters (mean, stdev), by default None + """ + super().__init__(model, feature_names=feature_names, + label_names=label_names, norm_params=norm_params) + + self._history = None + + @property + def history(self): + """ + Model training history + + Returns + ------- + pandas.DataFrame | None + """ + if self._history is None: + msg = 'Model has not been trained yet!' + logger.warning(msg) + warn(msg) + history = None + else: + history = pd.DataFrame(self._history.history) + history['epoch'] = self._history.epoch + + return history + + @staticmethod + def _clean_name(name): + """ + Make feature | label name compatible with TensorFlow + + Parameters + ---------- + name : str + Feature |label name from GOOML + + Returns + ------- + name : str + Feature | label name compatible with TensorFlow + """ + name = name.replace(' ', '_') + name = name.replace('*', '-x-') + name = name.replace('+', '-plus-') + name = name.replace('**', '-exp-') + name = name.replace(')', '') + name = name.replace('log(', 'log-') + + return name + + @staticmethod + def _generate_feature_columns(features): + """ + Generate feature layer from features table + + Parameters + ---------- + features : dict + model features + + Returns + ------- + feature_columns : list + List of tensorFlow.feature_column objects + """ + feature_columns = [] + for name, data in features.items(): + name = TfModel._clean_name(name) + if np.issubdtype(data.dtype.name, np.number): + f_col = feature_column.numeric_column(name) + else: + f_col = TfModel._generate_cat_column(name, data) + + feature_columns.append(f_col) + + return feature_columns + + @staticmethod + def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100): + """Generate a feature column from a categorical string data set + + Parameters + ---------- + name : str + Name of categorical columns + data : np.ndarray | list + String data array + vocab_threshold : int + Number of unique entries in the data array below which this + will use a vocabulary list, above which a hash bucket will be used. + bucket_size : int + Hash bucket size. + + Returns + ------- + f_col : IndicatorColumn + Categorical feature column. + """ + + n_unique = len(set(data)) + + if n_unique < vocab_threshold: + f_col = feature_column.categorical_column_with_vocabulary_list( + name, list(set(data))) + else: + f_col = feature_column.categorical_column_with_hash_bucket( + name, bucket_size) + + f_col = feature_column.indicator_column(f_col) + + return f_col + + @staticmethod + def _build_feature_columns(feature_columns): + """ + Build the feature layer from given feature column descriptions + + Parameters + ---------- + feature_columns : list + list of feature column descriptions (dictionaries) + + Returns + ------- + tf_columns : list + List of tensorFlow.feature_column objects + """ + tf_columns = {} + col_map = {} # TODO: build map to tf.feature_column functions + # TODO: what feature_columns need to be wrapped + indicators = [feature_column.categorical_column_with_hash_bucket, + feature_column.categorical_column_with_identity, + feature_column.categorical_column_with_vocabulary_file, + feature_column.categorical_column_with_vocabulary_list, + feature_column.crossed_column] + for col in feature_columns: + name = col['name'] + f_type = col_map.get(col['type'], col['type']) + kwargs = col.get('kwargs', {}) + + if f_type == feature_column.crossed_column: + cross_cols = [tf_columns[name] + for name in col['cross_columns']] + f_col = f_type(cross_cols, **kwargs) + elif f_type == feature_column.embedding_column: + embedded_type = col_map[col['embedded_col']] + f_col = embedded_type(name, **kwargs) + f_col = f_type(f_col, **kwargs) + else: + f_col = f_type(name, **kwargs) + + if f_type in indicators: + f_col = feature_column.indicator_column(f_col) + + tf_columns[name] = f_col + + return tf_columns + + @staticmethod + def _compile_model(feature_columns, model_layers=None, learning_rate=0.001, + loss="mean_squared_error", metrics=('mae', 'mse'), + optimizer_class=None, **kwargs): + """ + Build tensorflow sequential model from given layers and kwargs + + Parameters + ---------- + feature_columns : list + List of tensorFlow.feature_column objects + model_layers : list, optional + List of tensorflow layers.Dense kwargs (dictionaries) + if None use a single linear layer, by default None + learning_rate : float, optional + tensorflow optimizer learning rate, by default 0.001 + loss : str, optional + name of objective function, by default "mean_squared_error" + metrics : list, optional + List of metrics to be evaluated by the model during training and + testing, by default ('mae', 'mse') + optimizer_class : None | tf.keras.optimizers + Optional explicit request of optimizer. This should be a class + that will be instantated in the TfModel._compile_model() method + The default is the RMSprop optimizer. + kwargs : dict + kwargs for tensorflow.keras.models.compile + + Returns + ------- + tensorflow.keras.models.Sequential + Compiled tensorflow Sequential model + """ + model = tf.keras.models.Sequential() + model.add(layers.DenseFeatures(feature_columns)) + if model_layers is None: + # Add a single linear layer + model.add(layers.Dense(units=1, input_shape=(1,))) + else: + for layer in model_layers: + dropout = layer.pop('dropout', None) + model.add(layers.Dense(**layer)) + + if dropout is not None: + model.add(layers.Dropout(dropout)) + + if isinstance(metrics, tuple): + metrics = list(metrics) + elif not isinstance(metrics, list): + metrics = [metrics] + + if optimizer_class is None: + optimizer = tf.keras.optimizers.RMSprop( + learning_rate=learning_rate) + else: + optimizer = optimizer_class(learning_rate=learning_rate) + + model.compile(optimizer=optimizer, loss=loss, metrics=metrics, + **kwargs) + + return model + + @staticmethod + def build_model(features, feature_columns=None, model_layers=None, + learning_rate=0.001, loss="mean_squared_error", + metrics=('mae', 'mse'), optimizer_class=None, **kwargs): + """ + Build tensorflow sequential model from given layers and kwargs + + Parameters + ---------- + features : dict | pandas.DataFrame + Model features + feature_columns : list, optional + list of feature column descriptions (dictionaries) + if None use numeric columns with the feature_attr name, + by default None + model_layers : list, optional + List of tensorflow layers.Dense kwargs (dictionaries) + if None use a single linear layer, by default None + learning_rate : float, optional + tensorflow optimizer learning rate, by default 0.001 + loss : str, optional + name of objective function, by default "mean_squared_error" + metrics : list, optional + List of metrics to be evaluated by the model during training and + testing, by default ('mae', 'mse') + optimizer_class : None | tf.keras.optimizers + Optional explicit request of optimizer. This should be a class + that will be instantated in the TfModel._compile_model() method. + The default is the RMSprop optimizer. + kwargs : dict + kwargs for tensorflow.keras.models.compile + + Returns + ------- + tensorflow.keras.models.Sequential + Compiled tensorflow Sequential model + """ + if feature_columns is None: + feature_columns = TfModel._generate_feature_columns(features) + else: + if len(feature_columns) < len(features.keys()): + msg = ("There must be at least one feature column per feature!" + " {} columns were supplied but there are {} features!" + .format(len(feature_columns), + len(features.keys()))) + logger.error(msg) + raise ValueError + + feature_columns = TfModel._build_feature_columns(feature_columns) + + model = TfModel._compile_model(feature_columns, + model_layers=model_layers, + learning_rate=learning_rate, + loss=loss, metrics=metrics, + optimizer_class=optimizer_class, + **kwargs) + + return model + + def _parse_data(self, data, normalize=True, clean_names=True): + """ + Parse features or labels, normalize, and clean names if requested + + Parameters + ---------- + data : dict | pandas.DataFrame + Features or label to use with model + normalize : bool, optional + Flag to normalize features or labels, by default True + clean_names : bool, optional + Flag to clean feature or label names, by default True + + Returns + ------- + data : dict + Dictionary of normalized (if desired) features or label + """ + if isinstance(data, pd.DataFrame): + data = {name: np.array(value) for name, value in data.items()} + elif not isinstance(data, dict): + msg = ("Features and label must be supplied as a pandas.DataFrame" + " or python dictionary, but recieved: {}" + .format(type(data))) + logger.error(msg) + raise ValueError(msg) + + if normalize: + data = self.normalize(data) + + if clean_names: + data = {self._clean_name(key): value + for key, value in data.items()} + + return data + + def train_model(self, features, labels, norm_label=True, epochs=100, + validation_split=0.2, early_stop=True, **kwargs): + """ + Train the model with the provided features and label + + Parameters + ---------- + features : dict | pandas.DataFrame + Input features to train on + labels : dict | pandas.DataFrame + label to train on + norm_label : bool + Flag to normalize label + epochs : int, optional + Number of epochs to train the model, by default 100 + validation_split : float, optional + Fraction of the training data to be used as validation data, + by default 0.2 + early_stop : bool + Flag to stop training when it stops improving + kwargs : dict + kwargs for tensorflow.keras.models.fit + """ + features = self._parse_data(features) + self._feature_names = list(features.keys()) + + labels = self._parse_data(labels, normalize=norm_label) + + self._label_names = list(labels.keys()) + + if self._history is not None: + msg = 'Model has already been trained and will be re-fit!' + logger.warning(msg) + warn(msg) + + if early_stop: + early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', + patience=10) + callbacks = kwargs.pop('callbacks', None) + if callbacks is None: + callbacks = [early_stop] + else: + callbacks.append(early_stop) + + kwargs['callbacks'] = callbacks + + if validation_split > 0: + split = int(len(list(features.values())[0]) * validation_split) + validate_features = {name: arr[-split:] + for name, arr in features.items()} + validate_labels = [arr[-split:] for arr in labels.values()] + validation_data = (validate_features, validate_labels) + + features = {name: arr[:-split] + for name, arr in features.items()} + labels = [arr[:-split] for arr in labels.values()] + else: + validation_data = None + + self._history = self._model.fit(x=features, y=labels, epochs=epochs, + validation_data=validation_data, + **kwargs) + + def save_model(self, path): + """ + Save TfModel to path. + + Parameters + ---------- + path : str + Directory path to save model to. The tensorflow model will be + saved to the directory while the framework parameters will be + saved in json. + """ + if path.endswith('.json'): + path = path.replace('.json', '/') + + if not path.endswith('/'): + path += '/' + + if not os.path.exists(path): + os.makedirs(path) + + tf.saved_model.save(self.model, path) + + model_params = {'feature_names': self.feature_names, + 'label_names': self.label_names, + 'norm_params': self.normalization_parameters} + + json_path = path.rstrip('/') + '.json' + model_params = self.dict_json_convert(model_params) + with open(json_path, 'w') as f: + json.dump(model_params, f, indent=2, sort_keys=True) + + @classmethod + def build(cls, features, feature_columns=None, model_layers=None, + learning_rate=0.001, loss="mean_squared_error", + metrics=('mae', 'mse'), optimizer_class=None, **kwargs): + """ + Build tensorflow sequential model from given features, layers and + kwargs + + Parameters + ---------- + features : dict | pandas.DataFrame + Model features + feature_columns : list, optional + list of feature column descriptions (dictionaries) + if None use numeric columns with the feature_attr name, + by default None + model_layers : list, optional + List of tensorflow layers.Dense kwargs (dictionaries) + if None use a single linear layer, by default None + learning_rate : float, optional + tensorflow optimizer learning rate, by default 0.001 + loss : str, optional + name of objective function, by default "mean_squared_error" + metrics : list, optional + List of metrics to be evaluated by the model during training and + testing, by default ('mae', 'mse') + optimizer_class : None | tf.keras.optimizers + Optional explicit request of optimizer. This should be a class + that will be instantated in the TfModel._compile_model() method + The default is the RMSprop optimizer. + kwargs : dict + kwargs for tensorflow.keras.models.compile + + Returns + ------- + model : TfModel + Initialized TfKeraModel obj + """ + model = TfModel.build_model(features, feature_columns=feature_columns, + model_layers=model_layers, + learning_rate=learning_rate, loss=loss, + metrics=metrics, + optimizer_class=optimizer_class, **kwargs) + + return cls(model) + + @classmethod + def train(cls, features, labels, feature_columns=None, model_layers=None, + learning_rate=0.001, loss="mean_squared_error", + metrics=('mae', 'mse'), optimizer_class=None, norm_label=True, + epochs=100, validation_split=0.2, early_stop=True, + save_path=None, build_kwargs=None, train_kwargs=None): + """ + Build tensorflow sequential model from given features, layers and + kwargs and then train with given label and kwargs + + Parameters + ---------- + features : dict | pandas.DataFrame + Model features + labels : dict | pandas.DataFrame + label to train on + feature_columns : list, optional + list of feature column descriptions (dictionaries) + if None use numeric columns with the feature_attr name, + by default None + model_layers : list, optional + List of tensorflow layers.Dense kwargs (dictionaries) + if None use a single linear layer, by default None + learning_rate : float, optional + tensorflow optimizer learning rate, by default 0.001 + loss : str, optional + name of objective function, by default "mean_squared_error" + metrics : list, optional + List of metrics to be evaluated by the model during training and + testing, by default ('mae', 'mse') + optimizer_class : None | tf.keras.optimizers + Optional explicit request of optimizer. This should be a class + that will be instantated in the TfModel._compile_model() method + The default is the RMSprop optimizer. + norm_label : bool + Flag to normalize label + epochs : int, optional + Number of epochs to train the model, by default 100 + validation_split : float, optional + Fraction of the training data to be used as validation data, + by default 0.2 + early_stop : bool + Flag to stop training when it stops improving + save_path : str + Directory path to save model to. The tensorflow model will be + saved to the directory while the framework parameters will be + saved in json. + build_kwargs : dict + kwargs for tensorflow.keras.models.compile + train_kwargs : dict + kwargs for tensorflow.keras.models.fit + + Returns + ------- + model : TfModel + Initialized and trained TfModel obj + """ + if build_kwargs is None: + build_kwargs = {} + + model = cls.build(features, feature_columns=feature_columns, + model_layers=model_layers, + learning_rate=learning_rate, loss=loss, + metrics=metrics, optimizer_class=optimizer_class, + **build_kwargs) + + if train_kwargs is None: + train_kwargs = {} + + model.train_model(features, labels, norm_label=norm_label, + epochs=epochs, validation_split=validation_split, + early_stop=early_stop, **train_kwargs) + + if save_path is not None: + model.save_model(save_path) + + return model + + @classmethod + def load(cls, path): + """ + Load model from model path. + + Parameters + ---------- + path : str + Directory path to TfModel to load model from. There should be a + tensorflow saved model directory with a parallel pickle file for + the TfModel framework. + + Returns + ------- + model : TfModel + Loaded TfModel from disk. + """ + if path.endswith('.json'): + path = path.replace('.json', '/') + + if not path.endswith('/'): + path += '/' + + if not os.path.isdir(path): + e = ('Can only load directory path but target is not ' + 'directory: {}'.format(path)) + logger.error(e) + raise IOError(e) + + loaded = tf.keras.models.load_model(path) + + json_path = path.rstrip('/') + '.json' + with open(json_path, 'r') as f: + model_params = json.load(f) + + model = cls(loaded, **model_params) + + return model + + +class RandomForestModel(MLModelBase): + """ + scikit learn Random Forest Regression + """ + + def __init__(self, model, feature_names=None, label_name=None, + norm_params=None): + """ + Parameters + ---------- + model : sklearn.ensemble.RandomForestRegressor + Sklearn Random Forest Model + feature_names : list + Ordered list of feature names. + label_name : str + label (output) variable name. + norm_params : dict, optional + Dictionary mapping feature and label names (keys) to normalization + parameters (mean, stdev), by default None + """ + super().__init__(model, feature_names=feature_names, + label_name=label_name, norm_params=norm_params) + + if len(self.label_names) > 1: + msg = ("Only a single label can be supplied to {}, but {} were" + .format(self.__class__.__name__, len(self.label_names))) + logger.error(msg) + raise ValueError(msg) + + @staticmethod + def build_model(**kwargs): + """ + Build sklearn random forest model + + Parameters + ---------- + kwargs : dict + kwargs for sklearn.ensemble.RandomForestRegressor + + Returns + ------- + sklearn.ensemble.RandomForestRegressor + sklearn random forest model + """ + model = RandomForestRegressor(**kwargs) + + return model + + def _get_norm_params(self, names): + """ + Get means and stdevs for given feature/label names + + Parameters + ---------- + names : list + list of feature/label names to get normalization params for + + Returns + ------- + means : list | None + List of means to use for (un)normalization + stdevs : list | None + List of stdevs to use for (un)normalization + """ + means = [] + stdevs = [] + for name in names: + v = self._norm_params.get(name, None) + if v is None: + means = None + stdevs = None + break + + means.append(v['mean']) + stdevs.append(v['stdev']) + + return means, stdevs + + def normalize(self, df): + """ + Normalize DataFrame + + Parameters + ---------- + df : pandas.DataFrame + DataFrame of features/label to normalize + + Returns + ------- + norm_df : pandas.DataFrame + Normalized features/label + """ + df = pd.get_dummies(df) + means, stdevs = self._get_norm_params(df.columns) + + norm_df, means, stdevs = self._normalize(df, mean=means, + stdev=stdevs) + for i, c in enumerate(df.columns): + norm_params = {c: {'mean': means[i], 'stdev': stdevs[i]}} + self._norm_params.update(norm_params) + + return norm_df + + def unnormalize_prediction(self, prediction): + """ + Unnormalize prediction if needed + + Parameters + ---------- + prediction : ndarray + Model prediction + + Returns + ------- + prediction : ndarray + Native prediction + """ + means = self.label_means[0] + if means: + stdevs = self.label_stdevs[0] + prediction = self._unnormalize(prediction, means, stdevs) + + return prediction + + def _parse_data(self, features, normalize=True, names=False): + """ + Parse features or labels, normalize, and clean names if requested + + Parameters + ---------- + features : panda.DataFrame + Features or label to use with model + normalize : bool, optional + Flag to normalize features or labels, by default True + names : bool, optional + Flag to retain DataFrame, by default False + + Returns + ------- + features : ndarray | panda.DataFrame + Normalized (if desired) features or label + """ + if not isinstance(features, pd.DataFrame): + msg = ("Features must be a pandas.DataFrame, but {} was supplied" + .format(type(features))) + logger.error(msg) + raise ValueError(msg) + + if normalize: + features = self.normalize(features) + else: + features = pd.get_dummies(features) + + if not names: + features = features.values + + return features + + def train_model(self, features, label, norm_label=True, **kwargs): + """ + Train the model with the provided features and label + + Parameters + ---------- + features : dict | pandas.DataFrame + Input features to train on + labels : dict | pandas.DataFrame + label to train on + norm_label : bool + Flag to normalize label + kwargs : dict + kwargs for sklearn.ensemble.RandomForestRegressor.fit + """ + features = self._parse_data(features, names=True) + self._feature_names = list(features.columns) + features = features.values + + label = self._parse_data(label, normalize=norm_label, + names=True) + self._label_names = list(label.columns) + label = label.values + + if len(self.label_names) > 1: + msg = ("Only a single label can be supplied to {}, but {} were" + .format(self.__class__.__name__, len(self.label_names))) + logger.error(msg) + raise ValueError(msg) + + self._model.fit(features, label.ravel(), **kwargs) + + def save_model(self, path): + """ + Save Random Forest Model to path. + + Parameters + ---------- + path : str + Path to save model to + """ + if path.endswith('.json'): + dir_path = os.path.dirname(path) + else: + dir_path = path + path = os.path.join(dir_path, os.path.basename(path) + '.json') + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + model_params = {'feature_names': self.feature_names, + 'label_names': self.label_names, + 'norm_params': self.normalization_parameters, + 'model_params': self.model.get_params()} + + model_params = self.dict_json_convert(model_params) + with open(path, 'w') as f: + json.dump(model_params, f, indent=2, sort_keys=True) + + @classmethod + def train(cls, features, labels, norm_label=True, save_path=None, + build_kwargs=None, train_kwargs=None): + """ + Build Random Forest Model with given kwargs and then train with + given features, labels, and kwargs + + Parameters + ---------- + features : pandas.DataFrame + Model features + labels : pandas.DataFrame + label to train on + norm_label : bool + Flag to normalize label + save_path : str + Directory path to save model to. The RandomForest Model will be + saved to the directory while the framework parameters will be + saved in json. + build_kwargs : dict + kwargs for tensorflow.keras.models.compile + train_kwargs : dict + kwargs for tensorflow.keras.models.fit + + Returns + ------- + model : RandomForestModel + Initialized and trained RandomForestModel obj + """ + if build_kwargs is None: + build_kwargs = {} + + model = cls(cls.build_model(**build_kwargs)) + + if train_kwargs is None: + train_kwargs = {} + + model.train_model(features, labels, norm_label=norm_label, + **train_kwargs) + + if save_path is not None: + pass + # model.save_model(save_path) + + return model + + @classmethod + def load(cls, path): + """ + Load model from model path. + + Parameters + ---------- + path : str + Directory path to TfModel to load model from. There should be a + tensorflow saved model directory with a parallel pickle file for + the TfModel framework. + + Returns + ------- + model : TfModel + Loaded TfModel from disk. + """ + if not path.endswith('.json'): + path = os.path.join(path, os.path.basename(path) + '.json') + + if not os.path.exists(path): + e = ('{} does not exist'.format(path)) + logger.error(e) + raise IOError(e) + + with open(path, 'r') as f: + model_params = json.load(f) + + loaded = RandomForestRegressor() + loaded = loaded.set_params(**model_params.pop('model_params')) + + model = cls(loaded, **model_params) + + return model From 4f390c4c918b4ffe6d8064e84eefcb6c5a4801ba Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Mon, 24 Aug 2020 11:12:26 -0600 Subject: [PATCH 2/6] add model_interfaces clean up repo structure --- phygnn/__init__.py | 3 +- phygnn/ml_model.py | 1409 ----------------- phygnn/model_interfaces/__init__.py | 3 + phygnn/model_interfaces/base_model.py | 852 ++++++++++ .../model_interfaces/random_forest_model.py | 253 +++ phygnn/model_interfaces/tf_model.py | 517 ++++++ phygnn/phygnn.py | 2 +- phygnn/utilities/__init__.py | 4 + phygnn/{ => utilities}/loss_metrics.py | 0 phygnn/{ => utilities}/pre_processing.py | 68 +- phygnn/{ => utilities}/tf_utilities.py | 1 + 11 files changed, 1696 insertions(+), 1416 deletions(-) delete mode 100644 phygnn/ml_model.py create mode 100644 phygnn/model_interfaces/__init__.py create mode 100644 phygnn/model_interfaces/base_model.py create mode 100644 phygnn/model_interfaces/random_forest_model.py create mode 100644 phygnn/model_interfaces/tf_model.py create mode 100644 phygnn/utilities/__init__.py rename phygnn/{ => utilities}/loss_metrics.py (100%) rename phygnn/{ => utilities}/pre_processing.py (77%) rename phygnn/{ => utilities}/tf_utilities.py (95%) diff --git a/phygnn/__init__.py b/phygnn/__init__.py index 4c63f55..95fd779 100644 --- a/phygnn/__init__.py +++ b/phygnn/__init__.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- """Physics Guided Neural Network python library.""" import os +from .model_interfaces import TfModel from .phygnn import PhysicsGuidedNeuralNetwork -from .tf_utilities import tf_isin, tf_log10 +from .utilities import tf_isin, tf_log10 PHYGNNDIR = os.path.dirname(os.path.realpath(__file__)) TESTDATADIR = os.path.join(os.path.dirname(PHYGNNDIR), 'tests', 'data') diff --git a/phygnn/ml_model.py b/phygnn/ml_model.py deleted file mode 100644 index 9b8c904..0000000 --- a/phygnn/ml_model.py +++ /dev/null @@ -1,1409 +0,0 @@ -# -*- coding: utf-8 -*- -""" -TensorFlow Model -""" -import json -import logging -import numpy as np -import os -import pandas as pd -from sklearn.ensemble import RandomForestRegressor -import tensorflow as tf -from tensorflow import feature_column -from tensorflow.keras import layers -from warnings import warn - -logger = logging.getLogger(__name__) - - -class MLModelBase: - """ - Machine Learning Model Base - """ - - def __init__(self, model, feature_names=None, label_names=None, - norm_params=None): - """ - Parameters - ---------- - model : OBJ - Sci-kit learn or tensorflow model - feature_names : list - Ordered list of feature names. - label_names : list - Ordered list of label (output) names. - norm_params : dict, optional - Dictionary mapping feature and label names (keys) to normalization - parameters (mean, stdev), by default None - """ - self._model = model - - if isinstance(feature_names, str): - feature_names = [feature_names] - - self._feature_names = feature_names - - if isinstance(label_names, str): - label_names = [label_names] - - self._label_names = label_names - if norm_params is None: - norm_params = {} - - self._norm_params = norm_params - - def __repr__(self): - msg = "{}:\n{}".format(self.__class__.__name__, self.model_summary) - - return msg - - def __getitem__(self, features): - """ - Use model to predict label from given features - - Parameters - ---------- - features : pandas.DataFrame - features to predict from - - Returns - ------- - pandas.DataFrame - label prediction - """ - return self.predict(features) - - @property - def model_summary(self): - """ - Tensorflow model summary - - Returns - ------- - str - """ - try: - summary = self._model.summary() - except ValueError: - summary = None - - return summary - - @property - def feature_names(self): - """ - List of the feature variable names. - - Returns - ------- - list - """ - return self._feature_names - - @property - def label_names(self): - """ - label variable names - - Returns - ------- - list - """ - return self._label_names - - @property - def normalization_parameters(self): - """ - Features and label (un)normalization parameters - - Returns - ------- - dict - """ - return self._norm_params - - @property - def means(self): - """ - Mapping feature/label names to the mean values for - (un)normalization - - Returns - ------- - dict - """ - means = {k: v['mean'] for k, v in self._norm_params.items()} - - return means - - @property - def stdevs(self): - """ - Mapping feature/label names to the stdev values for - (un)normalization - - Returns - ------- - dict - """ - stdevs = {k: v['stdev'] for k, v in self._norm_params.items()} - - return stdevs - - @property - def model(self): - """ - Trained model - - Returns - ------- - tensorflow.keras.models - """ - return self._model - - @property - def feature_means(self): - """ - Feature means, used for (un)normalization - - Returns - ------- - dict - """ - means = None - if self._feature_names is not None: - means = {} - for f in self._feature_names: - v = self._norm_params.get(f, None) - if v is not None: - means[f] = v['mean'] - - return means - - @property - def feature_stdevs(self): - """ - Feature stdevs, used for (un)normalization - - Returns - ------- - dict - """ - stdevs = None - if self._feature_names is not None: - stdevs = {} - for f in self._feature_names: - v = self._norm_params.get(f, None) - if v is not None: - stdevs[f] = v['stdev'] - - return stdevs - - @property - def label_means(self): - """ - label means, used for (un)normalization - - Returns - ------- - dict - """ - means = None - if self.label_names is not None: - means = {} - for l_n in self.label_names: - v = self._norm_params.get(l_n, None) - if v is not None: - means[l_n] = v['mean'] - - return means - - @property - def label_stdevs(self): - """ - label stdevs, used for (un)normalization - - Returns - ------- - dict - """ - stdevs = None - if self.label_names is not None: - stdevs = {} - for l_n in self.label_names: - v = self._norm_params.get(l_n, None) - if v is not None: - stdevs[l_n] = v['stdev'] - - return stdevs - - @staticmethod - def _normalize(native_arr, mean=None, stdev=None): - """ - Normalize features with mean at 0 and stdev of 1. - - Parameters - ---------- - native_arr : ndarray - native data - mean : float | None - mean to use for normalization - stdev : float | None - stdev to use for normalization - - Returns - ------- - norm_arr : ndarray - normalized data - mean : float - mean used for normalization - stdev : float - stdev used for normalization - """ - - if mean is None: - mean = np.nanmean(native_arr, axis=0) - - if stdev is None: - stdev = np.nanstd(native_arr, axis=0) - - norm_arr = native_arr - mean - norm_arr /= stdev - - return norm_arr, mean, stdev - - @staticmethod - def _unnormalize(norm_arr, mean, stdev): - """ - Unnormalize data with mean at 0 and stdev of 1. - - Parameters - ---------- - norm_arr : ndarray - normalized data - mean : float - mean used for normalization - stdev : float - stdev used for normalization - - Returns - ------- - native_arr : ndarray - native un-normalized data - """ - native_arr = norm_arr * stdev - native_arr += mean - - return native_arr - - @staticmethod - def dict_json_convert(inp): - """Recursively convert numeric values in dict to work with json dump - - Parameters - ---------- - inp : dict - Dictionary to convert. - - Returns - ------- - out : dict - Copy of dict input with all nested numeric values converted to - base python int or float and all arrays converted to lists. - """ - - if isinstance(inp, dict): - out = {k: MLModelBase.dict_json_convert(v) for k, v in inp.items()} - elif isinstance(inp, (list, tuple)): - out = [MLModelBase.dict_json_convert(i) for i in inp] - elif np.issubdtype(type(inp), np.floating): - out = float(inp) - elif np.issubdtype(type(inp), np.integer): - out = int(inp) - elif isinstance(inp, np.ndarray): - out = inp.tolist() - else: - out = inp - - return out - - @staticmethod - def _parse_data(data): - """ - Parse features or labels - - Parameters - ---------- - data : dict | pandas.DataFrame - Features or label to use with model - - Returns - ------- - data : dict - Dictionary of normalized (if desired) features or label - """ - - return data - - def get_norm_params(self, name): - """ - Feature or label normalization parameters - - Parameters - ---------- - name : str - feature | label name - - Returns - ------- - dict - mean and stdev values for given feature | label - """ - return self._norm_params.get(name, None) - - def get_mean(self, name): - """ - Get feature | label mean - - Parameters - ---------- - name : str - feature | label name - - Returns - ------- - mean : float - Mean value used for normalization - """ - mean = self._norm_params.get(name, None) - if mean is not None: - mean = mean.get('mean', None) - - return mean - - def get_stdev(self, name): - """ - Get feature | label stdev - - Parameters - ---------- - name : str - feature | label name - - Returns - ------- - stdev : float - Stdev value used for normalization - """ - stdev = self._norm_params.get(name, None) - if stdev is not None: - stdev = stdev.get('stdev', None) - - return stdev - - def normalize(self, items): - """ - Normalize given items (features | labels) - - Parameters - ---------- - items : dict - mapping of names to vectors - - Returns - ------- - norm_items : dict - mapping of names to normalized-feature vectors - """ - norm_items = {} - for key, value in items.items(): - mean = self.get_mean(key) - stdev = self.get_stdev(key) - try: - value, mean, stdev = self._normalize(value, mean=mean, - stdev=stdev) - norm_params = {key: {'mean': mean, 'stdev': stdev}} - self._norm_params.update(norm_params) - except Exception as ex: - msg = "Could not normalize {}:\n{}".format(key, ex) - logger.warning(msg) - warn(msg) - - norm_items[key] = value - - return norm_items - - def unnormalize(self, items): - """ - Unnormalize given items (features | labels) - - Parameters - ---------- - items : dict - mapping of names to vectors - - Returns - ------- - native_items : dict - mapping of names to native vectors - """ - native_items = {} - for key, value in items.items(): - norm_params = self.get_norm_params(key) - if norm_params is not None: - value = self._unnormalize(value, norm_params['mean'], - norm_params['stdev']) - native_items[key] = value - else: - msg = ("Normalization Parameters unavailable for {}" - .format(key)) - logger.warning(msg) - warn(msg) - - return native_items - - def unnormalize_prediction(self, prediction): - """ - Unnormalize prediction if needed - - Parameters - ---------- - prediction : ndarray - TfModel prediction - - Returns - ------- - prediction : ndarray - Unnormalized prediction - """ - out = np.empty(prediction.shape, prediction.dtype) - for ax, label in enumerate(self.label_names): - value = prediction[:, ax] - norm_params = self.get_norm_params(label) - if norm_params is not None: - value = self._unnormalize(value, norm_params['mean'], - norm_params['stdev']) - else: - msg = ("Normalization Parameters unavailable for {}" - .format(label)) - logger.warning(msg) - warn(msg) - - out[:, ax] = value - - return out - - def predict(self, features, **kwargs): - """ - Use model to predict label from given features - - Parameters - ---------- - features : dict | pandas.DataFrame - features to predict from - kwargs : dict - kwargs for tensorflow.*.predict - - Returns - ------- - prediction : dict - label prediction - """ - features = self._parse_data(features) - - prediction = pd.DataFrame(self._model.predict(features, **kwargs), - columns=self.label_names) - - prediction = self.unnormalize_prediction(prediction) - - return prediction - - -class TfModel(MLModelBase): - """ - TensorFlow Keras Model - """ - def __init__(self, model, feature_names=None, label_names=None, - norm_params=None): - """ - Parameters - ---------- - model : tensorflow.keras.models.Sequential - Tensorflow Keras Model - feature_names : list - Ordered list of feature names. - label_names : list - Ordered list of label (output) names. - norm_params : dict, optional - Dictionary mapping feature and label names (keys) to normalization - parameters (mean, stdev), by default None - """ - super().__init__(model, feature_names=feature_names, - label_names=label_names, norm_params=norm_params) - - self._history = None - - @property - def history(self): - """ - Model training history - - Returns - ------- - pandas.DataFrame | None - """ - if self._history is None: - msg = 'Model has not been trained yet!' - logger.warning(msg) - warn(msg) - history = None - else: - history = pd.DataFrame(self._history.history) - history['epoch'] = self._history.epoch - - return history - - @staticmethod - def _clean_name(name): - """ - Make feature | label name compatible with TensorFlow - - Parameters - ---------- - name : str - Feature |label name from GOOML - - Returns - ------- - name : str - Feature | label name compatible with TensorFlow - """ - name = name.replace(' ', '_') - name = name.replace('*', '-x-') - name = name.replace('+', '-plus-') - name = name.replace('**', '-exp-') - name = name.replace(')', '') - name = name.replace('log(', 'log-') - - return name - - @staticmethod - def _generate_feature_columns(features): - """ - Generate feature layer from features table - - Parameters - ---------- - features : dict - model features - - Returns - ------- - feature_columns : list - List of tensorFlow.feature_column objects - """ - feature_columns = [] - for name, data in features.items(): - name = TfModel._clean_name(name) - if np.issubdtype(data.dtype.name, np.number): - f_col = feature_column.numeric_column(name) - else: - f_col = TfModel._generate_cat_column(name, data) - - feature_columns.append(f_col) - - return feature_columns - - @staticmethod - def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100): - """Generate a feature column from a categorical string data set - - Parameters - ---------- - name : str - Name of categorical columns - data : np.ndarray | list - String data array - vocab_threshold : int - Number of unique entries in the data array below which this - will use a vocabulary list, above which a hash bucket will be used. - bucket_size : int - Hash bucket size. - - Returns - ------- - f_col : IndicatorColumn - Categorical feature column. - """ - - n_unique = len(set(data)) - - if n_unique < vocab_threshold: - f_col = feature_column.categorical_column_with_vocabulary_list( - name, list(set(data))) - else: - f_col = feature_column.categorical_column_with_hash_bucket( - name, bucket_size) - - f_col = feature_column.indicator_column(f_col) - - return f_col - - @staticmethod - def _build_feature_columns(feature_columns): - """ - Build the feature layer from given feature column descriptions - - Parameters - ---------- - feature_columns : list - list of feature column descriptions (dictionaries) - - Returns - ------- - tf_columns : list - List of tensorFlow.feature_column objects - """ - tf_columns = {} - col_map = {} # TODO: build map to tf.feature_column functions - # TODO: what feature_columns need to be wrapped - indicators = [feature_column.categorical_column_with_hash_bucket, - feature_column.categorical_column_with_identity, - feature_column.categorical_column_with_vocabulary_file, - feature_column.categorical_column_with_vocabulary_list, - feature_column.crossed_column] - for col in feature_columns: - name = col['name'] - f_type = col_map.get(col['type'], col['type']) - kwargs = col.get('kwargs', {}) - - if f_type == feature_column.crossed_column: - cross_cols = [tf_columns[name] - for name in col['cross_columns']] - f_col = f_type(cross_cols, **kwargs) - elif f_type == feature_column.embedding_column: - embedded_type = col_map[col['embedded_col']] - f_col = embedded_type(name, **kwargs) - f_col = f_type(f_col, **kwargs) - else: - f_col = f_type(name, **kwargs) - - if f_type in indicators: - f_col = feature_column.indicator_column(f_col) - - tf_columns[name] = f_col - - return tf_columns - - @staticmethod - def _compile_model(feature_columns, model_layers=None, learning_rate=0.001, - loss="mean_squared_error", metrics=('mae', 'mse'), - optimizer_class=None, **kwargs): - """ - Build tensorflow sequential model from given layers and kwargs - - Parameters - ---------- - feature_columns : list - List of tensorFlow.feature_column objects - model_layers : list, optional - List of tensorflow layers.Dense kwargs (dictionaries) - if None use a single linear layer, by default None - learning_rate : float, optional - tensorflow optimizer learning rate, by default 0.001 - loss : str, optional - name of objective function, by default "mean_squared_error" - metrics : list, optional - List of metrics to be evaluated by the model during training and - testing, by default ('mae', 'mse') - optimizer_class : None | tf.keras.optimizers - Optional explicit request of optimizer. This should be a class - that will be instantated in the TfModel._compile_model() method - The default is the RMSprop optimizer. - kwargs : dict - kwargs for tensorflow.keras.models.compile - - Returns - ------- - tensorflow.keras.models.Sequential - Compiled tensorflow Sequential model - """ - model = tf.keras.models.Sequential() - model.add(layers.DenseFeatures(feature_columns)) - if model_layers is None: - # Add a single linear layer - model.add(layers.Dense(units=1, input_shape=(1,))) - else: - for layer in model_layers: - dropout = layer.pop('dropout', None) - model.add(layers.Dense(**layer)) - - if dropout is not None: - model.add(layers.Dropout(dropout)) - - if isinstance(metrics, tuple): - metrics = list(metrics) - elif not isinstance(metrics, list): - metrics = [metrics] - - if optimizer_class is None: - optimizer = tf.keras.optimizers.RMSprop( - learning_rate=learning_rate) - else: - optimizer = optimizer_class(learning_rate=learning_rate) - - model.compile(optimizer=optimizer, loss=loss, metrics=metrics, - **kwargs) - - return model - - @staticmethod - def build_model(features, feature_columns=None, model_layers=None, - learning_rate=0.001, loss="mean_squared_error", - metrics=('mae', 'mse'), optimizer_class=None, **kwargs): - """ - Build tensorflow sequential model from given layers and kwargs - - Parameters - ---------- - features : dict | pandas.DataFrame - Model features - feature_columns : list, optional - list of feature column descriptions (dictionaries) - if None use numeric columns with the feature_attr name, - by default None - model_layers : list, optional - List of tensorflow layers.Dense kwargs (dictionaries) - if None use a single linear layer, by default None - learning_rate : float, optional - tensorflow optimizer learning rate, by default 0.001 - loss : str, optional - name of objective function, by default "mean_squared_error" - metrics : list, optional - List of metrics to be evaluated by the model during training and - testing, by default ('mae', 'mse') - optimizer_class : None | tf.keras.optimizers - Optional explicit request of optimizer. This should be a class - that will be instantated in the TfModel._compile_model() method. - The default is the RMSprop optimizer. - kwargs : dict - kwargs for tensorflow.keras.models.compile - - Returns - ------- - tensorflow.keras.models.Sequential - Compiled tensorflow Sequential model - """ - if feature_columns is None: - feature_columns = TfModel._generate_feature_columns(features) - else: - if len(feature_columns) < len(features.keys()): - msg = ("There must be at least one feature column per feature!" - " {} columns were supplied but there are {} features!" - .format(len(feature_columns), - len(features.keys()))) - logger.error(msg) - raise ValueError - - feature_columns = TfModel._build_feature_columns(feature_columns) - - model = TfModel._compile_model(feature_columns, - model_layers=model_layers, - learning_rate=learning_rate, - loss=loss, metrics=metrics, - optimizer_class=optimizer_class, - **kwargs) - - return model - - def _parse_data(self, data, normalize=True, clean_names=True): - """ - Parse features or labels, normalize, and clean names if requested - - Parameters - ---------- - data : dict | pandas.DataFrame - Features or label to use with model - normalize : bool, optional - Flag to normalize features or labels, by default True - clean_names : bool, optional - Flag to clean feature or label names, by default True - - Returns - ------- - data : dict - Dictionary of normalized (if desired) features or label - """ - if isinstance(data, pd.DataFrame): - data = {name: np.array(value) for name, value in data.items()} - elif not isinstance(data, dict): - msg = ("Features and label must be supplied as a pandas.DataFrame" - " or python dictionary, but recieved: {}" - .format(type(data))) - logger.error(msg) - raise ValueError(msg) - - if normalize: - data = self.normalize(data) - - if clean_names: - data = {self._clean_name(key): value - for key, value in data.items()} - - return data - - def train_model(self, features, labels, norm_label=True, epochs=100, - validation_split=0.2, early_stop=True, **kwargs): - """ - Train the model with the provided features and label - - Parameters - ---------- - features : dict | pandas.DataFrame - Input features to train on - labels : dict | pandas.DataFrame - label to train on - norm_label : bool - Flag to normalize label - epochs : int, optional - Number of epochs to train the model, by default 100 - validation_split : float, optional - Fraction of the training data to be used as validation data, - by default 0.2 - early_stop : bool - Flag to stop training when it stops improving - kwargs : dict - kwargs for tensorflow.keras.models.fit - """ - features = self._parse_data(features) - self._feature_names = list(features.keys()) - - labels = self._parse_data(labels, normalize=norm_label) - - self._label_names = list(labels.keys()) - - if self._history is not None: - msg = 'Model has already been trained and will be re-fit!' - logger.warning(msg) - warn(msg) - - if early_stop: - early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', - patience=10) - callbacks = kwargs.pop('callbacks', None) - if callbacks is None: - callbacks = [early_stop] - else: - callbacks.append(early_stop) - - kwargs['callbacks'] = callbacks - - if validation_split > 0: - split = int(len(list(features.values())[0]) * validation_split) - validate_features = {name: arr[-split:] - for name, arr in features.items()} - validate_labels = [arr[-split:] for arr in labels.values()] - validation_data = (validate_features, validate_labels) - - features = {name: arr[:-split] - for name, arr in features.items()} - labels = [arr[:-split] for arr in labels.values()] - else: - validation_data = None - - self._history = self._model.fit(x=features, y=labels, epochs=epochs, - validation_data=validation_data, - **kwargs) - - def save_model(self, path): - """ - Save TfModel to path. - - Parameters - ---------- - path : str - Directory path to save model to. The tensorflow model will be - saved to the directory while the framework parameters will be - saved in json. - """ - if path.endswith('.json'): - path = path.replace('.json', '/') - - if not path.endswith('/'): - path += '/' - - if not os.path.exists(path): - os.makedirs(path) - - tf.saved_model.save(self.model, path) - - model_params = {'feature_names': self.feature_names, - 'label_names': self.label_names, - 'norm_params': self.normalization_parameters} - - json_path = path.rstrip('/') + '.json' - model_params = self.dict_json_convert(model_params) - with open(json_path, 'w') as f: - json.dump(model_params, f, indent=2, sort_keys=True) - - @classmethod - def build(cls, features, feature_columns=None, model_layers=None, - learning_rate=0.001, loss="mean_squared_error", - metrics=('mae', 'mse'), optimizer_class=None, **kwargs): - """ - Build tensorflow sequential model from given features, layers and - kwargs - - Parameters - ---------- - features : dict | pandas.DataFrame - Model features - feature_columns : list, optional - list of feature column descriptions (dictionaries) - if None use numeric columns with the feature_attr name, - by default None - model_layers : list, optional - List of tensorflow layers.Dense kwargs (dictionaries) - if None use a single linear layer, by default None - learning_rate : float, optional - tensorflow optimizer learning rate, by default 0.001 - loss : str, optional - name of objective function, by default "mean_squared_error" - metrics : list, optional - List of metrics to be evaluated by the model during training and - testing, by default ('mae', 'mse') - optimizer_class : None | tf.keras.optimizers - Optional explicit request of optimizer. This should be a class - that will be instantated in the TfModel._compile_model() method - The default is the RMSprop optimizer. - kwargs : dict - kwargs for tensorflow.keras.models.compile - - Returns - ------- - model : TfModel - Initialized TfKeraModel obj - """ - model = TfModel.build_model(features, feature_columns=feature_columns, - model_layers=model_layers, - learning_rate=learning_rate, loss=loss, - metrics=metrics, - optimizer_class=optimizer_class, **kwargs) - - return cls(model) - - @classmethod - def train(cls, features, labels, feature_columns=None, model_layers=None, - learning_rate=0.001, loss="mean_squared_error", - metrics=('mae', 'mse'), optimizer_class=None, norm_label=True, - epochs=100, validation_split=0.2, early_stop=True, - save_path=None, build_kwargs=None, train_kwargs=None): - """ - Build tensorflow sequential model from given features, layers and - kwargs and then train with given label and kwargs - - Parameters - ---------- - features : dict | pandas.DataFrame - Model features - labels : dict | pandas.DataFrame - label to train on - feature_columns : list, optional - list of feature column descriptions (dictionaries) - if None use numeric columns with the feature_attr name, - by default None - model_layers : list, optional - List of tensorflow layers.Dense kwargs (dictionaries) - if None use a single linear layer, by default None - learning_rate : float, optional - tensorflow optimizer learning rate, by default 0.001 - loss : str, optional - name of objective function, by default "mean_squared_error" - metrics : list, optional - List of metrics to be evaluated by the model during training and - testing, by default ('mae', 'mse') - optimizer_class : None | tf.keras.optimizers - Optional explicit request of optimizer. This should be a class - that will be instantated in the TfModel._compile_model() method - The default is the RMSprop optimizer. - norm_label : bool - Flag to normalize label - epochs : int, optional - Number of epochs to train the model, by default 100 - validation_split : float, optional - Fraction of the training data to be used as validation data, - by default 0.2 - early_stop : bool - Flag to stop training when it stops improving - save_path : str - Directory path to save model to. The tensorflow model will be - saved to the directory while the framework parameters will be - saved in json. - build_kwargs : dict - kwargs for tensorflow.keras.models.compile - train_kwargs : dict - kwargs for tensorflow.keras.models.fit - - Returns - ------- - model : TfModel - Initialized and trained TfModel obj - """ - if build_kwargs is None: - build_kwargs = {} - - model = cls.build(features, feature_columns=feature_columns, - model_layers=model_layers, - learning_rate=learning_rate, loss=loss, - metrics=metrics, optimizer_class=optimizer_class, - **build_kwargs) - - if train_kwargs is None: - train_kwargs = {} - - model.train_model(features, labels, norm_label=norm_label, - epochs=epochs, validation_split=validation_split, - early_stop=early_stop, **train_kwargs) - - if save_path is not None: - model.save_model(save_path) - - return model - - @classmethod - def load(cls, path): - """ - Load model from model path. - - Parameters - ---------- - path : str - Directory path to TfModel to load model from. There should be a - tensorflow saved model directory with a parallel pickle file for - the TfModel framework. - - Returns - ------- - model : TfModel - Loaded TfModel from disk. - """ - if path.endswith('.json'): - path = path.replace('.json', '/') - - if not path.endswith('/'): - path += '/' - - if not os.path.isdir(path): - e = ('Can only load directory path but target is not ' - 'directory: {}'.format(path)) - logger.error(e) - raise IOError(e) - - loaded = tf.keras.models.load_model(path) - - json_path = path.rstrip('/') + '.json' - with open(json_path, 'r') as f: - model_params = json.load(f) - - model = cls(loaded, **model_params) - - return model - - -class RandomForestModel(MLModelBase): - """ - scikit learn Random Forest Regression - """ - - def __init__(self, model, feature_names=None, label_name=None, - norm_params=None): - """ - Parameters - ---------- - model : sklearn.ensemble.RandomForestRegressor - Sklearn Random Forest Model - feature_names : list - Ordered list of feature names. - label_name : str - label (output) variable name. - norm_params : dict, optional - Dictionary mapping feature and label names (keys) to normalization - parameters (mean, stdev), by default None - """ - super().__init__(model, feature_names=feature_names, - label_name=label_name, norm_params=norm_params) - - if len(self.label_names) > 1: - msg = ("Only a single label can be supplied to {}, but {} were" - .format(self.__class__.__name__, len(self.label_names))) - logger.error(msg) - raise ValueError(msg) - - @staticmethod - def build_model(**kwargs): - """ - Build sklearn random forest model - - Parameters - ---------- - kwargs : dict - kwargs for sklearn.ensemble.RandomForestRegressor - - Returns - ------- - sklearn.ensemble.RandomForestRegressor - sklearn random forest model - """ - model = RandomForestRegressor(**kwargs) - - return model - - def _get_norm_params(self, names): - """ - Get means and stdevs for given feature/label names - - Parameters - ---------- - names : list - list of feature/label names to get normalization params for - - Returns - ------- - means : list | None - List of means to use for (un)normalization - stdevs : list | None - List of stdevs to use for (un)normalization - """ - means = [] - stdevs = [] - for name in names: - v = self._norm_params.get(name, None) - if v is None: - means = None - stdevs = None - break - - means.append(v['mean']) - stdevs.append(v['stdev']) - - return means, stdevs - - def normalize(self, df): - """ - Normalize DataFrame - - Parameters - ---------- - df : pandas.DataFrame - DataFrame of features/label to normalize - - Returns - ------- - norm_df : pandas.DataFrame - Normalized features/label - """ - df = pd.get_dummies(df) - means, stdevs = self._get_norm_params(df.columns) - - norm_df, means, stdevs = self._normalize(df, mean=means, - stdev=stdevs) - for i, c in enumerate(df.columns): - norm_params = {c: {'mean': means[i], 'stdev': stdevs[i]}} - self._norm_params.update(norm_params) - - return norm_df - - def unnormalize_prediction(self, prediction): - """ - Unnormalize prediction if needed - - Parameters - ---------- - prediction : ndarray - Model prediction - - Returns - ------- - prediction : ndarray - Native prediction - """ - means = self.label_means[0] - if means: - stdevs = self.label_stdevs[0] - prediction = self._unnormalize(prediction, means, stdevs) - - return prediction - - def _parse_data(self, features, normalize=True, names=False): - """ - Parse features or labels, normalize, and clean names if requested - - Parameters - ---------- - features : panda.DataFrame - Features or label to use with model - normalize : bool, optional - Flag to normalize features or labels, by default True - names : bool, optional - Flag to retain DataFrame, by default False - - Returns - ------- - features : ndarray | panda.DataFrame - Normalized (if desired) features or label - """ - if not isinstance(features, pd.DataFrame): - msg = ("Features must be a pandas.DataFrame, but {} was supplied" - .format(type(features))) - logger.error(msg) - raise ValueError(msg) - - if normalize: - features = self.normalize(features) - else: - features = pd.get_dummies(features) - - if not names: - features = features.values - - return features - - def train_model(self, features, label, norm_label=True, **kwargs): - """ - Train the model with the provided features and label - - Parameters - ---------- - features : dict | pandas.DataFrame - Input features to train on - labels : dict | pandas.DataFrame - label to train on - norm_label : bool - Flag to normalize label - kwargs : dict - kwargs for sklearn.ensemble.RandomForestRegressor.fit - """ - features = self._parse_data(features, names=True) - self._feature_names = list(features.columns) - features = features.values - - label = self._parse_data(label, normalize=norm_label, - names=True) - self._label_names = list(label.columns) - label = label.values - - if len(self.label_names) > 1: - msg = ("Only a single label can be supplied to {}, but {} were" - .format(self.__class__.__name__, len(self.label_names))) - logger.error(msg) - raise ValueError(msg) - - self._model.fit(features, label.ravel(), **kwargs) - - def save_model(self, path): - """ - Save Random Forest Model to path. - - Parameters - ---------- - path : str - Path to save model to - """ - if path.endswith('.json'): - dir_path = os.path.dirname(path) - else: - dir_path = path - path = os.path.join(dir_path, os.path.basename(path) + '.json') - - if not os.path.exists(dir_path): - os.makedirs(dir_path) - - model_params = {'feature_names': self.feature_names, - 'label_names': self.label_names, - 'norm_params': self.normalization_parameters, - 'model_params': self.model.get_params()} - - model_params = self.dict_json_convert(model_params) - with open(path, 'w') as f: - json.dump(model_params, f, indent=2, sort_keys=True) - - @classmethod - def train(cls, features, labels, norm_label=True, save_path=None, - build_kwargs=None, train_kwargs=None): - """ - Build Random Forest Model with given kwargs and then train with - given features, labels, and kwargs - - Parameters - ---------- - features : pandas.DataFrame - Model features - labels : pandas.DataFrame - label to train on - norm_label : bool - Flag to normalize label - save_path : str - Directory path to save model to. The RandomForest Model will be - saved to the directory while the framework parameters will be - saved in json. - build_kwargs : dict - kwargs for tensorflow.keras.models.compile - train_kwargs : dict - kwargs for tensorflow.keras.models.fit - - Returns - ------- - model : RandomForestModel - Initialized and trained RandomForestModel obj - """ - if build_kwargs is None: - build_kwargs = {} - - model = cls(cls.build_model(**build_kwargs)) - - if train_kwargs is None: - train_kwargs = {} - - model.train_model(features, labels, norm_label=norm_label, - **train_kwargs) - - if save_path is not None: - pass - # model.save_model(save_path) - - return model - - @classmethod - def load(cls, path): - """ - Load model from model path. - - Parameters - ---------- - path : str - Directory path to TfModel to load model from. There should be a - tensorflow saved model directory with a parallel pickle file for - the TfModel framework. - - Returns - ------- - model : TfModel - Loaded TfModel from disk. - """ - if not path.endswith('.json'): - path = os.path.join(path, os.path.basename(path) + '.json') - - if not os.path.exists(path): - e = ('{} does not exist'.format(path)) - logger.error(e) - raise IOError(e) - - with open(path, 'r') as f: - model_params = json.load(f) - - loaded = RandomForestRegressor() - loaded = loaded.set_params(**model_params.pop('model_params')) - - model = cls(loaded, **model_params) - - return model diff --git a/phygnn/model_interfaces/__init__.py b/phygnn/model_interfaces/__init__.py new file mode 100644 index 0000000..700b557 --- /dev/null +++ b/phygnn/model_interfaces/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- +"""Model Interfaces""" +from .tf_model import TfModel diff --git a/phygnn/model_interfaces/base_model.py b/phygnn/model_interfaces/base_model.py new file mode 100644 index 0000000..f112ca0 --- /dev/null +++ b/phygnn/model_interfaces/base_model.py @@ -0,0 +1,852 @@ +# -*- coding: utf-8 -*- +""" +Base Model Interface +""" +import logging +import numpy as np +import pandas as pd +from warnings import warn + +from phygnn.utilities.pre_processing import PreProcess + +logger = logging.getLogger(__name__) + + +class ModelBase: + """ + Base Model Interface + """ + + def __init__(self, model, feature_names=None, label_names=None, + norm_params=None): + """ + Parameters + ---------- + model : OBJ + Sci-kit learn or tensorflow model + feature_names : list + Ordered list of feature names. + label_names : list + Ordered list of label (output) names. + norm_params : dict, optional + Dictionary mapping feature and label names (keys) to normalization + parameters (mean, stdev), by default None + """ + self._model = model + + if isinstance(feature_names, str): + feature_names = [feature_names] + elif isinstance(feature_names, np.ndarray): + feature_names = feature_names.tolist() + + self._feature_names = feature_names + + if isinstance(label_names, str): + label_names = [label_names] + elif isinstance(label_names, np.ndarray): + label_names = label_names.tolist() + + self._label_names = label_names + if norm_params is None: + norm_params = {} + + self._norm_params = norm_params + + def __repr__(self): + msg = "{}:\n{}".format(self.__class__.__name__, self.model_summary) + + return msg + + def __getitem__(self, features): + """ + Use model to predict label from given features + + Parameters + ---------- + features : pandas.DataFrame + features to predict from + + Returns + ------- + pandas.DataFrame + label prediction + """ + return self.predict(features) + + @property + def model_summary(self): + """ + Tensorflow model summary + + Returns + ------- + str + """ + try: + summary = self._model.summary() + except ValueError: + summary = None + + return summary + + @property + def feature_names(self): + """ + List of the feature variable names. + + Returns + ------- + list + """ + return self._feature_names + + @property + def feature_dims(self): + """ + Number of features + + Returns + ------- + int + """ + n_features = (len(self.feature_names) + if self.feature_names is not None else None) + + return n_features + + @property + def label_names(self): + """ + label variable names + + Returns + ------- + list + """ + return self._label_names + + @property + def label_dims(self): + """ + Number of labels + + Returns + ------- + int + """ + n_labels = (len(self.label_names) + if self.label_names is not None else None) + + return n_labels + + @property + def normalization_parameters(self): + """ + Features and label (un)normalization parameters + + Returns + ------- + dict + """ + return self._norm_params + + @property + def means(self): + """ + Mapping feature/label names to the mean values for + (un)normalization + + Returns + ------- + dict + """ + means = {k: v['mean'] for k, v in self._norm_params.items()} + + return means + + @property + def stdevs(self): + """ + Mapping feature/label names to the stdev values for + (un)normalization + + Returns + ------- + dict + """ + stdevs = {k: v['stdev'] for k, v in self._norm_params.items()} + + return stdevs + + @property + def model(self): + """ + Trained model + + Returns + ------- + tensorflow.keras.models + """ + return self._model + + @property + def feature_means(self): + """ + Feature means, used for (un)normalization + + Returns + ------- + list + """ + means = None + if self._feature_names is not None: + means = [] + for f in self._feature_names: + means.append(self.get_mean(f)) + + return means + + @property + def feature_stdevs(self): + """ + Feature stdevs, used for (un)normalization + + Returns + ------- + list + """ + stdevs = None + if self._feature_names is not None: + stdevs = [] + for f in self._feature_names: + stdevs.append(self.get_stdev(f)) + + return stdevs + + @property + def label_means(self): + """ + label means, used for (un)normalization + + Returns + ------- + list + """ + means = None + if self.label_names is not None: + means = [] + for l_n in self.label_names: + means.append(self.get_mean(l_n)) + + return means + + @property + def label_stdevs(self): + """ + label stdevs, used for (un)normalization + + Returns + ------- + list + """ + stdevs = None + if self.label_names is not None: + stdevs = [] + for l_n in self.label_names: + stdevs.append(self.get_stdev(l_n)) + + return stdevs + + @staticmethod + def _normalize(native_arr, mean=None, stdev=None): + """ + Normalize features with mean at 0 and stdev of 1. + + Parameters + ---------- + native_arr : ndarray + native data + mean : float | None + mean to use for normalization + stdev : float | None + stdev to use for normalization + + Returns + ------- + norm_arr : ndarray + normalized data + mean : float + mean used for normalization + stdev : float + stdev used for normalization + """ + + if mean is None: + mean = np.nanmean(native_arr, axis=0) + + if stdev is None: + stdev = np.nanstd(native_arr, axis=0) + + norm_arr = native_arr - mean + norm_arr /= stdev + + return norm_arr, mean, stdev + + @staticmethod + def _unnormalize(norm_arr, mean, stdev): + """ + Unnormalize data with mean at 0 and stdev of 1. + + Parameters + ---------- + norm_arr : ndarray + normalized data + mean : float + mean used for normalization + stdev : float + stdev used for normalization + + Returns + ------- + native_arr : ndarray + native un-normalized data + """ + native_arr = norm_arr * stdev + native_arr += mean + + return native_arr + + @staticmethod + def dict_json_convert(inp): + """Recursively convert numeric values in dict to work with json dump + + Parameters + ---------- + inp : dict + Dictionary to convert. + + Returns + ------- + out : dict + Copy of dict input with all nested numeric values converted to + base python int or float and all arrays converted to lists. + """ + + if isinstance(inp, dict): + out = {k: ModelBase.dict_json_convert(v) for k, v in inp.items()} + elif isinstance(inp, (list, tuple)): + out = [ModelBase.dict_json_convert(i) for i in inp] + elif np.issubdtype(type(inp), np.floating): + out = float(inp) + elif np.issubdtype(type(inp), np.integer): + out = int(inp) + elif isinstance(inp, np.ndarray): + out = inp.tolist() + else: + out = inp + + return out + + @staticmethod + def _parse_data(data, names=None): + """ + Parse data array and names from input data + + Parameters + ---------- + data : pandas.DataFrame | dict | ndarray + Features/labels to parse + names : list, optional + List of data item names, by default None + + Returns + ------- + data : ndarray + Data array + names: list + List of data item names + """ + if isinstance(data, pd.DataFrame): + names = data.columns.values.tolist() + data = data.values + elif isinstance(data, dict): + names = list(data.keys()) + data = np.dstack(list(data.values()))[0] + elif isinstance(data, np.ndarray): + if names is None: + msg = ('Names of items must be supplied to parse data ' + 'arrays') + logger.error(msg) + raise RuntimeError(msg) + + return data, names + + def get_norm_params(self, names): + """ + Get means and stdevs for given feature/label names + + Parameters + ---------- + names : list + list of feature/label names to get normalization params for + + Returns + ------- + means : list + List of means to use for (un)normalization + stdevs : list + List of stdevs to use for (un)normalization + """ + means = [] + stdevs = [] + for name in names: + means.append(self.get_mean(name)) + stdevs.append(self.get_stdev(name)) + + if not all(set(means)): + means = None + + if not all(set(stdevs)): + stdevs = None + + return means, stdevs + + def get_mean(self, name): + """ + Get feature | label mean + + Parameters + ---------- + name : str + feature | label name + + Returns + ------- + mean : float + Mean value used for normalization + """ + mean = self._norm_params.get(name, None) + if mean is not None: + mean = mean.get('mean', None) + + return mean + + def get_stdev(self, name): + """ + Get feature | label stdev + + Parameters + ---------- + name : str + feature | label name + + Returns + ------- + stdev : float + Stdev value used for normalization + """ + stdev = self._norm_params.get(name, None) + if stdev is not None: + stdev = stdev.get('stdev', None) + + return stdev + + def _normalize_dict(self, items): + """ + Normalize given dictionary of items (features | labels) + + Parameters + ---------- + items : dict + mapping of names to vectors + + Returns + ------- + norm_items : dict + mapping of names to normalized-feature vectors + """ + norm_items = {} + for key, value in items.items(): + mean = self.get_mean(key) + stdev = self.get_stdev(key) + update = mean is None or stdev is None + try: + value, mean, stdev = self._normalize(value, mean=mean, + stdev=stdev) + if update: + norm_params = {key: {'mean': mean, 'stdev': stdev}} + self._norm_params.update(norm_params) + except Exception as ex: + msg = "Could not normalize {}:\n{}".format(key, ex) + logger.warning(msg) + warn(msg) + + norm_items[key] = value + + return norm_items + + def _normalize_df(self, df): + """ + Normalize DataFrame + + Parameters + ---------- + df : pandas.DataFrame + DataFrame of features/label to normalize + + Returns + ------- + norm_df : pandas.DataFrame + Normalized features/label + """ + means, stdevs = self.get_norm_params(df.columns) + update = means is None or stdevs is None + + norm_df, means, stdevs = self._normalize(df, mean=means, + stdev=stdevs) + if update: + for i, c in enumerate(df.columns): + norm_params = {c: {'mean': means[i], 'stdev': stdevs[i]}} + self._norm_params.update(norm_params) + + return norm_df + + def _normalize_arr(self, arr, names): + """ + Normalize array and save normalization parameters to given names + + Parameters + ---------- + arr : ndarray + Array of features/label to normalize + names : list + List of feature/label names + + Returns + ------- + norm_arr : ndarray + Normalized features/label + """ + if len(names) != arr.shape[1]: + msg = ("Number of item names ({}) does not match number of items " + "({})".format(len(names), arr.shape[1])) + logger.error(msg) + raise RuntimeError(msg) + + means, stdevs = self.get_norm_params(names) + update = means is None or stdevs is None + + norm_arr, means, stdevs = self._normalize(arr, mean=means, + stdev=stdevs) + if update: + for i, n in enumerate(names): + norm_params = {n: {'mean': means[i], 'stdev': stdevs[i]}} + self._norm_params.update(norm_params) + + return norm_arr + + def normalize(self, data, names=None): + """ + Normalize given data + + Parameters + ---------- + data : dict | pandas.DataFrame | ndarray + Data to normalize + names : list, optional + List of data item names, needed to normalized ndarrays, + by default None + + Returns + ------- + data : dict | pandas.DataFrame | ndarray + Normalized data in same format as input + """ + if isinstance(data, dict): + data = self._normalize_dict(data) + elif isinstance(data, pd.DataFrame): + data = self._normalize_df(data) + elif isinstance(data, (list, np.ndarray)): + if names is None: + msg = ('Names of items must be supplied to nomralize data ' + 'arrays') + logger.error(msg) + raise RuntimeError(msg) + else: + data = self._normalize_arr(data, names) + else: + msg = "Cannot normalize data of type: {}".format(type(data)) + logger.error(msg) + raise RuntimeError(msg) + + return data + + def _unnormalize_dict(self, items): + """ + Un-normalize given dictionary of items (features | labels) + + Parameters + ---------- + items : dict + mapping of names to vectors + + Returns + ------- + native_items : dict + mapping of names to native vectors + """ + native_items = {} + for key, value in items.items(): + norm_params = self.normalization_parameters[key] + if norm_params is not None: + value = self._unnormalize(value, norm_params['mean'], + norm_params['stdev']) + else: + msg = ("Normalization Parameters unavailable for {}" + .format(key)) + logger.warning(msg) + warn(msg) + + native_items[key] = value + + return native_items + + def _unnormalize_df(self, df): + """ + Un-normalize DataFrame + + Parameters + ---------- + df : pandas.DataFrame + DataFrame of features/label to un-normalize + + Returns + ------- + native_df : pandas.DataFrame + Native features/label array + """ + means, stdevs = self.get_norm_params(df.columns) + + native_df = self._unnormalize(df, means, stdevs) + + return native_df + + def _unnormalize_arr(self, arr, names): + """ + Un-normalize array using given names + + Parameters + ---------- + arr : ndarray + Array of features/label to un-normalize + names : list + List of feature/label names + + Returns + ------- + native_arr : ndarray + Native features/label array + """ + if len(names) != arr.shape[1]: + msg = ("Number of item names ({}) does not match number of items " + "({})".format(len(names), arr.shape[1])) + logger.error(msg) + raise RuntimeError(msg) + + means, stdevs = self.get_norm_params(names) + + native_arr = self._unnormalize(arr, means, stdevs) + + return native_arr + + def unnormalize(self, data, names=None): + """ + Un-normalize given data + + Parameters + ---------- + data : dict | pandas.DataFrame | ndarray + Data to un-normalize + names : list, optional + List of data item names, needed to un-normalized ndarrays, + by default None + + Returns + ------- + data : dict | pandas.DataFrame | ndarray + Native data in same format as input + """ + if isinstance(data, dict): + data = self._unnormalize_dict(data) + elif isinstance(data, pd.DataFrame): + data = self._unnormalize_df(data) + elif isinstance(data, (list, np.ndarray)): + if names is None: + msg = ('Names of items must be supplied to un-nomralize data ' + 'arrays') + logger.error(msg) + raise RuntimeError(msg) + else: + data = self._unnormalize_arr(data, names) + else: + msg = "Cannot un-normalize data of type: {}".format(type(data)) + logger.error(msg) + raise RuntimeError(msg) + + return data + + def _check_one_hot_norm_params(self, one_hot_features): + """ + Check one hot feature normalization parameters to ensure they are + {mean: 0, stdev: 1} to prevent normalization + + Parameters + ---------- + one_hot_features : list + list of one hot features + """ + for feature in one_hot_features: + mean = self.get_mean(feature) + stdev = self.get_stdev(feature) + if mean != 0 and stdev != 1: + norm_params = {feature: {'mean': 0, 'stdev': 1}} + self._norm_params.update(norm_params) + + def _parse_features(self, features, names=None, process_one_hot=True, + **kwargs): + """ + Parse features + + Parameters + ---------- + features : pandas.DataFrame | dict | ndarray + Features to train on or predict from + names : list, optional + List of feature names, by default None + process_one_hot : bool, optional + Check for and process one-hot variables, by default True + kwargs : dict, optional + kwargs for PreProcess.one_hot + + Returns + ------- + features : ndarray + Parsed features array normalized and with str columns converted + to one hot vectors if desired + """ + if len(features.shape) != 2: + msg = ('{} can only use 2D data as input!' + .format(self.__class__.__name__)) + logger.error(msg) + raise RuntimeError(msg) + + if self.feature_names is not None: + if features.shape[1] != len(self.feature_names): + msg = ('data has {} features but expected {}' + .format(features.shape[1], self.feature_dims)) + logger.error(msg) + raise RuntimeError(msg) + + features, feature_names = self._parse_data(features, names=names) + + if self._feature_names is None: + self._feature_names = feature_names + elif self.feature_names != feature_names: + msg = ('Expecting features with names: {}, but was provided with: ' + '{}!'.format(feature_names, self.feature_names)) + logger.error(msg) + raise RuntimeError(msg) + + if process_one_hot: + kwargs.update({'return_ind': True}) + features, one_hot_ind = PreProcess.one_hot(features, **kwargs) + self._check_one_hot_norm_params(self.feature_names[one_hot_ind]) + + features = self.normalize(features, names=feature_names) + + return features + + def _parse_labels(self, labels, names=None, normalize=True): + """ + Parse labels and normalize if desired + + Parameters + ---------- + labels : pandas.DataFrame | dict | ndarray + Features to train on or predict from + names : list, optional + List of label names, by default None + normalize : bool, optional + Normalize label array, by default True + + Returns + ------- + labels : ndarray + Parsed labels array, normalized if desired + """ + if self.label_names is not None: + if len(labels.shape) == 1: + n_labels = len(labels) + else: + n_labels = labels.shape[1] + + if n_labels != len(self.label_names): + msg = ('data has {} labels but expected {}' + .format(labels.shape[1], self.label_dims)) + logger.error(msg) + raise RuntimeError(msg) + + labels, label_names = self._parse_data(labels, names=names) + + if self._label_names is None: + self._label_names = label_names + elif self.label_names != label_names: + msg = ('Expecting labels with names: {}, but was provided with: ' + '{}!'.format(label_names, self.label_names)) + logger.error(msg) + raise RuntimeError(msg) + + if normalize: + labels = self.normalize(labels, names=label_names) + + return labels + + def predict(self, features, table=True, parse_kwargs=None, + predict_kwargs=None): + """ + Use model to predict label from given features + + Parameters + ---------- + features : dict | pandas.DataFrame + features to predict from + table : bool, optional + Return pandas DataFrame + parse_kwargs : dict + kwargs for cls._parse_features + predict_wargs : dict + kwargs for tensorflow.*.predict + + Returns + ------- + prediction : ndarray | pandas.DataFrame + label prediction + """ + if parse_kwargs is None: + parse_kwargs = {} + + if isinstance(features, np.ndarray): + parse_kwargs.update({"names": self.feature_names}) + + features = self._parse_features(features, **parse_kwargs) + + if predict_kwargs is None: + predict_kwargs = {} + + prediction = self._model.predict(features, **predict_kwargs) + prediction = self.unnormalize(prediction, names=self.label_names) + + if table: + prediction = pd.DataFrame(prediction, columns=self.label_names) + + return prediction diff --git a/phygnn/model_interfaces/random_forest_model.py b/phygnn/model_interfaces/random_forest_model.py new file mode 100644 index 0000000..5a948e8 --- /dev/null +++ b/phygnn/model_interfaces/random_forest_model.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +""" +Random Forest Model +""" +import json +import logging +import os +from sklearn.ensemble import RandomForestRegressor + +from phygnn.model_interfaces.base_model import ModelBase + +logger = logging.getLogger(__name__) + + +class RandomForestModel(ModelBase): + """ + scikit learn Random Forest Regression + """ + + def __init__(self, model, feature_names=None, label_name=None, + norm_params=None): + """ + Parameters + ---------- + model : sklearn.ensemble.RandomForestRegressor + Sklearn Random Forest Model + feature_names : list + Ordered list of feature names. + label_name : str + label (output) variable name. + norm_params : dict, optional + Dictionary mapping feature and label names (keys) to normalization + parameters (mean, stdev), by default None + """ + super().__init__(model, feature_names=feature_names, + label_name=label_name, norm_params=norm_params) + + if len(self.label_names) > 1: + msg = ("Only a single label can be supplied to {}, but {} were" + .format(self.__class__.__name__, len(self.label_names))) + logger.error(msg) + raise ValueError(msg) + + @staticmethod + def compile_model(**kwargs): + """ + Build sklearn random forest model + + Parameters + ---------- + kwargs : dict + kwargs for sklearn.ensemble.RandomForestRegressor + + Returns + ------- + sklearn.ensemble.RandomForestRegressor + sklearn random forest model + """ + model = RandomForestRegressor(**kwargs) + + return model + + def unnormalize_prediction(self, prediction): + """ + Unnormalize prediction if needed + + Parameters + ---------- + prediction : ndarray + Model prediction + + Returns + ------- + prediction : ndarray + Native prediction + """ + means = self.label_means[0] + if means: + stdevs = self.label_stdevs[0] + prediction = self._unnormalize(prediction, means, stdevs) + + return prediction + + def _parse_labels(self, label, name=None, normalize=True): + """ + Parse labels and normalize if desired + + Parameters + ---------- + label : pandas.DataFrame | dict | ndarray + Features to train on or predict from + name : list, optional + List of label names, by default None + normalize : bool, optional + Normalize label array, by default True + + Returns + ------- + label : ndarray + Parsed labels array, normalized if desired + """ + label = super()._parse_labels(label, names=name, + normalize=normalize) + + if len(self.label_names) > 1: + msg = ("Only a single label can be supplied to {}, but {} were" + .format(self.__class__.__name__, len(self.label_names))) + logger.error(msg) + raise ValueError(msg) + + return label + + def train_model(self, features, label, norm_label=True, parse_kwargs=None, + fit_kwargs=None): + """ + Train the model with the provided features and label + + Parameters + ---------- + features : dict | pandas.DataFrame + Input features to train on + label : dict | pandas.DataFrame + label to train on + norm_label : bool + Flag to normalize label + parse_kwargs : dict + kwargs for cls._parse_features + fit_kwargs : dict + kwargs for sklearn.ensemble.RandomForestRegressor.fit + """ + if parse_kwargs is None: + parse_kwargs = {} + + features = self._parse_features(features, **parse_kwargs) + + label = self._parse_data(label, normalize=norm_label, + names=True) + + if fit_kwargs is None: + fit_kwargs = {} + + # pylint: disable=no-member + self._model.fit(features, label.ravel(), **fit_kwargs) + + def save_model(self, path): + """ + Save Random Forest Model to path. + + Parameters + ---------- + path : str + Path to save model to + """ + if path.endswith('.json'): + dir_path = os.path.dirname(path) + else: + dir_path = path + path = os.path.join(dir_path, os.path.basename(path) + '.json') + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + model_params = {'feature_names': self.feature_names, + 'label_names': self.label_names, + 'norm_params': self.normalization_parameters, + 'model_params': self.model.get_params()} + + model_params = self.dict_json_convert(model_params) + with open(path, 'w') as f: + json.dump(model_params, f, indent=2, sort_keys=True) + + @classmethod + def train(cls, features, label, norm_label=True, save_path=None, + compile_kwargs=None, parse_kwargs=None, fit_kwargs=None): + """ + Build Random Forest Model with given kwargs and then train with + given features, labels, and kwargs + + Parameters + ---------- + features : pandas.DataFrame + Model features + label : pandas.DataFrame + label to train on + norm_label : bool + Flag to normalize label + save_path : str + Directory path to save model to. The RandomForest Model will be + saved to the directory while the framework parameters will be + saved in json. + compile_kwargs : dict + kwargs for sklearn.ensemble.RandomForestRegressor + parse_kwargs : dict + kwargs for cls._parse_features + fit_kwargs : dict + kwargs for sklearn.ensemble.RandomForestRegressor.fit + + Returns + ------- + model : RandomForestModel + Initialized and trained RandomForestModel obj + """ + if compile_kwargs is None: + compile_kwargs = {} + + _, feature_names = cls._parse_data(features) + _, label_names = cls._parse_data(label) + + model = cls(cls.compile_model(**compile_kwargs), + feature_names=feature_names, label_names=label_names) + + model.train_model(features, label, norm_label=norm_label, + parse_kwargs=parse_kwargs, fit_kwargs=fit_kwargs) + + if save_path is not None: + model.save_model(save_path) + + return model + + @classmethod + def load(cls, path): + """ + Load model from model path. + + Parameters + ---------- + path : str + Directory path to TfModel to load model from. There should be a + tensorflow saved model directory with a parallel pickle file for + the TfModel framework. + + Returns + ------- + model : TfModel + Loaded TfModel from disk. + """ + if not path.endswith('.json'): + path = os.path.join(path, os.path.basename(path) + '.json') + + if not os.path.exists(path): + e = ('{} does not exist'.format(path)) + logger.error(e) + raise IOError(e) + + with open(path, 'r') as f: + model_params = json.load(f) + + loaded = RandomForestRegressor() + loaded = loaded.set_params(**model_params.pop('model_params')) + + model = cls(loaded, **model_params) + + return model diff --git a/phygnn/model_interfaces/tf_model.py b/phygnn/model_interfaces/tf_model.py new file mode 100644 index 0000000..b8ed0e5 --- /dev/null +++ b/phygnn/model_interfaces/tf_model.py @@ -0,0 +1,517 @@ +# -*- coding: utf-8 -*- +""" +TensorFlow Model +""" +import json +import logging +import numpy as np +import os +import pandas as pd +import tensorflow as tf +from tensorflow import feature_column +from tensorflow.keras import layers +from tensorflow.keras.optimizers import Adam +from warnings import warn + +from phygnn.model_interfaces.base_model import ModelBase + +logger = logging.getLogger(__name__) + + +class TfModel(ModelBase): + """ + TensorFlow Keras Model + """ + def __init__(self, model, feature_names=None, label_names=None, + norm_params=None): + """ + Parameters + ---------- + model : tensorflow.keras.models.Sequential + Tensorflow Keras Model + feature_names : list + Ordered list of feature names. + label_names : list + Ordered list of label (output) names. + norm_params : dict, optional + Dictionary mapping feature and label names (keys) to normalization + parameters (mean, stdev), by default None + """ + super().__init__(model, feature_names=feature_names, + label_names=label_names, norm_params=norm_params) + + self._history = None + + @property + def history(self): + """ + Model training history + + Returns + ------- + pandas.DataFrame | None + """ + if self._history is None: + msg = 'Model has not been trained yet!' + logger.warning(msg) + warn(msg) + history = None + else: + history = pd.DataFrame(self._history.history) + history['epoch'] = self._history.epoch + + return history + + @staticmethod + def _clean_name(name): + """ + Make feature | label name compatible with TensorFlow + + Parameters + ---------- + name : str + Feature |label name from GOOML + + Returns + ------- + name : str + Feature | label name compatible with TensorFlow + """ + name = name.replace(' ', '_') + name = name.replace('*', '-x-') + name = name.replace('+', '-plus-') + name = name.replace('**', '-exp-') + name = name.replace(')', '') + name = name.replace('log(', 'log-') + + return name + + @staticmethod + def _generate_feature_columns(features): + """ + Generate feature layer from features table + + Parameters + ---------- + features : dict + model features + + Returns + ------- + feature_columns : list + List of tensorFlow.feature_column objects + """ + feature_columns = [] + for name, data in features.items(): + name = TfModel._clean_name(name) + if np.issubdtype(data.dtype.name, np.number): + f_col = feature_column.numeric_column(name) + else: + f_col = TfModel._generate_cat_column(name, data) + + feature_columns.append(f_col) + + return feature_columns + + @staticmethod + def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100): + """Generate a feature column from a categorical string data set + + Parameters + ---------- + name : str + Name of categorical columns + data : np.ndarray | list + String data array + vocab_threshold : int + Number of unique entries in the data array below which this + will use a vocabulary list, above which a hash bucket will be used. + bucket_size : int + Hash bucket size. + + Returns + ------- + f_col : IndicatorColumn + Categorical feature column. + """ + + n_unique = len(set(data)) + + if n_unique < vocab_threshold: + f_col = feature_column.categorical_column_with_vocabulary_list( + name, list(set(data))) + else: + f_col = feature_column.categorical_column_with_hash_bucket( + name, bucket_size) + + f_col = feature_column.indicator_column(f_col) + + return f_col + + @staticmethod + def _build_feature_columns(feature_columns): + """ + Build the feature layer from given feature column descriptions + + Parameters + ---------- + feature_columns : list + list of feature column descriptions (dictionaries) + + Returns + ------- + tf_columns : list + List of tensorFlow.feature_column objects + """ + tf_columns = {} + col_map = {} # TODO: build map to tf.feature_column functions + # TODO: what feature_columns need to be wrapped + indicators = [feature_column.categorical_column_with_hash_bucket, + feature_column.categorical_column_with_identity, + feature_column.categorical_column_with_vocabulary_file, + feature_column.categorical_column_with_vocabulary_list, + feature_column.crossed_column] + for col in feature_columns: + name = col['name'] + f_type = col_map.get(col['type'], col['type']) + kwargs = col.get('kwargs', {}) + + if f_type == feature_column.crossed_column: + cross_cols = [tf_columns[name] + for name in col['cross_columns']] + f_col = f_type(cross_cols, **kwargs) + elif f_type == feature_column.embedding_column: + embedded_type = col_map[col['embedded_col']] + f_col = embedded_type(name, **kwargs) + f_col = f_type(f_col, **kwargs) + else: + f_col = f_type(name, **kwargs) + + if f_type in indicators: + f_col = feature_column.indicator_column(f_col) + + tf_columns[name] = f_col + + return tf_columns + + @staticmethod + def compile_model(n_features, n_labels=1, hidden_layers=None, + learning_rate=0.001, loss="mean_squared_error", + metrics=('mae', 'mse'), optimizer_class=Adam, **kwargs): + """ + Build tensorflow sequential model from given layers and kwargs + + Parameters + ---------- + n_features : int + Number of features (inputs) to train the model on + n_labels : int, optional + Number of labels (outputs) to the model, by default 1 + hidden_layers : list, optional + List of tensorflow layers.Dense kwargs (dictionaries) + if None use a single linear layer, by default None + learning_rate : float, optional + tensorflow optimizer learning rate, by default 0.001 + loss : str, optional + name of objective function, by default "mean_squared_error" + metrics : list, optional + List of metrics to be evaluated by the model during training and + testing, by default ('mae', 'mse') + optimizer_class : tf.keras.optimizers, optional + Optional explicit request of optimizer. This should be a class + that will be instantated in the TfModel._compile_model() method + The default is the Adam optimizer + kwargs : dict + kwargs for tensorflow.keras.models.compile + + Returns + ------- + tensorflow.keras.models.Sequential + Compiled tensorflow Sequential model + """ + model = tf.keras.models.Sequential() + model.add(layers.InputLayer(input_shape=[n_features])) + if hidden_layers is None: + # Add a single linear layer + model.add(layers.Dense(n_labels)) + else: + for layer in hidden_layers: + dropout = layer.pop('dropout', None) + model.add(layers.Dense(**layer)) + + if dropout is not None: + model.add(layers.Dropout(dropout)) + + model.add(layers.Dense(n_labels)) + + if isinstance(metrics, tuple): + metrics = list(metrics) + elif not isinstance(metrics, list): + metrics = [metrics] + + optimizer = optimizer_class(learning_rate=learning_rate) + + model.compile(optimizer=optimizer, loss=loss, metrics=metrics, + **kwargs) + + return model + + def train_model(self, features, labels, norm_labels=True, epochs=100, + validation_split=0.2, early_stop=True, parse_kwargs=None, + fit_kwargs=None): + """ + Train the model with the provided features and label + + Parameters + ---------- + features : dict | pandas.DataFrame + Input features to train on + labels : dict | pandas.DataFrame + label to train on + norm_labels : bool, optional + Flag to normalize label, by default True + epochs : int, optional + Number of epochs to train the model, by default 100 + validation_split : float, optional + Fraction of the training data to be used as validation data, + by default 0.2 + early_stop : bool + Flag to stop training when it stops improving + parse_kwargs : dict + kwargs for cls._parse_features + fit_kwargs : dict + kwargs for tensorflow.keras.models.fit + """ + if parse_kwargs is None: + parse_kwargs = {} + + features = self._parse_features(features, **parse_kwargs) + labels = self._parse_labels(labels, normalize=norm_labels) + + if self._history is not None: + msg = 'Model has already been trained and will be re-fit!' + logger.warning(msg) + warn(msg) + + if fit_kwargs is None: + fit_kwargs = {} + + if early_stop: + early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', + patience=10) + callbacks = fit_kwargs.pop('callbacks', None) + if callbacks is None: + callbacks = [early_stop] + else: + callbacks.append(early_stop) + + fit_kwargs['callbacks'] = callbacks + + if validation_split > 0: + split = int(len(features) * validation_split) + validate_features = features[-split:] + validate_labels = labels[-split:] + validation_data = (validate_features, validate_labels) + + features = features[:-split] + labels = labels[:-split] + else: + validation_data = None + + self._history = self._model.fit(x=features, y=labels, epochs=epochs, + validation_data=validation_data, + **fit_kwargs) + + def save_model(self, path): + """ + Save TfModel to path. + + Parameters + ---------- + path : str + Directory path to save model to. The tensorflow model will be + saved to the directory while the framework parameters will be + saved in json. + """ + if path.endswith('.json'): + path = path.replace('.json', '/') + + if not path.endswith('/'): + path += '/' + + if not os.path.exists(path): + os.makedirs(path) + + tf.saved_model.save(self.model, path) + + model_params = {'feature_names': self.feature_names, + 'label_names': self.label_names, + 'norm_params': self.normalization_parameters} + + json_path = path.rstrip('/') + '.json' + model_params = self.dict_json_convert(model_params) + with open(json_path, 'w') as f: + json.dump(model_params, f, indent=2, sort_keys=True) + + @classmethod + def build(cls, feature_names, label_names, hidden_layers=None, + learning_rate=0.001, loss="mean_squared_error", + metrics=('mae', 'mse'), optimizer_class=Adam, **kwargs): + """ + Build tensorflow sequential model from given features, layers and + kwargs + + Parameters + ---------- + feature_names : list + Ordered list of feature names. + label_names : list + Ordered list of label (output) names. + hidden_layers : list, optional + List of tensorflow layers.Dense kwargs (dictionaries) + if None use a single linear layer, by default None + learning_rate : float, optional + tensorflow optimizer learning rate, by default 0.001 + loss : str, optional + name of objective function, by default "mean_squared_error" + metrics : list, optional + List of metrics to be evaluated by the model during training and + testing, by default ('mae', 'mse') + optimizer_class : tf.keras.optimizers, optional + Optional explicit request of optimizer. This should be a class + that will be instantated in the TfModel._compile_model() method + The default is the Adam optimizer + kwargs : dict + kwargs for tensorflow.keras.models.compile + + Returns + ------- + model : TfModel + Initialized TfKeraModel obj + """ + model = TfModel.compile_model(len(feature_names), + n_labels=len(label_names), + hidden_layers=hidden_layers, + learning_rate=learning_rate, loss=loss, + metrics=metrics, + optimizer_class=optimizer_class, + **kwargs) + + return cls(model, feature_names=feature_names, label_names=label_names) + + @classmethod + def train(cls, features, labels, hidden_layers=None, + learning_rate=0.001, loss="mean_squared_error", + metrics=('mae', 'mse'), optimizer_class=Adam, norm_labels=True, + epochs=100, validation_split=0.2, early_stop=True, + save_path=None, compile_kwargs=None, parse_kwargs=None, + fit_kwargs=None): + """ + Build tensorflow sequential model from given features, layers and + kwargs and then train with given label and kwargs + + Parameters + ---------- + features : dict | pandas.DataFrame + Model features + labels : dict | pandas.DataFrame + label to train on + hidden_layers : list, optional + List of tensorflow layers.Dense kwargs (dictionaries) + if None use a single linear layer, by default None + learning_rate : float, optional + tensorflow optimizer learning rate, by default 0.001 + loss : str, optional + name of objective function, by default "mean_squared_error" + metrics : list, optional + List of metrics to be evaluated by the model during training and + testing, by default ('mae', 'mse') + optimizer_class : tf.keras.optimizers, optional + Optional explicit request of optimizer. This should be a class + that will be instantated in the TfModel._compile_model() method + The default is the Adam optimizer + norm_label : bool + Flag to normalize label + epochs : int, optional + Number of epochs to train the model, by default 100 + validation_split : float, optional + Fraction of the training data to be used as validation data, + by default 0.2 + early_stop : bool + Flag to stop training when it stops improving + save_path : str + Directory path to save model to. The tensorflow model will be + saved to the directory while the framework parameters will be + saved in json. + compile_kwargs : dict + kwargs for tensorflow.keras.models.compile + parse_kwargs : dict + kwargs for cls._parse_features + fit_kwargs : dict + kwargs for tensorflow.keras.models.fit + + Returns + ------- + model : TfModel + Initialized and trained TfModel obj + """ + if compile_kwargs is None: + compile_kwargs = {} + + _, feature_names = cls._parse_data(features) + _, label_names = cls._parse_data(labels) + + model = cls.build(feature_names, label_names, + hidden_layers=hidden_layers, + learning_rate=learning_rate, loss=loss, + metrics=metrics, optimizer_class=optimizer_class, + **compile_kwargs) + + model.train_model(features, labels, norm_labels=norm_labels, + epochs=epochs, validation_split=validation_split, + early_stop=early_stop, parse_kwargs=parse_kwargs, + fit_kwargs=fit_kwargs) + + if save_path is not None: + model.save_model(save_path) + + return model + + @classmethod + def load(cls, path): + """ + Load model from model path. + + Parameters + ---------- + path : str + Directory path to TfModel to load model from. There should be a + tensorflow saved model directory with a parallel pickle file for + the TfModel framework. + + Returns + ------- + model : TfModel + Loaded TfModel from disk. + """ + if path.endswith('.json'): + path = path.replace('.json', '/') + + if not path.endswith('/'): + path += '/' + + if not os.path.isdir(path): + e = ('Can only load directory path but target is not ' + 'directory: {}'.format(path)) + logger.error(e) + raise IOError(e) + + loaded = tf.keras.models.load_model(path) + + json_path = path.rstrip('/') + '.json' + with open(json_path, 'r') as f: + model_params = json.load(f) + + model = cls(loaded, **model_params) + + return model diff --git a/phygnn/phygnn.py b/phygnn/phygnn.py index 387e3c5..e9ff2b5 100644 --- a/phygnn/phygnn.py +++ b/phygnn/phygnn.py @@ -13,7 +13,7 @@ from tensorflow.keras import optimizers, initializers from tensorflow.keras.layers import InputLayer, Dense, Dropout -from phygnn.loss_metrics import METRICS +from phygnn.utilities.loss_metrics import METRICS logger = logging.getLogger(__name__) diff --git a/phygnn/utilities/__init__.py b/phygnn/utilities/__init__.py new file mode 100644 index 0000000..f5aab5c --- /dev/null +++ b/phygnn/utilities/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +"""Utilities""" +from .pre_processing import PreProcess +from .tf_utilities import tf_isin, tf_log10 diff --git a/phygnn/loss_metrics.py b/phygnn/utilities/loss_metrics.py similarity index 100% rename from phygnn/loss_metrics.py rename to phygnn/utilities/loss_metrics.py diff --git a/phygnn/pre_processing.py b/phygnn/utilities/pre_processing.py similarity index 77% rename from phygnn/pre_processing.py rename to phygnn/utilities/pre_processing.py index a024ec0..ab83a3e 100644 --- a/phygnn/pre_processing.py +++ b/phygnn/utilities/pre_processing.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ Data pre-processing module. """ @@ -195,14 +196,15 @@ def _make_df_one_hot_cols_labels(self, one_hot_ind, one_hot_data, return col_labels - def process_one_hot(self, convert_int=False, categories=None): + def process_one_hot(self, convert_int=False, categories=None, + return_ind=False): """Process str and int columns in the feature data to one-hot vectors. Parameters ---------- - convert_int : bool - Flag to convert integer data to one-hot vectors. - categories : dict | None + convert_int : bool, optional + Flag to convert integer data to one-hot vectors, by default False + categories : dict | None, optional Categories to use for one hot encoding where a key is the original column name in the feature dataframe and value is a list of the possible unique values of the feature column. The value list must @@ -212,6 +214,9 @@ def process_one_hot(self, convert_int=False, categories=None): results in category names being determined automatically. Format: {'col_name1' : ['cat1', 'cat2', 'cat3'], 'col_name2' : ['other_cat1', 'other_cat2']} + by default None + return_ind : bool, optional + Return one hot column indices, by default False Returns ------- @@ -220,6 +225,9 @@ def process_one_hot(self, convert_int=False, categories=None): vectors appended as new columns. If features is a dataframe and categories is input, the new one-hot columns will be named according to categories. + one_hot_ind : list, optional + List of numeric column indices in the native data that are + to-be-transformed into one-hot vectors. """ if categories is None: @@ -229,7 +237,7 @@ def process_one_hot(self, convert_int=False, categories=None): convert_int=convert_int, categories=categories) if not one_hot_ind: - return self._features + processed = self._features else: if self._pd: @@ -250,4 +258,54 @@ def process_one_hot(self, convert_int=False, categories=None): assert processed.shape[0] == self._features.shape[0] processed = processed.astype(np.float32) + + if return_ind: + return processed, one_hot_ind + else: return processed + + @classmethod + def one_hot(cls, features, convert_int=False, categories=None, + return_ind=False): + """ + Process str and int columns in the feature data to one-hot vectors. + + Parameters + ---------- + features : np.ndarray | pd.DataFrame + Feature data in a 2D array or DataFrame. + convert_int : bool, optional + Flag to convert integer data to one-hot vectors, by default False + categories : dict | None, optional + Categories to use for one hot encoding where a key is the original + column name in the feature dataframe and value is a list of the + possible unique values of the feature column. The value list must + have as many or more entries as unique values in the feature + column. This will name the feature column headers for the new + one-hot-encoding if features is a dataframe. Empty dict or None + results in category names being determined automatically. Format: + {'col_name1' : ['cat1', 'cat2', 'cat3'], + 'col_name2' : ['other_cat1', 'other_cat2']} + by default None + return_ind : bool, optional + Return one hot column indices, by default False + + Returns + ------- + processed : np.ndarray | pd.DataFrame + Feature data with str and int columns removed and one-hot boolean + vectors appended as new columns. If features is a dataframe and + categories is input, the new one-hot columns will be named + according to categories. + one_hot_ind : list, optional + List of numeric column indices in the native data that are + to-be-transformed into one-hot vectors. + """ + logger.debug('Checking for one-hot items and converting them ' + 'to binary values') + pp = cls(features) + out = pp.process_one_hot(convert_int=convert_int, + categories=categories, + return_ind=return_ind) + + return out diff --git a/phygnn/tf_utilities.py b/phygnn/utilities/tf_utilities.py similarity index 95% rename from phygnn/tf_utilities.py rename to phygnn/utilities/tf_utilities.py index 594056c..8b5abde 100644 --- a/phygnn/tf_utilities.py +++ b/phygnn/utilities/tf_utilities.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ Tensorflow utilities """ From 2330f1d19bca459c2dbf18b684b6acf27e52c055 Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Mon, 24 Aug 2020 13:13:03 -0600 Subject: [PATCH 3/6] add tf_model tests --- phygnn/model_interfaces/base_model.py | 4 +- phygnn/model_interfaces/tf_model.py | 81 +++++++++++++++++++++++++-- requirements.txt | 2 +- tests/test_pre_processing.py | 2 +- tests/test_tf_model.py | 56 ++++++++++++++++++ tests/test_tf_utilities.py | 2 +- 6 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 tests/test_tf_model.py diff --git a/phygnn/model_interfaces/base_model.py b/phygnn/model_interfaces/base_model.py index f112ca0..2e6d4db 100644 --- a/phygnn/model_interfaces/base_model.py +++ b/phygnn/model_interfaces/base_model.py @@ -760,7 +760,9 @@ def _parse_features(self, features, names=None, process_one_hot=True, if process_one_hot: kwargs.update({'return_ind': True}) features, one_hot_ind = PreProcess.one_hot(features, **kwargs) - self._check_one_hot_norm_params(self.feature_names[one_hot_ind]) + if one_hot_ind: + one_hot_features = [self.feature_names[i] for i in one_hot_ind] + self._check_one_hot_norm_params(one_hot_features) features = self.normalize(features, names=feature_names) diff --git a/phygnn/model_interfaces/tf_model.py b/phygnn/model_interfaces/tf_model.py index b8ed0e5..157f17e 100644 --- a/phygnn/model_interfaces/tf_model.py +++ b/phygnn/model_interfaces/tf_model.py @@ -9,7 +9,7 @@ import pandas as pd import tensorflow as tf from tensorflow import feature_column -from tensorflow.keras import layers +from tensorflow.keras.layers import InputLayer, Dense, Dropout from tensorflow.keras.optimizers import Adam from warnings import warn @@ -42,6 +42,72 @@ def __init__(self, model, feature_names=None, label_names=None, self._history = None + @property + def layers(self): + """ + Model layers + + Returns + ------- + list + """ + return self.model.layers + + @property + def weights(self): + """ + Get a list of layer weights for gradient calculations. + + Returns + ------- + list + """ + weights = [] + for layer in self.layers: + weights += layer.get_weights() + + return weights + + @property + def kernel_weights(self): + """ + Get a list of the NN kernel weights (tensors) + + (can be used for kernel regularization). + + Does not include input layer or dropout layers. + Does include the output layer. + + Returns + ------- + list + """ + weights = [] + for layer in self.layers: + weights.append(layer.get_weights()[0]) + + return weights + + @property + def bias_weights(self): + """ + Get a list of the NN bias weights (tensors) + + (can be used for bias regularization). + + Does not include input layer or dropout layers. + Does include the output layer. + + Returns + ------- + list + """ + weights = [] + for layer in self.layers: + weights.append(layer.get_weights()[1]) + + return weights + @property def history(self): """ @@ -230,19 +296,19 @@ def compile_model(n_features, n_labels=1, hidden_layers=None, Compiled tensorflow Sequential model """ model = tf.keras.models.Sequential() - model.add(layers.InputLayer(input_shape=[n_features])) + model.add(InputLayer(input_shape=[n_features])) if hidden_layers is None: # Add a single linear layer - model.add(layers.Dense(n_labels)) + model.add(Dense(n_labels)) else: for layer in hidden_layers: dropout = layer.pop('dropout', None) - model.add(layers.Dense(**layer)) + model.add(Dense(**layer)) if dropout is not None: - model.add(layers.Dropout(dropout)) + model.add(Dropout(dropout)) - model.add(layers.Dense(n_labels)) + model.add(Dense(n_labels)) if isinstance(metrics, tuple): metrics = list(metrics) @@ -389,6 +455,9 @@ def build(cls, feature_names, label_names, hidden_layers=None, model : TfModel Initialized TfKeraModel obj """ + if isinstance(label_names, str): + label_names = [label_names] + model = TfModel.compile_model(len(feature_names), n_labels=len(label_names), hidden_layers=hidden_layers, diff --git a/requirements.txt b/requirements.txt index 9389413..c1618ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ +matplotlib>=3.1 numpy>=1.16 pandas>=0.25 -matplotlib>=3.1 pytest>=5.2 scikit-learn>=0.22 tensorflow \ No newline at end of file diff --git a/tests/test_pre_processing.py b/tests/test_pre_processing.py index 2667554..a5cc6cc 100644 --- a/tests/test_pre_processing.py +++ b/tests/test_pre_processing.py @@ -3,7 +3,7 @@ """ import numpy as np import pandas as pd -from phygnn.pre_processing import PreProcess +from phygnn.utilities.pre_processing import PreProcess index = pd.date_range('20180101', '20190101', freq='5min') diff --git a/tests/test_tf_model.py b/tests/test_tf_model.py new file mode 100644 index 0000000..48436c8 --- /dev/null +++ b/tests/test_tf_model.py @@ -0,0 +1,56 @@ +""" +Tests for basic phygnn functionality and execution. +""" +# pylint: disable=W0613 +import numpy as np +import pandas as pd +import pytest + +from phygnn.model_interfaces.tf_model import TfModel + + +N = 100 +A = np.linspace(-1, 1, N) +B = np.linspace(-1, 1, N) +A, B = np.meshgrid(A, B) +A = np.expand_dims(A.flatten(), axis=1) +B = np.expand_dims(B.flatten(), axis=1) + +Y = np.sqrt(A ** 2 + B ** 2) +X = np.hstack((A, B)) +features = pd.DataFrame(X, columns=['a', 'b']) + +Y_NOISE = Y * (1 + (np.random.random(Y.shape) - 0.5) * 0.5) + 0.1 +labels = pd.DataFrame(Y_NOISE, columns=['c']) + + +@pytest.mark.parametrize( + 'hidden_layers', + [None, + [{'units': 64, 'activation': 'relu', 'name': 'relu1'}, + {'units': 64, 'activation': 'relu', 'name': 'relu2'}]]) +def test_nn(hidden_layers): + """Test the TfModel """ + model = TfModel.train(features, labels, hidden_layers=hidden_layers, + epochs=10, fit_kwargs={"batch_size": 4}, + early_stop=False) + + test_mae = np.mean(np.abs(model[X].values.ravel() - Y)) + + n_layers = len(hidden_layers) + 1 if hidden_layers is not None else 1 + loss = 0.4 if hidden_layers is not None else 4 + assert len(model.layers) == n_layers + assert len(model.weights) == n_layers * 2 + assert len(model.history) == 10 + assert model.history['val_loss'].values[-1] < loss + assert test_mae < loss + + +def test_dropouts(): + """Test the dropout rate kwargs for adding dropout layers.""" + hidden_layers = [ + {'units': 64, 'activation': 'relu', 'name': 'relu1', 'dropout': 0.1}, + {'units': 64, 'activation': 'relu', 'name': 'relu2', 'dropout': 0.1}] + model = TfModel.build(['a', 'b'], 'c', hidden_layers=hidden_layers) + + assert len(model.layers) == 5 diff --git a/tests/test_tf_utilities.py b/tests/test_tf_utilities.py index 4d5277f..83b2314 100644 --- a/tests/test_tf_utilities.py +++ b/tests/test_tf_utilities.py @@ -3,7 +3,7 @@ """ import numpy as np import tensorflow as tf -from phygnn.tf_utilities import tf_isin, tf_log10 +from phygnn.utilities.tf_utilities import tf_isin, tf_log10 def test_tf_isin(): From 7a73b998386ed6a699f3cb33d55d3b3b01871b03 Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Mon, 24 Aug 2020 13:22:04 -0600 Subject: [PATCH 4/6] fix requirements.txt path --- README.rst | 37 +++++++++++++++++++++---------------- setup.py | 7 ++----- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/README.rst b/README.rst index 49c391f..35d6b39 100644 --- a/README.rst +++ b/README.rst @@ -4,23 +4,27 @@ phygnn phygnn stands for **physics-guided neural networks**. -This implementation of physics-guided neural networks augments a traditional -neural network loss function with a generic loss term that can be used to -guide the neural network to learn physical or theoretical constraints. -phygnn enables scientific software developers and data scientists to easily -integrate machine learning models into physics and engineering applications. -This framework should help alleviate some challenges that are often encountered -when applying purely data-driven machine learning models to scientific applications, -such as when machine learning models produce physically inconsistent results or have trouble -generalizing to out-of-sample scenarios. +This implementation of physics-guided neural networks augments a traditional +neural network loss function with a generic loss term that can be used to +guide the neural network to learn physical or theoretical constraints. +phygnn enables scientific software developers and data scientists to easily +integrate machine learning models into physics and engineering applications. +This framework should help alleviate some challenges that are often encountered +when applying purely data-driven machine learning models to scientific +applications, such as when machine learning models produce physically +inconsistent results or have trouble generalizing to out-of-sample scenarios. For details on the phygnn class framework see `the phygnn module documentation here. `_ -At the National Renewable Energy Lab (NREL), we are using the phygnn framework to supplement traditional satellite-based cloud property prediction models. -We use phygnn to predict cloud optical properties when the traditional mechanistic models fail and use a full tensor-based radiative transfer model as -the physical loss function to transform the predicted cloud properties into phygnn-predicted irradiance data. We then calculate a loss value comparing -the phygnn-predicted irradiance to high quality ground measurements. We have seen excellent improvements in the predicted irradiance data in rigorous -out-of-sample-validation experiments. +At the National Renewable Energy Lab (NREL), we are using the phygnn framework +to supplement traditional satellite-based cloud property prediction models. We +use phygnn to predict cloud optical properties when the traditional mechanistic +models fail and use a full tensor-based radiative transfer model as the +physical loss function to transform the predicted cloud properties into +phygnn-predicted irradiance data. We then calculate a loss value comparing the +phygnn-predicted irradiance to high quality ground measurements. We have seen +excellent improvements in the predicted irradiance data in rigorous +out-of-sample-validation experiments. Engineers and researchers can use the phygnn framework to: @@ -28,7 +32,8 @@ Engineers and researchers can use the phygnn framework to: * Use the physics loss function to extend training data, e.g. train against "known" outputs but also train using the downstream application of the predicted variables * Use the physics loss function to adjust theoretical models based on empirical observation using respective loss weights -Here are additional examples of similar architectures from the literature which helped inspire this work: +Here are additional examples of similar architectures from the literature which +helped inspire this work: * Jared Willard, Xiaowei Jia, Shaoming Xu, Michael Steinbach, and Vipin Kumar, “Integrating Physics-Based Modeling with Machine Learning: A Survey.” ArXiv abs/2003.04919 (2020). * Forssell, U. and P. Lindskog. “Combining Semi-Physical and Neural Network Modeling: An Example ofIts Usefulness.” IFAC Proceedings Volumes 30 (1997): 767-770. @@ -36,7 +41,7 @@ Here are additional examples of similar architectures from the literature which * Anuj Karpatne, William Watkins, Jordan Read, and Vipin Kumar, "Physics-guided Neural Networks (PGNN): An Application in Lake Temperature Modeling". arXiv:1710.11431v2 (2018). * Anuj Karpatne, Gowtham Atluri, James H Faghmous, Michael Steinbach, Arindam Banerjee, Auroop Ganguly, Shashi Shekhar, Nagiza Samatova, and Vipin Kumar. 2017. Theory-guided data science: A new paradigm for scientific discovery from data. IEEE Transactions on knowledge and data engineering 29, 10 (2017), 2318–2331. - + Installation ============ diff --git a/setup.py b/setup.py index 2b3252c..7e4e52f 100644 --- a/setup.py +++ b/setup.py @@ -15,20 +15,17 @@ if py_version.major < 3: raise RuntimeError("phygnn is only compatible with python 3!") - here = os.path.abspath(os.path.dirname(__file__)) - with open(os.path.join(here, "phygnn", "version.py"), encoding="utf-8") as f: version = f.read() -version = version.split('=')[-1].strip().strip('"').strip("'") +version = version.split('=')[-1].strip().strip('"').strip("'") with open(os.path.join(here, "README.rst"), encoding="utf-8") as f: readme = f.read() - -with open("requirements.txt") as f: +with open(os.path.join(here, "requirements.txt")) as f: install_requires = f.readlines() From 7ec111934e8d1021c4be6adf5cc7cfced76d9458 Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Mon, 24 Aug 2020 14:06:00 -0600 Subject: [PATCH 5/6] fix order of operations in parse_feature and parse_label --- phygnn/model_interfaces/base_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/phygnn/model_interfaces/base_model.py b/phygnn/model_interfaces/base_model.py index 2e6d4db..6990e87 100644 --- a/phygnn/model_interfaces/base_model.py +++ b/phygnn/model_interfaces/base_model.py @@ -734,6 +734,8 @@ def _parse_features(self, features, names=None, process_one_hot=True, Parsed features array normalized and with str columns converted to one hot vectors if desired """ + features, feature_names = self._parse_data(features, names=names) + if len(features.shape) != 2: msg = ('{} can only use 2D data as input!' .format(self.__class__.__name__)) @@ -747,8 +749,6 @@ def _parse_features(self, features, names=None, process_one_hot=True, logger.error(msg) raise RuntimeError(msg) - features, feature_names = self._parse_data(features, names=names) - if self._feature_names is None: self._feature_names = feature_names elif self.feature_names != feature_names: @@ -786,6 +786,8 @@ def _parse_labels(self, labels, names=None, normalize=True): labels : ndarray Parsed labels array, normalized if desired """ + labels, label_names = self._parse_data(labels, names=names) + if self.label_names is not None: if len(labels.shape) == 1: n_labels = len(labels) @@ -798,8 +800,6 @@ def _parse_labels(self, labels, names=None, normalize=True): logger.error(msg) raise RuntimeError(msg) - labels, label_names = self._parse_data(labels, names=names) - if self._label_names is None: self._label_names = label_names elif self.label_names != label_names: From e1e2cf270d2bfcadebce8c0e6a4ba675df3893cd Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Tue, 25 Aug 2020 12:37:08 -0600 Subject: [PATCH 6/6] add RandomForestModel tests patch bugs update version --- phygnn/model_interfaces/base_model.py | 37 +++++++++++++++---- .../model_interfaces/random_forest_model.py | 9 ++--- phygnn/version.py | 2 +- tests/test_random_forest_model.py | 31 ++++++++++++++++ 4 files changed, 65 insertions(+), 14 deletions(-) create mode 100644 tests/test_random_forest_model.py diff --git a/phygnn/model_interfaces/base_model.py b/phygnn/model_interfaces/base_model.py index 6990e87..2317c0f 100644 --- a/phygnn/model_interfaces/base_model.py +++ b/phygnn/model_interfaces/base_model.py @@ -2,6 +2,7 @@ """ Base Model Interface """ +from abc import ABC import logging import numpy as np import pandas as pd @@ -12,7 +13,7 @@ logger = logging.getLogger(__name__) -class ModelBase: +class ModelBase(ABC): """ Base Model Interface """ @@ -381,6 +382,28 @@ def _parse_data(data, names=None): return data, names + @staticmethod + def _get_item_number(arr): + """ + Get number of items in array (labels or features) + + Parameters + ---------- + arr : ndarray + 1 or 2D array + + Returns + ------- + n : int + Number of items + """ + if len(arr.shape) == 1: + n = 1 + else: + n = arr.shape[1] + + return n + def get_norm_params(self, names): """ Get means and stdevs for given feature/label names @@ -527,7 +550,8 @@ def _normalize_arr(self, arr, names): norm_arr : ndarray Normalized features/label """ - if len(names) != arr.shape[1]: + n_names = self._get_item_number(arr) + if len(names) != n_names: msg = ("Number of item names ({}) does not match number of items " "({})".format(len(names), arr.shape[1])) logger.error(msg) @@ -647,7 +671,8 @@ def _unnormalize_arr(self, arr, names): native_arr : ndarray Native features/label array """ - if len(names) != arr.shape[1]: + n_names = self._get_item_number(arr) + if len(names) != n_names: msg = ("Number of item names ({}) does not match number of items " "({})".format(len(names), arr.shape[1])) logger.error(msg) @@ -789,11 +814,7 @@ def _parse_labels(self, labels, names=None, normalize=True): labels, label_names = self._parse_data(labels, names=names) if self.label_names is not None: - if len(labels.shape) == 1: - n_labels = len(labels) - else: - n_labels = labels.shape[1] - + n_labels = self._get_item_number(labels) if n_labels != len(self.label_names): msg = ('data has {} labels but expected {}' .format(labels.shape[1], self.label_dims)) diff --git a/phygnn/model_interfaces/random_forest_model.py b/phygnn/model_interfaces/random_forest_model.py index 5a948e8..d9ff7a2 100644 --- a/phygnn/model_interfaces/random_forest_model.py +++ b/phygnn/model_interfaces/random_forest_model.py @@ -33,7 +33,7 @@ def __init__(self, model, feature_names=None, label_name=None, parameters (mean, stdev), by default None """ super().__init__(model, feature_names=feature_names, - label_name=label_name, norm_params=norm_params) + label_names=label_name, norm_params=norm_params) if len(self.label_names) > 1: msg = ("Only a single label can be supplied to {}, but {} were" @@ -133,8 +133,7 @@ def train_model(self, features, label, norm_label=True, parse_kwargs=None, features = self._parse_features(features, **parse_kwargs) - label = self._parse_data(label, normalize=norm_label, - names=True) + label = self._parse_labels(label, normalize=norm_label) if fit_kwargs is None: fit_kwargs = {} @@ -204,10 +203,10 @@ def train(cls, features, label, norm_label=True, save_path=None, compile_kwargs = {} _, feature_names = cls._parse_data(features) - _, label_names = cls._parse_data(label) + _, label_name = cls._parse_data(label) model = cls(cls.compile_model(**compile_kwargs), - feature_names=feature_names, label_names=label_names) + feature_names=feature_names, label_name=label_name) model.train_model(features, label, norm_label=norm_label, parse_kwargs=parse_kwargs, fit_kwargs=fit_kwargs) diff --git a/phygnn/version.py b/phygnn/version.py index db781ae..eebca35 100644 --- a/phygnn/version.py +++ b/phygnn/version.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- """Physics Guided Neural Network version.""" -__version__ = '0.0.0' +__version__ = '0.0.1' diff --git a/tests/test_random_forest_model.py b/tests/test_random_forest_model.py new file mode 100644 index 0000000..85256fb --- /dev/null +++ b/tests/test_random_forest_model.py @@ -0,0 +1,31 @@ +""" +Tests for basic phygnn functionality and execution. +""" +# pylint: disable=W0613 +import numpy as np +import pandas as pd + +from phygnn.model_interfaces.random_forest_model import RandomForestModel + + +N = 100 +A = np.linspace(-1, 1, N) +B = np.linspace(-1, 1, N) +A, B = np.meshgrid(A, B) +A = np.expand_dims(A.flatten(), axis=1) +B = np.expand_dims(B.flatten(), axis=1) + +Y = np.sqrt(A ** 2 + B ** 2) +X = np.hstack((A, B)) +features = pd.DataFrame(X, columns=['a', 'b']) + +Y_NOISE = Y * (1 + (np.random.random(Y.shape) - 0.5) * 0.5) + 0.1 +labels = pd.DataFrame(Y_NOISE, columns=['c']) + + +def test_random_forest(): + """Test the RandomForestModel """ + model = RandomForestModel.train(features, labels) + + test_mae = np.mean(np.abs(model[X].values.ravel() - Y)) + assert test_mae < 0.4