From feb81af34383a3cdf7afd4e37a9808862655fb09 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer <98785759+mlederbauer@users.noreply.github.com> Date: Tue, 28 May 2024 17:38:44 +0200 Subject: [PATCH] feat: make training scripts callable (#80) Co-authored-by: Magdalena Lederbauer --- .gitignore | 1 + scripts/training/baselines.py | 214 +++++++++++++++++++-------- scripts/training/multiple_tragets.py | 36 +++-- scripts/training/one_target.py | 73 ++++----- 4 files changed, 197 insertions(+), 127 deletions(-) diff --git a/.gitignore b/.gitignore index ced95ee..3a7a433 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ scratch/ dataset/ plots/ data/ +metrics/ docs/source diff --git a/scripts/training/baselines.py b/scripts/training/baselines.py index a1dab66..6019c26 100644 --- a/scripts/training/baselines.py +++ b/scripts/training/baselines.py @@ -1,80 +1,164 @@ import argparse import logging as log +import os -import numpy as np +import mlflow import pandas as pd +from sklearn.dummy import DummyClassifier +from nmrcraft.analysis import plotting from nmrcraft.data.dataloader import DataLoader -from nmrcraft.evaluation.evaluation import evaluate_model +from nmrcraft.evaluation import evaluation +from nmrcraft.utils.general import add_rows_metrics +# Setup MLflow +mlflow.set_experiment("Final_results") + +# Setup parser +parser = argparse.ArgumentParser( + description="Train a model with MLflow tracking." +) + +parser.add_argument( + "--max_evals", + type=int, + default=3, + help="The max evaluations for the hyperparameter tuning with hyperopt", +) +parser.add_argument( + "--target", + type=str, + default=["metal", "X3_ligand", "E_ligand"], + help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", +) +parser.add_argument( + "--include_structural", + type=bool, + default=False, + help="Handles if structural features will be included or only nmr tensors are used.", +) +parser.add_argument( + "--plot_folder", + type=str, + default="plots/", + help="The Folder where the plots are saved", +) -def main(): - parser = argparse.ArgumentParser( - description="Simplified model training script." - ) - parser.add_argument( - "--targets", - type=str, - default=["metal", "E_ligand"], - help="The Target for the predictions.", - ) - parser.add_argument( - "--dataset_size", - type=float, - default=1.0, - help="Size of the dataset to load.", - ) - parser.add_argument( - "--random_baseline", - type=bool, - default=False, - help="Use a random baseline model.", - ) - args = parser.parse_args() - # Set up logging +def main(args) -> pd.DataFrame: + + # Check if folder path exists, if not create it + if not os.path.exists(args.plot_folder): + os.makedirs(args.plot_folder) + + # Setup logging log.basicConfig( - level=log.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + format="%(asctime)s %(message)s", + level=log.INFO, + force=True, + datefmt="%Y-%m-%d %H:%M:%S", ) + log.getLogger().setLevel(log.INFO) - # Load data - dataloader = DataLoader( - target_columns=args.targets, - dataset_size=args.dataset_size, - feature_columns=[ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ], - complex_geometry="oct", - test_size=0.3, - random_state=42, - include_structural_features=False, - testing=False, - ) - X_train, X_test, y_train, y_test, y_labels = dataloader.load_data() - - predictions = np.zeros_like(y_test) - - for i in range(len(args.targets)): # Loop through each target column - if args.random_baseline: - unique_vals = np.unique(y_train[:, i]) - predictions[:, i] = np.random.choice(unique_vals, size=len(y_test)) - else: - most_common = pd.Series(y_train[:, i]).mode()[0] - predictions[:, i] = np.full( - shape=y_test[:, i].shape, fill_value=most_common - ) - - # Evaluate the model - metrics, confusion_matrices = evaluate_model( - y_test, predictions, args.targets - ) - log.info("Evaluation Metrics: %s", metrics) + dataset_sizes = [ + 0.01, + 0.1, + 0.5, + 1.0, + ] + models = [ + "baseline_random_ligand", + "baseline_most_often", + ] + + # Initialize df to store all the info for later plotting + unified_metrics_columns = [ + "target", + "model_targets", + "model", + "nmr_only", + "dataset_fraction", + "max_evals", + "accuracy_mean", + "accuracy_lb", + "accuracy_hb", + "f1_mean", + "f1_lb", + "f1_hb", + ] + unified_metrics = pd.DataFrame(columns=unified_metrics_columns) + + with mlflow.start_run(): + + for model_name in models: + + for dataset_size in dataset_sizes: + data_loader = DataLoader( + target_columns=args.target, + dataset_size=dataset_size, + ) + ( + X_train, + X_test, + y_train, + y_test, + y_labels, + ) = data_loader.load_data() + + if model_name == "baseline_random_ligand": + multioutput_model = DummyClassifier(strategy="uniform") + elif model_name == "baseline_most_often": + multioutput_model = DummyClassifier( + strategy="most_frequent" + ) + + multioutput_model.fit(X_train, y_train) + y_pred = multioutput_model.predict(X_test) + + metrics, cm_list = evaluation.evaluate_model( + y_test, y_pred, args.target + ) + + plotting.plot_confusion_matrix( + cm_list, + y_labels, + model_name, + dataset_size, + args.plot_folder, + ) + + bootstrap_metrics = evaluation.evaluate_bootstrap( + X_test, y_test, multioutput_model, args.target + ) + + statistical_metrics = evaluation.metrics_statistics( + bootstrap_metrics + ) + + unified_metrics = add_rows_metrics( + unified_metrics, + statistical_metrics, + dataset_size, + args.include_structural, + model_name, + args.max_evals, + ) + return unified_metrics if __name__ == "__main__": - main() + + # Add arguments + args = parser.parse_args() + + unified_metrics = main(args) + + # save all the results + if not os.path.isdir("metrics"): + os.mkdir("metrics") + + results_path = "metrics/results_baselines.csv" + if os.path.exists(results_path): + existing_data = pd.read_csv(results_path) + unified_metrics = pd.concat([existing_data, unified_metrics]) + unified_metrics.to_csv(results_path, index=False) diff --git a/scripts/training/multiple_tragets.py b/scripts/training/multiple_tragets.py index 5b97878..01e11b8 100644 --- a/scripts/training/multiple_tragets.py +++ b/scripts/training/multiple_tragets.py @@ -15,7 +15,7 @@ from nmrcraft.utils.general import add_rows_metrics # Setup MLflow -mlflow.set_experiment("Test_final_results") +mlflow.set_experiment("Final_results") # Setup parser parser = argparse.ArgumentParser( @@ -31,7 +31,7 @@ parser.add_argument( "--target", type=str, - default=["metal", "X3_ligand", "X4_ligand"], + default=["metal", "E_ligand", "X4_ligand"], help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", ) parser.add_argument( @@ -48,9 +48,7 @@ ) -if __name__ == "__main__": - # Add arguments - args = parser.parse_args() +def main(args) -> pd.DataFrame: # Check if folder path exists, if not create it if not os.path.exists(args.plot_folder): @@ -66,11 +64,10 @@ log.getLogger().setLevel(log.INFO) dataset_sizes = [ - # 0.01, + 0.01, 0.1, - 0.15, - # 0.5, - # 1.0, + 0.5, + 1.0, ] models = [ "random_forest", @@ -95,10 +92,8 @@ unified_metrics = pd.DataFrame(columns=unified_metrics_columns) with mlflow.start_run(): - model_metrics = [] for model_name in models: - data = pd.DataFrame() config = model_configs[model_name] tuner = HyperparameterTuner( model_name, config, max_evals=args.max_evals @@ -156,9 +151,22 @@ model_name, args.max_evals, ) - # Add all the newly generated metrics to the unified dataframe + return unified_metrics + + +if __name__ == "__main__": + + # Add arguments + args = parser.parse_args() + + unified_metrics = main(args) + # save all the results if not os.path.isdir("metrics"): os.mkdir("metrics") - unified_metrics.to_csv(f"metrics/metrics_{args.target}.csv") - # mlflow.log_input(unified_metrics, context="unified metrics") + + results_path = "metrics/results_multi_target.csv" + if os.path.exists(results_path): + existing_data = pd.read_csv(results_path) + unified_metrics = pd.concat([existing_data, unified_metrics]) + unified_metrics.to_csv(results_path, index=False) diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index d70d2ad..36e3860 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -15,7 +15,7 @@ from nmrcraft.utils.general import add_rows_metrics # Setup MLflow -mlflow.set_experiment("Test_final_results") +mlflow.set_experiment("Final_Results") # Setup parser parser = argparse.ArgumentParser( @@ -25,7 +25,7 @@ parser.add_argument( "--max_evals", type=int, - default=1, + default=2, help="The max evaluations for the hyperparameter tuning with hyperopt", ) parser.add_argument( @@ -48,9 +48,7 @@ ) -if __name__ == "__main__": - # Add arguments - args = parser.parse_args() +def main(args) -> pd.DataFrame: # Check if folder path exists, if not create it if not os.path.exists(args.plot_folder): @@ -66,17 +64,16 @@ log.getLogger().setLevel(log.INFO) dataset_sizes = [ - # 0.01, + 0.01, 0.1, - 0.15, - # 0.5, - # 1.0, + 0.5, + 1.0, ] models = [ "random_forest", - # "logistic_regression", - # "gradient_boosting", - # "svc", + "logistic_regression", + "gradient_boosting", + "svc", "extra_trees", ] @@ -99,7 +96,6 @@ with mlflow.start_run(): for model_name in models: - data = pd.DataFrame() config = model_configs[model_name] tuner = HyperparameterTuner( model_name, config, max_evals=args.max_evals @@ -157,41 +153,22 @@ args.max_evals, ) + return unified_metrics + + +if __name__ == "__main__": + + # Add arguments + args = parser.parse_args() + + unified_metrics = main(args) + # save all the results if not os.path.isdir("metrics"): os.mkdir("metrics") - unified_metrics.to_csv(f"metrics/metrics_{args.target}.csv") - # mlflow.log_input(unified_metrics, context="unified metrics") - - # TODO: Adapt this code to the new structure - # visualizer = Visualizer( - # model_name=model_name, - # cm=cm, - # rates=rates_df, - # metrics=metrics, - # folder_path=args.plot_folder, - # classes=C.y_labels, - # dataset_size=str(dataset_size), - # ) - # path_CM = visualizer.plot_confusion_matrix() - - # data.index = dataset_sizes - # model_metrics.append(data) - # data.index = dataset_sizes - - # path_AC = visualizer.plot_metric( - # data=model_data, - # metric="accuracy", - # title="Accuracy", - # filename="accuracy.png", - # ) - # path_F1 = visualizer.plot_metric( - # data=model_data, - # metric="f1_score", - # title="F1 Score", - # filename="f1_score.png", - # ) - - # for df, model in zip(model_metrics, models): - # print(model) - # print(df) + + results_path = "metrics/results_one_target.csv" + if os.path.exists(results_path): + existing_data = pd.read_csv(results_path) + unified_metrics = pd.concat([existing_data, unified_metrics]) + unified_metrics.to_csv(results_path, index=False)