Skip to content

Commit

Permalink
feat: make training scripts callable (#80)
Browse files Browse the repository at this point in the history
Co-authored-by: Magdalena Lederbauer <[email protected]>
  • Loading branch information
mlederbauer and Magdalena Lederbauer authored May 28, 2024
1 parent 2a52229 commit feb81af
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 127 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ scratch/
dataset/
plots/
data/
metrics/

docs/source

Expand Down
214 changes: 149 additions & 65 deletions scripts/training/baselines.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,164 @@
import argparse
import logging as log
import os

import numpy as np
import mlflow
import pandas as pd
from sklearn.dummy import DummyClassifier

from nmrcraft.analysis import plotting
from nmrcraft.data.dataloader import DataLoader
from nmrcraft.evaluation.evaluation import evaluate_model
from nmrcraft.evaluation import evaluation
from nmrcraft.utils.general import add_rows_metrics

# Setup MLflow
mlflow.set_experiment("Final_results")

# Setup parser
parser = argparse.ArgumentParser(
description="Train a model with MLflow tracking."
)

parser.add_argument(
"--max_evals",
type=int,
default=3,
help="The max evaluations for the hyperparameter tuning with hyperopt",
)
parser.add_argument(
"--target",
type=str,
default=["metal", "X3_ligand", "E_ligand"],
help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'",
)
parser.add_argument(
"--include_structural",
type=bool,
default=False,
help="Handles if structural features will be included or only nmr tensors are used.",
)
parser.add_argument(
"--plot_folder",
type=str,
default="plots/",
help="The Folder where the plots are saved",
)

def main():
parser = argparse.ArgumentParser(
description="Simplified model training script."
)
parser.add_argument(
"--targets",
type=str,
default=["metal", "E_ligand"],
help="The Target for the predictions.",
)
parser.add_argument(
"--dataset_size",
type=float,
default=1.0,
help="Size of the dataset to load.",
)
parser.add_argument(
"--random_baseline",
type=bool,
default=False,
help="Use a random baseline model.",
)
args = parser.parse_args()

# Set up logging
def main(args) -> pd.DataFrame:

# Check if folder path exists, if not create it
if not os.path.exists(args.plot_folder):
os.makedirs(args.plot_folder)

# Setup logging
log.basicConfig(
level=log.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
format="%(asctime)s %(message)s",
level=log.INFO,
force=True,
datefmt="%Y-%m-%d %H:%M:%S",
)
log.getLogger().setLevel(log.INFO)

# Load data
dataloader = DataLoader(
target_columns=args.targets,
dataset_size=args.dataset_size,
feature_columns=[
"M_sigma11_ppm",
"M_sigma22_ppm",
"M_sigma33_ppm",
"E_sigma11_ppm",
"E_sigma22_ppm",
"E_sigma33_ppm",
],
complex_geometry="oct",
test_size=0.3,
random_state=42,
include_structural_features=False,
testing=False,
)
X_train, X_test, y_train, y_test, y_labels = dataloader.load_data()

predictions = np.zeros_like(y_test)

for i in range(len(args.targets)): # Loop through each target column
if args.random_baseline:
unique_vals = np.unique(y_train[:, i])
predictions[:, i] = np.random.choice(unique_vals, size=len(y_test))
else:
most_common = pd.Series(y_train[:, i]).mode()[0]
predictions[:, i] = np.full(
shape=y_test[:, i].shape, fill_value=most_common
)

# Evaluate the model
metrics, confusion_matrices = evaluate_model(
y_test, predictions, args.targets
)
log.info("Evaluation Metrics: %s", metrics)
dataset_sizes = [
0.01,
0.1,
0.5,
1.0,
]
models = [
"baseline_random_ligand",
"baseline_most_often",
]

# Initialize df to store all the info for later plotting
unified_metrics_columns = [
"target",
"model_targets",
"model",
"nmr_only",
"dataset_fraction",
"max_evals",
"accuracy_mean",
"accuracy_lb",
"accuracy_hb",
"f1_mean",
"f1_lb",
"f1_hb",
]
unified_metrics = pd.DataFrame(columns=unified_metrics_columns)

with mlflow.start_run():

for model_name in models:

for dataset_size in dataset_sizes:
data_loader = DataLoader(
target_columns=args.target,
dataset_size=dataset_size,
)
(
X_train,
X_test,
y_train,
y_test,
y_labels,
) = data_loader.load_data()

if model_name == "baseline_random_ligand":
multioutput_model = DummyClassifier(strategy="uniform")
elif model_name == "baseline_most_often":
multioutput_model = DummyClassifier(
strategy="most_frequent"
)

multioutput_model.fit(X_train, y_train)
y_pred = multioutput_model.predict(X_test)

metrics, cm_list = evaluation.evaluate_model(
y_test, y_pred, args.target
)

plotting.plot_confusion_matrix(
cm_list,
y_labels,
model_name,
dataset_size,
args.plot_folder,
)

bootstrap_metrics = evaluation.evaluate_bootstrap(
X_test, y_test, multioutput_model, args.target
)

statistical_metrics = evaluation.metrics_statistics(
bootstrap_metrics
)

unified_metrics = add_rows_metrics(
unified_metrics,
statistical_metrics,
dataset_size,
args.include_structural,
model_name,
args.max_evals,
)
return unified_metrics


if __name__ == "__main__":
main()

# Add arguments
args = parser.parse_args()

unified_metrics = main(args)

# save all the results
if not os.path.isdir("metrics"):
os.mkdir("metrics")

results_path = "metrics/results_baselines.csv"
if os.path.exists(results_path):
existing_data = pd.read_csv(results_path)
unified_metrics = pd.concat([existing_data, unified_metrics])
unified_metrics.to_csv(results_path, index=False)
36 changes: 22 additions & 14 deletions scripts/training/multiple_tragets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from nmrcraft.utils.general import add_rows_metrics

# Setup MLflow
mlflow.set_experiment("Test_final_results")
mlflow.set_experiment("Final_results")

# Setup parser
parser = argparse.ArgumentParser(
Expand All @@ -31,7 +31,7 @@
parser.add_argument(
"--target",
type=str,
default=["metal", "X3_ligand", "X4_ligand"],
default=["metal", "E_ligand", "X4_ligand"],
help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'",
)
parser.add_argument(
Expand All @@ -48,9 +48,7 @@
)


if __name__ == "__main__":
# Add arguments
args = parser.parse_args()
def main(args) -> pd.DataFrame:

# Check if folder path exists, if not create it
if not os.path.exists(args.plot_folder):
Expand All @@ -66,11 +64,10 @@
log.getLogger().setLevel(log.INFO)

dataset_sizes = [
# 0.01,
0.01,
0.1,
0.15,
# 0.5,
# 1.0,
0.5,
1.0,
]
models = [
"random_forest",
Expand All @@ -95,10 +92,8 @@
unified_metrics = pd.DataFrame(columns=unified_metrics_columns)

with mlflow.start_run():
model_metrics = []

for model_name in models:
data = pd.DataFrame()
config = model_configs[model_name]
tuner = HyperparameterTuner(
model_name, config, max_evals=args.max_evals
Expand Down Expand Up @@ -156,9 +151,22 @@
model_name,
args.max_evals,
)
# Add all the newly generated metrics to the unified dataframe
return unified_metrics


if __name__ == "__main__":

# Add arguments
args = parser.parse_args()

unified_metrics = main(args)

# save all the results
if not os.path.isdir("metrics"):
os.mkdir("metrics")
unified_metrics.to_csv(f"metrics/metrics_{args.target}.csv")
# mlflow.log_input(unified_metrics, context="unified metrics")

results_path = "metrics/results_multi_target.csv"
if os.path.exists(results_path):
existing_data = pd.read_csv(results_path)
unified_metrics = pd.concat([existing_data, unified_metrics])
unified_metrics.to_csv(results_path, index=False)
Loading

0 comments on commit feb81af

Please sign in to comment.