Skip to content

Commit

Permalink
feat: functional multiclass models
Browse files Browse the repository at this point in the history
  • Loading branch information
kbiniek committed May 26, 2024
1 parent cb69197 commit 4af718e
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 169 deletions.
22 changes: 3 additions & 19 deletions nmrcraft/models/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"gradient_boosting": {
"model_params": {"random_state": 42},
"hyperparameters": {
"loss": hp.choice("loss", ["log_loss", "exponential"]),
"loss": hp.choice("loss", ["log_loss"]),
"learning_rate": hp.uniform("learning_rate", 0.01, 0.5),
"n_estimators": hp.choice("n_estimators", range(10, 1000, 10)),
# "subsample": hp.uniform("subsample", 0.01, 1.0),
Expand All @@ -31,17 +31,9 @@
"logistic_regression": {
"model_params": {"random_state": 42},
"hyperparameters": {
"penalty": hp.choice("penalty", ["l1", "l2", "elasticnet", None]),
"C": hp.uniform("C", 0.01, 10.0),
"solver": hp.choice("solver", ["saga"]),
# lbfgs --> l2, None
# liblinear --> l1, l2
# newton-cg --> l2, None
# newton-cholesky --> l2, None
# sag --> l2, None
# saga --> l1, l2, elasticnet, None
"max_iter": hp.choice("max_iter", range(100, 1000, 100)),
"l1_ratio": hp.uniform("l1_ratio", 0.01, 1.0),
"solver": hp.choice("solver", ["newton-cg", "sag", "saga"]),
# "max_iter": hp.choice("max_iter", range(100, 1000, 100)),
},
},
"svc": {
Expand All @@ -58,12 +50,4 @@
# "max_iter": hp.choice("max_iter", range(100, 1000, 100)),
},
},
"gpc": {
"model_params": {"random_state": 42},
"hyperparameters": {
"n_restarts_optimizer": hp.choice(
"n_restarts_optimizer", range(0, 20)
),
},
},
}
2 changes: 0 additions & 2 deletions nmrcraft/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Any

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

Expand Down Expand Up @@ -53,7 +52,6 @@ def load_model(model_name: str, **kwargs: Any):
"gradient_boosting": GradientBoostingClassifier,
"logistic_regression": LogisticRegression,
"svc": SVC,
"gpc": GaussianProcessClassifier,
}
# TODO: put model config here

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,15 @@
dataset_sizes = [
# 0.01,
0.1,
0.15
0.15,
# 0.5,
# 1.0,
]
models = [
"random_forest",
# "random_forest",
"logistic_regression",
# "gradient_boosting",
"svc",
# "gpc"
# "svc",
]

with mlflow.start_run():
Expand Down
97 changes: 97 additions & 0 deletions scripts/training/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import argparse

import mlflow
from sklearn.metrics import (
accuracy_score,
confusion_matrix,
f1_score,
)

from nmrcraft.data.dataset import DataLoader

# precision_score,
# recall_score,
from nmrcraft.models.model_configs import model_configs
from nmrcraft.models.models import load_model
from nmrcraft.training.hyperparameter_tune import HyperparameterTuner
from nmrcraft.utils.set_seed import set_seed

set_seed()


def main(dataset_size, target, model_name):
# TODO: better experiment naming
mlflow.set_experiment("Ceci_nest_pas_un_experiment")

with mlflow.start_run():
config = model_configs[model_name]

feature_columns = [
"M_sigma11_ppm",
"M_sigma22_ppm",
"M_sigma33_ppm",
"E_sigma11_ppm",
"E_sigma22_ppm",
"E_sigma33_ppm",
]

data_loader = DataLoader(
feature_columns=feature_columns,
target_columns=args.target,
dataset_size=args.dataset_size,
target_type="categorical",
)

# Load and preprocess data
X_train, X_test, y_train, y_test, y_labels = data_loader.load_data()

tuner = HyperparameterTuner(model_name, config, max_evals=1)
best_params, _ = tuner.tune(X_train, y_train)

model_func = lambda **params: load_model(
model_name, **{**params, **config["model_params"]}
)
best_model = model_func(**best_params)
best_model.fit(X_train, y_train)

mlflow.log_params(best_params)
mlflow.log_params(
{
"model_name": model_name,
"dataset_size": dataset_size,
"target": target,
}
)

y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
print(f"Accuracy: {ac}, F1: {f1}, Confusion Matrix:\n{cm}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Train a model with MLflow tracking."
)
parser.add_argument(
"--dataset_size",
type=float,
default=0.01,
help="Fraction of dataset to use",
)
parser.add_argument(
"--target",
type=str,
default="X3",
help="Specify the target(s) to select (metal, X1-X4, L, E or combinations of them, e.g., metal_1X_L)",
)
parser.add_argument(
"--model_name",
type=str,
default="gradient_boosting",
help="Model name to load ('random_forest', 'logistic_regression', 'svc')",
)
args = parser.parse_args()

main(args.dataset_size, args.target, args.model_name)
144 changes: 0 additions & 144 deletions scripts/training/train_metal.py

This file was deleted.

0 comments on commit 4af718e

Please sign in to comment.