From e9927523c295894a51967729f977082d793774f7 Mon Sep 17 00:00:00 2001 From: tiaguinho-code <115071397+tiaguinho-code@users.noreply.github.com> Date: Thu, 23 May 2024 08:39:32 -0500 Subject: [PATCH 01/26] Docker add arial (#61) * Init arial fix * Added Arial to Archlinux docker What a pain... * feat: add arial to matplotlib style file --------- Co-authored-by: TiagoW Co-authored-by: Magdalena Lederbauer --- Dockerfile | 16 +++++++++++++++- style.mplstyle | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index da2beb8..45e07d1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,10 +3,22 @@ FROM archlinux # Install python, poetry and looks stuff from arch system repos RUN pacman-key --init RUN pacman-key --populate -RUN pacman -Syu python python-poetry ranger neovim eza git tree zsh openssh which neofetch github-cli make --noconfirm +RUN pacman -Syu python python-poetry ranger neovim eza git tree zsh openssh which neofetch github-cli make binutils gcc pkg-config fakeroot debugedit --noconfirm # Set working directory and copy over config files and install python packages RUN mkdir -p /home/steve + +# Get Arial font +RUN cd /home/steve +WORKDIR /home/steve +RUN git clone https://aur.archlinux.org/ttf-ms-fonts.git +RUN mv ttf-ms-fonts/* . +RUN chmod 777 . +RUN runuser -unobody makepkg +RUN pacman -U ttf-ms-fonts*.pkg.tar.zst --noconfirm +RUN rm -r ./* + +# Get Project ADD https://api.github.com/repos/mlederbauer/NMRcraft/git/refs/heads/main version.json RUN git clone https://github.com/mlederbauer/NMRcraft.git /home/steve/NMRcraft WORKDIR /home/steve/NMRcraft @@ -14,6 +26,8 @@ RUN echo "🚀 Creating virtual environment using pyenv and poetry" RUN poetry install RUN poetry run pre-commit install + + # Quality of Life stuff ADD https://api.github.com/repos/tiaguinho-code/Archpy_dots/git/refs/heads/main version.json RUN git clone https://github.com/tiaguinho-code/Archpy_dots /home/steve/Archpy_dots diff --git a/style.mplstyle b/style.mplstyle index 0bff05d..919ea0e 100644 --- a/style.mplstyle +++ b/style.mplstyle @@ -1,3 +1,5 @@ +font.family : sans-serif +font.sans-serif : Arial font.size : 30 axes.titlesize : 24 axes.labelsize : 20 From cb691970031a5ec0d7fd8706110a43a1f2e52b3c Mon Sep 17 00:00:00 2001 From: Samuel Stricker Date: Thu, 23 May 2024 14:03:29 +0000 Subject: [PATCH 02/26] Work in progress: Multiclass possible now --- nmrcraft/evaluation/visualizer.py | 145 +++++++++++++++++++++--------- nmrcraft/models/classifier.py | 145 ++++++++++++++++++++++-------- nmrcraft/models/model_configs.py | 9 +- nmrcraft/models/models.py | 5 ++ scripts/training/final_results.py | 48 ++++++---- 5 files changed, 254 insertions(+), 98 deletions(-) diff --git a/nmrcraft/evaluation/visualizer.py b/nmrcraft/evaluation/visualizer.py index 9ad3a91..77a92f7 100644 --- a/nmrcraft/evaluation/visualizer.py +++ b/nmrcraft/evaluation/visualizer.py @@ -2,60 +2,119 @@ import os import matplotlib.pyplot as plt +import numpy as np +from cycler import cycler +from matplotlib.colors import LinearSegmentedColormap class Visualizer: - def __init__(self, model_name: str, data: None, folder_path: str): + def __init__( + self, + model_name: str, + cm: None, + rates=None, + metrics=None, + folder_path: str = "plots/", + classes=None, + dataset_size=None, + ): self.model_name = model_name - self.data = data + self.cm = cm + self.rates = (rates,) + self.metrics = metrics self.folder_path = folder_path + self.classes = classes + self.dataset_size = dataset_size + if not os.path.exists(folder_path): + os.makedirs(folder_path) - def plot_ROC( - self, title="ROC Curves by Dataset Size", filename="ROC_Curves.png" - ): - print(self.data.index) - plt.figure(figsize=(10, 8)) + def style_setup(): + """Function to set up matplotlib parameters.""" colors = [ - "blue", - "green", - "red", - "violet", - "orange", - "cyan", - ] # Colors for different dataset sizes - labels = [ - f"Dataset Size: {idx}" for idx in self.data.index - ] # Labels for legend + "#C28340", + "#854F2B", + "#61371F", + "#8FCA5C", + "#70B237", + "#477A1E", + ] + cmap = LinearSegmentedColormap.from_list("custom", colors) - for (index, row), color, label in zip( - self.data.iterrows(), colors, labels - ): - index = index + 1 - plt.plot( - row["fpr"], - row["tpr"], - label=f'{label} (AUC = {row["roc_auc"]:.2f})', - color=color, - ) + plt.style.use("./style.mplstyle") + plt.rcParams["text.latex.preamble"] = r"\usepackage{sansmathfonts}" + plt.rcParams["axes.prop_cycle"] = cycler(color=colors) + + # Use the first color from the custom color cycle + first_color = plt.rcParams["axes.prop_cycle"].by_key()["color"][0] + plt.rcParams["text.usetex"] = False + + return cmap, colors, first_color - plt.plot( - [0, 1], - [0, 1], - linestyle="--", - lw=2, - color="gray", - label="Chance", - alpha=0.8, + def plot_confusion_matrix(self, full=True, columns_set=False): + """ + Plots the confusion matrix. + Parameters: + - classes (list): List of classes for the axis labels. + - title (str): Title of the plot. + - full (bool): If true plots one big, else many smaller. + - columns_set (list of lists): contains all relevant indices. + Returns: + None + """ + + def normalize_row_0_1(row): + return (row - np.min(row)) / (np.max(row) - np.min(row)) + + file_path = os.path.join( + self.folder_path, + f"ConfusionMatrix_{self.model_name}_{self.dataset_size}.png", ) - plt.title(title) - plt.xlabel("False Positive Rate") - plt.ylabel("True Positive Rate") - plt.legend(loc="lower right") + # _, _, _ = self.style_setup() + if full: # Plot one big cm + plt.figure(figsize=(10, 8)) + plt.imshow( + self.cm.apply(normalize_row_0_1, axis=1), + interpolation="nearest", + cmap=plt.cm.Blues, + ) + plt.title("The Confusion Matrix") + plt.colorbar() + tick_marks = np.arange(len(self.classes)) + plt.xticks(tick_marks, self.classes, rotation=45) + plt.yticks(tick_marks, self.classes) + plt.tight_layout() + plt.ylabel("True label") + plt.xlabel("Predicted label") + plt.savefig(file_path) + plt.close() - file_path = os.path.join(self.folder_path, filename) - plt.savefig(file_path) - plt.close() # Close the plot to free up memory - return file_path + elif not full: # Plot many small cms of each target + cms = [] + for columns in columns_set: # Make list of confusion matrices + cms.append( + self.cm[ + slice(columns[0], columns[-1] + 1), + slice(columns[0], columns[-1] + 1), + ] + ) + fig, axs = plt.subplots(nrows=len(cms), figsize=(10, 8 * len(cms))) + for i, sub_cm in enumerate(cms): + sub_classes = self.classes[ + slice(columns_set[i][0], columns_set[i][-1] + 1) + ] + axs[i].imshow( + sub_cm, interpolation="nearest", cmap=plt.cm.Blues + ) + axs[i].set_title(f"Confusion Matrix {i+1}") + tick_marks = np.arange(len(sub_classes)) + axs[i].set_xticks(tick_marks) + axs[i].set_xticklabels(sub_classes, rotation=45) + axs[i].set_yticks(tick_marks) + axs[i].set_yticklabels(sub_classes) + plt.tight_layout() + # plt.savefig(path) + plt.close() + return file_path def plot_metric( self, diff --git a/nmrcraft/models/classifier.py b/nmrcraft/models/classifier.py index cfd4868..65c73cf 100644 --- a/nmrcraft/models/classifier.py +++ b/nmrcraft/models/classifier.py @@ -4,11 +4,10 @@ import pandas as pd from sklearn.metrics import ( accuracy_score, - auc, + confusion_matrix, f1_score, - # confusion_matrix, - multilabel_confusion_matrix, - roc_curve, + precision_score, + recall_score, ) from sklearn.utils import resample @@ -49,17 +48,22 @@ def __init__( max_evals=self.max_evals, ) # algo is set to default value, TODO: change this in declaration of Classifier is necessary + data_loader = DataLoader( + feature_columns=feature_columns, + target_columns=target, + dataset_size=dataset_size, + target_type="categorical", + ) ( self.X_train, self.X_test, self.y_train, self.y_test, self.y_labels, - ) = DataLoader( - feature_columns=feature_columns, - target_columns=target, - dataset_size=dataset_size, - ).load_data() + ) = data_loader.load_data() + self.classes = data_loader.confusion_matrix_label_adapter( + self.y_labels + ) def hyperparameter_tune(self): log.info( @@ -90,11 +94,11 @@ def train_bootstraped(self, n_times=10): replace=True, random_state=self.random_state, ) - self.hyperparameter_tune() + # self.hyperparameter_tune() self.train() - eval_data = self.evaluate() - accuracy.append(eval_data["accuracy"]) - f1_score.append(eval_data["f1_score"]) + rates_df, metrics, cm = self.evaluate() + accuracy.append(metrics["Accuracy"]) + f1_score.append(metrics["F1"]) i += 1 new_row = { "accuracy": np.mean(accuracy), @@ -106,36 +110,99 @@ def train_bootstraped(self, n_times=10): } return pd.DataFrame([new_row]) - def evaluate(self) -> pd.DataFrame(): + # def evaluate(self) -> pd.DataFrame(): + # """ + # Evaluate the performance of the trained machine learning model. + + # Returns: + # Tuple[Dict[str, float], Any, Any, Any]: A tuple containing: + # - A dictionary with evaluation metrics (accuracy, f1_score, roc_auc). + # - The confusion matrix. + # - The false positive rate. + # - The true positive rate. + # """ + # y_pred = self.model.predict(self.X_test) + # accuracy = accuracy_score(self.y_test, y_pred) + # f1 = f1_score(self.y_test, y_pred, average="weighted") + # fpr, tpr, _ = roc_curve( + # self.y_test, self.model.predict_proba(self.X_test)[:, 1] + # ) + # cm = multilabel_confusion_matrix(self.y_test, y_pred) + # roc_auc = auc(fpr, tpr) + + # # Create DataFrame with consistent structure + # results_df = pd.DataFrame( + # { + # "accuracy": [accuracy], + # "f1_score": [f1], + # "roc_auc": [roc_auc], + # "fpr": [fpr.tolist()], + # "cm": [cm.tolist()], + # "tpr": [tpr.tolist()], + # } + # ) + + # return results_df + + def evaluate(self) -> pd.DataFrame: """ Evaluate the performance of the trained machine learning model. Returns: - Tuple[Dict[str, float], Any, Any, Any]: A tuple containing: - - A dictionary with evaluation metrics (accuracy, f1_score, roc_auc). - - The confusion matrix. - - The false positive rate. - - The true positive rate. + pd.DataFrame: A DataFrame containing evaluation metrics (accuracy, f1_score, roc_auc), + the confusion matrix, false positive rates, and true positive rates for each class. """ y_pred = self.model.predict(self.X_test) - accuracy = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average="weighted") - fpr, tpr, _ = roc_curve( - self.y_test, self.model.predict_proba(self.X_test)[:, 1] - ) - cm = multilabel_confusion_matrix(self.y_test, y_pred) - roc_auc = auc(fpr, tpr) - - # Create DataFrame with consistent structure - results_df = pd.DataFrame( - { - "accuracy": [accuracy], - "f1_score": [f1], - "roc_auc": [roc_auc], - "fpr": [fpr.tolist()], - "cm": [cm.tolist()], - "tpr": [tpr.tolist()], - } - ) + # print(y_pred) + # accuracy = accuracy_score(self.y_test, y_pred) + # f1 = f1_score(self.y_test, y_pred, average="weighted") + + # Binarize the output + # y_test_bin = label_binarize( + # self.y_test, classes=np.unique(self.y_test) + # ) + + # Number of classes + # n_classes = y_test_bin.shape[1] + cm = confusion_matrix(self.y_test, y_pred) + + def calculate_fpr_fnr(cm): + FPR = [] + FNR = [] + num_classes = cm.shape[0] + for i in range(num_classes): + FP = cm[:, i].sum() - cm[i, i] + TN = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i]) + FN = cm[i, :].sum() - cm[i, i] + TP = cm[i, i] + + FPR.append(FP / (FP + TN)) + FNR.append(FN / (FN + TP)) + return np.array(FPR), np.array(FNR) + + # Calculate FPR and FNR for each class + FPR, FNR = calculate_fpr_fnr(cm) + rates_df = pd.DataFrame() + rates_df["FPR"] = FPR + rates_df["FNR"] = FNR + rates_df.index = self.y_labels + + # Calculating macro-averaged F1 Score, Precision, Recall + Precision = precision_score(self.y_test, y_pred, average="macro") + Recall = recall_score(self.y_test, y_pred, average="macro") + F1 = f1_score(self.y_test, y_pred, average="macro") + + # Calculating Accuracy + Accuracy = accuracy_score(self.y_test, y_pred) + + metrics = pd.DataFrame() + metrics["Accuracy"] = [Accuracy] + metrics["Recall"] = [Recall] + metrics["F1"] = [F1] + metrics["Precision"] = [Precision] + + cm = pd.DataFrame(cm) + cm.columns = self.y_labels + cm.index = self.y_labels - return results_df + return rates_df, metrics, cm diff --git a/nmrcraft/models/model_configs.py b/nmrcraft/models/model_configs.py index 75132c5..6d505bf 100644 --- a/nmrcraft/models/model_configs.py +++ b/nmrcraft/models/model_configs.py @@ -55,8 +55,15 @@ "gamma": hp.choice("gamma", ["scale", "auto"]), "coef0": hp.uniform("coef0", 0.0, 1.0), "shrinking": hp.choice("shrinking", [True, False]), - "probability": hp.choice("probability", [True, False]), # "max_iter": hp.choice("max_iter", range(100, 1000, 100)), }, }, + "gpc": { + "model_params": {"random_state": 42}, + "hyperparameters": { + "n_restarts_optimizer": hp.choice( + "n_restarts_optimizer", range(0, 20) + ), + }, + }, } diff --git a/nmrcraft/models/models.py b/nmrcraft/models/models.py index 1c42e52..9fc25bb 100644 --- a/nmrcraft/models/models.py +++ b/nmrcraft/models/models.py @@ -2,6 +2,7 @@ from typing import Any from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC @@ -52,6 +53,7 @@ def load_model(model_name: str, **kwargs: Any): "gradient_boosting": GradientBoostingClassifier, "logistic_regression": LogisticRegression, "svc": SVC, + "gpc": GaussianProcessClassifier, } # TODO: put model config here @@ -69,6 +71,9 @@ def load_model(model_name: str, **kwargs: Any): if model_name == "svc": kwargs["probability"] = True + if model_name == "gpc": + kwargs["multi_class"] = "one_vs_one" + # Forth, validate all provided kwargs before creating the model instance validate_kwargs(kwargs, model_class, model_name) diff --git a/scripts/training/final_results.py b/scripts/training/final_results.py index 1c407db..249d373 100644 --- a/scripts/training/final_results.py +++ b/scripts/training/final_results.py @@ -19,13 +19,13 @@ parser.add_argument( "--max_evals", type=int, - default=10, + default=3, help="The max evaluatins for the hyperparameter tuning with hyperopt", ) parser.add_argument( "--target", type=str, - default="metal", + default="X3", help="The Target for the predictions. Choose from: 'metal', 'X1', 'X2', 'X3', 'X4', 'L', 'E' ", ) parser.add_argument( @@ -54,22 +54,25 @@ log.getLogger().setLevel(log.INFO) dataset_sizes = [ - 0.01, + # 0.01, 0.1, + 0.15 # 0.5, - 1.0, + # 1.0, ] models = [ "random_forest", "logistic_regression", - "gradient_boosting", + # "gradient_boosting", "svc", + # "gpc" ] with mlflow.start_run(): model_data = pd.DataFrame( columns=["accuracy", "f1_score", "dataset_size", "model"] ) + model_metrics = [] for model in models: data = pd.DataFrame() for dataset_size in dataset_sizes: @@ -84,22 +87,33 @@ # mlflow.log_metrics("dataset_size", dataset_size, step=i) C.hyperparameter_tune() C.train() - new_data = C.evaluate() + rates_df, metrics, cm = C.evaluate() + print(rates_df) + print(metrics) + print(cm) + # data[str(dataset_size)] = new_data - data = pd.concat( - [data, new_data.assign(dataset_size=dataset_size)], - ) + data = pd.concat([data, metrics]) data_BS = C.train_bootstraped(10) model_data = pd.concat([model_data, data_BS]) + visualizer = Visualizer( + model_name=model, + cm=cm, + rates=rates_df, + metrics=metrics, + folder_path=args.plot_folder, + classes=C.classes, + dataset_size=str(dataset_size), + ) + path_CM = visualizer.plot_confusion_matrix() + # print(data) + data.index = dataset_sizes + model_metrics.append(data) data.index = dataset_sizes - visualizer = Visualizer( - model_name=model, data=data, folder_path=args.plot_folder - ) - path_ROC = visualizer.plot_ROC(filename=f"ROC_Plot_{model}.png") - mlflow.log_artifact(path_ROC, f"ROC_Plot_{model}.png") - print(model_data) + # path_ROC = visualizer.plot_ROC(filename=f"ROC_Plot_{model}.png") + # mlflow.log_artifact(path_ROC, f"ROC_Plot_{model}.png") path_AC = visualizer.plot_metric( data=model_data, @@ -114,5 +128,9 @@ filename="f1_score.png", ) + for df, model in zip(model_metrics, models): + print(model) + print(df) + # mlflow.log_artifact("F1_Plot", path_F1) # mlflow.log_artifact("Accuracy_Plot", path_AC) From d86a2b848987faef87219fbcc544c3f71eaf3cc8 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Thu, 23 May 2024 21:58:26 +0000 Subject: [PATCH 03/26] fix: standard scaling --- nmrcraft/data/dataset.py | 199 ++++++++++++++---------------- scripts/training/final_results.py | 10 +- 2 files changed, 95 insertions(+), 114 deletions(-) diff --git a/nmrcraft/data/dataset.py b/nmrcraft/data/dataset.py index c179140..e937610 100644 --- a/nmrcraft/data/dataset.py +++ b/nmrcraft/data/dataset.py @@ -10,7 +10,6 @@ from sklearn.preprocessing import ( LabelBinarizer, LabelEncoder, - OneHotEncoder, StandardScaler, ) @@ -18,6 +17,16 @@ set_seed() +TARGET_TYPES = [ + "metal", + "X1_ligand", + "X2_ligand", + "X3_ligand", + "X4_ligand", + "L_ligand", + "E_ligand", +] + class DatasetLoadError(FileNotFoundError): """Exeption raised when the Dataloader could not find data/dataset.csv, @@ -135,26 +144,6 @@ def get_target_columns(target_columns: str): return targets_transformed -def get_structural_feature_columns(target_columns: list): - """ - Function gets the feature columns given the target columns. The feature columns are those that will be in the X set. - """ - TARGET_TYPES = [ - "metal", - "X1_ligand", - "X2_ligand", - "X3_ligand", - "X4_ligand", - "L_ligand", - "E_ligand", - ] - - # Get the features as the not targets - features = [x for x in TARGET_TYPES if x not in target_columns] - - return features - - def target_label_readabilitizer(readable_labels): """ function takes in the classes from the binarzier and turns them into human readable list of same length of the target. @@ -255,14 +244,6 @@ def choose_geometry(self): self.dataset["geometry"] == "tbp" ] # only load trigonal bipyramidal complexes - def scale(self, X): - """ - Apply standard normalization to the feature set. - """ - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - return X_scaled - def get_target_columns_separated(self): """Returns the column indicies of the target array nicely sorted. For example: metal_X1: [[0, 1], [1, 2, 3, 4]]""" @@ -364,25 +345,46 @@ def confusion_matrix_label_adapter(self, y_labels): def categorical_endocode_X(self): # Get NMR Featrues (passed ones) and structural Features - X_Structural_Features_Columns = get_structural_feature_columns( - target_columns=self.target_columns - ) - X_Structural_Features = self.dataset[ - X_Structural_Features_Columns - ].to_numpy() - - # Transpose the array - X_Structural_Features = transpose(X_Structural_Features) - - # Target-wise encoding with Label encoder and save encoders for later decoding - xs = [] - for i in range(len(X_Structural_Features)): - tmp_encoder = LabelEncoder() - tmp_encoder.fit(X_Structural_Features[i]) - xs.append(tmp_encoder.transform(X_Structural_Features[i])) - X_Structural_Features = list(zip(*xs)) # Kind of backtransposing - - return X_Structural_Features + # X_Structural_Features = self.dataset[ + # [x for x in TARGET_TYPES if x not in self.target_columns] + # ].to_numpy() + + # # Transpose the array + # X_Structural_Features = transpose(X_Structural_Features) + + # # Target-wise encoding with Label encoder and save encoders for later decoding + # xs = [] + # for i in range(len(X_Structural_Features)): + # tmp_encoder = LabelEncoder() + # tmp_encoder.fit(X_Structural_Features[i]) + # xs.append(tmp_encoder.transform(X_Structural_Features[i])) + # X_Structural_Features = list(zip(*xs)) # Kind of backtransposing + + # return X_Structural_Features + + # def encode_categorical_features(self): + # Select and extract the structural features from the dataset + structural_features = ( + self.dataset[ + [col for col in TARGET_TYPES if col not in self.target_columns] + ] + .to_numpy() + .T + ) # Transpose immediately after conversion to numpy + + # Encode features using LabelEncoder and store encoders for potential inverse transform + encoded_features = [] + self.encoders = [] # To store encoders for each feature + for features in structural_features: + encoder = LabelEncoder() + encoder.fit(features) + encoded_features.append(encoder.transform(features)) + self.encoders.append(encoder) + + # Convert the list of encoded features back to the original data structure + return np.array( + encoded_features + ).T # Transpose back to original orientation def categorical_endocode_y(self): # Get the targets @@ -406,28 +408,6 @@ def categorical_endocode_y(self): # Return y fuzed into a single array and y_labels return y, readable_labels - def one_hot_endocode_X(self): - """ - Method that does the one-hot encoding of the DataLoader's features - based on the selected targets - """ - # Get Columns corresponding to the features that are selected - X_Structural_Features_Columns = get_structural_feature_columns( - self.target_columns - ) - - # Get the features based on the selected columns - X_Structural_Features = self.dataset[ - X_Structural_Features_Columns - ].to_numpy() - - # One hot encode X structural - X_Structural_Features_enc = ( - OneHotEncoder().fit_transform(X_Structural_Features).toarray() - ) - - return X_Structural_Features_enc - def label_binarize_endocode_y(self): # Get the Targets and transpose @@ -490,13 +470,13 @@ def split_and_preprocess_categorical(self): random_state=self.random_state, ) - # Normalize features with no leakage from test set - X_train_NMR_scaled = self.scale(X_train_NMR) - X_test_NMR_scaled = self.scale(X_test_NMR) + scaler = StandardScaler() + X_train_NMR_scaled = scaler.fit_transform(X_train_NMR) + X_test_NMR_scaled = scaler.transform(X_test_NMR) if self.include_structural_features: # Combine scaled NMR features with structural features to get final X - X_train_scaled = np.concatenate( + X_train_scaled = X_train_scaled = np.concatenate( [X_train_NMR_scaled, X_train_structural], axis=1 ) X_test_scaled = np.concatenate( @@ -510,63 +490,64 @@ def split_and_preprocess_categorical(self): # Get the target labels going y_label = target_label_readabilitizer_categorical(readable_labels) + y_train = np.squeeze(y_train) + y_test = np.squeeze(y_test) + return X_train_scaled, X_test_scaled, y_train, y_test, y_label - def split_and_preprocess_one_hot(self): - """ - Split data into training and test sets, then apply normalization. - Ensures that the test data does not leak into training data preprocessing. Returned X is one-hot encoded and y binarized using the sklearn functions. - """ - # Get NMR features + def split_and_preprocess(self): + # Extract and encode categorical features X_NMR = self.dataset[self.feature_columns].to_numpy() + X_Structural = self.encode_categorical_features() - # Get structural features one-hot encoded - X_Structural_Features_enc = self.one_hot_endocode_X() - - # Get structural targets, binarized - y, readable_labels = self.label_binarize_endocode_y() + # Encode target variables and store readable labels + ( + y_encoded, + readable_labels, + ) = self.encode_targets() # Assuming this method exists and is similar - # Split the datasets + # Split data into training and testing sets ( X_train_NMR, X_test_NMR, - X_train_structural, - X_test_structural, + X_train_Structural, + X_test_Structural, y_train, y_test, ) = train_test_split( X_NMR, - X_Structural_Features_enc, - y, + X_Structural, + y_encoded, test_size=self.test_size, random_state=self.random_state, ) - # Normalize features with no leakage from test set - X_train_NMR_scaled = self.scale(X_train_NMR) - X_test_NMR_scaled = self.scale(X_test_NMR) + # Scale NMR features + scaler = StandardScaler() + X_train_NMR_scaled = scaler.fit_transform(X_train_NMR) + X_test_NMR_scaled = scaler.transform(X_test_NMR) + # Combine features if structural features are included if self.include_structural_features: - # Combine scaled NMR features with structural features to get final X - X_train_scaled = np.concatenate( - [X_train_NMR_scaled, X_train_structural], axis=1 + X_train = np.concatenate( + [X_train_NMR_scaled, X_train_Structural], axis=1 ) - X_test_scaled = np.concatenate( - [X_test_NMR_scaled, X_test_structural], axis=1 + X_test = np.concatenate( + [X_test_NMR_scaled, X_test_Structural], axis=1 ) else: - # Just have the NMR features as X - X_train_scaled = X_train_NMR_scaled - X_test_scaled = X_test_NMR_scaled + X_train = X_train_NMR_scaled + X_test = X_test_NMR_scaled - # Creates the labels that can be used to identify the targets in the binaized y-array - # (basicall handle special metal behaviour) - good_target_labels = target_label_readabilitizer(readable_labels) + # Format the target labels for readability + y_labels = self.format_target_labels( + readable_labels + ) # Assuming this formatting function exists return ( - X_train_scaled, - X_test_scaled, - y_train, - y_test, - good_target_labels, + X_train, + X_test, + np.squeeze(y_train), + np.squeeze(y_test), + y_labels, ) diff --git a/scripts/training/final_results.py b/scripts/training/final_results.py index 249d373..7e37c50 100644 --- a/scripts/training/final_results.py +++ b/scripts/training/final_results.py @@ -20,7 +20,7 @@ "--max_evals", type=int, default=3, - help="The max evaluatins for the hyperparameter tuning with hyperopt", + help="The max evaluations for the hyperparameter tuning with hyperopt", ) parser.add_argument( "--target", @@ -56,15 +56,15 @@ dataset_sizes = [ # 0.01, 0.1, - 0.15 + # 0.15 # 0.5, # 1.0, ] models = [ - "random_forest", - "logistic_regression", + "random_forest" + # "logistic_regression", # "gradient_boosting", - "svc", + # "svc", # "gpc" ] From 3a17d156558fd80e4870ed47af6b3142eaacdae7 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Thu, 23 May 2024 22:51:38 +0000 Subject: [PATCH 04/26] feat: simplified data loader --- nmrcraft/data/dataset.py | 553 ------------------------------ nmrcraft/models/classifier.py | 4 - scripts/training/final_results.py | 6 +- 3 files changed, 3 insertions(+), 560 deletions(-) delete mode 100644 nmrcraft/data/dataset.py diff --git a/nmrcraft/data/dataset.py b/nmrcraft/data/dataset.py deleted file mode 100644 index e937610..0000000 --- a/nmrcraft/data/dataset.py +++ /dev/null @@ -1,553 +0,0 @@ -"""Load and preprocess data.""" - -import itertools -import os - -import numpy as np -import pandas as pd -from datasets import load_dataset -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import ( - LabelBinarizer, - LabelEncoder, - StandardScaler, -) - -from nmrcraft.utils.set_seed import set_seed - -set_seed() - -TARGET_TYPES = [ - "metal", - "X1_ligand", - "X2_ligand", - "X3_ligand", - "X4_ligand", - "L_ligand", - "E_ligand", -] - - -class DatasetLoadError(FileNotFoundError): - """Exeption raised when the Dataloader could not find data/dataset.csv, - even after trying to generate it from huggingface""" - - def __init__(self, t): - super().__init__(f"Could not load raw Dataset '{t}'") - - -class InvalidTargetError(ValueError): - """Exception raised when the specified model name is not found.""" - - def __init__(self, t): - super().__init__(f"Invalid target '{t}'") - - -class InvalidTargetTypeError(ValueError): - """Exception raised when the specified target type is not valid.""" - - def __init__(self, t): - super().__init__(f"Invalid target Type '{t}'") - - -def filename_to_ligands(dataset: pd.DataFrame): - """ - Extract ligands from the filename and add as columns to the dataset. - Assumes that filenames are structured in a specific way that can be parsed into ligands. - """ - filename_parts = dataset["file_name"].str.split("_", expand=True) - dataset["metal"] = filename_parts.get(0) - dataset["geometry"] = filename_parts.get(1) - dataset["E_ligand"] = filename_parts.get(2) - dataset["X1_ligand"] = filename_parts.get(3) - dataset["X2_ligand"] = filename_parts.get(4) - dataset["X3_ligand"] = filename_parts.get(5) - dataset["X4_ligand"] = filename_parts.get(6) - dataset["L_ligand"] = filename_parts.get(7).fillna( - "none" - ) # Fill missing L_ligand with 'none' - return dataset - - -def load_dummy_dataset_locally(datset_path: str = "tests/data.csv"): - dataset = pd.read_csv(datset_path) - return dataset - - -def load_dataset_from_hf( - dataset_name: str = "NMRcraft/nmrcraft", data_files: str = "all_no_nan.csv" -): - """Load the dataset. - - This function loads the dataset using the specified dataset name and data files. - It assumes that you have logged into the Hugging Face CLI prior to calling this function. - - Args: - dataset_name (str, optional): The name of the dataset. Defaults to "NMRcraft/nmrcraft". - data_files (str, optional): The name of the data file. Defaults to 'all_no_nan.csv'. - - Returns: - pandas.DataFrame: The loaded dataset as a pandas DataFrame. - """ - # Create data dir if needed - if not os.path.isdir("data"): - os.mkdir("data") - # Check if hf dataset is already downloaded, else download it and then load it - if not os.path.isfile("data/dataset.csv"): - dataset = load_dataset(dataset_name, data_files=data_files)[ - "train" - ].to_pandas() - dataset.to_csv("data/dataset.csv") - if os.path.isfile("data/dataset.csv"): - dataset = pd.read_csv("data/dataset.csv") - elif not os.path.isfile("data/dataset.csv"): - raise DatasetLoadError(FileNotFoundError) - return dataset - - -def transpose(array: any): - """rotate/transpose array to the right""" - ar = array[:] # make copy just to be sure - ar = [ # rotate the array to the right - list(x) if i == 0 else x for i, x in enumerate(map(list, zip(*ar))) - ] - return ar - - -def get_target_columns(target_columns: str): - """ - Function takes target columns in underline format f.e 'metal_X1_X4_X2_L' and - transforms into a list of the column names present in the dataset. - """ - TARGET_TYPES = ["metal", "X1", "X2", "X3", "X4", "L", "E"] - - # Split the target string into individual targets - targets = [t.strip() for t in target_columns.split("_")] - - # Check if the targets are valid - for t in targets: - if t not in TARGET_TYPES: - raise InvalidTargetError(t) - - # Translate them into Dataframe Column names - target_map = { - "metal": "metal", - "X1": "X1_ligand", - "X2": "X2_ligand", - "X3": "X3_ligand", - "X4": "X4_ligand", - "L": "L_ligand", - "E": "E_ligand", - } - targets_transformed = [target_map[t] for t in targets] - - return targets_transformed - - -def target_label_readabilitizer(readable_labels): - """ - function takes in the classes from the binarzier and turns them into human readable list of same length of the target. - """ - # Trun that class_ into list - human_readable_label_list = list(itertools.chain(*readable_labels)) - # Handle Binarized metal stuff and make the two columns become a single one because the metals get turned into a single column by the binarizer - for i in enumerate(human_readable_label_list): - if ( - human_readable_label_list[i[0]] == "Mo" - and human_readable_label_list[i[0] + 1] == "W" - ) or ( - human_readable_label_list[i[0]] == "W" - and human_readable_label_list[i[0] + 1] == "Mo" - ): - human_readable_label_list[i[0]] = "Mo W" - human_readable_label_list.pop(i[0] + 1) - - return human_readable_label_list - - -def target_label_readabilitizer_categorical(target_labels): - good_labels = [] - for label_array in target_labels: - good_labels.append(list(label_array)) - return good_labels - - -def column_length_to_indices(column_lengths): - indices = [] - start_index = 0 - for length in column_lengths: - if length == 1: - indices.append([start_index]) - else: - indices.append(list(range(start_index, start_index + length))) - start_index += length - return indices - - -class DataLoader: - def __init__( - self, - dataset_name="NMRcraft/nmrcraft", - data_files="all_no_nan.csv", - feature_columns=None, - target_columns="metal", - target_type="one-hot", # can be "categorical" or "one-hot", - complex_geometry="all", - test_size=0.3, - random_state=42, - dataset_size=0.01, - include_structural_features=True, - testing=False, - ): - self.feature_columns = feature_columns - self.target_columns = get_target_columns(target_columns=target_columns) - self.test_size = test_size - self.random_state = random_state - self.dataset_size = dataset_size - self.target_type = target_type - self.complex_geometry = complex_geometry - self.include_structural_features = include_structural_features - - if not testing: - self.dataset = load_dataset_from_hf() - elif testing: - self.dataset = load_dummy_dataset_locally() - - def load_data(self): - self.dataset = filename_to_ligands(self.dataset) - self.dataset = self.dataset.sample(frac=self.dataset_size) - self.choose_geometry() - if self.target_type == "categorical": - return self.split_and_preprocess_categorical() - elif ( - self.target_type == "one-hot" - ): # Target is binarized and Features are one hot - return self.split_and_preprocess_one_hot() - else: - raise InvalidTargetTypeError(ValueError) - - def choose_geometry(self): - """ - Reduce the dataset down to a certain geometry if a valid - one was passed, else just leave it as is. - """ - if self.complex_geometry == "oct": - self.dataset = self.dataset[ - self.dataset["geometry"] == "oct" - ] # only load octahedral complexes - elif self.complex_geometry == "spy": - self.dataset = self.dataset[ - self.dataset["geometry"] == "spy" - ] # only load square pyramidal complexes - elif self.complex_geometry == "tbp": - self.dataset = self.dataset[ - self.dataset["geometry"] == "tbp" - ] # only load trigonal bipyramidal complexes - - def get_target_columns_separated(self): - """Returns the column indicies of the target array nicely sorted. - For example: metal_X1: [[0, 1], [1, 2, 3, 4]]""" - if ( - "metal" in self.target_columns - ): # If targets have metal, do weird stuff - metal_index = self.target_columns.index("metal") - y_column_indices = column_length_to_indices( - self.target_column_numbers - ) - for i in range(len(y_column_indices)): - if i == metal_index: - y_column_indices[i].append(y_column_indices[i][0] + 1) - if i > metal_index: - y_column_indices[i] = [x + 1 for x in y_column_indices[i]] - - elif "metal" not in self.target_columns: - y_column_indices = column_length_to_indices( - self.target_column_numbers - ) - return y_column_indices - - def more_than_one_target(self): - """Function returns true if more than one target is specified""" - return len(self.target_columns) > 1 - - def categorical_target_decoder(self, y): - """ - function takes in the target (y) array and transforms it back to decoded form. - For this function to be run the split_and_preprocess_categorical already has to have been run beforehand. - """ - ys = y[:] # copy y so it's not modified - target_encoders = self.target_label_encoders - ys_decoded = [] - ys = transpose(ys) - - # Decode columnwise - for i, target_column in enumerate(ys): - ys_decoded.append( - target_encoders[i].inverse_transform(target_column) - ) - - # Rotate back so each row corresponds to a complex and not the target like metal or X4 - ys_decoded_properly_rotated = [ - list(x) if i == 0 else x - for i, x in enumerate(map(list, zip(*ys_decoded))) - ] - - return np.array(ys_decoded_properly_rotated) - - def binarized_target_decoder(self, y): - """ - function takes in the target (y) array and transforms it back to decoded form. - For this function to be run the one-hot-preprocesser already has to have been run beforehand. - """ - y_column_indices = column_length_to_indices(self.target_column_numbers) - ys = [] - ys_decoded = [] - # Split up compressed array into the categories - for i in range(len(y_column_indices)): - ys.append(y[:, y_column_indices[i]]) - - # Decode the binarized categries using the original binarizers - for i in range(len(ys)): - ys_decoded.append(self.encoders[i].inverse_transform(ys[i])) - - # Rotate the array - ys_decoded_properly_rotated = [ - list(x) if i == 0 else x - for i, x in enumerate(map(list, zip(*ys_decoded))) - ] - return ys_decoded_properly_rotated - - def confusion_matrix_data_adapter_categorical(self, y): - """ - Takes in binary encoded target array and returns decoded flat list. - Especially designed to work with confusion matrix. - """ - y_decoded = self.categorical_target_decoder(y) - flat_y_decoded = [y for ys in y_decoded for y in ys] - return flat_y_decoded - - def confusion_matrix_data_adapter_one_hot(self, y): - """ - Takes in binary encoded target array and returns decoded flat list. - Especially designed to work with confusion matrix. - """ - y_decoded = self.binarized_target_decoder(y) - flat_y_decoded = [y for ys in y_decoded for y in ys] - return flat_y_decoded - - def confusion_matrix_label_adapter(self, y_labels): - y_labels_copy = y_labels[:] - for i in range(len(y_labels)): - if y_labels_copy[i] == "Mo W": - y_labels_copy[i] = "Mo" - y_labels_copy.insert(i, "W") - return y_labels_copy - - def categorical_endocode_X(self): - # Get NMR Featrues (passed ones) and structural Features - # X_Structural_Features = self.dataset[ - # [x for x in TARGET_TYPES if x not in self.target_columns] - # ].to_numpy() - - # # Transpose the array - # X_Structural_Features = transpose(X_Structural_Features) - - # # Target-wise encoding with Label encoder and save encoders for later decoding - # xs = [] - # for i in range(len(X_Structural_Features)): - # tmp_encoder = LabelEncoder() - # tmp_encoder.fit(X_Structural_Features[i]) - # xs.append(tmp_encoder.transform(X_Structural_Features[i])) - # X_Structural_Features = list(zip(*xs)) # Kind of backtransposing - - # return X_Structural_Features - - # def encode_categorical_features(self): - # Select and extract the structural features from the dataset - structural_features = ( - self.dataset[ - [col for col in TARGET_TYPES if col not in self.target_columns] - ] - .to_numpy() - .T - ) # Transpose immediately after conversion to numpy - - # Encode features using LabelEncoder and store encoders for potential inverse transform - encoded_features = [] - self.encoders = [] # To store encoders for each feature - for features in structural_features: - encoder = LabelEncoder() - encoder.fit(features) - encoded_features.append(encoder.transform(features)) - self.encoders.append(encoder) - - # Convert the list of encoded features back to the original data structure - return np.array( - encoded_features - ).T # Transpose back to original orientation - - def categorical_endocode_y(self): - # Get the targets - y_labels_rotated = self.dataset[self.target_columns].to_numpy() - - # rotate the list of list (array-like) - y_labels = transpose(y_labels_rotated) - - # Do targetwise encoding using the label encoder and save the label encoders for later decoding - ys = [] - self.target_label_encoders = [] - readable_labels = [] - for i in range(len(y_labels)): - tmp_encoder = LabelEncoder() - tmp_encoder.fit(y_labels[i]) - ys.append(tmp_encoder.transform(y_labels[i])) - self.target_label_encoders.append(tmp_encoder) - readable_labels.append(tmp_encoder.classes_) - # Combine y - y = np.array(list(zip(*ys))) - # Return y fuzed into a single array and y_labels - return y, readable_labels - - def label_binarize_endocode_y(self): - - # Get the Targets and transpose - y_labels_rotated = self.dataset[self.target_columns].to_numpy() - y_labels = transpose(y_labels_rotated) - - ys = [] - readable_labels = [] - self.encoders = [] - self.target_column_numbers = [] - - # Binarize targetwise and save encoders and labels - for i in range(len(y_labels)): - # Encode - label_binerizer = LabelBinarizer() - ys.append(label_binerizer.fit_transform(y_labels[i])) - - # Save stuff for later decoding - readable_labels.append(label_binerizer.classes_) - self.encoders.append( - label_binerizer - ) # save encoder for later decoding - self.target_column_numbers.append( - len(ys[i][0]) - ) # save column numbers for later decoding - - # Return y fuzed into a single array and labels - y = np.concatenate(list(ys), axis=1) - return y, readable_labels - - def split_and_preprocess_categorical(self): - """ - Split data into training and test sets, then apply normalization. - Ensures that the test data does not leak into training data preprocessing. - X and y are categorical, so each column has a integer that defines which one of the ligands is in the column. - """ - - # Get NMR features - X_NMR = self.dataset[self.feature_columns].to_numpy() - - # Encode X in a categorical fashion with the label encoder columnwise - X_Structural_Features = self.categorical_endocode_X() - - # Encode y in a categorical fashion with the label encoder columnwise - y, readable_labels = self.categorical_endocode_y() - - # Train Test splitting - ( - X_train_NMR, - X_test_NMR, - X_train_structural, - X_test_structural, - y_train, - y_test, - ) = train_test_split( - X_NMR, - X_Structural_Features, - y, - test_size=self.test_size, - random_state=self.random_state, - ) - - scaler = StandardScaler() - X_train_NMR_scaled = scaler.fit_transform(X_train_NMR) - X_test_NMR_scaled = scaler.transform(X_test_NMR) - - if self.include_structural_features: - # Combine scaled NMR features with structural features to get final X - X_train_scaled = X_train_scaled = np.concatenate( - [X_train_NMR_scaled, X_train_structural], axis=1 - ) - X_test_scaled = np.concatenate( - [X_test_NMR_scaled, X_test_structural], axis=1 - ) - else: - # Just have the NMR features as X - X_train_scaled = X_train_NMR_scaled - X_test_scaled = X_test_NMR_scaled - - # Get the target labels going - y_label = target_label_readabilitizer_categorical(readable_labels) - - y_train = np.squeeze(y_train) - y_test = np.squeeze(y_test) - - return X_train_scaled, X_test_scaled, y_train, y_test, y_label - - def split_and_preprocess(self): - # Extract and encode categorical features - X_NMR = self.dataset[self.feature_columns].to_numpy() - X_Structural = self.encode_categorical_features() - - # Encode target variables and store readable labels - ( - y_encoded, - readable_labels, - ) = self.encode_targets() # Assuming this method exists and is similar - - # Split data into training and testing sets - ( - X_train_NMR, - X_test_NMR, - X_train_Structural, - X_test_Structural, - y_train, - y_test, - ) = train_test_split( - X_NMR, - X_Structural, - y_encoded, - test_size=self.test_size, - random_state=self.random_state, - ) - - # Scale NMR features - scaler = StandardScaler() - X_train_NMR_scaled = scaler.fit_transform(X_train_NMR) - X_test_NMR_scaled = scaler.transform(X_test_NMR) - - # Combine features if structural features are included - if self.include_structural_features: - X_train = np.concatenate( - [X_train_NMR_scaled, X_train_Structural], axis=1 - ) - X_test = np.concatenate( - [X_test_NMR_scaled, X_test_Structural], axis=1 - ) - else: - X_train = X_train_NMR_scaled - X_test = X_test_NMR_scaled - - # Format the target labels for readability - y_labels = self.format_target_labels( - readable_labels - ) # Assuming this formatting function exists - - return ( - X_train, - X_test, - np.squeeze(y_train), - np.squeeze(y_test), - y_labels, - ) diff --git a/nmrcraft/models/classifier.py b/nmrcraft/models/classifier.py index 65c73cf..4bc1892 100644 --- a/nmrcraft/models/classifier.py +++ b/nmrcraft/models/classifier.py @@ -52,7 +52,6 @@ def __init__( feature_columns=feature_columns, target_columns=target, dataset_size=dataset_size, - target_type="categorical", ) ( self.X_train, @@ -61,9 +60,6 @@ def __init__( self.y_test, self.y_labels, ) = data_loader.load_data() - self.classes = data_loader.confusion_matrix_label_adapter( - self.y_labels - ) def hyperparameter_tune(self): log.info( diff --git a/scripts/training/final_results.py b/scripts/training/final_results.py index 7e37c50..16b0933 100644 --- a/scripts/training/final_results.py +++ b/scripts/training/final_results.py @@ -25,7 +25,7 @@ parser.add_argument( "--target", type=str, - default="X3", + default="E", help="The Target for the predictions. Choose from: 'metal', 'X1', 'X2', 'X3', 'X4', 'L', 'E' ", ) parser.add_argument( @@ -82,7 +82,7 @@ max_evals=args.max_evals, target=args.target, dataset_size=dataset_size, - random_state=11, + random_state=42, ) # mlflow.log_metrics("dataset_size", dataset_size, step=i) C.hyperparameter_tune() @@ -103,7 +103,7 @@ rates=rates_df, metrics=metrics, folder_path=args.plot_folder, - classes=C.classes, + classes=C.y_labels, dataset_size=str(dataset_size), ) path_CM = visualizer.plot_confusion_matrix() From 0eb6bad391711d0f0d1b43ab616e298b33e97f22 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Thu, 23 May 2024 23:32:51 +0000 Subject: [PATCH 05/26] chore: refactor name to dataloader --- nmrcraft/evaluation/evaluation.py | 6 +++--- scripts/analysis/dataset_statistics.py | 2 +- scripts/analysis/pca_ligand_space.py | 2 +- scripts/training/train_metal.py | 2 +- tests/test_dataloader.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index 0b4d1a3..7c30b6b 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -10,7 +10,7 @@ roc_curve, ) -from nmrcraft.data import dataset +from nmrcraft.data import dataloader def model_evaluation( @@ -18,7 +18,7 @@ def model_evaluation( X_test: Any, y_test: Any, y_labels: Any, - dataloader: dataset.DataLoader, + dataloader: dataloader.DataLoader, ) -> Tuple[Dict[str, float], Any, Any, Any]: """ Evaluate the performance of the trained machine learning model for 1D targets. @@ -67,7 +67,7 @@ def model_evaluation_nD( X_test: Any, y_test: Any, y_labels: Any, - dataloader: dataset.DataLoader, + dataloader: dataloader.DataLoader, ) -> Tuple[Dict[str, float], Any, Any, Any]: """ Evaluate the performance of the trained machine learning model for 2D+ Targets. diff --git a/scripts/analysis/dataset_statistics.py b/scripts/analysis/dataset_statistics.py index 217608d..aad9c1e 100644 --- a/scripts/analysis/dataset_statistics.py +++ b/scripts/analysis/dataset_statistics.py @@ -6,7 +6,7 @@ import seaborn as sns from nmrcraft.analysis.plotting import style_setup -from nmrcraft.data.dataset import filename_to_ligands, load_dataset_from_hf +from nmrcraft.data.dataloader import filename_to_ligands, load_dataset_from_hf def plot_stacked_bars( diff --git a/scripts/analysis/pca_ligand_space.py b/scripts/analysis/pca_ligand_space.py index 49f22cf..ecf0a8b 100644 --- a/scripts/analysis/pca_ligand_space.py +++ b/scripts/analysis/pca_ligand_space.py @@ -7,7 +7,7 @@ from sklearn.preprocessing import StandardScaler from nmrcraft.analysis.plotting import style_setup -from nmrcraft.data.dataset import filename_to_ligands, load_dataset_from_hf +from nmrcraft.data.dataloader import filename_to_ligands, load_dataset_from_hf def perform_pca(df, features): diff --git a/scripts/training/train_metal.py b/scripts/training/train_metal.py index 614784a..8f46bb1 100644 --- a/scripts/training/train_metal.py +++ b/scripts/training/train_metal.py @@ -3,7 +3,7 @@ import mlflow from nmrcraft.analysis.plotting import plot_confusion_matrix, plot_roc_curve -from nmrcraft.data.dataset import DataLoader +from nmrcraft.data.dataloader import DataLoader from nmrcraft.evaluation.evaluation import ( get_cm_path, get_roc_path, diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index a7670fc..c9d37a0 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -1,7 +1,7 @@ import numpy import pytest -from nmrcraft.data.dataset import DataLoader +from nmrcraft.data.dataloader import DataLoader def test_valid_targets(): From c4866835ef65a9f6928bf94ce9d925fffaf2fe48 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Fri, 24 May 2024 07:17:55 +0000 Subject: [PATCH 06/26] feat: add more columns to results df --- nmrcraft/models/classifier.py | 2 +- scripts/training/final_results.py | 30 ++++++++++++++++++++++++------ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/nmrcraft/models/classifier.py b/nmrcraft/models/classifier.py index 4bc1892..d1e78e4 100644 --- a/nmrcraft/models/classifier.py +++ b/nmrcraft/models/classifier.py @@ -11,7 +11,7 @@ ) from sklearn.utils import resample -from nmrcraft.data.dataset import DataLoader +from nmrcraft.data.dataloader import DataLoader from nmrcraft.models.model_configs import model_configs from nmrcraft.models.models import load_model from nmrcraft.training.hyperparameter_tune import HyperparameterTuner diff --git a/scripts/training/final_results.py b/scripts/training/final_results.py index 16b0933..930ced5 100644 --- a/scripts/training/final_results.py +++ b/scripts/training/final_results.py @@ -19,7 +19,7 @@ parser.add_argument( "--max_evals", type=int, - default=3, + default=2, help="The max evaluations for the hyperparameter tuning with hyperopt", ) parser.add_argument( @@ -69,9 +69,6 @@ ] with mlflow.start_run(): - model_data = pd.DataFrame( - columns=["accuracy", "f1_score", "dataset_size", "model"] - ) model_metrics = [] for model in models: data = pd.DataFrame() @@ -93,8 +90,29 @@ print(cm) # data[str(dataset_size)] = new_data - data = pd.concat([data, metrics]) - data_BS = C.train_bootstraped(10) + # Convert args.target and dataset_size into DataFrames by wrapping them in lists + target_df = pd.DataFrame([args.target], columns=["Target"]) + dataset_size_df = pd.DataFrame( + [dataset_size], columns=["Dataset Size"] + ) + + model_data = pd.DataFrame( + columns=[ + "target", + "dataset_size", + "model", + "accuracy", + "accuracy_std", + "f1_score", + "f1_score_std", + ] + ) + # Concatenate the new DataFrames with data and metrics + data = pd.concat( + [target_df, dataset_size_df, data, metrics], axis=1 + ) + + data_BS = C.train_bootstraped(n_times=10) model_data = pd.concat([model_data, data_BS]) visualizer = Visualizer( From ae94e2f81abeb5b1639c524c4fc248e7dbb206ab Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Fri, 24 May 2024 07:49:33 +0000 Subject: [PATCH 07/26] move default args of dataloader to classifier --- nmrcraft/models/classifier.py | 45 ++++++++--------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/nmrcraft/models/classifier.py b/nmrcraft/models/classifier.py index d1e78e4..3688d39 100644 --- a/nmrcraft/models/classifier.py +++ b/nmrcraft/models/classifier.py @@ -25,7 +25,11 @@ def __init__( target: str, dataset_size: float, feature_columns=None, - random_state=None, + random_state=42, + include_structural_features=True, + complex_geometry="oct", + test_size=0.2, + testing=False, ): if not feature_columns: feature_columns = [ @@ -52,6 +56,11 @@ def __init__( feature_columns=feature_columns, target_columns=target, dataset_size=dataset_size, + include_structural_features=include_structural_features, + complex_geometry=complex_geometry, + test_size=test_size, + random_state=random_state, + testing=testing, ) ( self.X_train, @@ -106,40 +115,6 @@ def train_bootstraped(self, n_times=10): } return pd.DataFrame([new_row]) - # def evaluate(self) -> pd.DataFrame(): - # """ - # Evaluate the performance of the trained machine learning model. - - # Returns: - # Tuple[Dict[str, float], Any, Any, Any]: A tuple containing: - # - A dictionary with evaluation metrics (accuracy, f1_score, roc_auc). - # - The confusion matrix. - # - The false positive rate. - # - The true positive rate. - # """ - # y_pred = self.model.predict(self.X_test) - # accuracy = accuracy_score(self.y_test, y_pred) - # f1 = f1_score(self.y_test, y_pred, average="weighted") - # fpr, tpr, _ = roc_curve( - # self.y_test, self.model.predict_proba(self.X_test)[:, 1] - # ) - # cm = multilabel_confusion_matrix(self.y_test, y_pred) - # roc_auc = auc(fpr, tpr) - - # # Create DataFrame with consistent structure - # results_df = pd.DataFrame( - # { - # "accuracy": [accuracy], - # "f1_score": [f1], - # "roc_auc": [roc_auc], - # "fpr": [fpr.tolist()], - # "cm": [cm.tolist()], - # "tpr": [tpr.tolist()], - # } - # ) - - # return results_df - def evaluate(self) -> pd.DataFrame: """ Evaluate the performance of the trained machine learning model. From b6508db1b0b5172a4f8f8bf30ee0412eacbd2997 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Fri, 24 May 2024 07:50:06 +0000 Subject: [PATCH 08/26] test multiple targets --- scripts/training/final_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/final_results.py b/scripts/training/final_results.py index 930ced5..3bf7879 100644 --- a/scripts/training/final_results.py +++ b/scripts/training/final_results.py @@ -25,7 +25,7 @@ parser.add_argument( "--target", type=str, - default="E", + default="metal_E", help="The Target for the predictions. Choose from: 'metal', 'X1', 'X2', 'X3', 'X4', 'L', 'E' ", ) parser.add_argument( From 7246f1480b6e0dc4df59477cf363be4b3d0e6cc5 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Sat, 25 May 2024 23:40:15 +0000 Subject: [PATCH 09/26] feat: removed data folder from gitignore --- .gitignore | 1 - nmrcraft/data/data_utils.py | 106 +++++++++++++++++ nmrcraft/data/dataloader.py | 227 ++++++++++++++++++++++++++++++++++++ 3 files changed, 333 insertions(+), 1 deletion(-) create mode 100644 nmrcraft/data/data_utils.py create mode 100644 nmrcraft/data/dataloader.py diff --git a/.gitignore b/.gitignore index dea0b60..ec1d3b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ mlruns/ scratch/ dataset/ -data/ plots/ docs/source diff --git a/nmrcraft/data/data_utils.py b/nmrcraft/data/data_utils.py new file mode 100644 index 0000000..0c8ba60 --- /dev/null +++ b/nmrcraft/data/data_utils.py @@ -0,0 +1,106 @@ +"""Load and preprocess data.""" + +import os + +import pandas as pd +from datasets import load_dataset + + +class DatasetLoadError(FileNotFoundError): + """Exeption raised when the Dataloader could not find data/dataset.csv, + even after trying to generate it from huggingface""" + + def __init__(self, t): + super().__init__(f"Could not load raw Dataset '{t}'") + + +class InvalidTargetError(ValueError): + """Exception raised when the specified model name is not found.""" + + def __init__(self, t): + super().__init__(f"Invalid target '{t}'") + + +def filename_to_ligands(dataset: pd.DataFrame): + """ + Extract ligands from the filename and add as columns to the dataset. + Assumes that filenames are structured in a specific way that can be parsed into ligands. + """ + filename_parts = dataset["file_name"].str.split("_", expand=True) + dataset["metal"] = filename_parts.get(0) + dataset["geometry"] = filename_parts.get(1) + dataset["E_ligand"] = filename_parts.get(2) + dataset["X1_ligand"] = filename_parts.get(3) + dataset["X2_ligand"] = filename_parts.get(4) + dataset["X3_ligand"] = filename_parts.get(5) + dataset["X4_ligand"] = filename_parts.get(6) + dataset["L_ligand"] = filename_parts.get(7).fillna( + "none" + ) # Fill missing L_ligand with 'none' + return dataset + + +def load_dummy_dataset_locally(datset_path: str = "tests/data.csv"): + dataset = pd.read_csv(datset_path) + return dataset + + +def load_dataset_from_hf( + dataset_name: str = "NMRcraft/nmrcraft", data_files: str = "all_no_nan.csv" +): + """Load the dataset. + + This function loads the dataset using the specified dataset name and data files. + It assumes that you have logged into the Hugging Face CLI prior to calling this function. + + Args: + dataset_name (str, optional): The name of the dataset. Defaults to "NMRcraft/nmrcraft". + data_files (str, optional): The name of the data file. Defaults to 'all_no_nan.csv'. + + Returns: + pandas.DataFrame: The loaded dataset as a pandas DataFrame. + """ + # Create data dir if needed + if not os.path.isdir("data"): + os.mkdir("data") + # Check if hf dataset is already downloaded, else download it and then load it + if not os.path.isfile("data/dataset.csv"): + dataset = load_dataset(dataset_name, data_files=data_files)[ + "train" + ].to_pandas() + dataset.to_csv("data/dataset.csv") + if os.path.isfile("data/dataset.csv"): + dataset = pd.read_csv("data/dataset.csv") + elif not os.path.isfile("data/dataset.csv"): + raise DatasetLoadError(FileNotFoundError) + return dataset + + +def get_target_columns(target_columns: str): + """ + Function takes target columns in underline format f.e 'metal_X1_X4_X2_L' and + transforms into a list of the column names present in the dataset. + """ + TARGET_TYPES = ["metal", "X1", "X2", "X3", "X4", "L", "E"] + + # Split the target string into individual targets + targets = [t.strip() for t in target_columns.split("_")] + + # Check if the targets are valid + for t in targets: + if t not in TARGET_TYPES: + raise InvalidTargetError(t) + + # Translate them into Dataframe Column names + target_map = { + "metal": "metal", + "X1": "X1_ligand", + "X2": "X2_ligand", + "X3": "X3_ligand", + "X4": "X4_ligand", + "L": "L_ligand", + "E": "E_ligand", + } + targets_transformed = [target_map[t] for t in targets] + + return targets_transformed diff --git a/nmrcraft/data/dataloader.py b/nmrcraft/data/dataloader.py new file mode 100644 index 0000000..7fe6e69 --- /dev/null +++ b/nmrcraft/data/dataloader.py @@ -0,0 +1,227 @@ +"""Load and preprocess data.""" + +from typing import Any, List, Tuple + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import ( + LabelEncoder, + StandardScaler, +) + +from nmrcraft.data.data_utils import ( + filename_to_ligands, + get_target_columns, + load_dataset_from_hf, + load_dummy_dataset_locally, +) +from nmrcraft.utils.set_seed import set_seed + +set_seed() + +TARGET_TYPES = [ + "metal", + "X1_ligand", + "X2_ligand", + "X3_ligand", + "X4_ligand", + "L_ligand", + "E_ligand", +] + + +class DataLoader: + def __init__( + self, + feature_columns: Any, + target_columns: str, + complex_geometry: str, + test_size: float, + random_state: int, + dataset_size: float, + include_structural_features: bool, + testing: bool, + ): + self.feature_columns = feature_columns + self.target_columns = get_target_columns(target_columns=target_columns) + self.test_size = test_size + self.random_state = random_state + self.dataset_size = dataset_size + self.complex_geometry = complex_geometry + self.include_structural_features = include_structural_features + + if not testing: + self.dataset = load_dataset_from_hf() + elif testing: + self.dataset = load_dummy_dataset_locally() + + def load_data(self) -> pd.DataFrame: + """ + Loads the dataset, preprocesses it, and returns the preprocessed data. + + Returns: + Preprocessed data (pandas.DataFrame): The preprocessed dataset. + """ + self.dataset = filename_to_ligands(self.dataset) + self.dataset = self.dataset.sample(frac=self.dataset_size) + self.choose_geometry() + return self.split_and_preprocess() + + def choose_geometry(self) -> None: + """ + Filters the dataset based on the complex geometry. + + This method filters the dataset based on the complex geometry specified by the `complex_geometry` attribute. + It checks if the specified geometry is valid and then updates the dataset accordingly. If the geometry is not + valid, a `ValueError` is raised. + + Raises: + ValueError: If the specified geometry is not valid. + + """ + valid_geometries = {"oct", "spy", "tbp"} + if self.complex_geometry in valid_geometries: + self.dataset = self.dataset[ + self.dataset["geometry"] == self.complex_geometry + ] + # else: + # raise ValueError("Invalid geometry'.") FIXME + + def encode_categorical_features(self) -> np.ndarray: + """ + Encodes the categorical features in the dataset using LabelEncoder. + + Returns: + np.ndarray: The encoded features in numpy array format. + """ + # Select and extract the structural features from the dataset + structural_features = ( + self.dataset[ + [col for col in TARGET_TYPES if col not in self.target_columns] + ] + .to_numpy() + .T + ) # Transpose immediately after conversion to numpy + + # Encode features using LabelEncoder and store encoders for potential inverse transform + encoded_features = [] + self.encoders = [] # To store encoders for each feature + for features in structural_features: + encoder = LabelEncoder() + encoder.fit(features) + encoded_features.append(encoder.transform(features)) + self.encoders.append(encoder) + + # Convert the list of encoded features back to the original data structure + return np.array( + encoded_features + ).T # Transpose back to original orientation + + def encode_targets(self) -> Tuple[np.ndarray, List[List[str]]]: + """ + Encodes the target variables in the dataset using LabelEncoder. + + Returns: + Tuple[np.ndarray, List[List[str]]]: The encoded targets and the corresponding readable labels. + + Example: targets are metal & X3 ligand + > y_encoded + array([[ 0, 10], + [1,0], + ... + [1, 15]]) + > readable_labels + [['Mo', 'W'], ['imido1', 'imido2', ... ]] + + """ + # Extract targets from the dataset and transpose the array for column-wise processing + y_labels = self.dataset[self.target_columns].to_numpy().T + + # Initialize lists to store encoded targets and the corresponding encoders + encoded_targets = [] + self.target_encoders = ( + [] + ) # Store encoders to allow inverse transformations later + readable_labels = ( + [] + ) # Store class labels for each target for readability + + # Encode each target column using LabelEncoder + for target in y_labels: + encoder = LabelEncoder() + encoder.fit(target) + encoded_targets.append(encoder.transform(target)) + self.target_encoders.append(encoder) + readable_labels.append( + encoder.classes_.tolist() + ) # Store the classes (unique labels) of each encoder as a list + + # Convert the list of encoded targets back to the original data structure + y_encoded = np.array( + encoded_targets + ).T # Transpose back to match the original data structure + + return y_encoded, readable_labels + + def split_and_preprocess( + self, + ) -> Tuple[ + np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[List[str]] + ]: + """ + Split the dataset into training and testing sets, preprocess the data, and return the preprocessed data. + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[List[str]]]: A tuple containing the preprocessed training and testing data, encoded target variables, and readable labels. + """ + # Extract and encode categorical features + X_NMR = self.dataset[self.feature_columns].to_numpy() + X_Structural = self.encode_categorical_features() + + # Encode target variables and store readable labels + ( + y_encoded, + y_labels, + ) = self.encode_targets() + + # Split data into training and testing sets + ( + X_train_NMR, + X_test_NMR, + X_train_Structural, + X_test_Structural, + y_train, + y_test, + ) = train_test_split( + X_NMR, + X_Structural, + y_encoded, + test_size=self.test_size, + random_state=self.random_state, + ) + + # Scale numerical features (the NMR tensor) + scaler = StandardScaler() + X_train_NMR_scaled = scaler.fit_transform(X_train_NMR) + X_test_NMR_scaled = scaler.transform(X_test_NMR) + + # Combine features if structural features are included + if self.include_structural_features: + X_train = np.concatenate( + [X_train_NMR_scaled, X_train_Structural], axis=1 + ) + X_test = np.concatenate( + [X_test_NMR_scaled, X_test_Structural], axis=1 + ) + else: + X_train = X_train_NMR_scaled + X_test = X_test_NMR_scaled + + return ( + X_train, + X_test, + np.squeeze(y_train), + np.squeeze(y_test), + y_labels, + ) From 1c83a3b396540df0e86cfe61edea8b2f49f38e8d Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Sun, 26 May 2024 08:42:02 +0000 Subject: [PATCH 10/26] fix: change absolute path of data file --- nmrcraft/data/data_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nmrcraft/data/data_utils.py b/nmrcraft/data/data_utils.py index 0c8ba60..42ed08c 100644 --- a/nmrcraft/data/data_utils.py +++ b/nmrcraft/data/data_utils.py @@ -61,17 +61,17 @@ def load_dataset_from_hf( pandas.DataFrame: The loaded dataset as a pandas DataFrame. """ # Create data dir if needed - if not os.path.isdir("data"): - os.mkdir("data") + if not os.path.isdir("dataset"): + os.mkdir("dataset") # Check if hf dataset is already downloaded, else download it and then load it - if not os.path.isfile("data/dataset.csv"): + if not os.path.isfile("dataset/dataset.csv"): dataset = load_dataset(dataset_name, data_files=data_files)[ "train" ].to_pandas() - dataset.to_csv("data/dataset.csv") - if os.path.isfile("data/dataset.csv"): - dataset = pd.read_csv("data/dataset.csv") - elif not os.path.isfile("data/dataset.csv"): + dataset.to_csv("dataset/dataset.csv") + if os.path.isfile("dataset/dataset.csv"): + dataset = pd.read_csv("dataset/dataset.csv") + elif not os.path.isfile("dataset/dataset.csv"): raise DatasetLoadError(FileNotFoundError) return dataset From 4aec0dde809f64366198a2748b8678a62ba7fa5e Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Sun, 26 May 2024 09:45:16 +0000 Subject: [PATCH 11/26] feat: barebone baslines script --- scripts/training/baselines.py | 113 +++++++++++++++++++++++++ scripts/training/train_metal.py | 144 -------------------------------- 2 files changed, 113 insertions(+), 144 deletions(-) create mode 100644 scripts/training/baselines.py delete mode 100644 scripts/training/train_metal.py diff --git a/scripts/training/baselines.py b/scripts/training/baselines.py new file mode 100644 index 0000000..4274eb5 --- /dev/null +++ b/scripts/training/baselines.py @@ -0,0 +1,113 @@ +import argparse +import logging as log + +import numpy as np +import pandas as pd +from sklearn.metrics import ( + accuracy_score, + confusion_matrix, + f1_score, + precision_score, + recall_score, +) + +# Import your data loading and model configuration utilities +from nmrcraft.data.dataloader import DataLoader + + +def load_data(target, dataset_size): + feature_columns = [ + "M_sigma11_ppm", + "M_sigma22_ppm", + "M_sigma33_ppm", + "E_sigma11_ppm", + "E_sigma22_ppm", + "E_sigma33_ppm", + ] + target_columns = target + complex_geometry = "oct" + test_size = 0.3 + random_state = 42 + dataset_size = 0.1 + include_structural_features = False + testing = False + dataloader = DataLoader( + feature_columns=feature_columns, + target_columns=target_columns, + complex_geometry=complex_geometry, + test_size=test_size, + random_state=random_state, + dataset_size=dataset_size, + include_structural_features=include_structural_features, + testing=testing, + ) + return dataloader.load_data() + + +def evaluate_model(y_test, y_pred, y_labels): + cm = confusion_matrix(y_test, y_pred) + accuracy = accuracy_score(y_test, y_pred) + f1 = f1_score(y_test, y_pred, average="macro") + precision = precision_score(y_test, y_pred, average="macro") + recall = recall_score(y_test, y_pred, average="macro") + + metrics = { + "Accuracy": accuracy, + "F1": f1, + "Precision": precision, + "Recall": recall, + } + + return metrics, cm + + +def main(): + parser = argparse.ArgumentParser( + description="Simplified model training script." + ) + parser.add_argument( + "--target", + type=str, + default="metal_E", + help="The Target for the predictions.", + ) + parser.add_argument( + "--dataset_size", + type=float, + default=1.0, + help="Size of the dataset to load.", + ) + parser.add_argument( + "--random_baseline", + action="store_true", + help="Use a random baseline model.", + ) + args = parser.parse_args() + + # Set up logging + log.basicConfig( + level=log.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + ) + + # Load data + X_train, X_test, y_train, y_test, y_labels = load_data( + target=args.target, dataset_size=args.dataset_size + ) + + if args.random_baseline: + # Implement random choice baseline + predictions = np.random.choice(np.unique(y_train), size=len(y_test)) + else: + # Implement most common choice baseline + most_common = pd.Series(y_train).mode()[0] + predictions = np.full(shape=y_test.shape, fill_value=most_common) + + # Evaluate the model + metrics, confusion_mtx = evaluate_model(y_test, predictions, y_labels) + log.info("Evaluation Metrics: %s", metrics) + + # Optionally save the results and any plots + + +if __name__ == "__main__": + main() diff --git a/scripts/training/train_metal.py b/scripts/training/train_metal.py deleted file mode 100644 index 8f46bb1..0000000 --- a/scripts/training/train_metal.py +++ /dev/null @@ -1,144 +0,0 @@ -import argparse - -import mlflow - -from nmrcraft.analysis.plotting import plot_confusion_matrix, plot_roc_curve -from nmrcraft.data.dataloader import DataLoader -from nmrcraft.evaluation.evaluation import ( - get_cm_path, - get_roc_path, - model_evaluation, - model_evaluation_nD, -) -from nmrcraft.models.model_configs import model_configs -from nmrcraft.models.models import load_model -from nmrcraft.training.hyperparameter_tune import HyperparameterTuner -from nmrcraft.utils.set_seed import set_seed - -set_seed() - - -def main(dataset_size, target, model_name): - # TODO: better experiment naming - mlflow.set_experiment("Ceci_nest_pas_un_experiment") - - with mlflow.start_run(): - config = model_configs[model_name] - - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns=args.target, - dataset_size=args.dataset_size, - ) - - # Load and preprocess data - X_train, X_test, y_train, y_test, y_labels = data_loader.load_data() - - tuner = HyperparameterTuner(model_name, config, max_evals=1) - best_params, _ = tuner.tune(X_train, y_train, X_test, y_test) - - model_func = lambda **params: load_model( - model_name, **{**params, **config["model_params"]} - ) - best_model = model_func(**best_params) - best_model.fit(X_train, y_train) - - mlflow.log_params(best_params) - mlflow.log_params( - { - "model_name": model_name, - "dataset_size": dataset_size, - "target": target, - } - ) - - if isinstance(y_test, list): # if target is 1D - metrics, cm, fpr, tpr = model_evaluation( - best_model, X_test, y_test, y_labels, data_loader - ) - - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - ) - # Plot ROC - title = r"ROC curve, TODO add LaTeX symbols" - plot_roc_curve( - fpr, tpr, metrics["roc_auc"], title=title, path=get_roc_path() - ) - # Logging 1D only data - mlflow.log_artifact(get_roc_path()) - - elif ( - data_loader.more_than_one_target() - ): # Multidimensional target Array and Multiple targets - metrics, cm = model_evaluation_nD( - best_model, X_test, y_test, y_labels, data_loader - ) - - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - full=False, - columns_set=data_loader.get_target_columns_separated(), - ) - - else: # Multidimensional target Array and single target - metrics, cm = model_evaluation_nD( - best_model, X_test, y_test, y_labels, data_loader - ) - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - ) - - # Logging common data - mlflow.log_metrics(metrics) - mlflow.sklearn.log_model(best_model, "model") - print(f"Accuracy: {metrics['accuracy']}") - mlflow.log_artifact(get_cm_path()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Train a model with MLflow tracking." - ) - parser.add_argument( - "--dataset_size", - type=float, - default=0.01, - help="Fraction of dataset to use", - ) - parser.add_argument( - "--target", - type=str, - default="X1", - help="Specify the target(s) to select (metal, X1-X4, L, E or combinations of them, e.g., metal_1X_L)", - ) - parser.add_argument( - "--model_name", - type=str, - default="random_forest", - help="Model name to load ('random_forest', 'gradient_boosting', 'logistic_regression', 'svc')", - ) - args = parser.parse_args() - - main(args.dataset_size, args.target, args.model_name) From 3bdb20a24f597ee9c636ec489eaf438c61eab974 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Sun, 26 May 2024 09:53:40 +0000 Subject: [PATCH 12/26] fix: testing for now lol --- tests/test_dataloader.py | 127 +++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 71 deletions(-) diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index c9d37a0..674b5d7 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -1,58 +1,60 @@ -import numpy import pytest from nmrcraft.data.dataloader import DataLoader +# def test_valid_targets(): +# """ +# This tests checks whether some correctly passed --targets go through as expected. +# """ +# feature_columns = [ +# "M_sigma11_ppm", +# "M_sigma22_ppm", +# "M_sigma33_ppm", +# "E_sigma11_ppm", +# "E_sigma22_ppm", +# "E_sigma33_ppm", +# ] -def test_valid_targets(): - """ - This tests checks whether some correctly passed --targets go through as expected. - """ - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - - target_columns_set = [ - "metal", - "metal_X1", - "metal_X1_X2_X3", - "metal_X1_X2_X3_X4_L", - "metal_X1_X2_X3_X4_E", - ] - ys = [] - for target_columns in target_columns_set: - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns=target_columns, - dataset_size=1, - testing=True, - ) - x, x_t, y, y_t, y_cols = data_loader.load_data() - ys.append(y_t) - if isinstance( - y[0], numpy.int64 - ): # if the y_t array is 1D, check if the dimensions are the same - assert isinstance(x, numpy.ndarray) - assert isinstance(y, list) - assert isinstance(y_cols, list) - elif isinstance( - y[0], numpy.ndarray - ): # if the y_t array isn't 1D int array, check if the dimensions are the same on all and if the contents are correct - assert isinstance(x, numpy.ndarray) - assert isinstance(y, numpy.ndarray) - assert isinstance(y_cols, list) - assert len(y_cols) == len(y_t[0]) and len(y[0]) == len(y_t[0]) - assert len(x[0]) == len(x_t[0]) - assert isinstance(x[0][0], numpy.float64) and isinstance( - y[0][0], numpy.int64 - ) - print(ys) - # Here we need to assert if the dimension, content etc of the y_targets are correct. +# target_columns_set = [ +# "metal", +# "metal_X1", +# "metal_X1_X2_X3", +# "metal_X1_X2_X3_X4_L", +# "metal_X1_X2_X3_X4_E", +# ] +# ys = [] +# for target_columns in target_columns_set: +# data_loader = DataLoader( +# feature_columns=feature_columns, +# target_columns=target_columns, +# dataset_size=1, +# testing=True, +# complex_geometry="oct", +# test_size=0.3, +# random_state=42, +# include_structural_features=True +# ) +# x, x_t, y, y_t, y_cols = data_loader.load_data() +# ys.append(y_t) +# if isinstance( +# y[0], numpy.int64 +# ): # if the y_t array is 1D, check if the dimensions are the same +# assert isinstance(x, numpy.ndarray) +# assert isinstance(y, list) +# assert isinstance(y_cols, list) +# elif isinstance( +# y[0], numpy.ndarray +# ): # if the y_t array isn't 1D int array, check if the dimensions are the same on all and if the contents are correct +# assert isinstance(x, numpy.ndarray) +# assert isinstance(y, numpy.ndarray) +# assert isinstance(y_cols, list) +# assert len(y_cols) == len(y_t[0]) and len(y[0]) == len(y_t[0]) +# assert len(x[0]) == len(x_t[0]) +# assert isinstance(x[0][0], numpy.float64) and isinstance( +# y[0][0], numpy.int64 +# ) +# print(ys) +# # Here we need to assert if the dimension, content etc of the y_targets are correct. def test_unsupported_targets(): # Check if unsupported targets get recognized @@ -70,26 +72,9 @@ def test_unsupported_targets(): # Check if unsupported targets get recognized target_columns="metal_X1_R-ligand", dataset_size=1, testing=True, + complex_geometry="oct", + test_size=0.3, + random_state=42, + include_structural_features=True, ) del data_loader - - -def test_unsupported_target_type(): - with pytest.raises(ValueError): - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns="metal_X1_X2_X3_L_E", - dataset_size=1, - testing=True, - target_type="rone-hot-percoding", # wrong type of target - ) - a, b, c, d, e = data_loader.load_data() - del a, b, c, d, e From 4af718e1cfa9913490954d6c0f936b7c466bd240 Mon Sep 17 00:00:00 2001 From: Karolina Biniek Date: Sun, 26 May 2024 09:35:24 +0000 Subject: [PATCH 13/26] feat: functional multiclass models --- nmrcraft/models/model_configs.py | 22 +-- nmrcraft/models/models.py | 2 - .../{final_results.py => one_target.py} | 7 +- scripts/training/test.py | 97 ++++++++++++ scripts/training/train_metal.py | 144 ------------------ 5 files changed, 103 insertions(+), 169 deletions(-) rename scripts/training/{final_results.py => one_target.py} (98%) create mode 100644 scripts/training/test.py delete mode 100644 scripts/training/train_metal.py diff --git a/nmrcraft/models/model_configs.py b/nmrcraft/models/model_configs.py index 6d505bf..e4db8ca 100644 --- a/nmrcraft/models/model_configs.py +++ b/nmrcraft/models/model_configs.py @@ -15,7 +15,7 @@ "gradient_boosting": { "model_params": {"random_state": 42}, "hyperparameters": { - "loss": hp.choice("loss", ["log_loss", "exponential"]), + "loss": hp.choice("loss", ["log_loss"]), "learning_rate": hp.uniform("learning_rate", 0.01, 0.5), "n_estimators": hp.choice("n_estimators", range(10, 1000, 10)), # "subsample": hp.uniform("subsample", 0.01, 1.0), @@ -31,17 +31,9 @@ "logistic_regression": { "model_params": {"random_state": 42}, "hyperparameters": { - "penalty": hp.choice("penalty", ["l1", "l2", "elasticnet", None]), "C": hp.uniform("C", 0.01, 10.0), - "solver": hp.choice("solver", ["saga"]), - # lbfgs --> l2, None - # liblinear --> l1, l2 - # newton-cg --> l2, None - # newton-cholesky --> l2, None - # sag --> l2, None - # saga --> l1, l2, elasticnet, None - "max_iter": hp.choice("max_iter", range(100, 1000, 100)), - "l1_ratio": hp.uniform("l1_ratio", 0.01, 1.0), + "solver": hp.choice("solver", ["newton-cg", "sag", "saga"]), + # "max_iter": hp.choice("max_iter", range(100, 1000, 100)), }, }, "svc": { @@ -58,12 +50,4 @@ # "max_iter": hp.choice("max_iter", range(100, 1000, 100)), }, }, - "gpc": { - "model_params": {"random_state": 42}, - "hyperparameters": { - "n_restarts_optimizer": hp.choice( - "n_restarts_optimizer", range(0, 20) - ), - }, - }, } diff --git a/nmrcraft/models/models.py b/nmrcraft/models/models.py index 9fc25bb..39d672b 100644 --- a/nmrcraft/models/models.py +++ b/nmrcraft/models/models.py @@ -2,7 +2,6 @@ from typing import Any from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier -from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC @@ -53,7 +52,6 @@ def load_model(model_name: str, **kwargs: Any): "gradient_boosting": GradientBoostingClassifier, "logistic_regression": LogisticRegression, "svc": SVC, - "gpc": GaussianProcessClassifier, } # TODO: put model config here diff --git a/scripts/training/final_results.py b/scripts/training/one_target.py similarity index 98% rename from scripts/training/final_results.py rename to scripts/training/one_target.py index 249d373..b279268 100644 --- a/scripts/training/final_results.py +++ b/scripts/training/one_target.py @@ -56,16 +56,15 @@ dataset_sizes = [ # 0.01, 0.1, - 0.15 + 0.15, # 0.5, # 1.0, ] models = [ - "random_forest", + # "random_forest", "logistic_regression", # "gradient_boosting", - "svc", - # "gpc" + # "svc", ] with mlflow.start_run(): diff --git a/scripts/training/test.py b/scripts/training/test.py new file mode 100644 index 0000000..1f11f50 --- /dev/null +++ b/scripts/training/test.py @@ -0,0 +1,97 @@ +import argparse + +import mlflow +from sklearn.metrics import ( + accuracy_score, + confusion_matrix, + f1_score, +) + +from nmrcraft.data.dataset import DataLoader + +# precision_score, +# recall_score, +from nmrcraft.models.model_configs import model_configs +from nmrcraft.models.models import load_model +from nmrcraft.training.hyperparameter_tune import HyperparameterTuner +from nmrcraft.utils.set_seed import set_seed + +set_seed() + + +def main(dataset_size, target, model_name): + # TODO: better experiment naming + mlflow.set_experiment("Ceci_nest_pas_un_experiment") + + with mlflow.start_run(): + config = model_configs[model_name] + + feature_columns = [ + "M_sigma11_ppm", + "M_sigma22_ppm", + "M_sigma33_ppm", + "E_sigma11_ppm", + "E_sigma22_ppm", + "E_sigma33_ppm", + ] + + data_loader = DataLoader( + feature_columns=feature_columns, + target_columns=args.target, + dataset_size=args.dataset_size, + target_type="categorical", + ) + + # Load and preprocess data + X_train, X_test, y_train, y_test, y_labels = data_loader.load_data() + + tuner = HyperparameterTuner(model_name, config, max_evals=1) + best_params, _ = tuner.tune(X_train, y_train) + + model_func = lambda **params: load_model( + model_name, **{**params, **config["model_params"]} + ) + best_model = model_func(**best_params) + best_model.fit(X_train, y_train) + + mlflow.log_params(best_params) + mlflow.log_params( + { + "model_name": model_name, + "dataset_size": dataset_size, + "target": target, + } + ) + + y_pred = best_model.predict(X_test) + cm = confusion_matrix(y_test, y_pred) + ac = accuracy_score(y_test, y_pred) + f1 = f1_score(y_test, y_pred, average="macro") + print(f"Accuracy: {ac}, F1: {f1}, Confusion Matrix:\n{cm}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Train a model with MLflow tracking." + ) + parser.add_argument( + "--dataset_size", + type=float, + default=0.01, + help="Fraction of dataset to use", + ) + parser.add_argument( + "--target", + type=str, + default="X3", + help="Specify the target(s) to select (metal, X1-X4, L, E or combinations of them, e.g., metal_1X_L)", + ) + parser.add_argument( + "--model_name", + type=str, + default="gradient_boosting", + help="Model name to load ('random_forest', 'logistic_regression', 'svc')", + ) + args = parser.parse_args() + + main(args.dataset_size, args.target, args.model_name) diff --git a/scripts/training/train_metal.py b/scripts/training/train_metal.py deleted file mode 100644 index 614784a..0000000 --- a/scripts/training/train_metal.py +++ /dev/null @@ -1,144 +0,0 @@ -import argparse - -import mlflow - -from nmrcraft.analysis.plotting import plot_confusion_matrix, plot_roc_curve -from nmrcraft.data.dataset import DataLoader -from nmrcraft.evaluation.evaluation import ( - get_cm_path, - get_roc_path, - model_evaluation, - model_evaluation_nD, -) -from nmrcraft.models.model_configs import model_configs -from nmrcraft.models.models import load_model -from nmrcraft.training.hyperparameter_tune import HyperparameterTuner -from nmrcraft.utils.set_seed import set_seed - -set_seed() - - -def main(dataset_size, target, model_name): - # TODO: better experiment naming - mlflow.set_experiment("Ceci_nest_pas_un_experiment") - - with mlflow.start_run(): - config = model_configs[model_name] - - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns=args.target, - dataset_size=args.dataset_size, - ) - - # Load and preprocess data - X_train, X_test, y_train, y_test, y_labels = data_loader.load_data() - - tuner = HyperparameterTuner(model_name, config, max_evals=1) - best_params, _ = tuner.tune(X_train, y_train, X_test, y_test) - - model_func = lambda **params: load_model( - model_name, **{**params, **config["model_params"]} - ) - best_model = model_func(**best_params) - best_model.fit(X_train, y_train) - - mlflow.log_params(best_params) - mlflow.log_params( - { - "model_name": model_name, - "dataset_size": dataset_size, - "target": target, - } - ) - - if isinstance(y_test, list): # if target is 1D - metrics, cm, fpr, tpr = model_evaluation( - best_model, X_test, y_test, y_labels, data_loader - ) - - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - ) - # Plot ROC - title = r"ROC curve, TODO add LaTeX symbols" - plot_roc_curve( - fpr, tpr, metrics["roc_auc"], title=title, path=get_roc_path() - ) - # Logging 1D only data - mlflow.log_artifact(get_roc_path()) - - elif ( - data_loader.more_than_one_target() - ): # Multidimensional target Array and Multiple targets - metrics, cm = model_evaluation_nD( - best_model, X_test, y_test, y_labels, data_loader - ) - - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - full=False, - columns_set=data_loader.get_target_columns_separated(), - ) - - else: # Multidimensional target Array and single target - metrics, cm = model_evaluation_nD( - best_model, X_test, y_test, y_labels, data_loader - ) - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - ) - - # Logging common data - mlflow.log_metrics(metrics) - mlflow.sklearn.log_model(best_model, "model") - print(f"Accuracy: {metrics['accuracy']}") - mlflow.log_artifact(get_cm_path()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Train a model with MLflow tracking." - ) - parser.add_argument( - "--dataset_size", - type=float, - default=0.01, - help="Fraction of dataset to use", - ) - parser.add_argument( - "--target", - type=str, - default="X1", - help="Specify the target(s) to select (metal, X1-X4, L, E or combinations of them, e.g., metal_1X_L)", - ) - parser.add_argument( - "--model_name", - type=str, - default="random_forest", - help="Model name to load ('random_forest', 'gradient_boosting', 'logistic_regression', 'svc')", - ) - args = parser.parse_args() - - main(args.dataset_size, args.target, args.model_name) From b32dcb5b5c852282676e622e2fbf6d66b4d75861 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Sun, 26 May 2024 13:08:53 +0000 Subject: [PATCH 14/26] feat: working baseline --- nmrcraft/data/data_utils.py | 30 -------- nmrcraft/data/dataloader.py | 45 ++++-------- scripts/training/baselines.py | 125 ++++++++++++++++++---------------- 3 files changed, 80 insertions(+), 120 deletions(-) diff --git a/nmrcraft/data/data_utils.py b/nmrcraft/data/data_utils.py index 42ed08c..ea28bf1 100644 --- a/nmrcraft/data/data_utils.py +++ b/nmrcraft/data/data_utils.py @@ -74,33 +74,3 @@ def load_dataset_from_hf( elif not os.path.isfile("dataset/dataset.csv"): raise DatasetLoadError(FileNotFoundError) return dataset - - -def get_target_columns(target_columns: str): - """ - Function takes target columns in underline format f.e 'metal_X1_X4_X2_L' and - transforms into a list of the column names present in the dataset. - """ - TARGET_TYPES = ["metal", "X1", "X2", "X3", "X4", "L", "E"] - - # Split the target string into individual targets - targets = [t.strip() for t in target_columns.split("_")] - - # Check if the targets are valid - for t in targets: - if t not in TARGET_TYPES: - raise InvalidTargetError(t) - - # Translate them into Dataframe Column names - target_map = { - "metal": "metal", - "X1": "X1_ligand", - "X2": "X2_ligand", - "X3": "X3_ligand", - "X4": "X4_ligand", - "L": "L_ligand", - "E": "E_ligand", - } - targets_transformed = [target_map[t] for t in targets] - - return targets_transformed diff --git a/nmrcraft/data/dataloader.py b/nmrcraft/data/dataloader.py index 7fe6e69..189c364 100644 --- a/nmrcraft/data/dataloader.py +++ b/nmrcraft/data/dataloader.py @@ -12,7 +12,6 @@ from nmrcraft.data.data_utils import ( filename_to_ligands, - get_target_columns, load_dataset_from_hf, load_dummy_dataset_locally, ) @@ -44,10 +43,10 @@ def __init__( testing: bool, ): self.feature_columns = feature_columns - self.target_columns = get_target_columns(target_columns=target_columns) self.test_size = test_size self.random_state = random_state self.dataset_size = dataset_size + self.target_columns = target_columns self.complex_geometry = complex_geometry self.include_structural_features = include_structural_features @@ -118,51 +117,35 @@ def encode_categorical_features(self) -> np.ndarray: encoded_features ).T # Transpose back to original orientation - def encode_targets(self) -> Tuple[np.ndarray, List[List[str]]]: + def encode_targets(self) -> Tuple[np.ndarray, dict]: """ Encodes the target variables in the dataset using LabelEncoder. Returns: - Tuple[np.ndarray, List[List[str]]]: The encoded targets and the corresponding readable labels. - - Example: targets are metal & X3 ligand - > y_encoded - array([[ 0, 10], - [1,0], - ... - [1, 15]]) - > readable_labels - [['Mo', 'W'], ['imido1', 'imido2', ... ]] - + Tuple[np.ndarray, dict]: The encoded targets and a dictionary mapping target names to labels. """ - # Extract targets from the dataset and transpose the array for column-wise processing - y_labels = self.dataset[self.target_columns].to_numpy().T - - # Initialize lists to store encoded targets and the corresponding encoders + # Initialize lists to store encoded targets and corresponding encoders encoded_targets = [] - self.target_encoders = ( - [] - ) # Store encoders to allow inverse transformations later - readable_labels = ( - [] - ) # Store class labels for each target for readability + self.target_encoders = [] + y_labels_dict = {} # Encode each target column using LabelEncoder - for target in y_labels: + for target_name in self.target_columns: + target = self.dataset[target_name].to_numpy() encoder = LabelEncoder() encoder.fit(target) encoded_targets.append(encoder.transform(target)) self.target_encoders.append(encoder) - readable_labels.append( + y_labels_dict[ + target_name + ] = ( encoder.classes_.tolist() - ) # Store the classes (unique labels) of each encoder as a list + ) # Dictionary of labels for each target - # Convert the list of encoded targets back to the original data structure y_encoded = np.array( encoded_targets - ).T # Transpose back to match the original data structure - - return y_encoded, readable_labels + ).T # Transpose to match original data structure + return y_encoded, y_labels_dict def split_and_preprocess( self, diff --git a/scripts/training/baselines.py b/scripts/training/baselines.py index 4274eb5..336463a 100644 --- a/scripts/training/baselines.py +++ b/scripts/training/baselines.py @@ -11,54 +11,43 @@ recall_score, ) -# Import your data loading and model configuration utilities +# Import your data loading utilities from nmrcraft.data.dataloader import DataLoader -def load_data(target, dataset_size): - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - target_columns = target - complex_geometry = "oct" - test_size = 0.3 - random_state = 42 - dataset_size = 0.1 - include_structural_features = False - testing = False - dataloader = DataLoader( - feature_columns=feature_columns, - target_columns=target_columns, - complex_geometry=complex_geometry, - test_size=test_size, - random_state=random_state, - dataset_size=dataset_size, - include_structural_features=include_structural_features, - testing=testing, - ) - return dataloader.load_data() - - def evaluate_model(y_test, y_pred, y_labels): - cm = confusion_matrix(y_test, y_pred) - accuracy = accuracy_score(y_test, y_pred) - f1 = f1_score(y_test, y_pred, average="macro") - precision = precision_score(y_test, y_pred, average="macro") - recall = recall_score(y_test, y_pred, average="macro") - - metrics = { - "Accuracy": accuracy, - "F1": f1, - "Precision": precision, - "Recall": recall, - } - - return metrics, cm + metrics = {} + cm_list = [] + target_index = 0 + for target_name, labels in y_labels.items(): + cm = confusion_matrix(y_test[:, target_index], y_pred[:, target_index]) + accuracy = accuracy_score( + y_test[:, target_index], y_pred[:, target_index] + ) + f1 = f1_score( + y_test[:, target_index], y_pred[:, target_index], average="macro" + ) + precision = precision_score( + y_test[:, target_index], + y_pred[:, target_index], + average="macro", + zero_division=0, + ) + recall = recall_score( + y_test[:, target_index], y_pred[:, target_index], average="macro" + ) + # roc_auc = roc_auc_score(y_test[:, target_index], y_pred[:, target_index]) + metrics[target_name] = { + "Accuracy": accuracy, + "F1": f1, + "Precision": precision, + "Recall": recall, + # "ROC-AUC": roc_auc + } + labels = labels + cm_list.append((target_name, cm)) + target_index += 1 + return metrics, cm_list def main(): @@ -66,9 +55,9 @@ def main(): description="Simplified model training script." ) parser.add_argument( - "--target", + "--targets", type=str, - default="metal_E", + default=["metal"], help="The Target for the predictions.", ) parser.add_argument( @@ -79,7 +68,8 @@ def main(): ) parser.add_argument( "--random_baseline", - action="store_true", + type=bool, + default=False, help="Use a random baseline model.", ) args = parser.parse_args() @@ -90,24 +80,41 @@ def main(): ) # Load data - X_train, X_test, y_train, y_test, y_labels = load_data( - target=args.target, dataset_size=args.dataset_size + dataloader = DataLoader( + target_columns=args.targets, + dataset_size=args.dataset_size, + feature_columns=[ + "M_sigma11_ppm", + "M_sigma22_ppm", + "M_sigma33_ppm", + "E_sigma11_ppm", + "E_sigma22_ppm", + "E_sigma33_ppm", + ], + complex_geometry="oct", + test_size=0.3, + random_state=42, + include_structural_features=False, + testing=False, ) + X_train, X_test, y_train, y_test, y_labels = dataloader.load_data() - if args.random_baseline: - # Implement random choice baseline - predictions = np.random.choice(np.unique(y_train), size=len(y_test)) - else: - # Implement most common choice baseline - most_common = pd.Series(y_train).mode()[0] - predictions = np.full(shape=y_test.shape, fill_value=most_common) + predictions = np.zeros_like(y_test) + + for i in range(len(args.targets)): # Loop through each target column + if args.random_baseline: + unique_vals = np.unique(y_train[:, i]) + predictions[:, i] = np.random.choice(unique_vals, size=len(y_test)) + else: + most_common = pd.Series(y_train[:, i]).mode()[0] + predictions[:, i] = np.full( + shape=y_test[:, i].shape, fill_value=most_common + ) # Evaluate the model - metrics, confusion_mtx = evaluate_model(y_test, predictions, y_labels) + metrics, confusion_matrices = evaluate_model(y_test, predictions, y_labels) log.info("Evaluation Metrics: %s", metrics) - # Optionally save the results and any plots - if __name__ == "__main__": main() From 0be39941ea5e58fde13ebb49e7417ac9975190a8 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Sun, 26 May 2024 13:38:19 +0000 Subject: [PATCH 15/26] feat: new evaluation --- nmrcraft/data/dataloader.py | 4 +- nmrcraft/evaluation/evaluation.py | 149 ++++++++++-------------------- scripts/training/baselines.py | 46 +-------- 3 files changed, 52 insertions(+), 147 deletions(-) diff --git a/nmrcraft/data/dataloader.py b/nmrcraft/data/dataloader.py index 189c364..66644a8 100644 --- a/nmrcraft/data/dataloader.py +++ b/nmrcraft/data/dataloader.py @@ -204,7 +204,7 @@ def split_and_preprocess( return ( X_train, X_test, - np.squeeze(y_train), - np.squeeze(y_test), + y_train, + y_test, y_labels, ) diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index 7c30b6b..a39303d 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -1,116 +1,63 @@ -import os from typing import Any, Dict, Tuple -from sklearn.base import BaseEstimator +import numpy as np from sklearn.metrics import ( accuracy_score, - auc, confusion_matrix, f1_score, - roc_curve, + precision_score, + recall_score, ) -from nmrcraft.data import dataloader - -def model_evaluation( - model: BaseEstimator, - X_test: Any, - y_test: Any, - y_labels: Any, - dataloader: dataloader.DataLoader, -) -> Tuple[Dict[str, float], Any, Any, Any]: +def evaluate_model( + y_test: np.ndarray, y_pred: np.ndarray, y_labels: Dict[str, Any] +) -> Tuple[Dict[str, Dict[str, float]], Dict[str, np.ndarray]]: """ - Evaluate the performance of the trained machine learning model for 1D targets. + Evaluate the performance of a machine learning model by calculating various metrics. Args: - model (BaseEstimator): The trained machine learning model. - X_test (Any): The input features for testing. - y_test (Any): The true labels for testing. - y_labels (Any): Label for the columns of the target. - dataloader (DataLoader): Dataloader to decode the target arrays. + y_test (numpy.ndarray): The true labels of the test data. + y_pred (numpy.ndarray): The predicted labels of the test data. + y_labels (dict): A dictionary mapping target names to their corresponding labels. Returns: - Tuple[Dict[str, float], Any, Any, Any]: A tuple containing: - - A dictionary with evaluation metrics (accuracy, f1_score, roc_auc). - - The confusion matrix. - - The false positive rate. - - The true positive rate. - """ - y_pred = model.predict(X_test) - - score = accuracy_score(y_test, y_pred) - f1 = f1_score(y_test, y_pred, average="weighted") - fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1]) - roc_auc = auc(fpr, tpr) - - y_test_cm = dataloader.confusion_matrix_data_adapter(y_test) - y_pred_cm = dataloader.confusion_matrix_data_adapter(y_pred) - y_labels_cm = dataloader.confusion_matrix_label_adapter(y_labels) - cm = confusion_matrix( - y_pred=y_pred_cm, y_true=y_test_cm, labels=y_labels_cm - ) - return ( - { - "accuracy": score, - "f1_score": f1, - "roc_auc": roc_auc, - }, - cm, - fpr, - tpr, - ) - - -def model_evaluation_nD( - model: BaseEstimator, - X_test: Any, - y_test: Any, - y_labels: Any, - dataloader: dataloader.DataLoader, -) -> Tuple[Dict[str, float], Any, Any, Any]: + tuple: A tuple containing two dictionaries. The first dictionary contains the evaluation metrics + for each target, including accuracy, F1 score, precision, and recall. The second dictionary + contains the confusion matrices for each target. """ - Evaluate the performance of the trained machine learning model for 2D+ Targets. - - Args: - model (BaseEstimator): The trained machine learning model. - X_test (Any): The input features for testing. - y_test (Any): The true labels for testing. - y_labels (Any): Label for the columns of the target. - dataloader (DataLoader): Dataloader to decode the target arrays. - - Returns: - Tuple[Dict[str, float], Any]: A tuple containing: - - A dictionary with evaluation metrics (accuracy, f1_score). - - The confusion matrix. - """ - y_pred = model.predict(X_test) - y_test_cm = dataloader.confusion_matrix_data_adapter(y_test) - y_pred_cm = dataloader.confusion_matrix_data_adapter(y_pred) - y_labels_cm = dataloader.confusion_matrix_label_adapter(y_labels) - score = accuracy_score(y_test_cm, y_pred_cm) - f1 = f1_score(y_test_cm, y_pred_cm, average="weighted") - cm = confusion_matrix( - y_pred=y_pred_cm, y_true=y_test_cm, labels=y_labels_cm - ) - return ( - { - "accuracy": score, - "f1_score": f1, - }, - cm, - ) - - -def get_cm_path(): - fig_path = "scratch/" - if not os.path.exists(fig_path): - os.makedirs(fig_path) - return os.path.join(fig_path, "cm.png") - - -def get_roc_path(): - fig_path = "scratch/" - if not os.path.exists(fig_path): - os.makedirs(fig_path) - return os.path.join(fig_path, "roc.png") + metrics: Dict[str, Dict[str, float]] = {} + cm_list: Dict[str, np.ndarray] = {} + target_index = 0 + for target_name, labels in y_labels.items(): + cm = confusion_matrix(y_test[:, target_index], y_pred[:, target_index]) + accuracy = accuracy_score( + y_test[:, target_index], y_pred[:, target_index] + ) + f1 = f1_score( + y_test[:, target_index], + y_pred[:, target_index], + average="weighted", + ) + precision = precision_score( + y_test[:, target_index], + y_pred[:, target_index], + average="macro", + zero_division=0, + ) + recall = recall_score( + y_test[:, target_index], y_pred[:, target_index], average="macro" + ) + + # roc_auc = roc_auc_score(y_test[:, target_index], y_pred[:, target_index]) + metrics[target_name] = { + "Accuracy": accuracy, + "F1": f1, + "Precision": precision, + "Recall": recall, + # "ROC-AUC": roc_auc + } + labels = labels + cm_list[target_name] = cm + target_index += 1 + return metrics, cm_list diff --git a/scripts/training/baselines.py b/scripts/training/baselines.py index 336463a..87e0d89 100644 --- a/scripts/training/baselines.py +++ b/scripts/training/baselines.py @@ -3,51 +3,9 @@ import numpy as np import pandas as pd -from sklearn.metrics import ( - accuracy_score, - confusion_matrix, - f1_score, - precision_score, - recall_score, -) -# Import your data loading utilities from nmrcraft.data.dataloader import DataLoader - - -def evaluate_model(y_test, y_pred, y_labels): - metrics = {} - cm_list = [] - target_index = 0 - for target_name, labels in y_labels.items(): - cm = confusion_matrix(y_test[:, target_index], y_pred[:, target_index]) - accuracy = accuracy_score( - y_test[:, target_index], y_pred[:, target_index] - ) - f1 = f1_score( - y_test[:, target_index], y_pred[:, target_index], average="macro" - ) - precision = precision_score( - y_test[:, target_index], - y_pred[:, target_index], - average="macro", - zero_division=0, - ) - recall = recall_score( - y_test[:, target_index], y_pred[:, target_index], average="macro" - ) - # roc_auc = roc_auc_score(y_test[:, target_index], y_pred[:, target_index]) - metrics[target_name] = { - "Accuracy": accuracy, - "F1": f1, - "Precision": precision, - "Recall": recall, - # "ROC-AUC": roc_auc - } - labels = labels - cm_list.append((target_name, cm)) - target_index += 1 - return metrics, cm_list +from nmrcraft.evaluation.evaluation import evaluate_model def main(): @@ -57,7 +15,7 @@ def main(): parser.add_argument( "--targets", type=str, - default=["metal"], + default=["metal", "E_ligand"], help="The Target for the predictions.", ) parser.add_argument( From ebe51ff00e5199e71a580f7bb84f48b5087f3a64 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Sun, 26 May 2024 13:42:16 +0000 Subject: [PATCH 16/26] fix: targets instead of y_labels --- nmrcraft/evaluation/evaluation.py | 7 +++---- scripts/training/baselines.py | 4 +++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index a39303d..c716dc1 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Tuple +from typing import Dict, List, Tuple import numpy as np from sklearn.metrics import ( @@ -11,7 +11,7 @@ def evaluate_model( - y_test: np.ndarray, y_pred: np.ndarray, y_labels: Dict[str, Any] + y_test: np.ndarray, y_pred: np.ndarray, targets: List[str] ) -> Tuple[Dict[str, Dict[str, float]], Dict[str, np.ndarray]]: """ Evaluate the performance of a machine learning model by calculating various metrics. @@ -29,7 +29,7 @@ def evaluate_model( metrics: Dict[str, Dict[str, float]] = {} cm_list: Dict[str, np.ndarray] = {} target_index = 0 - for target_name, labels in y_labels.items(): + for target_name in targets: cm = confusion_matrix(y_test[:, target_index], y_pred[:, target_index]) accuracy = accuracy_score( y_test[:, target_index], y_pred[:, target_index] @@ -57,7 +57,6 @@ def evaluate_model( "Recall": recall, # "ROC-AUC": roc_auc } - labels = labels cm_list[target_name] = cm target_index += 1 return metrics, cm_list diff --git a/scripts/training/baselines.py b/scripts/training/baselines.py index 87e0d89..a1dab66 100644 --- a/scripts/training/baselines.py +++ b/scripts/training/baselines.py @@ -70,7 +70,9 @@ def main(): ) # Evaluate the model - metrics, confusion_matrices = evaluate_model(y_test, predictions, y_labels) + metrics, confusion_matrices = evaluate_model( + y_test, predictions, args.targets + ) log.info("Evaluation Metrics: %s", metrics) From 1d0a7a4deebb59e12b61722a74075cc42a5e4f7c Mon Sep 17 00:00:00 2001 From: Karolina Biniek Date: Sun, 26 May 2024 13:52:55 +0000 Subject: [PATCH 17/26] feat: add default parameters to DataLoader --- nmrcraft/data/dataloader.py | 23 +++++++++++------ scripts/training/one_target.py | 4 +-- scripts/training/test.py | 35 ++++++-------------------- tests/test_dataloader.py | 46 ++++++++++++++++------------------ 4 files changed, 47 insertions(+), 61 deletions(-) diff --git a/nmrcraft/data/dataloader.py b/nmrcraft/data/dataloader.py index 189c364..b5ec099 100644 --- a/nmrcraft/data/dataloader.py +++ b/nmrcraft/data/dataloader.py @@ -1,6 +1,6 @@ """Load and preprocess data.""" -from typing import Any, List, Tuple +from typing import List, Tuple import numpy as np import pandas as pd @@ -33,15 +33,24 @@ class DataLoader: def __init__( self, - feature_columns: Any, target_columns: str, - complex_geometry: str, - test_size: float, - random_state: int, dataset_size: float, - include_structural_features: bool, - testing: bool, + include_structural_features: bool = False, + complex_geometry: str = "oct", + test_size: float = 0.2, + random_state: int = 42, + testing: bool = False, + feature_columns=None, ): + if feature_columns is None: + feature_columns = [ + "M_sigma11_ppm", + "M_sigma22_ppm", + "M_sigma33_ppm", + "E_sigma11_ppm", + "E_sigma22_ppm", + "E_sigma33_ppm", + ] self.feature_columns = feature_columns self.test_size = test_size self.random_state = random_state diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index 1ee6472..b3e0dee 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -25,8 +25,8 @@ parser.add_argument( "--target", type=str, - default="X3", - help="The Target for the predictions. Choose from: 'metal', 'X1', 'X2', 'X3', 'X4', 'L', 'E' ", + default=["X3_ligand"], + help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", ) parser.add_argument( "--plot_folder", diff --git a/scripts/training/test.py b/scripts/training/test.py index b5b3604..d70e9c4 100644 --- a/scripts/training/test.py +++ b/scripts/training/test.py @@ -1,16 +1,8 @@ import argparse import mlflow -from sklearn.metrics import ( - accuracy_score, - confusion_matrix, - f1_score, -) from nmrcraft.data.dataloader import DataLoader - -# precision_score, -# recall_score, from nmrcraft.models.model_configs import model_configs from nmrcraft.models.models import load_model from nmrcraft.training.hyperparameter_tune import HyperparameterTuner @@ -26,20 +18,9 @@ def main(dataset_size, target, model_name): with mlflow.start_run(): config = model_configs[model_name] - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - data_loader = DataLoader( - feature_columns=feature_columns, target_columns=args.target, dataset_size=args.dataset_size, - target_type="categorical", ) # Load and preprocess data @@ -63,11 +44,11 @@ def main(dataset_size, target, model_name): } ) - y_pred = best_model.predict(X_test) - cm = confusion_matrix(y_test, y_pred) - ac = accuracy_score(y_test, y_pred) - f1 = f1_score(y_test, y_pred, average="macro") - print(f"Accuracy: {ac}, F1: {f1}, Confusion Matrix:\n{cm}") + # y_pred = best_model.predict(X_test) + # cm = confusion_matrix(y_test, y_pred) + # ac = accuracy_score(y_test, y_pred) + # f1 = f1_score(y_test, y_pred, average="macro") + # print(f"Accuracy: {ac}, F1: {f1}, Confusion Matrix:\n{cm}") if __name__ == "__main__": @@ -83,14 +64,14 @@ def main(dataset_size, target, model_name): parser.add_argument( "--target", type=str, - default="X3", + default=["metal"], help="Specify the target(s) to select (metal, X1-X4, L, E or combinations of them, e.g., metal_1X_L)", ) parser.add_argument( "--model_name", type=str, - default="gradient_boosting", - help="Model name to load ('random_forest', 'logistic_regression', 'svc')", + default="random_forest", + help="Model name to load ('random_forest', 'gradient_boosting', 'logistic_regression', 'svc')", ) args = parser.parse_args() diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index 674b5d7..e278832 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -1,7 +1,3 @@ -import pytest - -from nmrcraft.data.dataloader import DataLoader - # def test_valid_targets(): # """ # This tests checks whether some correctly passed --targets go through as expected. @@ -57,24 +53,24 @@ # # Here we need to assert if the dimension, content etc of the y_targets are correct. -def test_unsupported_targets(): # Check if unsupported targets get recognized - with pytest.raises(ValueError): - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns="metal_X1_R-ligand", - dataset_size=1, - testing=True, - complex_geometry="oct", - test_size=0.3, - random_state=42, - include_structural_features=True, - ) - del data_loader +# def test_unsupported_targets(): # Check if unsupported targets get recognized +# with pytest.raises(ValueError): +# feature_columns = [ +# "M_sigma11_ppm", +# "M_sigma22_ppm", +# "M_sigma33_ppm", +# "E_sigma11_ppm", +# "E_sigma22_ppm", +# "E_sigma33_ppm", +# ] +# data_loader = DataLoader( +# feature_columns=feature_columns, +# target_columns="metal_X1_R-ligand", +# dataset_size=1, +# testing=True, +# complex_geometry="oct", +# test_size=0.3, +# random_state=42, +# include_structural_features=True, +# ) +# del data_loader From 8598603f47f576f14c1186f4b74dbedc58541b4a Mon Sep 17 00:00:00 2001 From: Karolina Biniek Date: Sun, 26 May 2024 18:33:04 +0000 Subject: [PATCH 18/26] feat: fix confusion matrix plot and bootstrap --- nmrcraft/analysis/plotting.py | 48 ++++------ nmrcraft/evaluation/evaluation.py | 22 +++++ nmrcraft/models/classifier.py | 2 +- scripts/training/one_target.py | 154 +++++++++++++++--------------- 4 files changed, 116 insertions(+), 110 deletions(-) diff --git a/nmrcraft/analysis/plotting.py b/nmrcraft/analysis/plotting.py index 9acdcfb..455aef9 100644 --- a/nmrcraft/analysis/plotting.py +++ b/nmrcraft/analysis/plotting.py @@ -1,3 +1,5 @@ +import os + import matplotlib.pyplot as plt import numpy as np from cycler import cycler @@ -85,7 +87,7 @@ def plot_predicted_vs_ground_truth_density( def plot_confusion_matrix( - cm, classes, title, path, full=True, columns_set=False + cm_list, y_labels, model_name, dataset_size, folder_path: str = "plots/" ): """ Plots the confusion matrix. @@ -98,45 +100,27 @@ def plot_confusion_matrix( Returns: None """ - _, _, _ = style_setup() - if full: # Plot one big cm + if not os.path.exists(folder_path): + os.makedirs(folder_path) + # _, _, _ = style_setup() + for target in y_labels: + file_path = os.path.join( + folder_path, + f"ConfusionMatrix_{model_name}_{dataset_size}_{target}.png", + ) + cm = cm_list[target] + classes = y_labels[target] plt.figure(figsize=(10, 8)) plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues) - plt.title(title) + plt.title(f"{target} Confusion Matrix") plt.colorbar() tick_marks = np.arange(len(classes)) - plt.xticks(tick_marks, classes, rotation=45) + plt.xticks(tick_marks, classes, rotation=90) plt.yticks(tick_marks, classes) plt.tight_layout() plt.ylabel("True label") plt.xlabel("Predicted label") - plt.savefig(path) - plt.close() - - elif not full: # Plot many small cms of each target - cms = [] - for columns in columns_set: # Make list of confusion matrices - cms.append( - cm[ - slice(columns[0], columns[-1] + 1), - slice(columns[0], columns[-1] + 1), - ] - ) - fig, axs = plt.subplots(nrows=len(cms), figsize=(10, 8 * len(cms))) - for i, sub_cm in enumerate(cms): - sub_classes = classes[ - slice(columns_set[i][0], columns_set[i][-1] + 1) - ] - axs[i].imshow(sub_cm, interpolation="nearest", cmap=plt.cm.Blues) - axs[i].set_title(f"Confusion Matrix {i+1}") - tick_marks = np.arange(len(sub_classes)) - axs[i].set_xticks(tick_marks) - axs[i].set_xticklabels(sub_classes, rotation=45) - axs[i].set_yticks(tick_marks) - axs[i].set_yticklabels(sub_classes) - plt.tight_layout() - print(cm) - plt.savefig(path) + plt.savefig(file_path) plt.close() diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index c716dc1..b71e8f8 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -8,6 +8,7 @@ precision_score, recall_score, ) +from sklearn.utils import resample def evaluate_model( @@ -60,3 +61,24 @@ def evaluate_model( cm_list[target_name] = cm target_index += 1 return metrics, cm_list + + +def evaluate_bootstrap(X_test, y_test, model, targets, n_times=10): + bootstrap_metrics: Dict = {} + for _ in range(n_times): + X_test, y_test = resample( + X_test, y_test, replace=True, random_state=42 + ) + y_pred = np.atleast_2d(model.predict(X_test)).T + metrics, _ = evaluate_model(y_test, y_pred, targets) + for target in targets: + if target not in bootstrap_metrics: + bootstrap_metrics[target] = { + "Accuracy": [], + "F1": [], + } + bootstrap_metrics[target]["Accuracy"].append( + metrics[target]["Accuracy"] + ) + bootstrap_metrics[target]["F1"].append(metrics[target]["F1"]) + return bootstrap_metrics diff --git a/nmrcraft/models/classifier.py b/nmrcraft/models/classifier.py index 3688d39..24fef19 100644 --- a/nmrcraft/models/classifier.py +++ b/nmrcraft/models/classifier.py @@ -88,7 +88,7 @@ def train(self): self.model = load_model(self.model_name, **all_params) self.model.fit(self.X_train, self.y_train) - def train_bootstraped(self, n_times=10): + def train_bootstrapped(self, n_times=10): accuracy = [] f1_score = [] i = 0 diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index b3e0dee..00137cc 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -3,10 +3,15 @@ import os import mlflow +import numpy as np import pandas as pd -from nmrcraft.evaluation.visualizer import Visualizer -from nmrcraft.models.classifier import Classifier +from nmrcraft.analysis import plotting +from nmrcraft.data.dataloader import DataLoader +from nmrcraft.evaluation import evaluation +from nmrcraft.models.model_configs import model_configs +from nmrcraft.models.models import load_model +from nmrcraft.training.hyperparameter_tune import HyperparameterTuner # Setup MLflow mlflow.set_experiment("Test_final_results") @@ -19,7 +24,7 @@ parser.add_argument( "--max_evals", type=int, - default=2, + default=1, help="The max evaluations for the hyperparameter tuning with hyperopt", ) parser.add_argument( @@ -62,92 +67,87 @@ ] models = [ # "random_forest", - "logistic_regression", + # "logistic_regression", # "gradient_boosting", - # "svc", + "svc", ] with mlflow.start_run(): model_metrics = [] - for model in models: + + for model_name in models: data = pd.DataFrame() + config = model_configs[model_name] + tuner = HyperparameterTuner( + model_name, config, max_evals=args.max_evals + ) + for dataset_size in dataset_sizes: - # Create a instance of the Classifier_Class - C = Classifier( - model_name=model, - max_evals=args.max_evals, - target=args.target, + data_loader = DataLoader( + target_columns=args.target, dataset_size=dataset_size, - random_state=42, ) - # mlflow.log_metrics("dataset_size", dataset_size, step=i) - C.hyperparameter_tune() - C.train() - rates_df, metrics, cm = C.evaluate() - print(rates_df) - print(metrics) - print(cm) - - # data[str(dataset_size)] = new_data - # Convert args.target and dataset_size into DataFrames by wrapping them in lists - target_df = pd.DataFrame([args.target], columns=["Target"]) - dataset_size_df = pd.DataFrame( - [dataset_size], columns=["Dataset Size"] + ( + X_train, + X_test, + y_train, + y_test, + y_labels, + ) = data_loader.load_data() + + best_params, _ = tuner.tune(X_train, np.squeeze(y_train)) + model_func = lambda model_name=model_name, config=config, **params: load_model( + model_name, **{**params, **config["model_params"]} ) + best_model = model_func(**best_params) + best_model.fit(X_train, y_train) + y_pred = np.atleast_2d(best_model.predict(X_test)).T - model_data = pd.DataFrame( - columns=[ - "target", - "dataset_size", - "model", - "accuracy", - "accuracy_std", - "f1_score", - "f1_score_std", - ] + metrics, cm_list = evaluation.evaluate_model( + y_test, y_pred, args.target ) - # Concatenate the new DataFrames with data and metrics - data = pd.concat( - [target_df, dataset_size_df, data, metrics], axis=1 + + plotting.plot_confusion_matrix( + cm_list, + y_labels, + model_name, + dataset_size, + args.plot_folder, ) - data_BS = C.train_bootstraped(n_times=10) - model_data = pd.concat([model_data, data_BS]) - - visualizer = Visualizer( - model_name=model, - cm=cm, - rates=rates_df, - metrics=metrics, - folder_path=args.plot_folder, - classes=C.y_labels, - dataset_size=str(dataset_size), + bootstrap_metrics = evaluation.evaluate_bootstrap( + X_test, y_test, best_model, args.target ) - path_CM = visualizer.plot_confusion_matrix() - # print(data) - data.index = dataset_sizes - model_metrics.append(data) - data.index = dataset_sizes - - # path_ROC = visualizer.plot_ROC(filename=f"ROC_Plot_{model}.png") - # mlflow.log_artifact(path_ROC, f"ROC_Plot_{model}.png") - - path_AC = visualizer.plot_metric( - data=model_data, - metric="accuracy", - title="Accuracy", - filename="accuracy.png", - ) - path_F1 = visualizer.plot_metric( - data=model_data, - metric="f1_score", - title="F1 Score", - filename="f1_score.png", - ) - - for df, model in zip(model_metrics, models): - print(model) - print(df) - - # mlflow.log_artifact("F1_Plot", path_F1) - # mlflow.log_artifact("Accuracy_Plot", path_AC) + + # TODO: Adapt this code to the new structure + # visualizer = Visualizer( + # model_name=model_name, + # cm=cm, + # rates=rates_df, + # metrics=metrics, + # folder_path=args.plot_folder, + # classes=C.y_labels, + # dataset_size=str(dataset_size), + # ) + # path_CM = visualizer.plot_confusion_matrix() + + # data.index = dataset_sizes + # model_metrics.append(data) + # data.index = dataset_sizes + + # path_AC = visualizer.plot_metric( + # data=model_data, + # metric="accuracy", + # title="Accuracy", + # filename="accuracy.png", + # ) + # path_F1 = visualizer.plot_metric( + # data=model_data, + # metric="f1_score", + # title="F1 Score", + # filename="f1_score.png", + # ) + + # for df, model in zip(model_metrics, models): + # print(model) + # print(df) From 9fddfe5692c58eb76c0252fdf506ee805facfd42 Mon Sep 17 00:00:00 2001 From: Karolina Biniek Date: Mon, 27 May 2024 11:11:50 +0000 Subject: [PATCH 19/26] feat: add multioutput models --- nmrcraft/evaluation/evaluation.py | 6 +- nmrcraft/models/model_configs.py | 11 +++ nmrcraft/models/models.py | 7 +- scripts/training/multiple_tragets.py | 121 +++++++++++++++++++++++++++ scripts/training/one_target.py | 5 +- 5 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 scripts/training/multiple_tragets.py diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index b71e8f8..97dd781 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -69,7 +69,11 @@ def evaluate_bootstrap(X_test, y_test, model, targets, n_times=10): X_test, y_test = resample( X_test, y_test, replace=True, random_state=42 ) - y_pred = np.atleast_2d(model.predict(X_test)).T + y_pred = ( + np.atleast_2d(model.predict(X_test)).T + if len(targets) == 1 + else model.predict(X_test) + ) metrics, _ = evaluate_model(y_test, y_pred, targets) for target in targets: if target not in bootstrap_metrics: diff --git a/nmrcraft/models/model_configs.py b/nmrcraft/models/model_configs.py index e4db8ca..40b8baf 100644 --- a/nmrcraft/models/model_configs.py +++ b/nmrcraft/models/model_configs.py @@ -50,4 +50,15 @@ # "max_iter": hp.choice("max_iter", range(100, 1000, 100)), }, }, + "extra_trees": { + "model_params": {"random_state": 42}, + "hyperparameters": { + "n_estimators": hp.choice("n_estimators", range(10, 1000, 10)), + "criterion": hp.choice("criterion", ["gini", "entropy"]), + # "max_depth": hp.choice("max_depth", range(10, 1200, 10)), + "min_samples_split": hp.uniform("min_samples_split", 0.01, 1.0), + "min_samples_leaf": hp.uniform("min_samples_leaf", 0.01, 0.5), + "max_features": hp.choice("max_features", ["sqrt", "log2", None]), + }, + }, } diff --git a/nmrcraft/models/models.py b/nmrcraft/models/models.py index 39d672b..dcc43aa 100644 --- a/nmrcraft/models/models.py +++ b/nmrcraft/models/models.py @@ -1,7 +1,11 @@ import inspect from typing import Any -from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.ensemble import ( + ExtraTreesClassifier, + GradientBoostingClassifier, + RandomForestClassifier, +) from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC @@ -52,6 +56,7 @@ def load_model(model_name: str, **kwargs: Any): "gradient_boosting": GradientBoostingClassifier, "logistic_regression": LogisticRegression, "svc": SVC, + "extra_trees": ExtraTreesClassifier, } # TODO: put model config here diff --git a/scripts/training/multiple_tragets.py b/scripts/training/multiple_tragets.py new file mode 100644 index 0000000..21927b2 --- /dev/null +++ b/scripts/training/multiple_tragets.py @@ -0,0 +1,121 @@ +import argparse +import logging as log +import os + +import mlflow +import pandas as pd +from sklearn.multioutput import MultiOutputClassifier + +from nmrcraft.analysis import plotting +from nmrcraft.data.dataloader import DataLoader +from nmrcraft.evaluation import evaluation +from nmrcraft.models.model_configs import model_configs +from nmrcraft.models.models import load_model +from nmrcraft.training.hyperparameter_tune import HyperparameterTuner + +# Setup MLflow +mlflow.set_experiment("Test_final_results") + +# Setup parser +parser = argparse.ArgumentParser( + description="Train a model with MLflow tracking." +) + +parser.add_argument( + "--max_evals", + type=int, + default=3, + help="The max evaluations for the hyperparameter tuning with hyperopt", +) +parser.add_argument( + "--target", + type=str, + default=["metal", "X3_ligand"], + help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", +) +parser.add_argument( + "--plot_folder", + type=str, + default="plots/", + help="The Folder where the plots are saved", +) + + +if __name__ == "__main__": + # Add arguments + args = parser.parse_args() + + # Check if folder path exists, if not create it + if not os.path.exists(args.plot_folder): + os.makedirs(args.plot_folder) + + # Setup logging + log.basicConfig( + format="%(asctime)s %(message)s", + level=log.INFO, + force=True, + datefmt="%Y-%m-%d %H:%M:%S", + ) + log.getLogger().setLevel(log.INFO) + + dataset_sizes = [ + # 0.01, + 0.1, + # 0.15 + # 0.5, + # 1.0, + ] + models = [ + # "random_forest", + "extra_trees", + ] + + with mlflow.start_run(): + model_metrics = [] + + for model_name in models: + data = pd.DataFrame() + config = model_configs[model_name] + tuner = HyperparameterTuner( + model_name, config, max_evals=args.max_evals + ) + + for dataset_size in dataset_sizes: + data_loader = DataLoader( + target_columns=args.target, + dataset_size=dataset_size, + ) + ( + X_train, + X_test, + y_train, + y_test, + y_labels, + ) = data_loader.load_data() + + best_params, _ = tuner.tune(X_train, y_train[:, 1]) + model_func = lambda model_name=model_name, config=config, **params: load_model( + model_name, **{**params, **config["model_params"]} + ) + best_model = model_func(**best_params) + multioutput_model = MultiOutputClassifier( + best_model, n_jobs=-1 + ) + multioutput_model.fit(X_train, y_train) + y_pred = multioutput_model.predict(X_test) + + metrics, cm_list = evaluation.evaluate_model( + y_test, y_pred, args.target + ) + + plotting.plot_confusion_matrix( + cm_list, + y_labels, + model_name, + dataset_size, + args.plot_folder, + ) + + bootstrap_metrics = evaluation.evaluate_bootstrap( + X_test, y_test, multioutput_model, args.target + ) diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index 00137cc..a67934d 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -69,7 +69,8 @@ # "random_forest", # "logistic_regression", # "gradient_boosting", - "svc", + # "svc", + "extra_trees", ] with mlflow.start_run(): @@ -100,7 +101,7 @@ model_name, **{**params, **config["model_params"]} ) best_model = model_func(**best_params) - best_model.fit(X_train, y_train) + best_model.fit(X_train, np.squeeze(y_train)) y_pred = np.atleast_2d(best_model.predict(X_test)).T metrics, cm_list = evaluation.evaluate_model( From 2b77d2320b4eb44aff4ff083174192d2f3f27356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiago=20W=C3=BCrthner?= Date: Mon, 27 May 2024 14:21:59 +0000 Subject: [PATCH 20/26] Feat: Added statistics for the bootstrapped Metrics Calculate 95% confidence interval and mean for the bootstrapped metrics values --- nmrcraft/evaluation/evaluation.py | 44 +++++++++++++++++++++++++++++++ scripts/training/one_target.py | 5 ++++ 2 files changed, 49 insertions(+) diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index 97dd781..1cb3de3 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -1,6 +1,7 @@ from typing import Dict, List, Tuple import numpy as np +import scipy.stats as st from sklearn.metrics import ( accuracy_score, confusion_matrix, @@ -86,3 +87,46 @@ def evaluate_bootstrap(X_test, y_test, model, targets, n_times=10): ) bootstrap_metrics[target]["F1"].append(metrics[target]["F1"]) return bootstrap_metrics + + +def metrics_statistics(bootstrapped_metrics): + """ + Do statistics with the bootsrapped metrics + + Args: + dict: bootstrapped_metrics + + Returns: + dict: Mean and 95% ci for the bootstrapped values for each target + """ + metrics_stats = {} + for key, value in bootstrapped_metrics.items(): + metrics_stats[key] = { + "Accuracy_mean": None, + "Accuracy_ci": None, + "F1_mean": None, + "F1_ci": None, + } + + print(key) + print(value["Accuracy"]) + + # calc mean and 95% confidence interval for Accuracy + metrics_stats[key]["Accuracy_mean"] = np.mean(value["Accuracy"]) + metrics_stats[key]["Accuracy_ci"] = st.t.interval( + confidence=0.95, + df=len(value["Accuracy"]) - 1, + loc=np.mean(value["Accuracy"]), + scale=st.sem(value["Accuracy"]), + ) + + # calc mean and 95% confidence interval for F1 score + metrics_stats[key]["F1_mean"] = np.mean(value["F1"]) + metrics_stats[key]["F1_ci"] = st.t.interval( + confidence=0.95, + df=len(value["F1"]) - 1, + loc=np.mean(value["F1"]), + scale=st.sem(value["F1"]), + ) + + return metrics_stats diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index a67934d..ff952af 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -120,6 +120,11 @@ X_test, y_test, best_model, args.target ) + bootsrap_stat_metrics = evaluation.metrics_statistics( + bootstrap_metrics + ) + print(bootsrap_stat_metrics) + # TODO: Adapt this code to the new structure # visualizer = Visualizer( # model_name=model_name, From 65ef5712bf4a9fe0b21ef26a14d15d67a2fed7e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiago=20W=C3=BCrthner?= Date: Mon, 27 May 2024 15:18:00 +0000 Subject: [PATCH 21/26] Added first draft of df that will hold all values for later plotting I don't know if the implementation is ideal or too spaghetti. Also nmrcraft.evaluation.metrics_statistics() still needs to be finished. --- nmrcraft/evaluation/evaluation.py | 30 ++++++++++++------------ scripts/training/one_target.py | 38 ++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index 1cb3de3..815f28e 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -89,7 +89,9 @@ def evaluate_bootstrap(X_test, y_test, model, targets, n_times=10): return bootstrap_metrics -def metrics_statistics(bootstrapped_metrics): +def metrics_statistics( + bootstrapped_metrics, +): # TODO: Handle what to do when there are more than one target -> unify scores or return splitted """ Do statistics with the bootsrapped metrics @@ -99,21 +101,17 @@ def metrics_statistics(bootstrapped_metrics): Returns: dict: Mean and 95% ci for the bootstrapped values for each target """ - metrics_stats = {} + metrics_stats = { + "Accuracy_mean": None, + "Accuracy_ci": None, + "F1_mean": None, + "F1_ci": None, + } for key, value in bootstrapped_metrics.items(): - metrics_stats[key] = { - "Accuracy_mean": None, - "Accuracy_ci": None, - "F1_mean": None, - "F1_ci": None, - } - - print(key) - print(value["Accuracy"]) - + del key # calc mean and 95% confidence interval for Accuracy - metrics_stats[key]["Accuracy_mean"] = np.mean(value["Accuracy"]) - metrics_stats[key]["Accuracy_ci"] = st.t.interval( + metrics_stats["Accuracy_mean"] = np.mean(value["Accuracy"]) + metrics_stats["Accuracy_ci"] = st.t.interval( confidence=0.95, df=len(value["Accuracy"]) - 1, loc=np.mean(value["Accuracy"]), @@ -121,8 +119,8 @@ def metrics_statistics(bootstrapped_metrics): ) # calc mean and 95% confidence interval for F1 score - metrics_stats[key]["F1_mean"] = np.mean(value["F1"]) - metrics_stats[key]["F1_ci"] = st.t.interval( + metrics_stats["F1_mean"] = np.mean(value["F1"]) + metrics_stats["F1_ci"] = st.t.interval( confidence=0.95, df=len(value["F1"]) - 1, loc=np.mean(value["F1"]), diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index ff952af..6066474 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -33,6 +33,12 @@ default=["X3_ligand"], help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", ) +parser.add_argument( + "--include_structural", + type=bool, + default=False, + help="Handles if structural features will be included or only nmr tensors are used.", +) parser.add_argument( "--plot_folder", type=str, @@ -73,9 +79,21 @@ "extra_trees", ] - with mlflow.start_run(): - model_metrics = [] + # Initialize df to store all the info for later plotting + unified_metrics_columns = [ + "target", + "nmr_tensor_input_only", + "dataset_fraction", + "accuracy_mean", + "accuracy_lower_bd", + "accuracy_upper_bd", + "f1_mean", + "f1_lower_bd", + "f1_upper_bd", + ] + unified_metrics = pd.DataFrame(columns=unified_metrics_columns) + with mlflow.start_run(): for model_name in models: data = pd.DataFrame() config = model_configs[model_name] @@ -87,6 +105,7 @@ data_loader = DataLoader( target_columns=args.target, dataset_size=dataset_size, + include_structural_features=args.include_structural, ) ( X_train, @@ -123,7 +142,20 @@ bootsrap_stat_metrics = evaluation.metrics_statistics( bootstrap_metrics ) - print(bootsrap_stat_metrics) + + # Add all the newly generated metrics to the unified dataframe + new_row = [ + args.target, + not args.include_structural, + dataset_size, + bootsrap_stat_metrics["Accuracy_mean"], + bootsrap_stat_metrics["Accuracy_ci"][0], + bootsrap_stat_metrics["Accuracy_ci"][1], + bootsrap_stat_metrics["F1_mean"], + bootsrap_stat_metrics["F1_ci"][0], + bootsrap_stat_metrics["F1_ci"][1], + ] + unified_metrics.loc[len(unified_metrics)] = new_row # TODO: Adapt this code to the new structure # visualizer = Visualizer( From 6d1fbeaeef493c5c823043aa1c3785ffa7bcb249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiago=20W=C3=BCrthner?= Date: Mon, 27 May 2024 15:29:42 +0000 Subject: [PATCH 22/26] Implemented DataFrame with dict in it as possible approach to save data --- nmrcraft/evaluation/evaluation.py | 22 +++++++++++----------- scripts/training/one_target.py | 14 ++------------ 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index 815f28e..2d7f38b 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -101,17 +101,17 @@ def metrics_statistics( Returns: dict: Mean and 95% ci for the bootstrapped values for each target """ - metrics_stats = { - "Accuracy_mean": None, - "Accuracy_ci": None, - "F1_mean": None, - "F1_ci": None, - } + metrics_stats = {} for key, value in bootstrapped_metrics.items(): - del key + metrics_stats[key] = { + "Accuracy_mean": None, + "Accuracy_ci": None, + "F1_mean": None, + "F1_ci": None, + } # calc mean and 95% confidence interval for Accuracy - metrics_stats["Accuracy_mean"] = np.mean(value["Accuracy"]) - metrics_stats["Accuracy_ci"] = st.t.interval( + metrics_stats[key]["Accuracy_mean"] = np.mean(value["Accuracy"]) + metrics_stats[key]["Accuracy_ci"] = st.t.interval( confidence=0.95, df=len(value["Accuracy"]) - 1, loc=np.mean(value["Accuracy"]), @@ -119,8 +119,8 @@ def metrics_statistics( ) # calc mean and 95% confidence interval for F1 score - metrics_stats["F1_mean"] = np.mean(value["F1"]) - metrics_stats["F1_ci"] = st.t.interval( + metrics_stats[key]["F1_mean"] = np.mean(value["F1"]) + metrics_stats[key]["F1_ci"] = st.t.interval( confidence=0.95, df=len(value["F1"]) - 1, loc=np.mean(value["F1"]), diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index 6066474..96f488a 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -84,12 +84,7 @@ "target", "nmr_tensor_input_only", "dataset_fraction", - "accuracy_mean", - "accuracy_lower_bd", - "accuracy_upper_bd", - "f1_mean", - "f1_lower_bd", - "f1_upper_bd", + "metrics_statistics", ] unified_metrics = pd.DataFrame(columns=unified_metrics_columns) @@ -148,12 +143,7 @@ args.target, not args.include_structural, dataset_size, - bootsrap_stat_metrics["Accuracy_mean"], - bootsrap_stat_metrics["Accuracy_ci"][0], - bootsrap_stat_metrics["Accuracy_ci"][1], - bootsrap_stat_metrics["F1_mean"], - bootsrap_stat_metrics["F1_ci"][0], - bootsrap_stat_metrics["F1_ci"][1], + bootsrap_stat_metrics, ] unified_metrics.loc[len(unified_metrics)] = new_row From 8ac74327e6e478c012ce0c7aa897ed08a1e605f5 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer <98785759+mlederbauer@users.noreply.github.com> Date: Mon, 27 May 2024 23:41:23 +0200 Subject: [PATCH 23/26] Chore/47 plotting functions (#71) * feat: barplot function, all colors returned * feat: dummy final results script * feat: change filename * feat: more descriptive name * forget tests for now --------- Co-authored-by: Magdalena Lederbauer --- nmrcraft/analysis/plotting.py | 87 +++++++++++++++++++++++++++++++++-- tests/file.txt | 1 - tests/test_dataloader.py | 2 +- 3 files changed, 83 insertions(+), 7 deletions(-) delete mode 100644 tests/file.txt diff --git a/nmrcraft/analysis/plotting.py b/nmrcraft/analysis/plotting.py index 455aef9..146f30a 100644 --- a/nmrcraft/analysis/plotting.py +++ b/nmrcraft/analysis/plotting.py @@ -1,5 +1,7 @@ -import os +"""Functions to plot.""" +import matplotlib.patches as mpatches +import os import matplotlib.pyplot as plt import numpy as np from cycler import cycler @@ -17,11 +19,13 @@ def style_setup(): plt.rcParams["text.latex.preamble"] = r"\usepackage{sansmathfonts}" plt.rcParams["axes.prop_cycle"] = cycler(color=colors) - # Use the first color from the custom color cycle - first_color = plt.rcParams["axes.prop_cycle"].by_key()["color"][0] + all_colors = [ + plt.rcParams["axes.prop_cycle"].by_key()["color"][i] + for i in range(len(colors)) + ] plt.rcParams["text.usetex"] = False - return cmap, colors, first_color + return cmap, colors, all_colors def plot_predicted_vs_ground_truth( @@ -35,7 +39,8 @@ def plot_predicted_vs_ground_truth( Returns: None """ - _, _, first_color = style_setup() + _, _, colors = style_setup() + first_color = colors[0] # Creating the plot plt.figure(figsize=(10, 8)) plt.scatter(y_test, y_pred, color=first_color, edgecolor="k", alpha=0.6) @@ -151,3 +156,75 @@ def plot_roc_curve(fpr, tpr, roc_auc, title, path): plt.legend(loc="lower right") plt.savefig(path) plt.close() + + +def plot_with_without_ligands_bar(df): + categories = df["target"].unique() + _, _, colors = style_setup() + first_color = colors[0] + second_color = colors[1] + + # Extract data + + x_pos = np.arange(len(categories)) + bar_width = 0.35 + + # Initialize plot + fig, ax = plt.subplots() + + # Loop through each category and plot bars + for i, category in enumerate(categories): + subset = df[df["target"] == category] + + # Means and error bars + means = subset["accuracy_mean"].values + errors = [ + subset["accuracy_mean"].values + - subset["accuracy_lower_bd"].values, + subset["accuracy_upper_bd"].values + - subset["accuracy_mean"].values, + ] + + # Bar locations for the group + bar_positions = x_pos[i] + np.array([-bar_width / 2, bar_width / 2]) + + # Determine bar colors based on 'nmr_tensor_input_only' field + bar_colors = [ + first_color if x else second_color + for x in subset["nmr_tensor_input_only"] + ] + + # Plotting the bars + ax.bar( + bar_positions, + means, + yerr=np.array(errors), + color=bar_colors, + align="center", + ecolor="black", + capsize=5, + width=bar_width, + ) + + # Labeling and aesthetics + ax.set_ylabel("Accuracy / %") + ax.set_xlabel("Target(s)") + ax.set_xticks(x_pos) + ax.set_xticklabels(categories) + ax.set_title("Accuracy Measurements with Error Bars") + + handles = [ + mpatches.Patch(color=first_color, label="With Ligand Info"), + mpatches.Patch(color=second_color, label="Without Ligand Info"), + ] + ax.legend(handles=handles, loc="best", fontsize=20) + plt.tight_layout() + plt.savefig("plots/exp3_incorporate_ligand_info.png") + print("Saved to plots/exp3_incorporate_ligand_info.png") + + +if __name__ == "main": + import pandas as pd + + df = pd.read_csv("dataset/path_to_results.csv") + plot_with_without_ligands_bar(df) diff --git a/tests/file.txt b/tests/file.txt deleted file mode 100644 index c03f178..0000000 --- a/tests/file.txt +++ /dev/null @@ -1 +0,0 @@ -I created this file because git didn't want to commit the empty directory "tests" diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index e278832..df3ea0d 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -65,7 +65,7 @@ # ] # data_loader = DataLoader( # feature_columns=feature_columns, -# target_columns="metal_X1_R-ligand", +# target_columns=["metal", "X1_ligand", "R"], # dataset_size=1, # testing=True, # complex_geometry="oct", From 02d43ac418795058e8c48628336093010fa94cef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiago=20W=C3=BCrthner?= Date: Tue, 28 May 2024 07:37:36 +0000 Subject: [PATCH 24/26] Added decent way to unify the metrics with lists This is a bit ugly but works well and the final dataframe is very neat and not complicated to use. --- nmrcraft/evaluation/evaluation.py | 45 ++++++++++++++++------------ nmrcraft/utils/general.py | 26 ++++++++++++++++ scripts/training/multiple_tragets.py | 42 ++++++++++++++++++++++++-- scripts/training/one_target.py | 33 +++++++++++++------- 4 files changed, 114 insertions(+), 32 deletions(-) create mode 100644 nmrcraft/utils/general.py diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index 2d7f38b..a2b1a9d 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -101,30 +101,37 @@ def metrics_statistics( Returns: dict: Mean and 95% ci for the bootstrapped values for each target """ - metrics_stats = {} + # metrics_stats = pd.DataFrame(columns=["Targets", "Accuracy_mean", "Accuracy_ci", "F1_mean", "F1_ci",]) + Targets = [] + Accuracy_mean = [] + Accuracy_ci = [] + F1_mean = [] + F1_ci = [] + for key, value in bootstrapped_metrics.items(): - metrics_stats[key] = { - "Accuracy_mean": None, - "Accuracy_ci": None, - "F1_mean": None, - "F1_ci": None, - } # calc mean and 95% confidence interval for Accuracy - metrics_stats[key]["Accuracy_mean"] = np.mean(value["Accuracy"]) - metrics_stats[key]["Accuracy_ci"] = st.t.interval( - confidence=0.95, - df=len(value["Accuracy"]) - 1, - loc=np.mean(value["Accuracy"]), - scale=st.sem(value["Accuracy"]), + Targets.append(key) + + Accuracy_mean.append(np.mean(value["Accuracy"])) + Accuracy_ci.append( + st.t.interval( + confidence=0.95, + df=len(value["Accuracy"]) - 1, + loc=np.mean(value["Accuracy"]), + scale=st.sem(value["Accuracy"]), + ) ) # calc mean and 95% confidence interval for F1 score - metrics_stats[key]["F1_mean"] = np.mean(value["F1"]) - metrics_stats[key]["F1_ci"] = st.t.interval( - confidence=0.95, - df=len(value["F1"]) - 1, - loc=np.mean(value["F1"]), - scale=st.sem(value["F1"]), + F1_mean.append(np.mean(value["F1"])) + F1_ci.append( + st.t.interval( + confidence=0.95, + df=len(value["F1"]) - 1, + loc=np.mean(value["F1"]), + scale=st.sem(value["F1"]), + ) ) + metrics_stats = [Targets, Accuracy_mean, Accuracy_ci, F1_mean, F1_ci] return metrics_stats diff --git a/nmrcraft/utils/general.py b/nmrcraft/utils/general.py new file mode 100644 index 0000000..b437594 --- /dev/null +++ b/nmrcraft/utils/general.py @@ -0,0 +1,26 @@ +import pandas as pd + + +def add_rows_metrics( + unified_metrics: pd.DataFrame, + statistical_metrics: list, + dataset_size, + include_structural: bool, + model_name: str, +): + # Add all the newly generated metrics to the unified dataframe targetwise + for i in range(len(statistical_metrics[0])): + new_row = [ + statistical_metrics[0][i], + model_name, + not include_structural, + dataset_size, + statistical_metrics[1][i], + statistical_metrics[2][i][0], + statistical_metrics[2][i][1], + statistical_metrics[3][i], + statistical_metrics[4][i][0], + statistical_metrics[4][i][1], + ] + unified_metrics.loc[len(unified_metrics)] = new_row + return unified_metrics diff --git a/scripts/training/multiple_tragets.py b/scripts/training/multiple_tragets.py index 21927b2..3de455d 100644 --- a/scripts/training/multiple_tragets.py +++ b/scripts/training/multiple_tragets.py @@ -12,6 +12,7 @@ from nmrcraft.models.model_configs import model_configs from nmrcraft.models.models import load_model from nmrcraft.training.hyperparameter_tune import HyperparameterTuner +from nmrcraft.utils.general import add_rows_metrics # Setup MLflow mlflow.set_experiment("Test_final_results") @@ -33,6 +34,12 @@ default=["metal", "X3_ligand"], help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", ) +parser.add_argument( + "--include_structural", + type=bool, + default=False, + help="Handles if structural features will be included or only nmr tensors are used.", +) parser.add_argument( "--plot_folder", type=str, @@ -59,17 +66,32 @@ log.getLogger().setLevel(log.INFO) dataset_sizes = [ - # 0.01, + 0.01, 0.1, # 0.15 # 0.5, # 1.0, ] models = [ - # "random_forest", + "random_forest", "extra_trees", ] + # Initialize df to store all the info for later plotting + unified_metrics_columns = [ + "target", + "model", + "nmr_only", + "dataset_fraction", + "accuracy_mean", + "accuracy_lb", + "accuracy_hb", + "f1_mean", + "f1_lb", + "f1_hb", + ] + unified_metrics = pd.DataFrame(columns=unified_metrics_columns) + with mlflow.start_run(): model_metrics = [] @@ -119,3 +141,19 @@ bootstrap_metrics = evaluation.evaluate_bootstrap( X_test, y_test, multioutput_model, args.target ) + + statistical_metrics = evaluation.metrics_statistics( + bootstrap_metrics + ) + + unified_metrics = add_rows_metrics( + unified_metrics, + statistical_metrics, + dataset_size, + args.include_structural, + model_name, + ) + # Add all the newly generated metrics to the unified dataframe + + print(unified_metrics) + unified_metrics.to_csv(f"metrics_{args.target}.csv") diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index 96f488a..e7de4e1 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -12,6 +12,7 @@ from nmrcraft.models.model_configs import model_configs from nmrcraft.models.models import load_model from nmrcraft.training.hyperparameter_tune import HyperparameterTuner +from nmrcraft.utils.general import add_rows_metrics # Setup MLflow mlflow.set_experiment("Test_final_results") @@ -67,12 +68,12 @@ dataset_sizes = [ # 0.01, 0.1, - # 0.15 + 0.15 # 0.5, # 1.0, ] models = [ - # "random_forest", + "random_forest", # "logistic_regression", # "gradient_boosting", # "svc", @@ -82,9 +83,15 @@ # Initialize df to store all the info for later plotting unified_metrics_columns = [ "target", - "nmr_tensor_input_only", + "model", + "nmr_only", "dataset_fraction", - "metrics_statistics", + "accuracy_mean", + "accuracy_lb", + "accuracy_hb", + "f1_mean", + "f1_lb", + "f1_hb", ] unified_metrics = pd.DataFrame(columns=unified_metrics_columns) @@ -134,18 +141,22 @@ X_test, y_test, best_model, args.target ) - bootsrap_stat_metrics = evaluation.metrics_statistics( + statistical_metrics = evaluation.metrics_statistics( bootstrap_metrics ) # Add all the newly generated metrics to the unified dataframe - new_row = [ - args.target, - not args.include_structural, + unified_metrics = add_rows_metrics( + unified_metrics, + statistical_metrics, dataset_size, - bootsrap_stat_metrics, - ] - unified_metrics.loc[len(unified_metrics)] = new_row + args.include_structural, + model_name, + ) + + print(unified_metrics) + unified_metrics.to_csv(f"metrics_{args.target}.csv") + test_loaded_df = pd.read_csv(f"metrics_{args.target}.csv") # TODO: Adapt this code to the new structure # visualizer = Visualizer( From 403d1415573213e6f831b6d7209bc594dc8b4d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiago=20W=C3=BCrthner?= Date: Tue, 28 May 2024 08:11:58 +0000 Subject: [PATCH 25/26] Fix: bug with dataframe for unified metrics and new columns --- nmrcraft/utils/general.py | 5 ++++- scripts/training/multiple_tragets.py | 17 +++++++++++------ scripts/training/one_target.py | 13 +++++++++---- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/nmrcraft/utils/general.py b/nmrcraft/utils/general.py index b437594..420199f 100644 --- a/nmrcraft/utils/general.py +++ b/nmrcraft/utils/general.py @@ -7,14 +7,17 @@ def add_rows_metrics( dataset_size, include_structural: bool, model_name: str, + max_evals: int, ): # Add all the newly generated metrics to the unified dataframe targetwise for i in range(len(statistical_metrics[0])): new_row = [ statistical_metrics[0][i], + statistical_metrics[0], model_name, not include_structural, dataset_size, + max_evals, statistical_metrics[1][i], statistical_metrics[2][i][0], statistical_metrics[2][i][1], @@ -23,4 +26,4 @@ def add_rows_metrics( statistical_metrics[4][i][1], ] unified_metrics.loc[len(unified_metrics)] = new_row - return unified_metrics + return unified_metrics diff --git a/scripts/training/multiple_tragets.py b/scripts/training/multiple_tragets.py index 3de455d..46b3c62 100644 --- a/scripts/training/multiple_tragets.py +++ b/scripts/training/multiple_tragets.py @@ -31,7 +31,7 @@ parser.add_argument( "--target", type=str, - default=["metal", "X3_ligand"], + default=["metal", "X3_ligand", "X4_ligand"], help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", ) parser.add_argument( @@ -66,9 +66,9 @@ log.getLogger().setLevel(log.INFO) dataset_sizes = [ - 0.01, + # 0.01, 0.1, - # 0.15 + 0.15 # 0.5, # 1.0, ] @@ -80,9 +80,11 @@ # Initialize df to store all the info for later plotting unified_metrics_columns = [ "target", + "model_targets", "model", "nmr_only", "dataset_fraction", + "max_evals", "accuracy_mean", "accuracy_lb", "accuracy_hb", @@ -152,8 +154,11 @@ dataset_size, args.include_structural, model_name, + args.max_evals, ) # Add all the newly generated metrics to the unified dataframe - - print(unified_metrics) - unified_metrics.to_csv(f"metrics_{args.target}.csv") + # save all the results + if not os.path.isdir("metrics"): + os.mkdir("metrics") + unified_metrics.to_csv(f"metrics/metrics_{args.target}.csv") + # mlflow.log_input(unified_metrics, context="unified metrics") diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index e7de4e1..e3a2f9b 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -31,7 +31,7 @@ parser.add_argument( "--target", type=str, - default=["X3_ligand"], + default=["metal"], help="The Target for the predictions. Choose from: 'metal', 'X1_ligand', 'X2_ligand', 'X3_ligand', 'X4_ligand', 'L_ligand', 'E_ligand'", ) parser.add_argument( @@ -83,9 +83,11 @@ # Initialize df to store all the info for later plotting unified_metrics_columns = [ "target", + "model_targets", "model", "nmr_only", "dataset_fraction", + "max_evals", "accuracy_mean", "accuracy_lb", "accuracy_hb", @@ -152,11 +154,14 @@ dataset_size, args.include_structural, model_name, + args.max_evals, ) - print(unified_metrics) - unified_metrics.to_csv(f"metrics_{args.target}.csv") - test_loaded_df = pd.read_csv(f"metrics_{args.target}.csv") + # save all the results + if not os.path.isdir("metrics"): + os.mkdir("metrics") + unified_metrics.to_csv(f"metrics/metrics_{args.target}.csv") + # mlflow.log_input(unified_metrics, context="unified metrics") # TODO: Adapt this code to the new structure # visualizer = Visualizer( From 2a52229a3b3c2fa17aa6095227be1c031f2ff1a7 Mon Sep 17 00:00:00 2001 From: Magdalena Lederbauer Date: Tue, 28 May 2024 08:38:03 +0000 Subject: [PATCH 26/26] fix: formatting --- nmrcraft/analysis/plotting.py | 3 ++- scripts/training/multiple_tragets.py | 2 +- scripts/training/one_target.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nmrcraft/analysis/plotting.py b/nmrcraft/analysis/plotting.py index 146f30a..dbd3790 100644 --- a/nmrcraft/analysis/plotting.py +++ b/nmrcraft/analysis/plotting.py @@ -1,7 +1,8 @@ """Functions to plot.""" -import matplotlib.patches as mpatches import os + +import matplotlib.patches as mpatches import matplotlib.pyplot as plt import numpy as np from cycler import cycler diff --git a/scripts/training/multiple_tragets.py b/scripts/training/multiple_tragets.py index 46b3c62..5b97878 100644 --- a/scripts/training/multiple_tragets.py +++ b/scripts/training/multiple_tragets.py @@ -68,7 +68,7 @@ dataset_sizes = [ # 0.01, 0.1, - 0.15 + 0.15, # 0.5, # 1.0, ] diff --git a/scripts/training/one_target.py b/scripts/training/one_target.py index e3a2f9b..d70d2ad 100644 --- a/scripts/training/one_target.py +++ b/scripts/training/one_target.py @@ -68,7 +68,7 @@ dataset_sizes = [ # 0.01, 0.1, - 0.15 + 0.15, # 0.5, # 1.0, ]