0.74.5

felixbur · Dec 15, 2023 · 3313d8a · 3313d8a
1 parent 9024a75
commit 3313d8a
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 54 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+Version 0.74.5
+--------------
+* added praat feature extractor for one sample
+
 Version 0.74.4
 --------------
 * fixed bug combining augmentations

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.74.4"
+VERSION="0.74.5"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/demo_predictor.py b/nkululeko/demo_predictor.py
@@ -42,6 +42,13 @@ def run_demo(self):
 
     def predict_signal(self, signal, sr):
         features = self.feature_extractor.extract_sample(signal, sr)
+        scale_feats = self.util.config_val("FEATS", "scale", False)
+        if scale_feats:
+            from sklearn.preprocessing import StandardScaler
+
+            scaler = StandardScaler()
+            features = scaler.fit_transform(features)
+        features = np.nan_to_num(features)
         result_dict = self.model.predict_sample(features)
         keys = result_dict.keys()
         if self.label_encoder is not None:

diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
@@ -100,12 +100,14 @@ def load_datasets(self):
         dbs = ",".join(list(self.datasets.keys()))
         labels = self.util.config_val("DATA", "labels", False)
         if labels:
-            labels = ast.literal_eval(labels)
+            self.labels = ast.literal_eval(labels)
             self.util.debug(f"Target labels (from config): {labels}")
         else:
-            labels = list(next(iter(self.datasets.values())).df[self.target].unique())
+            self.labels = list(
+                next(iter(self.datasets.values())).df[self.target].unique()
+            )
             self.util.debug(f"Target labels (from database): {labels}")
-        glob_conf.set_labels(labels)
+        glob_conf.set_labels(self.labels)
         self.util.debug(f"loaded databases {dbs}")
 
     def _import_csv(self, storage):
@@ -589,6 +591,7 @@ def run(self):
         if save:
             # save the experiment for future use
             self.save(self.util.get_save_name())
+            # self.save_onnx(self.util.get_save_name())
 
         # self.__collect_reports()
         self.util.print_best_results(self.reports)
@@ -667,6 +670,7 @@ def load(self, filename):
         tmp_dict = pickle.load(f)
         f.close()
         self.__dict__.update(tmp_dict)
+        glob_conf.set_labels(self.labels)
 
     def save(self, filename):
         try:
@@ -675,3 +679,17 @@ def save(self, filename):
             f.close()
         except (AttributeError, TypeError, RuntimeError) as error:
             self.util.warn(f"Save experiment: Can't pickle local object: {error}")
+
+    def save_onnx(self, filename):
+        # export the model to onnx
+        model = self.runmgr.get_best_model()
+        if model.is_ANN():
+            print("converting to onnx from torch")
+        else:
+            from skl2onnx import to_onnx
+
+            print("converting to onnx from sklearn")
+        # save the rest
+        f = open(filename, "wb")
+        pickle.dump(self.__dict__, f)
+        f.close()
diff --git a/nkululeko/feat_extract/feats_praat.py b/nkululeko/feat_extract/feats_praat.py
@@ -2,6 +2,7 @@
 from nkululeko.feat_extract.featureset import Featureset
 import os
 import pandas as pd
+import numpy as np
 import nkululeko.glob_conf as glob_conf
 from nkululeko.feat_extract import feinberg_praat
 import ast
@@ -23,14 +24,10 @@ def extract(self):
         store = self.util.get_path("store")
         store_format = self.util.config_val("FEATS", "store_format", "pkl")
         storage = f"{store}{self.name}.{store_format}"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
-            self.util.debug(
-                "extracting Praat features, this might take a while..."
-            )
+            self.util.debug("extracting Praat features, this might take a while...")
             self.df = feinberg_praat.compute_features(self.data_df.index)
             self.df = self.df.set_index(self.data_df.index)
             for i, col in enumerate(self.df.columns):
@@ -53,36 +50,25 @@ def extract(self):
         self.df = self.df.astype(float)
 
     def extract_sample(self, signal, sr):
-        self.util.error(
-            "feats_praat: extracting single samples not implemented yet"
-        )
-        feats = None
-        return feats
+        import audiofile, audformat
 
-    # def filter(self):
-    #     # use only the features that are indexed in the target dataframes
-    #     self.df = self.df[self.df.index.isin(self.data_df.index)]
-    #     try:
-    #         # use only some features
-    #         selected_features = ast.literal_eval(
-    #             glob_conf.config["FEATS"]["praat.features"]
-    #         )
-    #         self.util.debug(
-    #             f"selecting features from Praat: {selected_features}"
-    #         )
-    #         sel_feats_df = pd.DataFrame()
-    #         hit = False
-    #         for feat in selected_features:
-    #             try:
-    #                 sel_feats_df[feat] = self.df[feat]
-    #                 hit = True
-    #             except KeyError:
-    #                 pass
-    #         if hit:
-    #             self.df = sel_feats_df
-    #             self.util.debug(
-    #                 "new feats shape after selecting Praat features:"
-    #                 f" {self.df.shape}"
-    #             )
-    #     except KeyError:
-    #         pass
+        tmp_audio_names = ["praat_audio_tmp.wav"]
+        audiofile.write(tmp_audio_names[0], signal, sr)
+        df = pd.DataFrame(index=tmp_audio_names)
+        index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
+        df = feinberg_praat.compute_features(index)
+        df.set_index(index)
+        for i, col in enumerate(df.columns):
+            if df[col].isnull().values.any():
+                self.util.debug(
+                    f"{col} includes {df[col].isnull().sum()} nan,"
+                    " inserting mean values"
+                )
+                mean_val = df[col].mean()
+                if not np.isnan(mean_val):
+                    df[col] = df[col].fillna(mean_val)
+                else:
+                    df[col] = df[col].fillna(0)
+        df = df.astype(float)
+        feats = df.to_numpy()
+        return feats
diff --git a/nkululeko/feat_extract/feinberg_praat.py b/nkululeko/feat_extract/feinberg_praat.py
@@ -199,28 +199,30 @@ def runPCA(df):
     # pickle.dump(x, f)
     # f.close()
 
-    x = StandardScaler().fit_transform(x)
-    if np.any(np.isnan(x)):
+    # x = StandardScaler().fit_transform(x)
+    if np.any(np.isnan(x[0])):
         print(
             f"Warning: {np.count_nonzero(np.isnan(x))} Nans in x, replacing" " with 0"
         )
         x[np.isnan(x)] = 0
-    if np.any(np.isfinite(x)):
-        print(f"Warning: {np.count_nonzero(np.isfinite(x))} infinite in x")
+    # if np.any(np.isfinite(x[0])):
+    #     print(f"Warning: {np.count_nonzero(np.isfinite(x))} finite in x")
 
     # PCA
     pca = PCA(n_components=2)
-    principalComponents = pca.fit_transform(x)
-    if np.any(np.isnan(principalComponents)):
-        print("pc is nan")
-        print(f"count: {np.count_nonzero(np.isnan(principalComponents))}")
-        print(principalComponents)
-        principalComponents = np.nan_to_num(principalComponents)
-
+    try:
+        principalComponents = pca.fit_transform(x)
+        if np.any(np.isnan(principalComponents)):
+            print("pc is nan")
+            print(f"count: {np.count_nonzero(np.isnan(principalComponents))}")
+            print(principalComponents)
+            principalComponents = np.nan_to_num(principalComponents)
+    except ValueError:
+        print("need more than one file for pca")
+        principalComponents = [[0, 0]]
     principalDf = pd.DataFrame(
         data=principalComponents, columns=["JitterPCA", "ShimmerPCA"]
     )
-
     return principalDf
 
 

diff --git a/nkululeko/models/model_mlp.py b/nkululeko/models/model_mlp.py
@@ -174,6 +174,9 @@ def predict_sample(self, features):
     def store(self):
         torch.save(self.model.state_dict(), self.store_path)
 
+    def store_as_onnx(self):
+        pass
+
     def load(self, run, epoch):
         self.set_id(run, epoch)
         dir = self.util.get_path("model_dir")