0.74.1

felixbur · Dec 14, 2023 · b4e5fe4 · b4e5fe4
1 parent 4a1fab6
commit b4e5fe4
Show file tree

Hide file tree

Showing 46 changed files with 510 additions and 172 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+Version 0.74.1
+--------------
+* fixed various bugs with augmentation
+
 Version 0.74.0 
 --------------
 * added patience (early stopping)

diff --git a/ini_file.md b/ini_file.md
@@ -8,6 +8,7 @@
   - [Sections](#sections)
     - [EXP](#exp)
     - [DATA](#data)
+    - [AUGMENT](#augment)
     - [SEGMENT](#segment)
     - [FEATS](#feats)
     - [MODEL](#model)
@@ -119,18 +120,6 @@
   * min_dur_test = 3.5
 * **target_divide_by**: divide the target values by some factor, e.g., to make age smaller and encode years from .0 to 1
   * target_divide_by = 100
-* **augment**: select the samples to augment: either *train*, *test*, or *all*
-  * augment = train
-* **augment_result**: file name to store the augmented data (can then be added to training)
-  * augment_result = augment.csv
-* **random_splice**: select the samples to be random spliced: either *train*, *test*, or *all*
-  * random_splice = train
-* **random_splice_result**: file name to store the random spliced data (can then be added to training)
-  * random_splice_result = random_spliced.csv
-* **filter**: don't use all the data but only selected values from columns: [col, val]*
-  * filter = [['gender', 'female'], ['task', 'reading']]
-* **filter.sample_selection**: Which sample set to use for filtering
-  * filter.sample_selection = all # either all, train or test
 * **limit_samples**: maximum number of random N samples per sample selection
   * limit_samples = 20
 * **limit_samples_per_speaker**: maximum number of samples per speaker per sample selection
@@ -144,11 +133,23 @@
 * **check_vad**: check if the files contain speech, using [silero VAD](https://github.com/snakers4/silero-vad)
   * check_vad = True
 
+### AUGMENT
+* **augment**: select the samples to augment: either *train*, *test*, or *all*
+  * augment = train
+* **result**: file name to store the augmented data (can then be added to training)
+  * result = augment.csv
+* **random_splice**: select the samples to be random spliced: either *train*, *test*, or *all*
+  * random_splice = train
+* **result**: file name to store the augmented data (can then be added to training)
+  * result = augment.csv
+
+
+
 ### SEGMENT
 * **sample_selection**: select the samples to segment: either *train*, *test*, or *all*
   * segment = all
-* **segment_target**: name of the extension that is added to the dataset names when storing the segmented data table with the *segment* module
-  * segment_target = _seg
+* **segment_result**: name of the segmented data table as a result
+  * segment_target = segmented.csv
 * **method**: select the model 
   * method = [silero](https://github.com/snakers4/silero-vad)
 * **min_length**: the minimum length of rest samples (in seconds)

diff --git a/nkululeko/augment.py b/nkululeko/augment.py
@@ -10,12 +10,8 @@
 
 
 def main(src_dir):
-    parser = argparse.ArgumentParser(
-        description="Call the nkululeko framework."
-    )
-    parser.add_argument(
-        "--config", default="exp.ini", help="The base configuration"
-    )
+    parser = argparse.ArgumentParser(description="Call the nkululeko framework.")
+    parser.add_argument("--config", default="exp.ini", help="The base configuration")
     args = parser.parse_args()
     if args.config is not None:
         config_file = args.config
@@ -48,24 +44,33 @@ def main(src_dir):
 
     # split into train and test
     expr.fill_train_and_tests()
-    util.debug(
-        f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}"
-    )
+    util.debug(f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}")
 
     # augment
-    augmenting = util.config_val("DATA", "augment", False)
+    augmenting = util.config_val("AUGMENT", "augment", False)
     if augmenting:
-        expr.augment()
+        df_ret = expr.augment()
 
-    random_splicing = util.config_val("DATA", "random_splice", False)
+    random_splicing = util.config_val("AUGMENT", "random_splice", False)
     if random_splicing:
-        expr.random_splice()
+        df_ret = expr.random_splice()
+
+    if (not augmenting) and (not random_splicing):
+        util.error("no augmentation selected")
+
+    # remove encoded labels
+    target = util.config_val("DATA", "target", "emotion")
+    if "class_label" in df_ret.columns:
+        df_ret = df_ret.drop(columns=[target])
+        df_ret = df_ret.rename(columns={"class_label": target})
+    # save file
+    filename = util.config_val("AUGMENT", "result", "augmented.csv")
 
+    df_ret.to_csv(f"{expr.data_dir}/{filename}")
+    util.debug(f"saved augmentation table to {filename} to {expr.data_dir}")
     print("DONE")
 
 
 if __name__ == "__main__":
     cwd = os.path.dirname(os.path.abspath(__file__))
-    main(
-        cwd
-    )  # use this if you want to state the config file path on command line
+    main(cwd)  # use this if you want to state the config file path on command line
diff --git a/nkululeko/augmenting/augmenter.py b/nkululeko/augmenting/augmenter.py
@@ -6,8 +6,14 @@
 import numpy as np
 import pandas as pd
 from audformat.utils import map_file_path
-from audiomentations import (AddGaussianNoise, AddGaussianSNR, Compose,
-                             PitchShift, Shift, TimeStretch)
+from audiomentations import (
+    AddGaussianNoise,
+    AddGaussianSNR,
+    Compose,
+    PitchShift,
+    Shift,
+    TimeStretch,
+)
 from nkululeko.util import Util
 from tqdm import tqdm
 
@@ -23,9 +29,7 @@ def __init__(self, df):
         # Define a standard transformation that randomly add augmentations to files
         self.audioment = Compose(
             [
-                AddGaussianNoise(
-                    min_amplitude=0.001, max_amplitude=0.015, p=0.5
-                ),
+                AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
                 # AddGaussianSNR(min_snr_db=10, max_snr_db=40, p=0.5),
                 TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
                 PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
@@ -50,25 +54,19 @@ def augment(self, sample_selection):
         audeer.mkdir(filepath)
         self.util.debug(f"augmenting {sample_selection} samples to {filepath}")
         newpath = ""
+        index_map = {}
         for i, f in enumerate(tqdm(files)):
             signal, sr = audiofile.read(f)
             filename = os.path.basename(f)
             parent = os.path.dirname(f).split("/")[-1]
             sig_aug = self.audioment(samples=signal, sample_rate=sr)
             newpath = f"{filepath}/{parent}/"
             audeer.mkdir(newpath)
-            audiofile.write(
-                f"{newpath}{filename}", signal=sig_aug, sampling_rate=sr
-            )
+            new_full_name = newpath + filename
+            audiofile.write(new_full_name, signal=sig_aug, sampling_rate=sr)
+            index_map[f] = new_full_name
         df_ret = self.df.copy()
-        df_ret = df_ret.set_index(
-            map_file_path(df_ret.index, lambda x: self.changepath(x, newpath))
-        )
-        aug_db_filename = self.util.config_val(
-            "DATA", "augment_result", "augment.csv"
-        )
-        target = self.util.config_val("DATA", "target", "emotion")
-        df_ret[target] = df_ret["class_label"]
-        df_ret = df_ret.drop(columns=["class_label"])
-        df_ret.to_csv(aug_db_filename)
+        file_index = df_ret.index.levels[0].map(lambda x: index_map[x]).values
+        df_ret = df_ret.set_index(df_ret.index.set_levels(file_index, level="file"))
+
         return df_ret
diff --git a/nkululeko/augmenting/randomsplicer.py b/nkululeko/augmenting/randomsplicer.py
@@ -33,10 +33,6 @@ def __init__(self, df):
         self.df = df
         self.util = Util("randomsplicer")
 
-    def changepath(self, fp, np):
-        fullpath = os.path.dirname(fp)
-        return fp.replace(fullpath, np)
-
     def run(self, sample_selection):
         """
         random splice the selected samples and return a dataframe with new files index.
@@ -52,10 +48,9 @@ def run(self, sample_selection):
         store = self.util.get_path("store")
         filepath = f"{store}randomspliced/"
         audeer.mkdir(filepath)
-        self.util.debug(
-            f"random splicing {sample_selection} samples to {filepath}"
-        )
+        self.util.debug(f"random splicing {sample_selection} samples to {filepath}")
         newpath = ""
+        index_map = {}
         for i, f in enumerate(tqdm(files)):
             signal, sr = af.read(f)
             filename = os.path.basename(f)
@@ -66,19 +61,13 @@ def run(self, sample_selection):
                 p_reverse=p_reverse,
                 top_db=top_db,
             )
-
             newpath = f"{filepath}/{parent}/"
             audeer.mkdir(newpath)
-            af.write(f"{newpath}{filename}", signal=sig_new, sampling_rate=sr)
+            new_full_name = newpath + filename
+            af.write(new_full_name, signal=sig_new, sampling_rate=sr)
+            index_map[f] = new_full_name
+
         df_ret = self.df.copy()
-        df_ret = df_ret.set_index(
-            map_file_path(df_ret.index, lambda x: self.changepath(x, newpath))
-        )
-        db_filename = self.util.config_val(
-            "DATA", "random_splice_result", "random_spliced.csv"
-        )
-        target = self.util.config_val("DATA", "target", "emotion")
-        df_ret[target] = df_ret["class_label"]
-        df_ret = df_ret.drop(columns=["class_label"])
-        df_ret.to_csv(db_filename)
+        file_index = df_ret.index.levels[0].map(lambda x: index_map[x]).values
+        df_ret = df_ret.set_index(df_ret.index.set_levels(file_index, level="file"))
         return df_ret
diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.74.0"
+VERSION="0.74.1"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/data/dataset.py b/nkululeko/data/dataset.py
@@ -401,6 +401,8 @@ def split(self):
             self.util.debug(f"{self.name}: trying to reuse data splits")
             self.df_test = pd.read_pickle(storage_test)
             self.df_train = pd.read_pickle(storage_train)
+        else:
+            self.util.error(f"unknown split strategy: {split_strategy}")
 
         if self.df_test.shape[0] > 0:
             self.df_test = self.finish_up(self.df_test, storage_test)
@@ -596,7 +598,7 @@ def map_labels(self, df):
         return df
 
     def check_continuous_classification(self):
-        datatype = self.util.config_val("DATA", "type", "False")
+        datatype = self.util.config_val("DATA", "type", False)
         if self.util.exp_is_classification() and datatype == "continuous":
             return True
         return False
@@ -606,7 +608,7 @@ def map_continuous_classification(self, df):
         if self.check_continuous_classification():
             self.util.debug(f"{self.name}: binning continuous variable to categories")
             cat_vals = self.util.continuous_to_categorical(df[self.target])
-            df[self.target] = cat_vals
+            df[self.target] = cat_vals.values
             labels = ast.literal_eval(glob_conf.config["DATA"]["labels"])
             df["class_label"] = df[self.target]
             for i, l in enumerate(labels):

diff --git a/nkululeko/data/dataset_csv.py b/nkululeko/data/dataset_csv.py
@@ -17,9 +17,9 @@ def load(self):
         self.util.debug(f"loading {self.name}")
         self.got_target, self.got_speaker, self.got_gender = False, False, False
         data_file = self.util.config_val_data(self.name, "", "")
-        if not os.path.isabs(data_file):
-            exp_root = self.util.config_val("EXP", "root", "")
-            data_file = os.path.join(exp_root, data_file)
+        # if not os.path.isabs(data_file):
+        #     exp_root = self.util.config_val("EXP", "root", "")
+        #     data_file = os.path.join(exp_root, data_file)
         root = os.path.dirname(data_file)
         audio_path = self.util.config_val_data(self.name, "audio_path", "")
         df = audformat.utils.read_csv(data_file)
@@ -28,7 +28,7 @@ def load(self):
             col_dict = ast.literal_eval(rename_cols)
             df = df.rename(columns=col_dict)
         absolute_path = eval(
-            self.util.config_val_data(self.name, "absolute_path", True)
+            self.util.config_val_data(self.name, "absolute_path", "True")
         )
         if not absolute_path:
             # add the root folder to the relative paths of the files

diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
@@ -365,7 +365,7 @@ def augment(self):
         """
         from nkululeko.augmenting.augmenter import Augmenter
 
-        sample_selection = self.util.config_val("DATA", "augment", "train")
+        sample_selection = self.util.config_val("AUGMENT", "augment", "train")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -379,7 +379,8 @@ def augment(self):
             )
 
         augmenter = Augmenter(df)
-        augmenter.augment(sample_selection)
+        df_ret = augmenter.augment(sample_selection)
+        return df_ret
 
     def autopredict(self):
         """
@@ -459,7 +460,7 @@ def random_splice(self):
         """
         from nkululeko.augmenting.randomsplicer import Randomsplicer
 
-        sample_selection = self.util.config_val("DATA", "random_splice", "train")
+        sample_selection = self.util.config_val("AUGMENT", "random_splice", "train")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -472,7 +473,8 @@ def random_splice(self):
                 " should be [all | train | test]"
             )
         randomsplicer = Randomsplicer(df)
-        randomsplicer.run(sample_selection)
+        df_ret = randomsplicer.run(sample_selection)
+        return df_ret
 
     def analyse_features(self, needs_feats):
         """

diff --git a/nkululeko/models/model_xgb.py b/nkululeko/models/model_xgb.py
@@ -9,4 +9,4 @@ class XGB_model(Model):
 
     is_classifier = True
 
-    clf = XGBClassifier(use_label_encoder=False)  # set up the classifier
+    clf = XGBClassifier()  # set up the classifier
diff --git a/nkululeko/models/model_xgr.py b/nkululeko/models/model_xgr.py
@@ -9,4 +9,4 @@ class XGR_model(Model):
 
     is_classifier = False
 
-    clf = XGBRegressor(use_label_encoder=False)  # set up the regressor
+    clf = XGBRegressor()  # set up the regressor
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,4 @@ class XGB_model(Model):

		is_classifier = True

		clf = XGBClassifier(use_label_encoder=False) # set up the classifier
		clf = XGBClassifier() # set up the classifier
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,4 @@ class XGR_model(Model):

		is_classifier = False

		clf = XGBRegressor(use_label_encoder=False) # set up the regressor
		clf = XGBRegressor() # set up the regressor