0.92.0

felixbur · Nov 7, 2024 · cf9ff2f · cf9ff2f
1 parent e664003
commit cf9ff2f
Show file tree

Hide file tree

Showing 7 changed files with 170 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+Version 0.92.0
+--------------
+* added first version of automatic speaker prediction/segmentation
+
 Version 0.91.3
 --------------
 * some additions for robustness

diff --git a/nkululeko/autopredict/ap_sid.py b/nkululeko/autopredict/ap_sid.py
@@ -2,13 +2,12 @@
 A predictor for sid - Speaker ID.
 """
 
-from pyannote.audio import Pipeline
-
-
 import numpy as np
+from pyannote.audio import Pipeline
+import torch
 
-import nkululeko.glob_conf as glob_conf
 from nkululeko.feature_extractor import FeatureExtractor
+import nkululeko.glob_conf as glob_conf
 from nkululeko.utils.util import Util
 
 
@@ -21,23 +20,29 @@ class SIDPredictor:
     def __init__(self, df):
         self.df = df
         self.util = Util("sidPredictor")
+        hf_token = self.util.config_val("Model", "hf_token", None)
+        if hf_token is None:
+            self.util.error(
+                "speaker id prediction needs huggingface token: [MODEL][hf_token]"
+            )
         self.pipeline = Pipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
-            use_auth_token="HUGGINGFACE_ACCESS_TOKEN_GOES_HERE",
+            use_auth_token=hf_token,
         )
+        device = self.util.config_val("Model", "device", "cpu")
+        self.pipeline.to(torch.device(device))
 
     def predict(self, split_selection):
-        self.util.debug(f"estimating PESQ for {split_selection} samples")
+        self.util.debug(f"estimating speaker id for {split_selection} samples")
         return_df = self.df.copy()
-        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
-        self.feature_extractor = FeatureExtractor(
-            self.df, ["squim"], feats_name, split_selection
-        )
-        result_df = self.feature_extractor.extract()
-        # replace missing values by 0
-        result_df = result_df.fillna(0)
-        result_df = result_df.replace(np.nan, 0)
-        result_df.replace([np.inf, -np.inf], 0, inplace=True)
-        pred_vals = result_df.pesq * 100
-        return_df["pesq_pred"] = pred_vals.astype("int") / 100
+        # @todo
+        # 1) concat all audio files
+        # 2) get segmentations with pyannote
+        # 3) map pyannote segments with orginal ones and assign speaker id
+
         return return_df
+
+    def concat_files(self, df):
+        pass
+        # todo
+        # please use https://audeering.github.io/audiofile/usage.html#read-a-file
diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.91.3"
+VERSION="0.92.0"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
@@ -439,7 +439,12 @@ def autopredict(self):
             )
         targets = self.util.config_val_list("PREDICT", "targets", ["gender"])
         for target in targets:
-            if target == "gender":
+            if target == "speaker":
+                from nkululeko.autopredict.ap_sid import SIDPredictor
+
+                predictor = SIDPredictor(df)
+                df = predictor.predict(sample_selection)
+            elif target == "gender":
                 from nkululeko.autopredict.ap_gender import GenderPredictor
 
                 predictor = GenderPredictor(df)

diff --git a/nkululeko/predict.py b/nkululeko/predict.py
@@ -2,7 +2,7 @@
 # use some model and add automatically predicted labels to train and test splits
 # then save as a new dataset
 
-"""This script is used to call the nkululeko PREDICT framework. 
+r"""This script is used to call the nkululeko PREDICT framework.
 
 It loads a configuration file, creates a new experiment,
 and performs automatic prediction on the train and test datasets. The predicted labels are added to the datasets and

diff --git a/nkululeko/segment.py b/nkululeko/segment.py
@@ -1,5 +1,4 @@
-"""
-Segments the samples in the dataset into chunks based on voice activity detection using SILERO VAD [1].
+"""Segments the samples in the dataset into chunks based on voice activity detection using SILERO VAD [1].
 
 The segmentation results are saved to a file, and the distributions of the original and
 segmented durations are plotted.
@@ -15,7 +14,7 @@
 
 References:
     [1] https://github.com/snakers4/silero-vad
-    
+    [2] https://github.com/pyannote/pyannote-audio
 """
 
 import argparse
@@ -83,12 +82,15 @@ def main():
 
         segmenter = Silero_segmenter()
         df_seg = segmenter.segment_dataframe(df)
+    elif segmenter == "pyannote":
+        from nkululeko.segmenting.seg_pyannote import Pyannote_segmenter
 
+        segmenter = Pyannote_segmenter(config)
+        df_seg = segmenter.segment_dataframe(df)
     else:
-        util.error(f"unkown segmenter: {segmenter}")
+        util.error(f"unknown segmenter: {segmenter}")
 
     def calc_dur(x):
-
         starts = x[1]
         ends = x[2]
         return (ends - starts).total_seconds()
@@ -115,8 +117,6 @@ def calc_dur(x):
         df_seg = df_seg.drop(columns=[target])
         df_seg = df_seg.rename(columns={"class_label": target})
     # save file
-    # dataname = "_".join(expr.datasets.keys())
-    # name = f"{dataname}{segment_target}"
     df_seg.to_csv(f"{expr.data_dir}/{segmented_file}")
     util.debug(
         f"saved {segmented_file} to {expr.data_dir}, {num_after} samples (was"

diff --git a/nkululeko/segmenting/seg_pyannote.py b/nkululeko/segmenting/seg_pyannote.py
@@ -0,0 +1,129 @@
+"""seg_pyannote.py.
+
+Segment a dataset with the Pyannote segmenter.
+Also adds speaker ids to the segments.
+
+"""
+
+import pandas as pd
+from pyannote.audio import Pipeline
+import torch
+from tqdm import tqdm
+
+import audformat
+from audformat import segmented_index
+
+from nkululeko.utils.util import Util
+
+
+SAMPLING_RATE = 16000
+
+
+class Pyannote_segmenter:
+    def __init__(self, not_testing=True):
+        # initialize the VAD model
+        torch.set_num_threads(1)
+        self.no_testing = not_testing
+        self.util = Util("pyannote_segmenter")
+        hf_token = self.util.config_val("MODEL", "hf_token", None)
+        if hf_token is None:
+            self.util.error(
+                "speaker id prediction needs huggingface token: [MODEL][hf_token]"
+            )
+        self.pipeline = Pipeline.from_pretrained(
+            "pyannote/speaker-diarization-3.1",
+            use_auth_token=hf_token,
+        )
+        device = self.util.config_val("MODEL", "device", "cpu")
+        if device == "cpu":
+            self.util.warn(
+                "running pyannote on CPU can be really slow, consider using a GPU"
+            )
+        self.pipeline.to(torch.device(device))
+
+    def get_segmentation_simple(self, file):
+
+        annotation = self.pipeline(file[0])
+
+        speakers, starts, ends, files = [], [], [], []
+        # print the result
+        for turn, _, speaker in annotation.itertracks(yield_label=True):
+            start = turn.start
+            end = turn.end
+            speakers.append(speaker)
+            starts.append(start)
+            files.append(file[0])
+            ends.append(end)
+        seg_index = segmented_index(files, starts, ends)
+        return seg_index, speakers
+
+    def get_segmentation(self, file, min_length, max_length):
+        annotation = self.pipeline(file)
+        files, starts, ends, speakers = [], [], [], []
+        for turn, _, speaker in annotation.itertracks(yield_label=True):
+            start = turn.start
+            end = turn.end
+            new_end = end
+            handled = False
+            while end - start > max_length:
+                new_end = start + max_length
+                if end - new_end < min_length:
+                    new_end = end
+                files.append(file[0])
+                starts.append(start)
+                ends.append(new_end)
+                speakers.append(speaker)
+                start += max_length
+                handled = True
+            if not handled and end - start > min_length:
+                files.append(file[0])
+                starts.append(start)
+                ends.append(end)
+                speakers.append(speaker)
+        seg_index = segmented_index(files, starts, ends)
+        return seg_index, speakers
+
+    def segment_dataframe(self, df):
+        dfs = []
+        max_length = eval(self.util.config_val("SEGMENT", "max_length", "False"))
+        if max_length:
+            if self.no_testing:
+                min_length = float(self.util.config_val("SEGMENT", "min_length", 2))
+            else:
+                min_length = 2
+            self.util.debug(f"segmenting with max length: {max_length+min_length}")
+        for file, values in tqdm(df.iterrows()):
+            if max_length:
+                index, speakers = self.get_segmentation(file, min_length, max_length)
+            else:
+                index, speakers = self.get_segmentation_simple(file)
+            df = pd.DataFrame(
+                values.to_dict(),
+                index,
+            )
+            df["speaker"] = speakers
+            dfs.append(df)
+        return audformat.utils.concat(dfs)
+
+
+def main():
+    files = pd.Series(["test_wavs/very_long.wav"])
+    df_sample = pd.DataFrame(index=files)
+    df_sample["target"] = "anger"
+    df_sample.index = audformat.utils.to_segmented_index(
+        df_sample.index, allow_nat=False
+    )
+    segmenter = Pyannote_segmenter(not_testing=False)
+    df_seg = segmenter.segment_dataframe(df_sample)
+
+    def calc_dur(x):
+        starts = x[1]
+        ends = x[2]
+        return (ends - starts).total_seconds()
+
+    df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
+    print(df_seg.head(100))
+
+
+if __name__ == "__main__":
+    main()