Skip to content

Commit

Permalink
0.92.0
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Nov 7, 2024
1 parent e664003 commit cf9ff2f
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 27 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changelog
=========

Version 0.92.0
--------------
* added first version of automatic speaker prediction/segmentation

Version 0.91.3
--------------
* some additions for robustness
Expand Down
39 changes: 22 additions & 17 deletions nkululeko/autopredict/ap_sid.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
A predictor for sid - Speaker ID.
"""

from pyannote.audio import Pipeline


import numpy as np
from pyannote.audio import Pipeline
import torch

import nkululeko.glob_conf as glob_conf
from nkululeko.feature_extractor import FeatureExtractor
import nkululeko.glob_conf as glob_conf
from nkululeko.utils.util import Util


Expand All @@ -21,23 +20,29 @@ class SIDPredictor:
def __init__(self, df):
self.df = df
self.util = Util("sidPredictor")
hf_token = self.util.config_val("Model", "hf_token", None)
if hf_token is None:
self.util.error(
"speaker id prediction needs huggingface token: [MODEL][hf_token]"
)
self.pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token="HUGGINGFACE_ACCESS_TOKEN_GOES_HERE",
use_auth_token=hf_token,
)
device = self.util.config_val("Model", "device", "cpu")
self.pipeline.to(torch.device(device))

def predict(self, split_selection):
self.util.debug(f"estimating PESQ for {split_selection} samples")
self.util.debug(f"estimating speaker id for {split_selection} samples")
return_df = self.df.copy()
feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
self.feature_extractor = FeatureExtractor(
self.df, ["squim"], feats_name, split_selection
)
result_df = self.feature_extractor.extract()
# replace missing values by 0
result_df = result_df.fillna(0)
result_df = result_df.replace(np.nan, 0)
result_df.replace([np.inf, -np.inf], 0, inplace=True)
pred_vals = result_df.pesq * 100
return_df["pesq_pred"] = pred_vals.astype("int") / 100
# @todo
# 1) concat all audio files
# 2) get segmentations with pyannote
# 3) map pyannote segments with orginal ones and assign speaker id

return return_df

def concat_files(self, df):
pass
# todo
# please use https://audeering.github.io/audiofile/usage.html#read-a-file
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.91.3"
VERSION="0.92.0"
SAMPLING_RATE = 16000
7 changes: 6 additions & 1 deletion nkululeko/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,12 @@ def autopredict(self):
)
targets = self.util.config_val_list("PREDICT", "targets", ["gender"])
for target in targets:
if target == "gender":
if target == "speaker":
from nkululeko.autopredict.ap_sid import SIDPredictor

predictor = SIDPredictor(df)
df = predictor.predict(sample_selection)
elif target == "gender":
from nkululeko.autopredict.ap_gender import GenderPredictor

predictor = GenderPredictor(df)
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# use some model and add automatically predicted labels to train and test splits
# then save as a new dataset

"""This script is used to call the nkululeko PREDICT framework.
r"""This script is used to call the nkululeko PREDICT framework.
It loads a configuration file, creates a new experiment,
and performs automatic prediction on the train and test datasets. The predicted labels are added to the datasets and
Expand Down
14 changes: 7 additions & 7 deletions nkululeko/segment.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""
Segments the samples in the dataset into chunks based on voice activity detection using SILERO VAD [1].
"""Segments the samples in the dataset into chunks based on voice activity detection using SILERO VAD [1].
The segmentation results are saved to a file, and the distributions of the original and
segmented durations are plotted.
Expand All @@ -15,7 +14,7 @@
References:
[1] https://github.com/snakers4/silero-vad
[2] https://github.com/pyannote/pyannote-audio
"""

import argparse
Expand Down Expand Up @@ -83,12 +82,15 @@ def main():

segmenter = Silero_segmenter()
df_seg = segmenter.segment_dataframe(df)
elif segmenter == "pyannote":
from nkululeko.segmenting.seg_pyannote import Pyannote_segmenter

segmenter = Pyannote_segmenter(config)
df_seg = segmenter.segment_dataframe(df)
else:
util.error(f"unkown segmenter: {segmenter}")
util.error(f"unknown segmenter: {segmenter}")

def calc_dur(x):

starts = x[1]
ends = x[2]
return (ends - starts).total_seconds()
Expand All @@ -115,8 +117,6 @@ def calc_dur(x):
df_seg = df_seg.drop(columns=[target])
df_seg = df_seg.rename(columns={"class_label": target})
# save file
# dataname = "_".join(expr.datasets.keys())
# name = f"{dataname}{segment_target}"
df_seg.to_csv(f"{expr.data_dir}/{segmented_file}")
util.debug(
f"saved {segmented_file} to {expr.data_dir}, {num_after} samples (was"
Expand Down
129 changes: 129 additions & 0 deletions nkululeko/segmenting/seg_pyannote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""seg_pyannote.py.
Segment a dataset with the Pyannote segmenter.
Also adds speaker ids to the segments.
"""

import pandas as pd
from pyannote.audio import Pipeline
import torch
from tqdm import tqdm

import audformat
from audformat import segmented_index

from nkululeko.utils.util import Util


SAMPLING_RATE = 16000


class Pyannote_segmenter:
def __init__(self, not_testing=True):
# initialize the VAD model
torch.set_num_threads(1)
self.no_testing = not_testing
self.util = Util("pyannote_segmenter")
hf_token = self.util.config_val("MODEL", "hf_token", None)
if hf_token is None:
self.util.error(
"speaker id prediction needs huggingface token: [MODEL][hf_token]"
)
self.pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=hf_token,
)
device = self.util.config_val("MODEL", "device", "cpu")
if device == "cpu":
self.util.warn(
"running pyannote on CPU can be really slow, consider using a GPU"
)
self.pipeline.to(torch.device(device))

def get_segmentation_simple(self, file):

annotation = self.pipeline(file[0])

speakers, starts, ends, files = [], [], [], []
# print the result
for turn, _, speaker in annotation.itertracks(yield_label=True):
start = turn.start
end = turn.end
speakers.append(speaker)
starts.append(start)
files.append(file[0])
ends.append(end)
seg_index = segmented_index(files, starts, ends)
return seg_index, speakers

def get_segmentation(self, file, min_length, max_length):
annotation = self.pipeline(file)
files, starts, ends, speakers = [], [], [], []
for turn, _, speaker in annotation.itertracks(yield_label=True):
start = turn.start
end = turn.end
new_end = end
handled = False
while end - start > max_length:
new_end = start + max_length
if end - new_end < min_length:
new_end = end
files.append(file[0])
starts.append(start)
ends.append(new_end)
speakers.append(speaker)
start += max_length
handled = True
if not handled and end - start > min_length:
files.append(file[0])
starts.append(start)
ends.append(end)
speakers.append(speaker)
seg_index = segmented_index(files, starts, ends)
return seg_index, speakers

def segment_dataframe(self, df):
dfs = []
max_length = eval(self.util.config_val("SEGMENT", "max_length", "False"))
if max_length:
if self.no_testing:
min_length = float(self.util.config_val("SEGMENT", "min_length", 2))
else:
min_length = 2
self.util.debug(f"segmenting with max length: {max_length+min_length}")
for file, values in tqdm(df.iterrows()):
if max_length:
index, speakers = self.get_segmentation(file, min_length, max_length)
else:
index, speakers = self.get_segmentation_simple(file)
df = pd.DataFrame(
values.to_dict(),
index,
)
df["speaker"] = speakers
dfs.append(df)
return audformat.utils.concat(dfs)


def main():
files = pd.Series(["test_wavs/very_long.wav"])
df_sample = pd.DataFrame(index=files)
df_sample["target"] = "anger"
df_sample.index = audformat.utils.to_segmented_index(
df_sample.index, allow_nat=False
)
segmenter = Pyannote_segmenter(not_testing=False)
df_seg = segmenter.segment_dataframe(df_sample)

def calc_dur(x):
starts = x[1]
ends = x[2]
return (ends - starts).total_seconds()

df_seg["duration"] = df_seg.index.to_series().map(lambda x: calc_dur(x))
print(df_seg.head(100))


if __name__ == "__main__":
main()

0 comments on commit cf9ff2f

Please sign in to comment.