From 07e2b9a36daf3618301bc17524ff44c725282ae2 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 10 Jan 2024 21:09:45 +0800 Subject: [PATCH] Support exporting models to onnx from 3D-Speaker (#522) --- .../workflows/export-3dspeaker-to-onnx.yaml | 45 +++++ scripts/3dspeaker/README.md | 5 + scripts/3dspeaker/export-onnx.py | 146 +++++++++++++++ scripts/3dspeaker/run.sh | 63 +++++++ scripts/3dspeaker/test-onnx.py | 173 ++++++++++++++++++ scripts/wespeaker/add_meta_data.py | 4 +- scripts/wespeaker/test.py | 12 +- ...eaker-embedding-extractor-wespeaker-impl.h | 2 +- ...dding-extractor-wespeaker-model-metadata.h | 2 +- ...ker-embedding-extractor-wespeaker-model.cc | 4 +- 10 files changed, 442 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/export-3dspeaker-to-onnx.yaml create mode 100644 scripts/3dspeaker/README.md create mode 100755 scripts/3dspeaker/export-onnx.py create mode 100755 scripts/3dspeaker/run.sh create mode 100755 scripts/3dspeaker/test-onnx.py diff --git a/.github/workflows/export-3dspeaker-to-onnx.yaml b/.github/workflows/export-3dspeaker-to-onnx.yaml new file mode 100644 index 000000000..d64d8af22 --- /dev/null +++ b/.github/workflows/export-3dspeaker-to-onnx.yaml @@ -0,0 +1,45 @@ +name: export-3dspeaker-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-3dspeaker-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-3dspeaker-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export 3d-speaker to ONNX + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.8"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Run + shell: bash + run: | + cd scripts/3dspeaker + ./run.sh + + mv -v *.onnx ../.. + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.onnx + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: speaker-recongition-models diff --git a/scripts/3dspeaker/README.md b/scripts/3dspeaker/README.md new file mode 100644 index 000000000..0901317d5 --- /dev/null +++ b/scripts/3dspeaker/README.md @@ -0,0 +1,5 @@ +# Introduction + +This directory contains scripts +about exporting models from https://github.com/alibaba-damo-academy/3D-Speaker +to `onnx` so that they can be used in `sherpa-onnx`. diff --git a/scripts/3dspeaker/export-onnx.py b/scripts/3dspeaker/export-onnx.py new file mode 100755 index 000000000..f2c1b3ed1 --- /dev/null +++ b/scripts/3dspeaker/export-onnx.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# Copyright 2023-2024 Xiaomi Corp. (authors: Fangjun Kuang) + +import argparse +import json +import os +import pathlib +import re +from typing import Dict + +import onnx +import torch +from infer_sv import supports +from modelscope.hub.snapshot_download import snapshot_download +from speakerlab.utils.builder import dynamic_import + + +def add_meta_data(filename: str, meta_data: Dict[str, str]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + choices=[ + "speech_campplus_sv_en_voxceleb_16k", + "speech_campplus_sv_zh-cn_16k-common", + "speech_eres2net_sv_en_voxceleb_16k", + "speech_eres2net_sv_zh-cn_16k-common", + "speech_eres2net_base_200k_sv_zh-cn_16k-common", + "speech_eres2net_base_sv_zh-cn_3dspeaker_16k", + "speech_eres2net_large_sv_zh-cn_3dspeaker_16k", + ], + ) + return parser.parse_args() + + +@torch.no_grad() +def main(): + args = get_args() + local_model_dir = "pretrained" + model_id = f"damo/{args.model}" + conf = supports[model_id] + cache_dir = snapshot_download( + model_id, + revision=conf["revision"], + ) + cache_dir = pathlib.Path(cache_dir) + + save_dir = os.path.join(local_model_dir, model_id.split("/")[1]) + save_dir = pathlib.Path(save_dir) + save_dir.mkdir(exist_ok=True, parents=True) + + download_files = ["examples", conf["model_pt"]] + for src in cache_dir.glob("*"): + if re.search("|".join(download_files), src.name): + dst = save_dir / src.name + try: + dst.unlink() + except FileNotFoundError: + pass + dst.symlink_to(src) + pretrained_model = save_dir / conf["model_pt"] + pretrained_state = torch.load(pretrained_model, map_location="cpu") + + model = conf["model"] + embedding_model = dynamic_import(model["obj"])(**model["args"]) + embedding_model.load_state_dict(pretrained_state) + embedding_model.eval() + + with open(f"{cache_dir}/configuration.json") as f: + json_config = json.loads(f.read()) + print(json_config) + + T = 100 + C = 80 + x = torch.rand(1, T, C) + filename = f"{args.model}.onnx" + torch.onnx.export( + embedding_model, + x, + filename, + opset_version=13, + input_names=["x"], + output_names=["embedding"], + dynamic_axes={ + "x": {0: "N", 1: "T"}, + "embeddings": {0: "N"}, + }, + ) + + # all models from 3d-speaker expect input samples in the range + # [-1, 1] + normalize_samples = 1 + + # all models from 3d-speaker normalize the features by the global mean + feature_normalize_type = "global-mean" + sample_rate = json_config["model"]["model_config"]["sample_rate"] + + feat_dim = conf["model"]["args"]["feat_dim"] + assert feat_dim == 80, feat_dim + + output_dim = conf["model"]["args"]["embedding_size"] + + if "zh-cn" in args.model: + language = "Chinese" + elif "en" in args.model: + language = "English" + else: + raise ValueError(f"Unsupported language for model {args.model}") + + comment = f"This model is from damo/{args.model}" + url = f"https://www.modelscope.cn/models/damo/{args.model}/summary" + + meta_data = { + "framework": "3d-speaker", + "language": language, + "url": url, + "comment": comment, + "sample_rate": sample_rate, + "output_dim": output_dim, + "normalize_samples": normalize_samples, + "feature_normalize_type": feature_normalize_type, + } + print(meta_data) + add_meta_data(filename=filename, meta_data=meta_data) + + +main() diff --git a/scripts/3dspeaker/run.sh b/scripts/3dspeaker/run.sh new file mode 100755 index 000000000..f0e875dd0 --- /dev/null +++ b/scripts/3dspeaker/run.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -e + +function install_3d_speaker() { + echo "Install 3D-Speaker" + git clone https://github.com/alibaba-damo-academy/3D-Speaker.git + pushd 3D-Speaker + pip install -q -r ./requirements.txt + pip install -q modelscope onnx onnxruntime kaldi-native-fbank + popd +} + +function download_test_data() { + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_a_cn_16k.wav + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_b_cn_16k.wav + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_cn_16k.wav + + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_a_en_16k.wav + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_b_en_16k.wav + wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_en_16k.wav +} + +install_3d_speaker + +download_test_data + +export PYTHONPATH=$PWD/3D-Speaker:$PYTHONPATH +export PYTHONPATH=$PWD/3D-Speaker/speakerlab/bin:$PYTHONPATH + +models=( +speech_campplus_sv_en_voxceleb_16k +speech_campplus_sv_zh-cn_16k-common +speech_eres2net_sv_en_voxceleb_16k +speech_eres2net_sv_zh-cn_16k-common +speech_eres2net_base_200k_sv_zh-cn_16k-common +speech_eres2net_base_sv_zh-cn_3dspeaker_16k +speech_eres2net_large_sv_zh-cn_3dspeaker_16k +) +for model in ${models[@]}; do + echo "--------------------$model--------------------" + python3 ./export-onnx.py --model $model + + python3 ./test-onnx.py \ + --model ${model}.onnx \ + --file1 ./speaker1_a_cn_16k.wav \ + --file2 ./speaker1_b_cn_16k.wav + + python3 ./test-onnx.py \ + --model ${model}.onnx \ + --file1 ./speaker1_a_cn_16k.wav \ + --file2 ./speaker2_a_cn_16k.wav + + python3 ./test-onnx.py \ + --model ${model}.onnx \ + --file1 ./speaker1_a_en_16k.wav \ + --file2 ./speaker1_b_en_16k.wav + + python3 ./test-onnx.py \ + --model ${model}.onnx \ + --file1 ./speaker1_a_en_16k.wav \ + --file2 ./speaker2_a_en_16k.wav +done diff --git a/scripts/3dspeaker/test-onnx.py b/scripts/3dspeaker/test-onnx.py new file mode 100755 index 000000000..b2079e5e3 --- /dev/null +++ b/scripts/3dspeaker/test-onnx.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +# Copyright 2023-2024 Xiaomi Corp. (authors: Fangjun Kuang) + +""" +This script computes speaker similarity score in the range [0-1] +of two wave files using a speaker embedding model. +""" +import argparse +import wave +from pathlib import Path + +import kaldi_native_fbank as knf +import numpy as np +import onnxruntime as ort +from numpy.linalg import norm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + help="Path to the input onnx model. Example value: model.onnx", + ) + + parser.add_argument( + "--file1", + type=str, + required=True, + help="Input wave 1", + ) + + parser.add_argument( + "--file2", + type=str, + required=True, + help="Input wave 2", + ) + + return parser.parse_args() + + +def read_wavefile(filename, expected_sample_rate: int = 16000) -> np.ndarray: + """ + Args: + filename: + Path to a wave file, which must be of 16-bit and 16kHz. + expected_sample_rate: + Expected sample rate of the wave file. + Returns: + Return a 1-D float32 array containing audio samples. Each sample is in + the range [-1, 1]. + """ + filename = str(filename) + with wave.open(filename) as f: + wave_file_sample_rate = f.getframerate() + assert wave_file_sample_rate == expected_sample_rate, ( + wave_file_sample_rate, + expected_sample_rate, + ) + + num_channels = f.getnchannels() + assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes + num_samples = f.getnframes() + samples = f.readframes(num_samples) + samples_int16 = np.frombuffer(samples, dtype=np.int16) + samples_int16 = samples_int16.reshape(-1, num_channels)[:, 0] + samples_float32 = samples_int16.astype(np.float32) + + samples_float32 = samples_float32 / 32768 + + return samples_float32 + + +def compute_features(samples: np.ndarray, sample_rate: int) -> np.ndarray: + opts = knf.FbankOptions() + opts.frame_opts.dither = 0 + opts.frame_opts.samp_freq = sample_rate + opts.frame_opts.snip_edges = True + + opts.mel_opts.num_bins = 80 + opts.mel_opts.debug_mel = False + + fbank = knf.OnlineFbank(opts) + fbank.accept_waveform(sample_rate, samples) + fbank.input_finished() + + features = [] + for i in range(fbank.num_frames_ready): + f = fbank.get_frame(i) + features.append(f) + features = np.stack(features, axis=0) + + return features + + +class OnnxModel: + def __init__( + self, + filename: str, + ): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + + self.model = ort.InferenceSession( + filename, + sess_options=self.session_opts, + ) + + meta = self.model.get_modelmeta().custom_metadata_map + self.normalize_samples = int(meta["normalize_samples"]) + self.sample_rate = int(meta["sample_rate"]) + self.output_dim = int(meta["output_dim"]) + self.feature_normalize_type = meta["feature_normalize_type"] + + def __call__(self, x: np.ndarray) -> np.ndarray: + """ + Args: + x: + A 2-D float32 tensor of shape (T, C). + y: + A 1-D float32 tensor containing model output. + """ + x = np.expand_dims(x, axis=0) + + return self.model.run( + [ + self.model.get_outputs()[0].name, + ], + { + self.model.get_inputs()[0].name: x, + }, + )[0][0] + + +def main(): + args = get_args() + print(args) + filename = Path(args.model) + file1 = Path(args.file1) + file2 = Path(args.file2) + assert filename.is_file(), filename + assert file1.is_file(), file1 + assert file2.is_file(), file2 + + model = OnnxModel(filename) + wave1 = read_wavefile(file1, model.sample_rate) + wave2 = read_wavefile(file2, model.sample_rate) + + if not model.normalize_samples: + wave1 = wave1 * 32768 + wave2 = wave2 * 32768 + + features1 = compute_features(wave1, model.sample_rate) + features2 = compute_features(wave2, model.sample_rate) + + if model.feature_normalize_type == "global-mean": + features1 -= features1.mean(axis=0, keepdims=True) + features2 -= features2.mean(axis=0, keepdims=True) + + output1 = model(features1) + output2 = model(features2) + + similarity = np.dot(output1, output2) / (norm(output1) * norm(output2)) + print(f"similarity in the range [0-1]: {similarity}") + + +if __name__ == "__main__": + main() diff --git a/scripts/wespeaker/add_meta_data.py b/scripts/wespeaker/add_meta_data.py index a6c331d5f..2ec77bded 100755 --- a/scripts/wespeaker/add_meta_data.py +++ b/scripts/wespeaker/add_meta_data.py @@ -124,7 +124,7 @@ def main(): # all models from wespeaker expect input samples in the range # [-32768, 32767] - normalize_features = 0 + normalize_samples = 0 meta_data = { "framework": "wespeaker", @@ -133,7 +133,7 @@ def main(): "comment": comment, "sample_rate": sample_rate, "output_dim": output_dim, - "normalize_features": normalize_features, + "normalize_samples": normalize_samples, } print(meta_data) add_meta_data(filename=str(model), meta_data=meta_data) diff --git a/scripts/wespeaker/test.py b/scripts/wespeaker/test.py index 4cbcd5713..698cded24 100755 --- a/scripts/wespeaker/test.py +++ b/scripts/wespeaker/test.py @@ -3,7 +3,7 @@ """ This script computes speaker similarity score in the range [0-1] -of two wave files using a speaker recognition model. +of two wave files using a speaker embedding model. """ import argparse import wave @@ -54,8 +54,6 @@ def read_wavefile(filename, expected_sample_rate: int = 16000) -> np.ndarray: """ filename = str(filename) with wave.open(filename) as f: - # Note: If wave_file_sample_rate is different from - # recognizer.sample_rate, we will do resampling inside sherpa-ncnn wave_file_sample_rate = f.getframerate() assert wave_file_sample_rate == expected_sample_rate, ( wave_file_sample_rate, @@ -104,7 +102,7 @@ def __init__( ): session_opts = ort.SessionOptions() session_opts.inter_op_num_threads = 1 - session_opts.intra_op_num_threads = 4 + session_opts.intra_op_num_threads = 1 self.session_opts = session_opts @@ -114,7 +112,7 @@ def __init__( ) meta = self.model.get_modelmeta().custom_metadata_map - self.normalize_features = int(meta["normalize_features"]) + self.normalize_samples = int(meta["normalize_samples"]) self.sample_rate = int(meta["sample_rate"]) self.output_dim = int(meta["output_dim"]) @@ -151,7 +149,7 @@ def main(): wave1 = read_wavefile(file1, model.sample_rate) wave2 = read_wavefile(file2, model.sample_rate) - if not model.normalize_features: + if not model.normalize_samples: wave1 = wave1 * 32768 wave2 = wave2 * 32768 @@ -161,8 +159,6 @@ def main(): output1 = model(features1) output2 = model(features2) - print(output1.shape) - print(output2.shape) similarity = np.dot(output1, output2) / (norm(output1) * norm(output2)) print(f"similarity in the range [0-1]: {similarity}") diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-impl.h index f69de5749..b408d9def 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-impl.h @@ -27,7 +27,7 @@ class SpeakerEmbeddingExtractorWeSpeakerImpl FeatureExtractorConfig feat_config; auto meta_data = model_.GetMetaData(); feat_config.sampling_rate = meta_data.sample_rate; - feat_config.normalize_samples = meta_data.normalize_features; + feat_config.normalize_samples = meta_data.normalize_samples; return std::make_unique(feat_config); } diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model-metadata.h b/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model-metadata.h index 4d8997c4c..32ee76c6a 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model-metadata.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model-metadata.h @@ -12,7 +12,7 @@ namespace sherpa_onnx { struct SpeakerEmbeddingExtractorWeSpeakerModelMetaData { int32_t output_dim = 0; int32_t sample_rate = 0; - int32_t normalize_features = 0; + int32_t normalize_samples = 0; std::string language; }; diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model.cc b/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model.cc index b23cc95e1..b934f28a6 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-wespeaker-model.cc @@ -61,8 +61,8 @@ class SpeakerEmbeddingExtractorWeSpeakerModel::Impl { Ort::AllocatorWithDefaultOptions allocator; // used in the macro below SHERPA_ONNX_READ_META_DATA(meta_data_.output_dim, "output_dim"); SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate"); - SHERPA_ONNX_READ_META_DATA(meta_data_.normalize_features, - "normalize_features"); + SHERPA_ONNX_READ_META_DATA(meta_data_.normalize_samples, + "normalize_samples"); SHERPA_ONNX_READ_META_DATA_STR(meta_data_.language, "language"); std::string framework;