diff --git a/.github/workflows/apk-speaker-diarization.yaml b/.github/workflows/apk-speaker-diarization.yaml index 19f0b99bc..8e422e13f 100644 --- a/.github/workflows/apk-speaker-diarization.yaml +++ b/.github/workflows/apk-speaker-diarization.yaml @@ -4,7 +4,6 @@ on: push: branches: - apk - - android-demo-speaker-diarization-2 workflow_dispatch: @@ -76,6 +75,11 @@ jobs: run: | cd scripts/apk + total=${{ matrix.total }} + index=${{ matrix.index }} + + python3 ./generate-speaker-diarization-apk-script.py --total $total --index $index + chmod +x build-apk-speaker-diarization.sh mv -v ./build-apk-speaker-diarization.sh ../.. diff --git a/.github/workflows/export-revai-segmentation-to-onnx.yaml b/.github/workflows/export-revai-segmentation-to-onnx.yaml new file mode 100644 index 000000000..f0f1594c6 --- /dev/null +++ b/.github/workflows/export-revai-segmentation-to-onnx.yaml @@ -0,0 +1,86 @@ +name: export-revai-segmentation-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-revai-segmentation-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-revai-segmentation-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export revai segmentation models to ONNX + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install pyannote + shell: bash + run: | + pip install pyannote.audio onnx==1.15.0 onnxruntime==1.16.3 + + - name: Run + shell: bash + run: | + d=sherpa-onnx-reverb-diarization-v1 + src=$PWD/$d + mkdir -p $src + + pushd scripts/pyannote/segmentation + ./run-revai.sh + cp ./*.onnx $src/ + cp ./README.md $src/ + cp ./LICENSE $src/ + cp ./run-revai.sh $src/run.sh + cp ./*.py $src/ + + popd + ls -lh $d + tar cjfv $d.tar.bz2 $d + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: speaker-segmentation-models + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + d=sherpa-onnx-reverb-diarization-v1 + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + git clone https://huggingface.co/csukuangfj/$d huggingface + cp -v $d/* ./huggingface + cd huggingface + git lfs track "*.onnx" + git status + git add . + git status + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main diff --git a/scripts/apk/build-apk-speaker-diarization.sh b/scripts/apk/build-apk-speaker-diarization.sh.in similarity index 75% rename from scripts/apk/build-apk-speaker-diarization.sh rename to scripts/apk/build-apk-speaker-diarization.sh.in index 04e294c6c..2b2922172 100755 --- a/scripts/apk/build-apk-speaker-diarization.sh +++ b/scripts/apk/build-apk-speaker-diarization.sh.in @@ -31,15 +31,24 @@ log "====================x86====================" mkdir -p apks +{% for model in model_list %} + pushd ./android/SherpaOnnxSpeakerDiarization/app/src/main/assets/ -curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 -tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 -rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 -mv sherpa-onnx-pyannote-segmentation-3-0/model.onnx segmentation.onnx -rm -rf sherpa-onnx-pyannote-segmentation-3-0 +ls -lh + +model_name={{ model.model_name }} +short_name={{ model.short_name }} -curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/$model_name.tar.bz2 +tar xvf $model_name.tar.bz2 +rm $model_name.tar.bz2 +mv $model_name/model.onnx segmentation.onnx +rm -rf $model_name + +if [ ! -f 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi echo "pwd: $PWD" ls -lh @@ -65,9 +74,13 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do ./gradlew build popd - mv android/SherpaOnnxSpeakerDiarization/app/build/outputs/apk/debug/app-debug.apk ./apks/sherpa-onnx-${SHERPA_ONNX_VERSION}-$arch-speaker-diarization-pyannote_audio-3dspeaker.apk + mv android/SherpaOnnxSpeakerDiarization/app/build/outputs/apk/debug/app-debug.apk ./apks/sherpa-onnx-${SHERPA_ONNX_VERSION}-$arch-speaker-diarization-$short_name-3dspeaker.apk ls -lh apks rm -v ./android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/$arch/*.so done +rm -rf ./android/SherpaOnnxSpeakerDiarization/app/src/main/assets/segmentation.onnx + +{% endfor %} + ls -lh apks diff --git a/scripts/apk/generate-speaker-diarization-apk-script.py b/scripts/apk/generate-speaker-diarization-apk-script.py new file mode 100755 index 000000000..688189fad --- /dev/null +++ b/scripts/apk/generate-speaker-diarization-apk-script.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import argparse +from dataclasses import dataclass +from typing import List + +import jinja2 + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--total", + type=int, + default=1, + help="Number of runners", + ) + parser.add_argument( + "--index", + type=int, + default=0, + help="Index of the current runner", + ) + return parser.parse_args() + + +@dataclass +class SpeakerSegmentationModel: + model_name: str + short_name: str = "" + + +def get_models() -> List[SpeakerSegmentationModel]: + models = [ + SpeakerSegmentationModel( + model_name="sherpa-onnx-pyannote-segmentation-3-0", + short_name="pyannote_audio", + ), + SpeakerSegmentationModel( + model_name="sherpa-onnx-reverb-diarization-v1", + short_name="revai_v1", + ), + ] + + return models + + +def main(): + args = get_args() + index = args.index + total = args.total + assert 0 <= index < total, (index, total) + + all_model_list = get_models() + + num_models = len(all_model_list) + + num_per_runner = num_models // total + if num_per_runner <= 0: + raise ValueError(f"num_models: {num_models}, num_runners: {total}") + + start = index * num_per_runner + end = start + num_per_runner + + remaining = num_models - args.total * num_per_runner + + print(f"{index}/{total}: {start}-{end}/{num_models}") + + d = dict() + d["model_list"] = all_model_list[start:end] + if index < remaining: + s = args.total * num_per_runner + index + d["model_list"].append(all_model_list[s]) + print(f"{s}/{num_models}") + + filename_list = ["./build-apk-speaker-diarization.sh"] + for filename in filename_list: + environment = jinja2.Environment() + with open(f"{filename}.in") as f: + s = f.read() + template = environment.from_string(s) + + s = template.render(**d) + with open(filename, "w") as f: + print(s, file=f) + + +if __name__ == "__main__": + main() diff --git a/scripts/pyannote/segmentation/export-onnx.py b/scripts/pyannote/segmentation/export-onnx.py index feb241a26..e360f0e70 100755 --- a/scripts/pyannote/segmentation/export-onnx.py +++ b/scripts/pyannote/segmentation/export-onnx.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) +import os from typing import Any, Dict import onnx @@ -35,6 +37,8 @@ def add_meta_data(filename: str, meta_data: Dict[str, Any]): def main(): # You can download ./pytorch_model.bin from # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0 + # or from + # https://huggingface.co/Revai/reverb-diarization-v1/tree/main pt_filename = "./pytorch_model.bin" model = Model.from_pretrained(pt_filename) model.eval() @@ -94,6 +98,22 @@ def main(): receptive_field_size = int(model.receptive_field.duration * 16000) receptive_field_shift = int(model.receptive_field.step * 16000) + is_revai = os.getenv("SHERPA_ONNX_IS_REVAI", "") + if is_revai == "": + url_1 = "https://huggingface.co/pyannote/segmentation-3.0" + url_2 = "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0" + license_url = ( + "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE" + ) + model_author = "pyannote-audio" + else: + url_1 = "https://huggingface.co/Revai/reverb-diarization-v1" + url_2 = "https://huggingface.co/csukuangfj/sherpa-onnx-reverb-diarization-v1" + license_url = ( + "https://huggingface.co/Revai/reverb-diarization-v1/blob/main/LICENSE" + ) + model_author = "Revai" + meta_data = { "num_speakers": len(model.specifications.classes), "powerset_max_classes": model.specifications.powerset_max_classes, @@ -104,11 +124,11 @@ def main(): "receptive_field_shift": receptive_field_shift, "model_type": "pyannote-segmentation-3.0", "version": "1", - "model_author": "pyannote", + "model_author": model_author, "maintainer": "k2-fsa", - "url_1": "https://huggingface.co/pyannote/segmentation-3.0", - "url_2": "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0", - "license": "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE", + "url_1": url_1, + "url_2": url_2, + "license": license_url, } add_meta_data(filename=filename, meta_data=meta_data) diff --git a/scripts/pyannote/segmentation/preprocess.sh b/scripts/pyannote/segmentation/preprocess.sh index 703420b15..aa423ee08 100755 --- a/scripts/pyannote/segmentation/preprocess.sh +++ b/scripts/pyannote/segmentation/preprocess.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) python3 -m onnxruntime.quantization.preprocess --input model.onnx --output tmp.preprocessed.onnx diff --git a/scripts/pyannote/segmentation/run-revai.sh b/scripts/pyannote/segmentation/run-revai.sh new file mode 100755 index 000000000..61f4fec29 --- /dev/null +++ b/scripts/pyannote/segmentation/run-revai.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +export SHERPA_ONNX_IS_REVAI=1 + +set -ex +function install_pyannote() { + pip install pyannote.audio onnx onnxruntime +} + +function download_test_files() { + curl -SL -O https://huggingface.co/Revai/reverb-diarization-v1/resolve/main/pytorch_model.bin + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +} + +install_pyannote +download_test_files + +./export-onnx.py +./preprocess.sh + +echo "----------torch----------" +./vad-torch.py + +echo "----------onnx model.onnx----------" +./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav + +echo "----------onnx model.int8.onnx----------" +./vad-onnx.py --model ./model.int8.onnx --wav ./lei-jun-test.wav + +curl -SL -O https://huggingface.co/Revai/reverb-diarization-v1/resolve/main/LICENSE + +cat >README.md << EOF +# Introduction + +Models in this file are converted from +https://huggingface.co/Revai/reverb-diarization-v1/tree/main + +Note that it is accessible under a non-commercial license. + +Please see ./LICENSE for details. + +See also +https://www.rev.com/blog/speech-to-text-technology/introducing-reverb-open-source-asr-diarization + +EOF + + diff --git a/scripts/pyannote/segmentation/speaker-diarization-torch.py b/scripts/pyannote/segmentation/speaker-diarization-torch.py index 18a50ec08..ac64bf4ce 100755 --- a/scripts/pyannote/segmentation/speaker-diarization-torch.py +++ b/scripts/pyannote/segmentation/speaker-diarization-torch.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) """ Please refer to diff --git a/scripts/pyannote/segmentation/vad-onnx.py b/scripts/pyannote/segmentation/vad-onnx.py index 417b8a842..4a95f3bd0 100755 --- a/scripts/pyannote/segmentation/vad-onnx.py +++ b/scripts/pyannote/segmentation/vad-onnx.py @@ -216,6 +216,8 @@ def main(): is_active = classification[0] > onset start = None + if is_active: + start = 0 scale = m.receptive_field_shift / m.sample_rate scale_offset = m.receptive_field_size / m.sample_rate * 0.5