diff --git a/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml new file mode 100644 index 000000000..8bfcdf08b --- /dev/null +++ b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml @@ -0,0 +1,142 @@ +name: wasm-simd-hf-space-en-asr-zipformer + +on: + release: + types: + - published + + workflow_dispatch: + +concurrency: + group: wasm-simd-hf-space-en-asr-zipformer-${{ github.ref }} + cancel-in-progress: true + +jobs: + wasm-simd-hf-space-en-asr-zipformer: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 + + - name: View emsdk version + shell: bash + run: | + emcc -v + echo "--------------------" + emcc --check + + - name: Download model files + shell: bash + run: | + cd wasm/asr/assets + ls -lh + echo "----------" + + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2 + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.int8.onnx encoder.onnx + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx decoder.onnx + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.onnx joiner.onnx + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt ./ + + rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-21 + + ls -lh + + - name: Build sherpa-onnx for WebAssembly (ASR) + shell: bash + run: | + ./build-wasm-simd-asr.sh + + - name: collect files + shell: bash + run: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-en-asr-zipformer + mv build-wasm-simd-asr/install/bin/wasm/asr $dst + ls -lh $dst + tar cjfv ${dst}.tar.bz2 ./${dst} + + - name: Upload wasm files + uses: actions/upload-artifact@v4 + with: + name: sherpa-onnx-wasm-simd-en-asr-zipformer + path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + + - name: Publish to ModelScope + env: + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf ms + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en.git ms + cd ms + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en.git + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en main diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml new file mode 100644 index 000000000..dc6ce30dd --- /dev/null +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml @@ -0,0 +1,149 @@ +name: wasm-simd-hf-space-zh-en-asr-paraformer + +on: + release: + types: + - published + + workflow_dispatch: + +concurrency: + group: wasm-simd-hf-space-zh-en-asr-paraformer-${{ github.ref }} + cancel-in-progress: true + +jobs: + wasm-simd-hf-space-zh-en-asr-paraformer: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 + + - name: View emsdk version + shell: bash + run: | + emcc -v + echo "--------------------" + emcc --check + + - name: Download model files + shell: bash + run: | + cd wasm/asr/assets + ls -lh + echo "----------" + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + + mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx encoder.onnx + mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx decoder.onnx + mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt ./ + + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en + + ls -lh + + cd ../ + + sed -i.bak s/"type = 0"/"type = 1"/g ./sherpa-onnx.js + sed -i.bak s/Zipformer/Paraformer/g ./index.html + + git diff + + - name: Build sherpa-onnx for WebAssembly (ASR) + shell: bash + run: | + ./build-wasm-simd-asr.sh + + - name: collect files + shell: bash + run: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-zh-en-asr-paraformer + mv build-wasm-simd-asr/install/bin/wasm/asr $dst + ls -lh $dst + tar cjfv ${dst}.tar.bz2 ./${dst} + + - name: Upload wasm files + uses: actions/upload-artifact@v4 + with: + name: sherpa-onnx-wasm-simd-zh-en-asr-paraformer + path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + + - name: Publish to ModelScope + env: + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf ms + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer.git ms + cd ms + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer.git + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer main diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml new file mode 100644 index 000000000..eb5262afa --- /dev/null +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml @@ -0,0 +1,140 @@ +name: wasm-simd-hf-space-zh-en-asr-zipformer + +on: + release: + types: + - published + + workflow_dispatch: + +concurrency: + group: wasm-simd-hf-space-zh-en-asr-zipformer-${{ github.ref }} + cancel-in-progress: true + +jobs: + wasm-simd-hf-space-zh-en-asr-zipformer: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 + + - name: View emsdk version + shell: bash + run: | + emcc -v + echo "--------------------" + emcc --check + + - name: Download model files + shell: bash + run: | + cd wasm/asr/assets + ls -lh + echo "----------" + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx encoder.onnx + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx decoder.onnx + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx joiner.onnx + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ./ + rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/ + + ls -lh + + - name: Build sherpa-onnx for WebAssembly (ASR) + shell: bash + run: | + ./build-wasm-simd-asr.sh + + - name: collect files + shell: bash + run: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-zh-en-asr-zipformer + mv build-wasm-simd-asr/install/bin/wasm/asr $dst + ls -lh $dst + tar cjfv ${dst}.tar.bz2 ./${dst} + + - name: Upload wasm files + uses: actions/upload-artifact@v4 + with: + name: sherpa-onnx-wasm-simd-zh-en-asr-zipformer + path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + + - name: Publish to ModelScope + env: + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf ms + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en.git ms + cd ms + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en.git + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en main diff --git a/.gitignore b/.gitignore index e87236843..47d10a97a 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,6 @@ vits-piper-en_US-amy-low vits-piper-*-*-* log *.exe +vits-piper-* +vits-coqui-* +vits-mms-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 03dbe3f78..fac8af55f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ option(SHERPA_ONNX_ENABLE_WEBSOCKET "Whether to build webscoket server/client" O option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF) option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF) option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) +option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) option(SHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY "True to link libstdc++ statically. Used only when BUILD_SHARED_LIBS is OFF on Linux" ON) @@ -106,10 +107,17 @@ message(STATUS "SHERPA_ONNX_ENABLE_WEBSOCKET ${SHERPA_ONNX_ENABLE_WEBSOCKET}") message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}") message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") +message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") if(SHERPA_ONNX_ENABLE_WASM_TTS) if(NOT SHERPA_ONNX_ENABLE_WASM) - message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for tts") + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for TTS") + endif() +endif() + +if(SHERPA_ONNX_ENABLE_WASM_ASR) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for ASR") endif() endif() diff --git a/build-wasm-simd-asr.sh b/build-wasm-simd-asr.sh new file mode 100755 index 000000000..b894087ac --- /dev/null +++ b/build-wasm-simd-asr.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Copyright (c) 2024 Xiaomi Corporation +# +# This script is to build sherpa-onnx for WebAssembly (ASR) + +set -ex + +if [ x"$EMSCRIPTEN" == x"" ]; then + if ! command -v emcc &> /dev/null; then + echo "Please install emscripten first" + echo "" + echo "You can use the following commands to install it:" + echo "" + echo "git clone https://github.com/emscripten-core/emsdk.git" + echo "cd emsdk" + echo "git pull" + echo "./emsdk install latest" + echo "./emsdk activate latest" + echo "source ./emsdk_env.sh" + exit 1 + else + EMSCRIPTEN=$(dirname $(realpath $(which emcc))) + fi +fi + +export EMSCRIPTEN=$EMSCRIPTEN +echo "EMSCRIPTEN: $EMSCRIPTEN" +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake" + echo "Please make sure you have installed emsdk correctly" + exit 1 +fi + +mkdir -p build-wasm-simd-asr +pushd build-wasm-simd-asr + +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON + +cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \ + \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + -DSHERPA_ONNX_ENABLE_GPU=OFF \ + -DSHERPA_ONNX_ENABLE_WASM=ON \ + -DSHERPA_ONNX_ENABLE_WASM_ASR=ON \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \ + .. +make -j2 +make install + +ls -lh install/bin/wasm/asr diff --git a/build-wasm-simd-tts.sh b/build-wasm-simd-tts.sh index 085a4d892..6835e4c43 100755 --- a/build-wasm-simd-tts.sh +++ b/build-wasm-simd-tts.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Copyright (c) 2024 Xiaomi Corporation # -# This script is to build sherpa-onnx for WebAssembly +# This script is to build sherpa-onnx for WebAssembly (TTS) set -ex diff --git a/sherpa-onnx/csrc/alsa.cc b/sherpa-onnx/csrc/alsa.cc index 0370bbe0d..3c883331a 100644 --- a/sherpa-onnx/csrc/alsa.cc +++ b/sherpa-onnx/csrc/alsa.cc @@ -37,7 +37,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] and if you want to select card 3 and the device 0 on that card, please use: - hw:3,0 + plughw:3,0 )"; diff --git a/sherpa-onnx/csrc/online-zipformer-transducer-model.cc b/sherpa-onnx/csrc/online-zipformer-transducer-model.cc index 31234ae74..b7e16cb67 100644 --- a/sherpa-onnx/csrc/online-zipformer-transducer-model.cc +++ b/sherpa-onnx/csrc/online-zipformer-transducer-model.cc @@ -107,11 +107,12 @@ void OnlineZipformerTransducerModel::InitEncoder(void *model_data, if (config_.debug) { auto print = [](const std::vector &v, const char *name) { - fprintf(stderr, "%s: ", name); + std::ostringstream os; + os << name << ": "; for (auto i : v) { - fprintf(stderr, "%d ", i); + os << i << " "; } - fprintf(stderr, "\n"); + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); }; print(encoder_dims_, "encoder_dims"); print(attention_dims_, "attention_dims"); diff --git a/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc b/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc index 1146f00b2..aff4e5cb8 100644 --- a/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc +++ b/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc @@ -282,11 +282,12 @@ class OnlineZipformer2CtcModel::Impl { if (config_.debug) { auto print = [](const std::vector &v, const char *name) { - fprintf(stderr, "%s: ", name); + std::ostringstream os; + os << name << ": "; for (auto i : v) { - fprintf(stderr, "%d ", i); + os << i << " "; } - fprintf(stderr, "\n"); + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); }; print(encoder_dims_, "encoder_dims"); print(query_head_dims_, "query_head_dims"); diff --git a/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc b/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc index e818b0bc9..0d9ea08a4 100644 --- a/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc +++ b/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc @@ -111,11 +111,12 @@ void OnlineZipformer2TransducerModel::InitEncoder(void *model_data, if (config_.debug) { auto print = [](const std::vector &v, const char *name) { - fprintf(stderr, "%s: ", name); + std::ostringstream os; + os << name << ": "; for (auto i : v) { - fprintf(stderr, "%d ", i); + os << i << " "; } - fprintf(stderr, "\n"); + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); }; print(encoder_dims_, "encoder_dims"); print(query_head_dims_, "query_head_dims"); diff --git a/sherpa-onnx/csrc/sherpa-onnx-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-alsa.cc index 3301353ad..16d9bab42 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-alsa.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-alsa.cc @@ -54,10 +54,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] and if you want to select card 3 and the device 0 on that card, please use: - hw:3,0 - -or - plughw:3,0 as the device_name. diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt index 2db9a3aa0..dc077a23d 100644 --- a/wasm/CMakeLists.txt +++ b/wasm/CMakeLists.txt @@ -1,3 +1,7 @@ if(SHERPA_ONNX_ENABLE_WASM_TTS) add_subdirectory(tts) endif() + +if(SHERPA_ONNX_ENABLE_WASM_ASR) + add_subdirectory(asr) +endif() diff --git a/wasm/asr/.gitignore b/wasm/asr/.gitignore new file mode 100644 index 000000000..751553b3a --- /dev/null +++ b/wasm/asr/.gitignore @@ -0,0 +1 @@ +*.bak diff --git a/wasm/asr/CMakeLists.txt b/wasm/asr/CMakeLists.txt new file mode 100644 index 000000000..db8a077eb --- /dev/null +++ b/wasm/asr/CMakeLists.txt @@ -0,0 +1,62 @@ +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH}) + message(FATAL_ERROR "Please use ./build-wasm-simd-asr.sh to build for wasm ASR") +endif() + +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/encoder.onnx") + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue") +endif() + +set(exported_functions + MyPrint + # online ASR + AcceptWaveform + CreateOnlineRecognizer + CreateOnlineStream + DecodeOnlineStream + DestroyOnlineRecognizer + DestroyOnlineRecognizerResult + DestroyOnlineStream + GetOnlineStreamResult + InputFinished + IsEndpoint + IsOnlineStreamReady + Reset + # +) +set(mangled_exported_functions) +foreach(x IN LISTS exported_functions) + list(APPEND mangled_exported_functions "_${x}") +endforeach() +list(JOIN mangled_exported_functions "," all_exported_functions) + +include_directories(${CMAKE_SOURCE_DIR}) +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=512MB -s ALLOW_MEMORY_GROWTH=1") +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ") +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ") +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue'] ") + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}") +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}") + +if (NOT CMAKE_EXECUTABLE_SUFFIX STREQUAL ".js") + message(FATAL_ERROR "The default suffix for building executables should be .js!") +endif() +# set(CMAKE_EXECUTABLE_SUFFIX ".html") + +add_executable(sherpa-onnx-wasm-asr-main sherpa-onnx-wasm-asr-main.cc) +target_link_libraries(sherpa-onnx-wasm-asr-main sherpa-onnx-c-api) +install(TARGETS sherpa-onnx-wasm-asr-main DESTINATION bin/wasm/asr) + +install( + FILES + "$/sherpa-onnx-wasm-asr-main.js" + "index.html" + "sherpa-onnx.js" + "app.js" + "$/sherpa-onnx-wasm-asr-main.wasm" + "$/sherpa-onnx-wasm-asr-main.data" + DESTINATION + bin/wasm/asr +) diff --git a/wasm/asr/app.js b/wasm/asr/app.js new file mode 100644 index 000000000..cb27db97f --- /dev/null +++ b/wasm/asr/app.js @@ -0,0 +1,299 @@ +// This file copies and modifies code +// from https://mdn.github.io/web-dictaphone/scripts/app.js +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e + +const startBtn = document.getElementById('startBtn'); +const stopBtn = document.getElementById('stopBtn'); +const clearBtn = document.getElementById('clearBtn'); +const hint = document.getElementById('hint'); +const soundClips = document.getElementById('sound-clips'); + +let textArea = document.getElementById('results'); + +let lastResult = ''; +let resultList = []; + +clearBtn.onclick = function() { + resultList = []; + textArea.value = getDisplayResult(); + textArea.scrollTop = textArea.scrollHeight; // auto scroll +}; + +function getDisplayResult() { + let i = 0; + let ans = ''; + for (let s in resultList) { + if (resultList[s] == '') { + continue; + } + + ans += '' + i + ': ' + resultList[s] + '\n'; + i += 1; + } + + if (lastResult.length > 0) { + ans += '' + i + ': ' + lastResult + '\n'; + } + return ans; +} + + +Module = {}; +Module.onRuntimeInitialized = function() { + console.log('inited!'); + hint.innerText = 'Model loaded! Please click start'; + + startBtn.disabled = false; + + recognizer = createRecognizer(); + console.log('recognizer is created!', recognizer); +}; + +let audioCtx; +let mediaStream; + +let expectedSampleRate = 16000; +let recordSampleRate; // the sampleRate of the microphone +let recorder = null; // the microphone +let leftchannel = []; // TODO: Use a single channel + +let recordingLength = 0; // number of samples so far + +let recognizer = null; +let recognizer_stream = null; + +if (navigator.mediaDevices.getUserMedia) { + console.log('getUserMedia supported.'); + + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia + const constraints = {audio: true}; + + let onSuccess = function(stream) { + if (!audioCtx) { + audioCtx = new AudioContext({sampleRate: 16000}); + } + console.log(audioCtx); + recordSampleRate = audioCtx.sampleRate; + console.log('sample rate ' + recordSampleRate); + + // creates an audio node from the microphone incoming stream + mediaStream = audioCtx.createMediaStreamSource(stream); + console.log('media stream', mediaStream); + + // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor + // bufferSize: the onaudioprocess event is called when the buffer is full + var bufferSize = 4096; + var numberOfInputChannels = 1; + var numberOfOutputChannels = 2; + if (audioCtx.createScriptProcessor) { + recorder = audioCtx.createScriptProcessor( + bufferSize, numberOfInputChannels, numberOfOutputChannels); + } else { + recorder = audioCtx.createJavaScriptNode( + bufferSize, numberOfInputChannels, numberOfOutputChannels); + } + console.log('recorder', recorder); + + recorder.onaudioprocess = function(e) { + let samples = new Float32Array(e.inputBuffer.getChannelData(0)) + samples = downsampleBuffer(samples, expectedSampleRate); + + if (recognizer_stream == null) { + recognizer_stream = recognizer.createStream(); + } + + recognizer_stream.acceptWaveform(expectedSampleRate, samples); + while (recognizer.isReady(recognizer_stream)) { + recognizer.decode(recognizer_stream); + } + + let isEndpoint = recognizer.isEndpoint(recognizer_stream); + let result = recognizer.getResult(recognizer_stream); + + + if (result.length > 0 && lastResult != result) { + lastResult = result; + } + + if (isEndpoint) { + if (lastResult.length > 0) { + resultList.push(lastResult); + lastResult = ''; + } + recognizer.reset(recognizer_stream); + } + + textArea.value = getDisplayResult(); + textArea.scrollTop = textArea.scrollHeight; // auto scroll + + let buf = new Int16Array(samples.length); + for (var i = 0; i < samples.length; ++i) { + let s = samples[i]; + if (s >= 1) + s = 1; + else if (s <= -1) + s = -1; + + samples[i] = s; + buf[i] = s * 32767; + } + + leftchannel.push(buf); + recordingLength += bufferSize; + }; + + startBtn.onclick = function() { + mediaStream.connect(recorder); + recorder.connect(audioCtx.destination); + + console.log('recorder started'); + + stopBtn.disabled = false; + startBtn.disabled = true; + }; + + stopBtn.onclick = function() { + console.log('recorder stopped'); + + // stopBtn recording + recorder.disconnect(audioCtx.destination); + mediaStream.disconnect(recorder); + + startBtn.style.background = ''; + startBtn.style.color = ''; + // mediaRecorder.requestData(); + + stopBtn.disabled = true; + startBtn.disabled = false; + + var clipName = new Date().toISOString(); + + const clipContainer = document.createElement('article'); + const clipLabel = document.createElement('p'); + const audio = document.createElement('audio'); + const deleteButton = document.createElement('button'); + clipContainer.classList.add('clip'); + audio.setAttribute('controls', ''); + deleteButton.textContent = 'Delete'; + deleteButton.className = 'delete'; + + clipLabel.textContent = clipName; + + clipContainer.appendChild(audio); + + clipContainer.appendChild(clipLabel); + clipContainer.appendChild(deleteButton); + soundClips.appendChild(clipContainer); + + audio.controls = true; + let samples = flatten(leftchannel); + const blob = toWav(samples); + + leftchannel = []; + const audioURL = window.URL.createObjectURL(blob); + audio.src = audioURL; + console.log('recorder stopped'); + + deleteButton.onclick = function(e) { + let evtTgt = e.target; + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode); + }; + + clipLabel.onclick = function() { + const existingName = clipLabel.textContent; + const newClipName = prompt('Enter a new name for your sound clip?'); + if (newClipName === null) { + clipLabel.textContent = existingName; + } else { + clipLabel.textContent = newClipName; + } + }; + }; + }; + + let onError = function(err) { + console.log('The following error occured: ' + err); + }; + + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); +} else { + console.log('getUserMedia not supported on your browser!'); + alert('getUserMedia not supported on your browser!'); +} + + +// this function is copied/modified from +// https://gist.github.com/meziantou/edb7217fddfbb70e899e +function flatten(listOfSamples) { + let n = 0; + for (let i = 0; i < listOfSamples.length; ++i) { + n += listOfSamples[i].length; + } + let ans = new Int16Array(n); + + let offset = 0; + for (let i = 0; i < listOfSamples.length; ++i) { + ans.set(listOfSamples[i], offset); + offset += listOfSamples[i].length; + } + return ans; +} + +// this function is copied/modified from +// https://gist.github.com/meziantou/edb7217fddfbb70e899e +function toWav(samples) { + let buf = new ArrayBuffer(44 + samples.length * 2); + var view = new DataView(buf); + + // http://soundfile.sapp.org/doc/WaveFormat/ + // F F I R + view.setUint32(0, 0x46464952, true); // chunkID + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize + // E V A W + view.setUint32(8, 0x45564157, true); // format + // + // t m f + view.setUint32(12, 0x20746d66, true); // subchunk1ID + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM + view.setUint32(20, 1, true); // audioFormat, 1 for PCM + view.setUint16(22, 1, true); // numChannels: 1 channel + view.setUint32(24, expectedSampleRate, true); // sampleRate + view.setUint32(28, expectedSampleRate * 2, true); // byteRate + view.setUint16(32, 2, true); // blockAlign + view.setUint16(34, 16, true); // bitsPerSample + view.setUint32(36, 0x61746164, true); // Subchunk2ID + view.setUint32(40, samples.length * 2, true); // subchunk2Size + + let offset = 44; + for (let i = 0; i < samples.length; ++i) { + view.setInt16(offset, samples[i], true); + offset += 2; + } + + return new Blob([view], {type: 'audio/wav'}); +} + +// this function is copied from +// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46 +function downsampleBuffer(buffer, exportSampleRate) { + if (exportSampleRate === recordSampleRate) { + return buffer; + } + var sampleRateRatio = recordSampleRate / exportSampleRate; + var newLength = Math.round(buffer.length / sampleRateRatio); + var result = new Float32Array(newLength); + var offsetResult = 0; + var offsetBuffer = 0; + while (offsetResult < result.length) { + var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio); + var accum = 0, count = 0; + for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) { + accum += buffer[i]; + count++; + } + result[offsetResult] = accum / count; + offsetResult++; + offsetBuffer = nextOffsetBuffer; + } + return result; +}; diff --git a/wasm/asr/assets/.gitignore b/wasm/asr/assets/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/wasm/asr/assets/README.md b/wasm/asr/assets/README.md new file mode 100644 index 000000000..2d4bb5c83 --- /dev/null +++ b/wasm/asr/assets/README.md @@ -0,0 +1,82 @@ +# Introduction + +Please refer to +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +or +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +to download a model. + +# Streaming ASR + +## Transducer +```bash +cd sherpa-onnx/wasm/asr/assets + +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + +# Note it is not an error that we rename encoder.int8.onnx to encoder.onnx + +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx encoder.onnx +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx decoder.onnx +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx joiner.onnx +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ./ +rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/ + +cd ../../.. + +./build-wasm-simd-asr.sh +``` + +You should have the following files in `assets` before you can run +`build-wasm-simd-asr.sh` + +``` +assets fangjun$ tree -L 1 +. +├── README.md +├── decoder.onnx +├── encoder.onnx +├── joiner.onnx +└── tokens.txt + +0 directories, 5 files +``` + +## Paraformer + +``` +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + +mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx encoder.onnx +mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx decoder.onnx +mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt ./ + +rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en + +cd ../ + +sed -i.bak s/"type = 0"/"type = 1"/g ./sherpa-onnx.js +sed -i.bak s/Zipformer/Paraformer/g ./index.html + +cd ../.. + +./build-wasm-simd-asr.sh +``` + +You should have the following files in `assets` before you can run +`build-wasm-simd-asr.sh` + +``` +assets fangjun$ tree -L 1 +. +├── README.md +├── decoder.onnx +├── encoder.onnx +└── tokens.txt + +0 directories, 4 files +``` diff --git a/wasm/asr/index.html b/wasm/asr/index.html new file mode 100644 index 000000000..64661f115 --- /dev/null +++ b/wasm/asr/index.html @@ -0,0 +1,42 @@ + + + + + + Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech + + + + +

+ Next-gen Kaldi + WebAssembly
+ ASR Demo with sherpa-onnx
+ (with Zipformer) +

+ +
+ Loading model ... ... +
+
+ + + +
+
+ +
+ +
+
+ + + + + diff --git a/wasm/asr/sherpa-onnx-wasm-asr-main.cc b/wasm/asr/sherpa-onnx-wasm-asr-main.cc new file mode 100644 index 000000000..236766312 --- /dev/null +++ b/wasm/asr/sherpa-onnx-wasm-asr-main.cc @@ -0,0 +1,75 @@ +// wasm/sherpa-onnx-wasm-asr-main.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +// see also +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html + +extern "C" { + +static_assert(sizeof(SherpaOnnxOnlineTransducerModelConfig) == 3 * 4, ""); +static_assert(sizeof(SherpaOnnxOnlineParaformerModelConfig) == 2 * 4, ""); +static_assert(sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) == 1 * 4, ""); +static_assert(sizeof(SherpaOnnxOnlineModelConfig) == + sizeof(SherpaOnnxOnlineTransducerModelConfig) + + sizeof(SherpaOnnxOnlineParaformerModelConfig) + + sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4, + ""); +static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); +static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) == + sizeof(SherpaOnnxFeatureConfig) + + sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4, + ""); + +void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { + auto model_config = &config->model_config; + auto feat = &config->feat_config; + auto transducer_model_config = &model_config->transducer; + auto paraformer_model_config = &model_config->paraformer; + auto ctc_model_config = &model_config->zipformer2_ctc; + + fprintf(stdout, "----------online transducer model config----------\n"); + fprintf(stdout, "encoder: %s\n", transducer_model_config->encoder); + fprintf(stdout, "decoder: %s\n", transducer_model_config->decoder); + fprintf(stdout, "joiner: %s\n", transducer_model_config->joiner); + + fprintf(stdout, "----------online parformer model config----------\n"); + fprintf(stdout, "encoder: %s\n", paraformer_model_config->encoder); + fprintf(stdout, "decoder: %s\n", paraformer_model_config->decoder); + + fprintf(stdout, "----------online ctc model config----------\n"); + fprintf(stdout, "model: %s\n", ctc_model_config->model); + fprintf(stdout, "tokens: %s\n", model_config->tokens); + fprintf(stdout, "num_threads: %d\n", model_config->num_threads); + fprintf(stdout, "provider: %s\n", model_config->provider); + fprintf(stdout, "debug: %d\n", model_config->debug); + fprintf(stdout, "model type: %s\n", model_config->model_type); + + fprintf(stdout, "----------feat config----------\n"); + fprintf(stdout, "sample rate: %d\n", feat->sample_rate); + fprintf(stdout, "feat dim: %d\n", feat->feature_dim); + + fprintf(stdout, "----------recognizer config----------\n"); + fprintf(stdout, "decoding method: %s\n", config->decoding_method); + fprintf(stdout, "max active paths: %d\n", config->max_active_paths); + fprintf(stdout, "enable_endpoint: %d\n", config->enable_endpoint); + fprintf(stdout, "rule1_min_trailing_silence: %.2f\n", + config->rule1_min_trailing_silence); + fprintf(stdout, "rule2_min_trailing_silence: %.2f\n", + config->rule2_min_trailing_silence); + fprintf(stdout, "rule3_min_utterance_length: %.2f\n", + config->rule3_min_utterance_length); + fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file); + fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score); +} + +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { + std::copy(src, src + num_bytes, dst); +} +} diff --git a/wasm/asr/sherpa-onnx.js b/wasm/asr/sherpa-onnx.js new file mode 100644 index 000000000..d08217226 --- /dev/null +++ b/wasm/asr/sherpa-onnx.js @@ -0,0 +1,381 @@ +function freeConfig(config) { + if ('buffer' in config) { + _free(config.buffer); + } + + if ('config' in config) { + freeConfig(config.config) + } + + if ('transducer' in config) { + freeConfig(config.transducer) + } + + if ('paraformer' in config) { + freeConfig(config.paraformer) + } + + if ('ctc' in config) { + freeConfig(config.ctc) + } + + if ('feat' in config) { + freeConfig(config.feat) + } + + if ('model' in config) { + freeConfig(config.model) + } + + _free(config.ptr); +} + +// The user should free the returned pointers +function initSherpaOnnxOnlineTransducerModelConfig(config) { + let encoderLen = lengthBytesUTF8(config.encoder) + 1; + let decoderLen = lengthBytesUTF8(config.decoder) + 1; + let joinerLen = lengthBytesUTF8(config.joiner) + 1; + + let n = encoderLen + decoderLen + joinerLen; + + let buffer = _malloc(n); + + let len = 3 * 4; // 3 pointers + let ptr = _malloc(len); + + let offset = 0; + stringToUTF8(config.encoder, buffer + offset, encoderLen); + offset += encoderLen; + + stringToUTF8(config.decoder, buffer + offset, decoderLen); + offset += decoderLen; + + stringToUTF8(config.joiner, buffer + offset, joinerLen); + + offset = 0; + setValue(ptr, buffer + offset, 'i8*'); + offset += encoderLen; + + setValue(ptr + 4, buffer + offset, 'i8*'); + offset += decoderLen; + + setValue(ptr + 8, buffer + offset, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOnlineParaformerModelConfig(config) { + let encoderLen = lengthBytesUTF8(config.encoder) + 1; + let decoderLen = lengthBytesUTF8(config.decoder) + 1; + + let n = encoderLen + decoderLen; + let buffer = _malloc(n); + + let len = 2 * 4; // 2 pointers + let ptr = _malloc(len); + + let offset = 0; + stringToUTF8(config.encoder, buffer + offset, encoderLen); + offset += encoderLen; + + stringToUTF8(config.decoder, buffer + offset, decoderLen); + + offset = 0; + setValue(ptr, buffer + offset, 'i8*'); + offset += encoderLen; + + setValue(ptr + 4, buffer + offset, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOnlineZipformer2CtcModelConfig(config) { + let n = lengthBytesUTF8(config.model) + 1; + let buffer = _malloc(n); + + let len = 1 * 4; // 1 pointer + let ptr = _malloc(len); + + stringToUTF8(config.model, buffer, n); + + setValue(ptr, buffer, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOnlineModelConfig(config) { + let transducer = initSherpaOnnxOnlineTransducerModelConfig(config.transducer); + let paraformer = initSherpaOnnxOnlineParaformerModelConfig(config.paraformer); + let ctc = initSherpaOnnxOnlineZipformer2CtcModelConfig(config.zipformer2Ctc); + + let len = transducer.len + paraformer.len + ctc.len + 5 * 4; + let ptr = _malloc(len); + + let offset = 0; + _CopyHeap(transducer.ptr, transducer.len, ptr + offset); + offset += transducer.len; + + _CopyHeap(paraformer.ptr, paraformer.len, ptr + offset); + offset += paraformer.len; + + _CopyHeap(ctc.ptr, ctc.len, ptr + offset); + offset += ctc.len; + + let tokensLen = lengthBytesUTF8(config.tokens) + 1; + let providerLen = lengthBytesUTF8(config.provider) + 1; + let modelTypeLen = lengthBytesUTF8(config.modelType) + 1; + let bufferLen = tokensLen + providerLen + modelTypeLen; + let buffer = _malloc(bufferLen); + + offset = 0; + stringToUTF8(config.tokens, buffer, tokensLen); + offset += tokensLen; + + stringToUTF8(config.provider, buffer + offset, providerLen); + offset += providerLen; + + stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + + offset = transducer.len + paraformer.len + ctc.len; + setValue(ptr + offset, buffer, 'i8*'); // tokens + offset += 4; + + setValue(ptr + offset, config.numThreads, 'i32'); + offset += 4; + + setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider + offset += 4; + + setValue(ptr + offset, config.debug, 'i32'); + offset += 4; + + setValue(ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, transducer: transducer, + paraformer: paraformer, ctc: ctc + } +} + +function initSherpaOnnxFeatureConfig(config) { + let len = 2 * 4; // 2 pointers + let ptr = _malloc(len); + + setValue(ptr, config.sampleRate, 'i32'); + setValue(ptr + 4, config.featureDim, 'i32'); + return {ptr: ptr, len: len}; +} + +function initSherpaOnnxOnlineRecognizerConfig(config) { + let feat = initSherpaOnnxFeatureConfig(config.featConfig); + let model = initSherpaOnnxOnlineModelConfig(config.modelConfig); + + let len = feat.len + model.len + 8 * 4; + let ptr = _malloc(len); + + let offset = 0; + _CopyHeap(feat.ptr, feat.len, ptr + offset); + offset += feat.len; + + _CopyHeap(model.ptr, model.len, ptr + offset); + offset += model.len; + + let decodingMethodLen = lengthBytesUTF8(config.decodingMethod) + 1; + let hotwordsFileLen = lengthBytesUTF8(config.hotwordsFile) + 1; + let bufferLen = decodingMethodLen + hotwordsFileLen; + let buffer = _malloc(bufferLen); + + offset = 0; + stringToUTF8(config.decodingMethod, buffer, decodingMethodLen); + offset += decodingMethodLen; + + stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); + + offset = feat.len + model.len; + setValue(ptr + offset, buffer, 'i8*'); // decoding method + offset += 4; + + setValue(ptr + offset, config.maxActivePaths, 'i32'); + offset += 4; + + setValue(ptr + offset, config.enableEndpoint, 'i32'); + offset += 4; + + setValue(ptr + offset, config.rule1MinTrailingSilence, 'float'); + offset += 4; + + setValue(ptr + offset, config.rule2MinTrailingSilence, 'float'); + offset += 4; + + setValue(ptr + offset, config.rule3MinUtteranceLength, 'float'); + offset += 4; + + setValue(ptr + offset, buffer + decodingMethodLen, 'i8*'); + offset += 4; + + setValue(ptr + offset, config.hotwordsScore, 'float'); + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, feat: feat, model: model + } +} + + +function createRecognizer() { + let onlineTransducerModelConfig = { + encoder: '', + decoder: '', + joiner: '', + } + + let onlineParaformerModelConfig = { + encoder: '', + decoder: '', + } + + let onlineZipformer2CtcModelConfig = { + model: '', + } + + let type = 0; + + switch (type) { + case 0: + // transducer + onlineTransducerModelConfig.encoder = './encoder.onnx'; + onlineTransducerModelConfig.decoder = './decoder.onnx'; + onlineTransducerModelConfig.joiner = './joiner.onnx'; + break; + case 1: + // paraformer + onlineParaformerModelConfig.encoder = './encoder.onnx'; + onlineParaformerModelConfig.decoder = './decoder.onnx'; + break; + case 2: + // ctc + onlineZipformer2CtcModelConfig.model = './encoder.onnx'; + break; + } + + + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: './tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 1, + modelType: '', + } + + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + } + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + } + + return new OnlineRecognizer(recognizerConfig); +} + +class OnlineStream { + constructor(handle) { + this.handle = handle; + this.pointer = null; // buffer + this.n = 0; // buffer size + } + + free() { + if (this.handle) { + _DestroyOnlineStream(this.handle); + this.handle = null; + _free(this.pointer); + this.pointer = null; + this.n = 0; + } + } + + /** + * @param sampleRate {Number} + * @param samples {Float32Array} Containing samples in the range [-1, 1] + */ + acceptWaveform(sampleRate, samples) { + if (this.n < samples.length) { + _free(this.pointer); + this.pointer = _malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.n = samples.length + } + + Module.HEAPF32.set(samples, this.pointer / samples.BYTES_PER_ELEMENT); + _AcceptWaveform(this.handle, sampleRate, this.pointer, samples.length); + } + + inputFinished() { + _InputFinished(this.handle); + } +}; + +class OnlineRecognizer { + constructor(configObj) { + let config = initSherpaOnnxOnlineRecognizerConfig(configObj) + let handle = _CreateOnlineRecognizer(config.ptr); + + freeConfig(config); + + this.handle = handle; + } + + free() { + _DestroyOnlineRecognizer(this.handle); + this.handle = 0 + } + + createStream() { + let handle = _CreateOnlineStream(this.handle); + return new OnlineStream(handle); + } + + isReady(stream) { + return _IsOnlineStreamReady(this.handle, stream.handle) == 1; + } + + decode(stream) { + return _DecodeOnlineStream(this.handle, stream.handle); + } + + isEndpoint(stream) { + return _IsEndpoint(this.handle, stream.handle) == 1; + } + + reset(stream) { + _Reset(this.handle, stream.handle); + } + + getResult(stream) { + let r = _GetOnlineStreamResult(this.handle, stream.handle); + let textPtr = getValue(r, 'i8*'); + let text = UTF8ToString(textPtr); + _DestroyOnlineRecognizerResult(r); + return text; + } +}