Skip to content

Commit

Permalink
Add C# and JavaScript (wasm) API for MatchaTTS models (#1682)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jan 5, 2025
1 parent 1ef9e5e commit 3eced3e
Show file tree
Hide file tree
Showing 26 changed files with 677 additions and 88 deletions.
32 changes: 21 additions & 11 deletions .github/scripts/test-dot-net.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,27 @@

cd dotnet-examples/

cd ./offline-speaker-diarization
cd ./offline-tts
./run-matcha-zh.sh
ls -lh *.wav
./run-matcha-en.sh
ls -lh *.wav
./run-aishell3.sh
ls -lh *.wav
./run-piper.sh
ls -lh *.wav
./run-hf-fanchen.sh
ls -lh *.wav
ls -lh

pushd ../..

mkdir tts

cp dotnet-examples/offline-tts/*.wav ./tts
popd

cd ../offline-speaker-diarization
./run.sh
rm -rfv *.onnx
rm -fv *.wav
Expand Down Expand Up @@ -76,14 +96,4 @@ cd ../spoken-language-identification
./run.sh
rm -rf sherpa-onnx-*

cd ../offline-tts
./run-aishell3.sh
./run-piper.sh
./run-hf-fanchen.sh
ls -lh

cd ../..

mkdir tts

cp dotnet-examples/offline-tts/*.wav ./tts
54 changes: 42 additions & 12 deletions .github/scripts/test-nodejs-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,48 @@ git status
ls -lh
ls -lh node_modules

# offline tts
#
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test-offline-tts-matcha-zh.js

rm -rf matcha-icefall-zh-baker
rm hifigan_v2.onnx

echo "---"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test-offline-tts-matcha-en.js

rm -rf matcha-icefall-en_US-ljspeech
rm hifigan_v2.onnx

echo "---"

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
node ./test-offline-tts-vits-en.js
rm -rf vits-piper-en_US-amy-low*

echo "---"

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
node ./test-offline-tts-vits-zh.js
rm -rf vits-icefall-zh-aishell3*

ls -lh *.wav

echo '-----speaker diarization----------'
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
Expand Down Expand Up @@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
node ./test-online-zipformer2-ctc-hlg.js
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18

# offline tts

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
node ./test-offline-tts-en.js
rm -rf vits-piper-en_US-amy-low*

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
node ./test-offline-tts-zh.js
rm -rf vits-icefall-zh-aishell3*
44 changes: 44 additions & 0 deletions .github/workflows/test-dot-net.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,50 @@ jobs:
python-version: ["3.8"]

steps:
- name: Check space
shell: bash
run: |
df -h
- name: Free space
shell: bash
run: |
df -h
rm -rf /opt/hostedtoolcache
df -h
- name: Free more space
shell: bash
run: |
# https://github.com/orgs/community/discussions/25678
cd /opt
find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';'
sudo rm -rf /usr/share/dotnet
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: false
haskell: true
large-packages: true
docker-images: false
swap-storage: true

- name: Check space
shell: bash
run: |
df -h
- uses: actions/checkout@v4
with:
fetch-depth: 0
Expand Down
109 changes: 82 additions & 27 deletions dotnet-examples/offline-tts-play/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,48 +21,56 @@ class OfflineTtsPlayDemo
{
class Options
{

[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
public string? RuleFsts { get; set; }
public string RuleFsts { get; set; } = string.Empty;

[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
public string RuleFars { get; set; } = string.Empty;

[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
public string? DictDir { get; set; }
[Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
public string DictDir { get; set; } = string.Empty;

[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
public string? DataDir { get; set; }
[Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
public string DataDir { get; set; } = string.Empty;

[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
public float LengthScale { get; set; }
[Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
public float LengthScale { get; set; } = 1;

[Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")]
public float NoiseScale { get; set; }
[Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")]
public float NoiseScale { get; set; } = 0.667F;

[Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")]
public float NoiseScaleW { get; set; }
[Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")]
public float NoiseScaleW { get; set; } = 0.8F;

[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
public string? Lexicon { get; set; }
[Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
public string Lexicon { get; set; } = string.Empty;

[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
public string? Tokens { get; set; }
[Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")]
public string Tokens { get; set; } = string.Empty;

[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
public int MaxNumSentences { get; set; }
public int MaxNumSentences { get; set; } = 1;

[Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
public int Debug { get; set; }
public int Debug { get; set; } = 0;

[Option("vits-model", Required = false, HelpText = "Path to VITS model")]
public string Model { get; set; } = string.Empty;

[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
public string? Model { get; set; }
[Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")]
public string AcousticModel { get; set; } = "";

[Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")]
public string Vocoder { get; set; } = "";

[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
public int SpeakerId { get; set; }
public int SpeakerId { get; set; } = 0;

[Option("text", Required = true, HelpText = "Text to synthesize")]
public string? Text { get; set; }
public string Text { get; set; } = string.Empty;

[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
public string? OutputFilename { get; set; }
public string OutputFilename { get; set; } = "./generated.wav";
}

static void Main(string[] args)
Expand All @@ -78,15 +86,51 @@ static void Main(string[] args)
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
{
string usage = @"
# matcha-icefall-zh-baker
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
dotnet run \
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
--tokens=./matcha-icefall-zh-baker/tokens.txt \
--dict-dir=./matcha-icefall-zh-baker/dict \
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
--debug=1 \
--output-filename=./matcha-zh.wav \
--text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'
# matcha-icefall-en_US-ljspeech
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
dotnet run \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--tokens=./matcha-icefall-zh-baker/tokens.txt \
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--debug=1 \
--output-filename=./matcha-zh.wav \
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
# vits-aishell3
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
tar xf vits-zh-aishell3.tar.bz2
dotnet run \
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
--vits-tokens=./vits-zh-aishell3/tokens.txt \
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
--tokens=./vits-zh-aishell3/tokens.txt \
--lexicon=./vits-zh-aishell3/lexicon.txt \
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
--sid=66 \
--debug=1 \
Expand All @@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2
dotnet run \
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
---tokens=./vits-piper-en_US-amy-low/tokens.txt \
--data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
--debug=1 \
--output-filename=./amy.wav \
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
Expand All @@ -124,6 +168,7 @@ to download more models.
private static void Run(Options options)
{
var config = new OfflineTtsConfig();

config.Model.Vits.Model = options.Model;
config.Model.Vits.Lexicon = options.Lexicon;
config.Model.Vits.Tokens = options.Tokens;
Expand All @@ -132,6 +177,16 @@ private static void Run(Options options)
config.Model.Vits.NoiseScale = options.NoiseScale;
config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
config.Model.Vits.LengthScale = options.LengthScale;

config.Model.Matcha.AcousticModel = options.AcousticModel;
config.Model.Matcha.Vocoder = options.Vocoder;
config.Model.Matcha.Lexicon = options.Lexicon;
config.Model.Matcha.Tokens = options.Tokens;
config.Model.Matcha.DataDir = options.DataDir;
config.Model.Matcha.DictDir = options.DictDir;
config.Model.Matcha.NoiseScale = options.NoiseScale;
config.Model.Matcha.LengthScale = options.LengthScale;

config.Model.NumThreads = 1;
config.Model.Debug = options.Debug;
config.Model.Provider = "cpu";
Expand Down
4 changes: 2 additions & 2 deletions dotnet-examples/offline-tts-play/run-hf-fanchen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ fi

dotnet run \
--vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
--vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \
--vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
--tokens=./vits-zh-hf-fanchen-C/tokens.txt \
--lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
--tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
--vits-dict-dir=./vits-zh-hf-fanchen-C/dict \
--sid=100 \
Expand Down
26 changes: 26 additions & 0 deletions dotnet-examples/offline-tts-play/run-matcha-en.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -ex


# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

dotnet run \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--debug=1 \
--output-filename=./matcha-en.wav \
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
27 changes: 27 additions & 0 deletions dotnet-examples/offline-tts-play/run-matcha-zh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -ex

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi


dotnet run \
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
--tokens=./matcha-icefall-zh-baker/tokens.txt \
--dict-dir=./matcha-icefall-zh-baker/dict \
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
--debug=1 \
--output-filename=./matcha-zh.wav \
--text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
Loading

0 comments on commit 3eced3e

Please sign in to comment.