Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Java API for Matcha-TTS models. #1673

Merged
merged 2 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/run-java-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,13 @@ jobs:
shell: bash
run: |
cd ./java-api-examples

./run-non-streaming-tts-matcha-zh.sh
./run-non-streaming-tts-matcha-en.sh

rm -rf matcha-icefall-*
rm hifigan_v2.onnx

./run-non-streaming-tts-piper-en.sh
rm -rf vits-piper-*

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,4 @@ sherpa-onnx-moonshine-base-en-int8
harmony-os/SherpaOnnxHar/sherpa_onnx/LICENSE
harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md
matcha-icefall-zh-baker
matcha-icefall-en_US-ljspeech
60 changes: 60 additions & 0 deletions java-api-examples/NonStreamingTtsMatchaEn.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright 2025 Xiaomi Corporation

// This file shows how to use a matcha English model
// to convert text to speech
import com.k2fsa.sherpa.onnx.*;

public class NonStreamingTtsMatchaEn {
public static void main(String[] args) {
// please visit
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
// to download model files
String acousticModel = "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx";
String vocoder = "./hifigan_v2.onnx";
String tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt";
String dataDir = "./matcha-icefall-en_US-ljspeech/espeak-ng-data";
String text =
"Today as always, men fall into two groups: slaves and free men. Whoever does not have"
+ " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
+ " businessman, an official, or a scholar.";

OfflineTtsMatchaModelConfig matchaModelConfig =
OfflineTtsMatchaModelConfig.builder()
.setAcousticModel(acousticModel)
.setVocoder(vocoder)
.setTokens(tokens)
.setDataDir(dataDir)
.build();

OfflineTtsModelConfig modelConfig =
OfflineTtsModelConfig.builder()
.setMatcha(matchaModelConfig)
.setNumThreads(1)
.setDebug(true)
.build();

OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
OfflineTts tts = new OfflineTts(config);

int sid = 0;
float speed = 1.0f;
long start = System.currentTimeMillis();
GeneratedAudio audio = tts.generate(text, sid, speed);
long stop = System.currentTimeMillis();

float timeElapsedSeconds = (stop - start) / 1000.0f;

float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
float real_time_factor = timeElapsedSeconds / audioDuration;

String waveFilename = "tts-matcha-en.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);

tts.release();
}
}
66 changes: 66 additions & 0 deletions java-api-examples/NonStreamingTtsMatchaZh.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright 2025 Xiaomi Corporation

// This file shows how to use a matcha Chinese TTS model
// to convert text to speech
import com.k2fsa.sherpa.onnx.*;

public class NonStreamingTtsMatchaZh {
public static void main(String[] args) {
// please visit
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
// to download model files
String acousticModel = "./matcha-icefall-zh-baker/model-steps-3.onnx";
String vocoder = "./hifigan_v2.onnx";
String tokens = "./matcha-icefall-zh-baker/tokens.txt";
String lexicon = "./matcha-icefall-zh-baker/lexicon.txt";
String dictDir = "./matcha-icefall-zh-baker/dict";
String ruleFsts =
"./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst";
String text =
"某某银行的副行长和一些行政领导表示,他们去过长江"
+ "和长白山; 经济不断增长。"
+ "2024年12月31号,拨打110或者18920240511。"
+ "123456块钱。";

OfflineTtsMatchaModelConfig matchaModelConfig =
OfflineTtsMatchaModelConfig.builder()
.setAcousticModel(acousticModel)
.setVocoder(vocoder)
.setTokens(tokens)
.setLexicon(lexicon)
.setDictDir(dictDir)
.build();

OfflineTtsModelConfig modelConfig =
OfflineTtsModelConfig.builder()
.setMatcha(matchaModelConfig)
.setNumThreads(1)
.setDebug(true)
.build();

OfflineTtsConfig config =
OfflineTtsConfig.builder().setModel(modelConfig).setRuleFsts(ruleFsts).build();
OfflineTts tts = new OfflineTts(config);

int sid = 0;
float speed = 1.0f;
long start = System.currentTimeMillis();
GeneratedAudio audio = tts.generate(text, sid, speed);
long stop = System.currentTimeMillis();

float timeElapsedSeconds = (stop - start) / 1000.0f;

float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
float real_time_factor = timeElapsedSeconds / audioDuration;

String waveFilename = "tts-matcha-zh.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);

tts.release();
}
}
45 changes: 45 additions & 0 deletions java-api-examples/run-non-streaming-tts-matcha-en.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash

set -ex

if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
mkdir -p ../build
pushd ../build
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..

make -j4
ls -lh lib
popd
fi

if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
pushd ../sherpa-onnx/java-api
make
popd
fi

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

java \
-Djava.library.path=$PWD/../build/lib \
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
NonStreamingTtsMatchaEn.java
44 changes: 44 additions & 0 deletions java-api-examples/run-non-streaming-tts-matcha-zh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env bash

set -ex

if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
mkdir -p ../build
pushd ../build
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..

make -j4
ls -lh lib
popd
fi

if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
pushd ../sherpa-onnx/java-api
make
popd
fi

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

java \
-Djava.library.path=$PWD/../build/lib \
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
NonStreamingTtsMatchaZh.java
5 changes: 5 additions & 0 deletions sherpa-onnx/csrc/piper-phonemize-lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
#endif

#if __OHOS__
template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
NativeResourceManager *mgr, const std::string &tokens,
const std::string &data_dir,
const OfflineTtsVitsModelMetaData &vits_meta_data);

template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
NativeResourceManager *mgr, const std::string &tokens,
const std::string &data_dir,
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/java-api/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ java_files += OfflineRecognizerResult.java
java_files += OfflineStream.java
java_files += OfflineRecognizer.java

java_files += OfflineTtsMatchaModelConfig.java
java_files += OfflineTtsVitsModelConfig.java
java_files += OfflineTtsModelConfig.java
java_files += OfflineTtsConfig.java
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright 2025 Xiaomi Corporation

package com.k2fsa.sherpa.onnx;

public class OfflineTtsMatchaModelConfig {
private final String acousticModel;
private final String vocoder;
private final String lexicon;
private final String tokens;
private final String dataDir;
private final String dictDir;
private final float noiseScale;
private final float lengthScale;

private OfflineTtsMatchaModelConfig(Builder builder) {
this.acousticModel = builder.acousticModel;
this.vocoder = builder.vocoder;
this.lexicon = builder.lexicon;
this.tokens = builder.tokens;
this.dataDir = builder.dataDir;
this.dictDir = builder.dictDir;
this.noiseScale = builder.noiseScale;
this.lengthScale = builder.lengthScale;
}

public static Builder builder() {
return new Builder();
}

public String getAcousticModel() {
return acousticModel;
}

public String getVocoder() {
return vocoder;
}

public String getLexicon() {
return lexicon;
}

public String getTokens() {
return tokens;
}

public String getDataDir() {
return dataDir;
}

public String getDictDir() {
return dictDir;
}

public float getLengthScale() {
return lengthScale;
}

public float getNoiseScale() {
return noiseScale;
}

public static class Builder {
private String acousticModel = "";
private String vocoder = "";
private String lexicon = "";
private String tokens = "";
private String dataDir = "";
private String dictDir = "";
private float noiseScale = 1.0f;
private float lengthScale = 1.0f;

public OfflineTtsMatchaModelConfig build() {
return new OfflineTtsMatchaModelConfig(this);
}

public Builder setAcousticModel(String acousticModel) {
this.acousticModel = acousticModel;
return this;
}

public Builder setVocoder(String vocoder) {
this.vocoder = vocoder;
return this;
}

public Builder setTokens(String tokens) {
this.tokens = tokens;
return this;
}

public Builder setLexicon(String lexicon) {
this.lexicon = lexicon;
return this;
}

public Builder setDataDir(String dataDir) {
this.dataDir = dataDir;
return this;
}

public Builder setDictDir(String dictDir) {
this.dictDir = dictDir;
return this;
}

public Builder setNoiseScale(float noiseScale) {
this.noiseScale = noiseScale;
return this;
}

public Builder setLengthScale(float lengthScale) {
this.lengthScale = lengthScale;
return this;
}
}
}
Loading
Loading