From b0b444abecccb5657033ac3fefcdae6e4f457296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=B3bert=20Kjaran?= Date: Thu, 21 Dec 2023 14:28:23 +0000 Subject: [PATCH] Add CMake option to disable/enable TTS support --- CMakeLists.txt | 13 ++-- sherpa-onnx/c-api/CMakeLists.txt | 10 ++- sherpa-onnx/c-api/c-api-tts.cc | 98 ++++++++++++++++++++++++++ sherpa-onnx/c-api/c-api.cc | 88 ----------------------- sherpa-onnx/c-api/c-api.h | 4 ++ sherpa-onnx/csrc/CMakeLists.txt | 79 +++++++++++++++------ sherpa-onnx/python/csrc/CMakeLists.txt | 15 ++-- 7 files changed, 188 insertions(+), 119 deletions(-) create mode 100644 sherpa-onnx/c-api/c-api-tts.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 19c27189d..5b05c9b50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ option(SHERPA_ONNX_ENABLE_C_API "Whether to build C API" ON) option(SHERPA_ONNX_ENABLE_WEBSOCKET "Whether to build webscoket server/client" ON) option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF) option(SHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY "True to link libstdc++ statically. Used only when BUILD_SHARED_LIBS is OFF on Linux" ON) +option(SHERPA_ONNX_ENABLE_TTS "Whether to build with TTS capability" ON) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") @@ -99,6 +100,8 @@ message(STATUS "SHERPA_ONNX_ENABLE_JNI ${SHERPA_ONNX_ENABLE_JNI}") message(STATUS "SHERPA_ONNX_ENABLE_C_API ${SHERPA_ONNX_ENABLE_C_API}") message(STATUS "SHERPA_ONNX_ENABLE_WEBSOCKET ${SHERPA_ONNX_ENABLE_WEBSOCKET}") message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}") +message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") + if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.") @@ -193,10 +196,12 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET) include(asio) endif() -include(espeak-ng-for-piper) -set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR}) -message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}") -include(piper-phonemize) +if(SHERPA_ONNX_ENABLE_TTS) + include(espeak-ng-for-piper) + set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR}) + message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}") + include(piper-phonemize) +endif() add_subdirectory(sherpa-onnx) diff --git a/sherpa-onnx/c-api/CMakeLists.txt b/sherpa-onnx/c-api/CMakeLists.txt index c0da8ca8a..4f406db05 100644 --- a/sherpa-onnx/c-api/CMakeLists.txt +++ b/sherpa-onnx/c-api/CMakeLists.txt @@ -1,5 +1,10 @@ include_directories(${CMAKE_SOURCE_DIR}) add_library(sherpa-onnx-c-api c-api.cc) + +if(SHERPA_ONNX_ENABLE_TTS) + target_sources(sherpa-onnx-c-api PRIVATE c-api-tts.cc) +endif() + target_link_libraries(sherpa-onnx-c-api sherpa-onnx-core) if(BUILD_SHARED_LIBS) @@ -7,9 +12,12 @@ if(BUILD_SHARED_LIBS) target_compile_definitions(sherpa-onnx-c-api PRIVATE SHERPA_ONNX_BUILD_MAIN_LIB=1) endif() +if(SHERPA_ONNX_ENABLE_TTS) + target_compile_definitions(sherpa-onnx-c-api PUBLIC SHERPA_ONNX_ENABLE_TTS=1) +endif() + install(TARGETS sherpa-onnx-c-api DESTINATION lib) install(FILES c-api.h DESTINATION include/sherpa-onnx/c-api ) - diff --git a/sherpa-onnx/c-api/c-api-tts.cc b/sherpa-onnx/c-api/c-api-tts.cc new file mode 100644 index 000000000..a5cb3242a --- /dev/null +++ b/sherpa-onnx/c-api/c-api-tts.cc @@ -0,0 +1,98 @@ +// sherpa-onnx/c-api/c-api-tts.cc +// +// Copyright (c) 2023 Xiaomi Corporation +#include "sherpa-onnx/c-api/c-api.h" + +#include +#include + +#include "sherpa-onnx/csrc/offline-tts.h" +#include "sherpa-onnx/csrc/wave-writer.h" + +#define SHERPA_ONNX_OR(x, y) (x ? x : y) + +struct SherpaOnnxOfflineTts { + std::unique_ptr impl; +}; + +SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( + const SherpaOnnxOfflineTtsConfig *config) { + sherpa_onnx::OfflineTtsConfig tts_config; + + tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, ""); + tts_config.model.vits.lexicon = + SHERPA_ONNX_OR(config->model.vits.lexicon, ""); + tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, ""); + tts_config.model.vits.data_dir = + SHERPA_ONNX_OR(config->model.vits.data_dir, ""); + tts_config.model.vits.noise_scale = + SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667); + tts_config.model.vits.noise_scale_w = + SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8); + tts_config.model.vits.length_scale = + SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0); + + tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); + tts_config.model.debug = config->model.debug; + tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); + tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); + tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); + + if (tts_config.model.debug) { + fprintf(stderr, "%s\n", tts_config.ToString().c_str()); + } + + SherpaOnnxOfflineTts *tts = new SherpaOnnxOfflineTts; + + tts->impl = std::make_unique(tts_config); + + return tts; +} + +void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; } + +int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) { + return tts->impl->SampleRate(); +} + +const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, + float speed) { + return SherpaOnnxOfflineTtsGenerateWithCallback(tts, text, sid, speed, + nullptr); +} + +const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallback( + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, + SherpaOnnxGeneratedAudioCallback callback) { + sherpa_onnx::GeneratedAudio audio = + tts->impl->Generate(text, sid, speed, callback); + + if (audio.samples.empty()) { + return nullptr; + } + + SherpaOnnxGeneratedAudio *ans = new SherpaOnnxGeneratedAudio; + + float *samples = new float[audio.samples.size()]; + std::copy(audio.samples.begin(), audio.samples.end(), samples); + + ans->samples = samples; + ans->n = audio.samples.size(); + ans->sample_rate = audio.sample_rate; + + return ans; +} + +void SherpaOnnxDestroyOfflineTtsGeneratedAudio( + const SherpaOnnxGeneratedAudio *p) { + if (p) { + delete[] p->samples; + delete p; + } +} + +int32_t SherpaOnnxWriteWave(const float *samples, int32_t n, + int32_t sample_rate, const char *filename) { + return sherpa_onnx::WriteWave(filename, sample_rate, samples, n); +} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index b8bffab9c..ed9be9f70 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -12,10 +12,8 @@ #include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/display.h" #include "sherpa-onnx/csrc/offline-recognizer.h" -#include "sherpa-onnx/csrc/offline-tts.h" #include "sherpa-onnx/csrc/online-recognizer.h" #include "sherpa-onnx/csrc/voice-activity-detector.h" -#include "sherpa-onnx/csrc/wave-writer.h" struct SherpaOnnxOnlineRecognizer { std::unique_ptr impl; @@ -534,89 +532,3 @@ void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) { void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { p->impl->Reset(); } - -struct SherpaOnnxOfflineTts { - std::unique_ptr impl; -}; - -SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( - const SherpaOnnxOfflineTtsConfig *config) { - sherpa_onnx::OfflineTtsConfig tts_config; - - tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, ""); - tts_config.model.vits.lexicon = - SHERPA_ONNX_OR(config->model.vits.lexicon, ""); - tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, ""); - tts_config.model.vits.data_dir = - SHERPA_ONNX_OR(config->model.vits.data_dir, ""); - tts_config.model.vits.noise_scale = - SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667); - tts_config.model.vits.noise_scale_w = - SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8); - tts_config.model.vits.length_scale = - SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0); - - tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); - tts_config.model.debug = config->model.debug; - tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); - tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); - tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); - - if (tts_config.model.debug) { - fprintf(stderr, "%s\n", tts_config.ToString().c_str()); - } - - SherpaOnnxOfflineTts *tts = new SherpaOnnxOfflineTts; - - tts->impl = std::make_unique(tts_config); - - return tts; -} - -void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; } - -int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) { - return tts->impl->SampleRate(); -} - -const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( - const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, - float speed) { - return SherpaOnnxOfflineTtsGenerateWithCallback(tts, text, sid, speed, - nullptr); -} - -const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallback( - const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, - SherpaOnnxGeneratedAudioCallback callback) { - sherpa_onnx::GeneratedAudio audio = - tts->impl->Generate(text, sid, speed, callback); - - if (audio.samples.empty()) { - return nullptr; - } - - SherpaOnnxGeneratedAudio *ans = new SherpaOnnxGeneratedAudio; - - float *samples = new float[audio.samples.size()]; - std::copy(audio.samples.begin(), audio.samples.end(), samples); - - ans->samples = samples; - ans->n = audio.samples.size(); - ans->sample_rate = audio.sample_rate; - - return ans; -} - -void SherpaOnnxDestroyOfflineTtsGeneratedAudio( - const SherpaOnnxGeneratedAudio *p) { - if (p) { - delete[] p->samples; - delete p; - } -} - -int32_t SherpaOnnxWriteWave(const float *samples, int32_t n, - int32_t sample_rate, const char *filename) { - return sherpa_onnx::WriteWave(filename, sample_rate, samples, n); -} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 11971808c..3d840f025 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -600,6 +600,8 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment( SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( SherpaOnnxVoiceActivityDetector *p); +#if defined(SHERPA_ONNX_ENABLE_TTS) + // ============================================================ // For offline Text-to-Speech (i.e., non-streaming TTS) // ============================================================ @@ -677,6 +679,8 @@ SHERPA_ONNX_API int32_t SherpaOnnxWriteWave(const float *samples, int32_t n, int32_t sample_rate, const char *filename); +#endif // SHERPA_ONNX_ENABLE_TTS + #if defined(__GNUC__) #pragma GCC diagnostic pop #endif diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index c114e08fb..bdde9fbb8 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -41,7 +41,6 @@ set(sources offline-transducer-model-config.cc offline-transducer-model.cc offline-transducer-modified-beam-search-decoder.cc - offline-tts-character-frontend.cc offline-wenet-ctc-model-config.cc offline-wenet-ctc-model.cc offline-whisper-greedy-search-decoder.cc @@ -75,7 +74,6 @@ set(sources packed-sequence.cc pad-sequence.cc parse-options.cc - piper-phonemize-lexicon.cc provider.cc resample.cc session.cc @@ -94,15 +92,20 @@ set(sources wave-reader.cc ) -list(APPEND sources - lexicon.cc - offline-tts-impl.cc - offline-tts-model-config.cc - offline-tts-vits-model-config.cc - offline-tts-vits-model.cc - offline-tts.cc - wave-writer.cc -) + +if(SHERPA_ONNX_ENABLE_TTS) + list(APPEND sources + lexicon.cc + offline-tts-character-frontend.cc + offline-tts-impl.cc + offline-tts-model-config.cc + offline-tts-vits-model-config.cc + offline-tts-vits-model.cc + offline-tts.cc + wave-writer.cc + piper-phonemize-lexicon.cc + ) +endif() if(SHERPA_ONNX_ENABLE_CHECK) list(APPEND sources log.cc) @@ -143,7 +146,9 @@ if(SHERPA_ONNX_ENABLE_GPU) ) endif() -target_link_libraries(sherpa-onnx-core piper_phonemize) +if(SHERPA_ONNX_ENABLE_TTS) + target_link_libraries(sherpa-onnx-core piper_phonemize) +endif() if(SHERPA_ONNX_ENABLE_CHECK) target_compile_definitions(sherpa-onnx-core PUBLIC SHERPA_ONNX_ENABLE_CHECK=1) @@ -164,15 +169,23 @@ endif() add_executable(sherpa-onnx sherpa-onnx.cc) add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc) add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc) -add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc) + +if(SHERPA_ONNX_ENABLE_TTS) + add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc) +endif() set(main_exes sherpa-onnx sherpa-onnx-offline sherpa-onnx-offline-parallel - sherpa-onnx-offline-tts ) +if(SHERPA_ONNX_ENABLE_TTS) + list(APPEND main_exes + sherpa-onnx-offline-tts + ) +endif() + foreach(exe IN LISTS main_exes) target_link_libraries(${exe} sherpa-onnx-core) endforeach() @@ -207,12 +220,21 @@ install( if(SHERPA_ONNX_HAS_ALSA) add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) - add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc) + + if(SHERPA_ONNX_ENABLE_TTS) + add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc) + endif() set(exes sherpa-onnx-alsa - sherpa-onnx-offline-tts-play-alsa ) + + if(SHERPA_ONNX_ENABLE_TTS) + list(APPEND exes + sherpa-onnx-offline-tts-play-alsa + ) + endif() + foreach(exe IN LISTS exes) target_link_libraries(${exe} sherpa-onnx-core) endforeach() @@ -246,10 +268,12 @@ if(SHERPA_ONNX_HAS_ALSA) endif() if(SHERPA_ONNX_ENABLE_PORTAUDIO) - add_executable(sherpa-onnx-offline-tts-play - sherpa-onnx-offline-tts-play.cc - microphone.cc - ) + if(SHERPA_ONNX_ENABLE_TTS) + add_executable(sherpa-onnx-offline-tts-play + sherpa-onnx-offline-tts-play.cc + microphone.cc + ) + endif() add_executable(sherpa-onnx-microphone sherpa-onnx-microphone.cc @@ -278,12 +302,18 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) endif() set(exes - sherpa-onnx-offline-tts-play sherpa-onnx-microphone sherpa-onnx-microphone-offline sherpa-onnx-vad-microphone sherpa-onnx-vad-microphone-offline-asr ) + + if(SHERPA_ONNX_ENABLE_TTS) + list(APPEND exes + sherpa-onnx-offline-tts-play + ) + endif() + foreach(exe IN LISTS exes) target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core) endforeach() @@ -377,7 +407,6 @@ if(SHERPA_ONNX_ENABLE_TESTS) context-graph-test.cc packed-sequence-test.cc pad-sequence-test.cc - piper-phonemize-test.cc slice-test.cc stack-test.cc transpose-test.cc @@ -385,6 +414,12 @@ if(SHERPA_ONNX_ENABLE_TESTS) utfcpp-test.cc ) + if(SHERPA_ONNX_ENABLE_TTS) + list(APPEND sherpa_onnx_test_srcs + piper-phonemize-test.cc + ) + endif() + function(sherpa_onnx_add_test source) get_filename_component(name ${source} NAME_WE) set(target_name ${name}) diff --git a/sherpa-onnx/python/csrc/CMakeLists.txt b/sherpa-onnx/python/csrc/CMakeLists.txt index e346922e5..85c35546d 100644 --- a/sherpa-onnx/python/csrc/CMakeLists.txt +++ b/sherpa-onnx/python/csrc/CMakeLists.txt @@ -1,6 +1,6 @@ include_directories(${CMAKE_SOURCE_DIR}) -pybind11_add_module(_sherpa_onnx +set(sources circular-buffer.cc display.cc endpoint.cc @@ -14,9 +14,6 @@ pybind11_add_module(_sherpa_onnx offline-stream.cc offline-tdnn-model-config.cc offline-transducer-model-config.cc - offline-tts-model-config.cc - offline-tts-vits-model-config.cc - offline-tts.cc offline-wenet-ctc-model-config.cc offline-whisper-model-config.cc offline-zipformer-ctc-model-config.cc @@ -34,6 +31,16 @@ pybind11_add_module(_sherpa_onnx voice-activity-detector.cc ) +if(SHERPA_ONNX_ENABLE_TTS) + list(APPEND sources + offline-tts-model-config.cc + offline-tts-vits-model-config.cc + offline-tts.cc + ) +endif() + +pybind11_add_module(_sherpa_onnx ${sources}) + if(APPLE) execute_process( COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"