From ae29139a6e2f772d42126541a39e43ce5a7a8ab2 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 5 Nov 2023 12:56:54 +0800 Subject: [PATCH] Support Chinese polyphones in TTS --- sherpa-onnx/csrc/lexicon.cc | 46 ++++++++++++++++++++++++++++++++++++- sherpa-onnx/csrc/lexicon.h | 4 ++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index b74892531..fce7bc8de 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -17,6 +17,8 @@ #include "android/asset_manager_jni.h" #endif +#include + #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/onnx-utils.h" #include "sherpa-onnx/csrc/text-utils.h" @@ -147,7 +149,36 @@ std::vector Lexicon::ConvertTextToTokenIds( std::vector Lexicon::ConvertTextToTokenIdsChinese( const std::string &text) const { - std::vector words = SplitUtf8(text); + std::vector words; + if (pattern_) { + // Handle polyphones + size_t pos = 0; + auto begin = std::sregex_iterator(text.begin(), text.end(), *pattern_); + auto end = std::sregex_iterator(); + for (std::sregex_iterator i = begin; i != end; ++i) { + std::smatch match = *i; + if (pos < match.position()) { + auto this_segment = text.substr(pos, match.position() - pos); + auto this_segment_words = SplitUtf8(this_segment); + words.insert(words.end(), this_segment_words.begin(), + this_segment_words.end()); + pos = match.position() + match.length(); + } else if (pos == match.position()) { + pos = match.position() + match.length(); + } + + words.push_back(match.str()); + } + + if (pos < text.size()) { + auto this_segment = text.substr(pos, text.size() - pos); + auto this_segment_words = SplitUtf8(this_segment); + words.insert(words.end(), this_segment_words.begin(), + this_segment_words.end()); + } + } else { + words = SplitUtf8(text); + } if (debug_) { fprintf(stderr, "Input text in string: %s\n", text.c_str()); @@ -272,6 +303,9 @@ void Lexicon::InitLexicon(std::istream &is) { std::string line; std::string phone; + std::ostringstream os; + std::string sep; + while (std::getline(is, line)) { std::istringstream iss(line); @@ -293,8 +327,18 @@ void Lexicon::InitLexicon(std::istream &is) { if (ids.empty()) { continue; } + if (language_ == Language::kChinese && word.size() > 3) { + // this is not a single word; + os << sep << word; + sep = "|"; + } + word2ids_.insert({std::move(word), std::move(ids)}); } + + if (!sep.empty()) { + pattern_ = std::make_unique(os.str()); + } } void Lexicon::InitPunctuations(const std::string &punctuations) { diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index af993712a..4cab72d0d 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -79,6 +80,9 @@ class Lexicon { Language language_; bool debug_; bool is_piper_; + + // for Chinese polyphones + std::unique_ptr pattern_; }; } // namespace sherpa_onnx