Skip to content

Commit

Permalink
Support Chinese polyphones in TTS
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Nov 5, 2023
1 parent 606cb26 commit ae29139
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 1 deletion.
46 changes: 45 additions & 1 deletion sherpa-onnx/csrc/lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include "android/asset_manager_jni.h"
#endif

#include <regex>

#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/text-utils.h"
Expand Down Expand Up @@ -147,7 +149,36 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(

std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
const std::string &text) const {
std::vector<std::string> words = SplitUtf8(text);
std::vector<std::string> words;
if (pattern_) {
// Handle polyphones
size_t pos = 0;
auto begin = std::sregex_iterator(text.begin(), text.end(), *pattern_);
auto end = std::sregex_iterator();
for (std::sregex_iterator i = begin; i != end; ++i) {
std::smatch match = *i;
if (pos < match.position()) {
auto this_segment = text.substr(pos, match.position() - pos);
auto this_segment_words = SplitUtf8(this_segment);
words.insert(words.end(), this_segment_words.begin(),
this_segment_words.end());
pos = match.position() + match.length();
} else if (pos == match.position()) {
pos = match.position() + match.length();
}

words.push_back(match.str());
}

if (pos < text.size()) {
auto this_segment = text.substr(pos, text.size() - pos);
auto this_segment_words = SplitUtf8(this_segment);
words.insert(words.end(), this_segment_words.begin(),
this_segment_words.end());
}
} else {
words = SplitUtf8(text);
}

if (debug_) {
fprintf(stderr, "Input text in string: %s\n", text.c_str());
Expand Down Expand Up @@ -272,6 +303,9 @@ void Lexicon::InitLexicon(std::istream &is) {
std::string line;
std::string phone;

std::ostringstream os;
std::string sep;

while (std::getline(is, line)) {
std::istringstream iss(line);

Expand All @@ -293,8 +327,18 @@ void Lexicon::InitLexicon(std::istream &is) {
if (ids.empty()) {
continue;
}
if (language_ == Language::kChinese && word.size() > 3) {
// this is not a single word;
os << sep << word;
sep = "|";
}

word2ids_.insert({std::move(word), std::move(ids)});
}

if (!sep.empty()) {
pattern_ = std::make_unique<std::regex>(os.str());
}
}

void Lexicon::InitPunctuations(const std::string &punctuations) {
Expand Down
4 changes: 4 additions & 0 deletions sherpa-onnx/csrc/lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <cstdint>
#include <iostream>
#include <regex>
#include <string>
#include <unordered_map>
#include <unordered_set>
Expand Down Expand Up @@ -79,6 +80,9 @@ class Lexicon {
Language language_;
bool debug_;
bool is_piper_;

// for Chinese polyphones
std::unique_ptr<std::regex> pattern_;
};

} // namespace sherpa_onnx
Expand Down

0 comments on commit ae29139

Please sign in to comment.