diff --git a/src/python_run/piper/__main__.py b/src/python_run/piper/__main__.py index 15109f8e..4e859354 100644 --- a/src/python_run/piper/__main__.py +++ b/src/python_run/piper/__main__.py @@ -36,6 +36,12 @@ def main() -> None: action="store_true", help="Stream raw audio to stdout", ) + parser.add_argument( + "-p", + "--phoneme-input", + action="store_true", + help="Flag to use pure phonemes as input" + ) # parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)") parser.add_argument( @@ -107,6 +113,7 @@ def main() -> None: # Load voice voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda) synthesize_args = { + "phoneme_input": args.phoneme_input, "speaker_id": args.speaker, "length_scale": args.length_scale, "noise_scale": args.noise_scale, diff --git a/src/python_run/piper/voice.py b/src/python_run/piper/voice.py index 0360c273..9a0be7e4 100644 --- a/src/python_run/piper/voice.py +++ b/src/python_run/piper/voice.py @@ -90,6 +90,7 @@ def synthesize( self, text: str, wav_file: wave.Wave_write, + phoneme_input: bool, speaker_id: Optional[int] = None, length_scale: Optional[float] = None, noise_scale: Optional[float] = None, @@ -103,6 +104,7 @@ def synthesize( for audio_bytes in self.synthesize_stream_raw( text, + phoneme_input=phoneme_input, speaker_id=speaker_id, length_scale=length_scale, noise_scale=noise_scale, @@ -114,6 +116,7 @@ def synthesize( def synthesize_stream_raw( self, text: str, + phoneme_input: bool, speaker_id: Optional[int] = None, length_scale: Optional[float] = None, noise_scale: Optional[float] = None, @@ -121,8 +124,10 @@ def synthesize_stream_raw( sentence_silence: float = 0.0, ) -> Iterable[bytes]: """Synthesize raw audio per sentence from text.""" - sentence_phonemes = self.phonemize(text) - + if phoneme_input: + sentence_phonemes = [list(text)] + else: + sentence_phonemes = self.phonemize(text) # 16-bit mono num_silence_samples = int(sentence_silence * self.config.sample_rate) silence_bytes = bytes(num_silence_samples * 2)