diff --git a/.changeset/light-tools-jump.md b/.changeset/light-tools-jump.md new file mode 100644 index 000000000..fecff7279 --- /dev/null +++ b/.changeset/light-tools-jump.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-azure": minor +--- + +Azure TTS Prosody SSML support #912 diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 5a0489c30..ed05b4613 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -718,7 +718,7 @@ def _commit_user_question_if_needed() -> None: if tool_calls: extra_tools_messages.append( - ChatMessage.create_tool_calls(tool_calls, content=collected_text) + ChatMessage.create_tool_calls(tool_calls, text=collected_text) ) extra_tools_messages.extend(tool_calls_results_msg) diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py index 618f51f1f..9502b4a51 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py @@ -17,9 +17,10 @@ from dataclasses import dataclass from typing import Literal -import azure.cognitiveservices.speech as speechsdk # type: ignore from livekit.agents import tts, utils +import azure.cognitiveservices.speech as speechsdk # type: ignore + AZURE_SAMPLE_RATE: int = 16000 AZURE_BITS_PER_SAMPLE: int = 16 AZURE_NUM_CHANNELS: int = 1 @@ -65,7 +66,13 @@ def validate(self) -> None: "Prosody volume must be one of 'silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud'" ) - if self.pitch and self.pitch not in ["x-low", "low", "medium", "high", "x-high"]: + if self.pitch and self.pitch not in [ + "x-low", + "low", + "medium", + "high", + "x-high", + ]: raise ValueError( "Prosody pitch must be one of 'x-low', 'low', 'medium', 'high', 'x-high'" ) @@ -153,25 +160,22 @@ async def _main_task(self): stream=stream_callback, ) - def _create_ssml_text(text: str, opts: _TTSOptions) -> str: - ssml = f'' - prosody_ssml = " speechsdk.SpeechSynthesisResult: if self._opts.prosody: - ssml_text = _create_ssml_text(self._text, self._opts) - return synthesizer.speak_ssml_async(ssml_text).get() + ssml = f'' + prosody_ssml = "