diff --git a/.changeset/light-tools-jump.md b/.changeset/light-tools-jump.md
new file mode 100644
index 000000000..fecff7279
--- /dev/null
+++ b/.changeset/light-tools-jump.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-azure": minor
+---
+
+Azure TTS Prosody SSML support #912
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index 5a0489c30..ed05b4613 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -718,7 +718,7 @@ def _commit_user_question_if_needed() -> None:
if tool_calls:
extra_tools_messages.append(
- ChatMessage.create_tool_calls(tool_calls, content=collected_text)
+ ChatMessage.create_tool_calls(tool_calls, text=collected_text)
)
extra_tools_messages.extend(tool_calls_results_msg)
diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
index 618f51f1f..9502b4a51 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
@@ -17,9 +17,10 @@
from dataclasses import dataclass
from typing import Literal
-import azure.cognitiveservices.speech as speechsdk # type: ignore
from livekit.agents import tts, utils
+import azure.cognitiveservices.speech as speechsdk # type: ignore
+
AZURE_SAMPLE_RATE: int = 16000
AZURE_BITS_PER_SAMPLE: int = 16
AZURE_NUM_CHANNELS: int = 1
@@ -65,7 +66,13 @@ def validate(self) -> None:
"Prosody volume must be one of 'silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud'"
)
- if self.pitch and self.pitch not in ["x-low", "low", "medium", "high", "x-high"]:
+ if self.pitch and self.pitch not in [
+ "x-low",
+ "low",
+ "medium",
+ "high",
+ "x-high",
+ ]:
raise ValueError(
"Prosody pitch must be one of 'x-low', 'low', 'medium', 'high', 'x-high'"
)
@@ -153,25 +160,22 @@ async def _main_task(self):
stream=stream_callback,
)
- def _create_ssml_text(text: str, opts: _TTSOptions) -> str:
- ssml = f''
- prosody_ssml = ""
- ssml += prosody_ssml
- ssml += text
- ssml += ""
- return ssml
-
def _synthesize() -> speechsdk.SpeechSynthesisResult:
if self._opts.prosody:
- ssml_text = _create_ssml_text(self._text, self._opts)
- return synthesizer.speak_ssml_async(ssml_text).get()
+ ssml = f''
+ prosody_ssml = ""
+ ssml += prosody_ssml
+ ssml += self._text
+ ssml += ""
+ return synthesizer.speak_ssml_async(ssml).get() # type: ignore
+
return synthesizer.speak_text_async(self._text).get() # type: ignore
result = None