updates

livekit · Jan 2, 2025 · 97f5040 · 97f5040
1 parent 4800261
commit 97f5040
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 37 deletions.
diff --git a/livekit-agents/livekit/agents/multimodal/agent_playout.py b/livekit-agents/livekit/agents/multimodal/agent_playout.py
@@ -161,14 +161,14 @@ async def _capture_task():
             await self._source.wait_for_playout()
 
         async def _stt_stream_co() -> None:
-            if stt_stream is not None:
+            if stt_stream and self._stt_forwarder is not None:
                 async for ev in stt_stream:
                     self._stt_forwarder.update(ev)
 
                     if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
-                        self.emit("final_transcript", ev)
+                        self.emit("final_transcript", ev.alternatives[0].text)
                     elif ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
-                        self.emit("interim_transcript", ev)
+                        self.emit("interim_transcript", ev.alternatives[0].text)
 
         read_text_task = asyncio.create_task(_play_text_stream())
 

diff --git a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py
@@ -434,8 +434,8 @@ def _on_playout_stopped(interrupted: bool) -> None:
 
                     self._emit_speech_committed("agent", collected_text, interrupted)
 
-        def _on_final_transcript(ev: stt.SpeechEvent):
-            self._emit_speech_committed("agent", ev.alternatives[0].text)
+        def _on_final_transcript(text: str):
+            self._emit_speech_committed("agent", text)
 
         self._agent_playout.on("playout_started", _on_playout_started)
         self._agent_playout.on("playout_stopped", _on_playout_stopped)
@@ -486,9 +486,6 @@ def _subscribe_to_microphone(self, *args, **kwargs) -> None:
                 and publication.track != self._subscribed_track
             ):
                 self._subscribed_track = publication.track  # type: ignore
-                stream_24khz = rtc.AudioStream(
-                    self._subscribed_track, sample_rate=24000, num_channels=1
-                )  # type: ignore
                 self._stt_forwarder = STTSegmentsForwarder(
                     room=self._room,
                     participant=self._linked_participant,
@@ -499,20 +496,20 @@ def _subscribe_to_microphone(self, *args, **kwargs) -> None:
                     self._recognize_atask.cancel()
 
                 self._recognize_atask = asyncio.create_task(
-                    self._recognize_task(stream_24khz)
+                    self._recognize_task(self._subscribed_track)  # type: ignore
                 )
                 break
 
     @utils.log_exceptions(logger=logger)
-    async def _recognize_task(self, audio_stream: rtc.AudioStream) -> None:
+    async def _recognize_task(self, track: rtc.LocalAudioTrack) -> None:
         """
         Receive the frames from the user audio stream.
         """
-
+        stream_24khz = rtc.AudioStream(track, sample_rate=24000, num_channels=1)
         stt_stream = self._stt.stream() if self._stt is not None else None
 
         async def _micro_task() -> None:
-            async for ev in audio_stream:
+            async for ev in stream_24khz:
                 if stt_stream is not None:
                     stt_stream.push_frame(ev.frame)
                 self._input_audio_ch.send_nowait(ev.frame)
@@ -523,9 +520,9 @@ async def _stt_stream_co() -> None:
                     self._stt_forwarder.update(ev)
 
                     if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
-                        self.emit("final_transcript", ev)
+                        self.emit("final_transcript", ev.alternatives[0].text)
                     elif ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
-                        self.emit("interim_transcript", ev)
+                        self.emit("interim_transcript", ev.alternatives[0].text)
 
         tasks = [
             asyncio.create_task(_micro_task()),
@@ -544,8 +541,8 @@ def _ensure_session(self) -> aiohttp.ClientSession:
 
         return self._http_session
 
-    def _on_final_transcript(self, ev: stt.SpeechEvent):
-        self._emit_speech_committed("user", ev.alternatives[0].text)
+    def _on_final_transcript(self, text: str):
+        self._emit_speech_committed("user", text)
 
     def _emit_speech_committed(
         self, speaker: Literal["user", "agent"], msg: str, interrupted: bool = False

diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/stt.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/stt.py
@@ -36,27 +36,13 @@
 SYSTEM_INSTRUCTIONS = """
 You are an **Audio Transcriber**. Your task is to convert audio content into accurate and precise text.
 
-**Guidelines:**
-
-1. **Transcription Only:**
-   - Transcribe spoken words exactly as they are.
-   - Exclude any non-speech sounds (e.g., background noise, music).
-
-2. **Response Format:**
-   - Provide only the transcription without any additional text or explanations.
-   - If the audio is unclear or inaudible, respond with: `...`
-
-3. **Accuracy:**
-   - Ensure the transcription is free from errors.
-   - Maintain the original meaning and context of the speech.
-
-4. **Clarity:**
-   - Use proper punctuation and formatting to enhance readability.
-   - Preserve the original speaker's intent and tone as much as possible.
-
-**Do Not:**
-- Add any explanations, comments, or additional information.
-- Include timestamps, speaker labels, or annotations unless specified.
+- Transcribe verbatim; exclude non-speech sounds.
+- Provide only transcription; no extra text or explanations.
+- If audio is unclear, respond with: `...`
+- Ensure error-free transcription, preserving meaning and context.
+- Use proper punctuation and formatting.
+- Do not add explanations, comments, or extra information.
+- Do not include timestamps, speaker labels, or annotations unless specified.
 """