Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jayeshp19 committed Jan 2, 2025
1 parent 4800261 commit 97f5040
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 37 deletions.
6 changes: 3 additions & 3 deletions livekit-agents/livekit/agents/multimodal/agent_playout.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,14 @@ async def _capture_task():
await self._source.wait_for_playout()

async def _stt_stream_co() -> None:
if stt_stream is not None:
if stt_stream and self._stt_forwarder is not None:
async for ev in stt_stream:
self._stt_forwarder.update(ev)

if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
self.emit("final_transcript", ev)
self.emit("final_transcript", ev.alternatives[0].text)
elif ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
self.emit("interim_transcript", ev)
self.emit("interim_transcript", ev.alternatives[0].text)

read_text_task = asyncio.create_task(_play_text_stream())

Expand Down
23 changes: 10 additions & 13 deletions livekit-agents/livekit/agents/multimodal/multimodal_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,8 @@ def _on_playout_stopped(interrupted: bool) -> None:

self._emit_speech_committed("agent", collected_text, interrupted)

def _on_final_transcript(ev: stt.SpeechEvent):
self._emit_speech_committed("agent", ev.alternatives[0].text)
def _on_final_transcript(text: str):
self._emit_speech_committed("agent", text)

self._agent_playout.on("playout_started", _on_playout_started)
self._agent_playout.on("playout_stopped", _on_playout_stopped)
Expand Down Expand Up @@ -486,9 +486,6 @@ def _subscribe_to_microphone(self, *args, **kwargs) -> None:
and publication.track != self._subscribed_track
):
self._subscribed_track = publication.track # type: ignore
stream_24khz = rtc.AudioStream(
self._subscribed_track, sample_rate=24000, num_channels=1
) # type: ignore
self._stt_forwarder = STTSegmentsForwarder(
room=self._room,
participant=self._linked_participant,
Expand All @@ -499,20 +496,20 @@ def _subscribe_to_microphone(self, *args, **kwargs) -> None:
self._recognize_atask.cancel()

self._recognize_atask = asyncio.create_task(
self._recognize_task(stream_24khz)
self._recognize_task(self._subscribed_track) # type: ignore
)
break

@utils.log_exceptions(logger=logger)
async def _recognize_task(self, audio_stream: rtc.AudioStream) -> None:
async def _recognize_task(self, track: rtc.LocalAudioTrack) -> None:
"""
Receive the frames from the user audio stream.
"""

stream_24khz = rtc.AudioStream(track, sample_rate=24000, num_channels=1)
stt_stream = self._stt.stream() if self._stt is not None else None

async def _micro_task() -> None:
async for ev in audio_stream:
async for ev in stream_24khz:
if stt_stream is not None:
stt_stream.push_frame(ev.frame)
self._input_audio_ch.send_nowait(ev.frame)
Expand All @@ -523,9 +520,9 @@ async def _stt_stream_co() -> None:
self._stt_forwarder.update(ev)

if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
self.emit("final_transcript", ev)
self.emit("final_transcript", ev.alternatives[0].text)
elif ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
self.emit("interim_transcript", ev)
self.emit("interim_transcript", ev.alternatives[0].text)

tasks = [
asyncio.create_task(_micro_task()),
Expand All @@ -544,8 +541,8 @@ def _ensure_session(self) -> aiohttp.ClientSession:

return self._http_session

def _on_final_transcript(self, ev: stt.SpeechEvent):
self._emit_speech_committed("user", ev.alternatives[0].text)
def _on_final_transcript(self, text: str):
self._emit_speech_committed("user", text)

def _emit_speech_committed(
self, speaker: Literal["user", "agent"], msg: str, interrupted: bool = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,27 +36,13 @@
SYSTEM_INSTRUCTIONS = """
You are an **Audio Transcriber**. Your task is to convert audio content into accurate and precise text.
**Guidelines:**
1. **Transcription Only:**
- Transcribe spoken words exactly as they are.
- Exclude any non-speech sounds (e.g., background noise, music).
2. **Response Format:**
- Provide only the transcription without any additional text or explanations.
- If the audio is unclear or inaudible, respond with: `...`
3. **Accuracy:**
- Ensure the transcription is free from errors.
- Maintain the original meaning and context of the speech.
4. **Clarity:**
- Use proper punctuation and formatting to enhance readability.
- Preserve the original speaker's intent and tone as much as possible.
**Do Not:**
- Add any explanations, comments, or additional information.
- Include timestamps, speaker labels, or annotations unless specified.
- Transcribe verbatim; exclude non-speech sounds.
- Provide only transcription; no extra text or explanations.
- If audio is unclear, respond with: `...`
- Ensure error-free transcription, preserving meaning and context.
- Use proper punctuation and formatting.
- Do not add explanations, comments, or extra information.
- Do not include timestamps, speaker labels, or annotations unless specified.
"""


Expand Down

0 comments on commit 97f5040

Please sign in to comment.