diff --git a/.changeset/good-horses-explain.md b/.changeset/good-horses-explain.md new file mode 100644 index 000000000..979d1fe3b --- /dev/null +++ b/.changeset/good-horses-explain.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +voice-pipeline: avoid stacked replies when interruptions is disallowed diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 13bad4106..4d380f66c 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -443,8 +443,11 @@ def _on_final_transcript(ev: stt.SpeechEvent) -> None: ) + new_transcript if self._opts.preemptive_synthesis: - if not self._synthesize_agent_reply(): - return + if ( + self._playing_speech is None + or self._playing_speech.allow_interruptions + ): + self._synthesize_agent_reply() self._deferred_validation.on_human_final_transcript(new_transcript) @@ -509,17 +512,11 @@ def _on_playout_stopped(interrupted: bool) -> None: self._speech_q_changed.clear() - def _synthesize_agent_reply(self) -> bool: + def _synthesize_agent_reply(self): """Synthesize the agent reply to the user question, also make sure only one reply is synthesized/played at a time""" if self._pending_agent_reply is not None: - if not self._pending_agent_reply.allow_interruptions: - logger.debug( - "ignoring reply synthesis since interruptions are not allowed" - ) - return False - self._pending_agent_reply.interrupt() if self._human_input is not None and not self._human_input.speaking: @@ -535,8 +532,6 @@ def _synthesize_agent_reply(self) -> bool: self._synthesize_answer_task(self._agent_reply_task, new_handle) ) - return True - @utils.log_exceptions(logger=logger) async def _synthesize_answer_task( self, old_task: asyncio.Task[None], handle: SpeechHandle @@ -801,12 +796,21 @@ def _synthesize_agent_speech( def _validate_reply_if_possible(self) -> None: """Check if the new agent speech should be played""" + if ( + self._playing_speech is not None + and not self._playing_speech.allow_interruptions + ): + logger.debug( + "skipping validation, agent is speaking and does not allow interruptions", + extra={"speech_id": self._playing_speech.id}, + ) + return + if self._pending_agent_reply is None: if self._opts.preemptive_synthesis or not self._transcribed_text: return - if not self._synthesize_agent_reply(): - return + self._synthesize_agent_reply() assert self._pending_agent_reply is not None