From ce1c058bf802fec2767cb43e8bc2d09c3ffaa389 Mon Sep 17 00:00:00 2001
From: jerad fields <jeradfields@gmail.com>
Date: Mon, 16 Dec 2024 12:49:56 -0600
Subject: [PATCH 01/46] use onnx turn detector model (#1231)

Co-authored-by: David Zhao <dz@livekit.io>
---
 .changeset/loud-onions-invent.md                       |  5 +++++
 .../livekit-plugins-turn-detector/README.md            |  2 +-
 .../livekit/plugins/turn_detector/__init__.py          |  7 +++++--
 .../livekit/plugins/turn_detector/eou.py               | 10 +++++++---
 livekit-plugins/livekit-plugins-turn-detector/setup.py |  1 +
 5 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 .changeset/loud-onions-invent.md

diff --git a/.changeset/loud-onions-invent.md b/.changeset/loud-onions-invent.md
new file mode 100644
index 000000000..dcedf95b4
--- /dev/null
+++ b/.changeset/loud-onions-invent.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-turn-detector": patch
+---
+
+use quantized onnx version of turn detector model
diff --git a/livekit-plugins/livekit-plugins-turn-detector/README.md b/livekit-plugins/livekit-plugins-turn-detector/README.md
index 988706784..859b803cf 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/README.md
+++ b/livekit-plugins/livekit-plugins-turn-detector/README.md
@@ -35,7 +35,7 @@ python my_agent.py download-files
 
 ## Model system requirements
 
-The end-of-turn model is optimized to run on CPUs with modest system requirements. It is designed to run on the same server hosting your agents. On a 4-core server instance, it completes inference in under 100ms with minimal CPU usage.
+The end-of-turn model is optimized to run on CPUs with modest system requirements. It is designed to run on the same server hosting your agents. On a 4-core server instance, it completes inference in ~50ms with minimal CPU usage.
 
 The model requires 1.5GB of RAM and runs within a shared inference server, supporting multiple concurrent sessions.
 
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py
index 6ca7eecbb..32692361a 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py
@@ -27,11 +27,14 @@ def __init__(self):
         super().__init__(__name__, __version__, __package__, logger)
 
     def download_files(self) -> None:
-        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from optimum.onnxruntime import ORTModelForCausalLM
+        from transformers import AutoTokenizer
 
         from .eou import HG_MODEL
 
-        AutoModelForCausalLM.from_pretrained(HG_MODEL)
+        ORTModelForCausalLM.from_pretrained(
+            HG_MODEL, use_cache=False, use_io_binding=False
+        )
         AutoTokenizer.from_pretrained(HG_MODEL)
 
 
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
index d5f21799e..afbc09415 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
@@ -56,11 +56,15 @@ def _format_chat_ctx(self, chat_ctx: dict):
 
     def initialize(self) -> None:
         from huggingface_hub import errors
-        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from optimum.onnxruntime import ORTModelForCausalLM
+        from transformers import AutoTokenizer
 
         try:
-            self._model = AutoModelForCausalLM.from_pretrained(
-                HG_MODEL, local_files_only=True
+            self._model = ORTModelForCausalLM.from_pretrained(
+                HG_MODEL,
+                local_files_only=True,
+                use_io_binding=False,
+                use_cache=False,
             )
             self._tokenizer = AutoTokenizer.from_pretrained(
                 HG_MODEL, local_files_only=True
diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py
index a73d4c797..b26b8e536 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/setup.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py
@@ -54,6 +54,7 @@
         "transformers>=4.46",
         "numpy>=1.26",
         "torch>=2.5.1",
+        "optimum[onnxruntime]>=1.23.3",
     ],
     package_data={"livekit.plugins.turn_detector": ["py.typed"]},
     project_urls={

From aa933d2bb5ba2131cd54a5d3a91b44cd7c16f303 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Mon, 16 Dec 2024 17:09:29 -0800
Subject: [PATCH 02/46] streaming audio decoder, enables receiving compressed
 audio from TTS services (#1236)

---
 .changeset/empty-sheep-pump.md                |   6 +
 .github/workflows/tests.yml                   |  32 +---
 livekit-agents/livekit/agents/stt/stt.py      |   2 +-
 .../livekit/agents/utils/codecs/__init__.py   |   3 +-
 .../livekit/agents/utils/codecs/decoder.py    | 159 ++++++++++++++++++
 livekit-agents/setup.py                       |   2 +-
 .../livekit-plugins-deepgram/setup.py         |   2 +-
 tests/.gitattributes                          |   1 +
 tests/change-sophie.opus                      |   3 +
 tests/test_decoder.py                         | 140 +++++++++++++++
 tests/test_stt.py                             |   2 +
 11 files changed, 325 insertions(+), 27 deletions(-)
 create mode 100644 .changeset/empty-sheep-pump.md
 create mode 100644 livekit-agents/livekit/agents/utils/codecs/decoder.py
 create mode 100644 tests/change-sophie.opus
 create mode 100644 tests/test_decoder.py

diff --git a/.changeset/empty-sheep-pump.md b/.changeset/empty-sheep-pump.md
new file mode 100644
index 000000000..06c854c20
--- /dev/null
+++ b/.changeset/empty-sheep-pump.md
@@ -0,0 +1,6 @@
+---
+"livekit-plugins-deepgram": patch
+"livekit-agents": patch
+---
+
+added streaming audio decoder for compressed audio.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 302e5ad71..2fac6f9a2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -18,15 +18,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os:
-          [
-            macos-14-large,
+        os: [
+            # disabled Intel Macs due to pytorch 2.3+ not supporting it
+            # macos-14-large,
             macos-14,
             windows-2019,
             ubuntu-20.04,
             namespace-profile-default-arm64,
           ]
-        python_version: ["3.12"]
+        python_version: ["3.9", "3.12"]
         test_group: ["base"]
         include:
           # Include llm, stt, and tts tests only on Ubuntu 20.04 with Python 3.9
@@ -60,11 +60,8 @@ jobs:
             ${{ runner.os }}-cache
 
       - uses: actions/setup-python@v5
-        # brew will install python as part of ffmpeg install on MacOS
-        # installing system Python could cause a conflict with `Could not symlink bin/idle3`
-        if: ${{ matrix.os != 'macos-14-large' }}
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python_version }}
           cache: "pip"
 
       - name: Install ffmpeg (Linux)
@@ -91,20 +88,9 @@ jobs:
       - name: Install packages
         shell: bash
         run: |
-          pip3 install pytest pytest-asyncio pytest-timeout './livekit-agents[codecs]' psutil
-          pip3 install -r ./tests/test-requirements.txt
-          pip3 install ./livekit-agents \
-                      ./livekit-plugins/livekit-plugins-openai \
-                      ./livekit-plugins/livekit-plugins-deepgram \
-                      ./livekit-plugins/livekit-plugins-google \
-                      ./livekit-plugins/livekit-plugins-nltk \
-                      ./livekit-plugins/livekit-plugins-silero \
-                      ./livekit-plugins/livekit-plugins-elevenlabs \
-                      ./livekit-plugins/livekit-plugins-cartesia \
-                      ./livekit-plugins/livekit-plugins-azure \
-                      ./livekit-plugins/livekit-plugins-anthropic \
-                      ./livekit-plugins/livekit-plugins-assemblyai \
-                      ./livekit-plugins/livekit-plugins-fal
+          pip install pytest pytest-asyncio pytest-timeout './livekit-agents[codecs]' psutil
+          pip install -r ./tests/test-requirements.txt
+          ./livekit-plugins/install_local.sh
 
       - name: Run tests
         shell: bash
@@ -131,7 +117,7 @@ jobs:
 
           case "${{ matrix.test_group }}" in
             base)
-              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py"
+              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py"
               ;;
             llm)
               test_files="test_llm.py"
diff --git a/livekit-agents/livekit/agents/stt/stt.py b/livekit-agents/livekit/agents/stt/stt.py
index c1922bc56..e2f79f93c 100644
--- a/livekit-agents/livekit/agents/stt/stt.py
+++ b/livekit-agents/livekit/agents/stt/stt.py
@@ -295,7 +295,7 @@ def flush(self) -> None:
         self._input_ch.send_nowait(self._FlushSentinel())
 
     def end_input(self) -> None:
-        """Mark the end of input, no more text will be pushed"""
+        """Mark the end of input, no more audio will be pushed"""
         self.flush()
         self._input_ch.close()
 
diff --git a/livekit-agents/livekit/agents/utils/codecs/__init__.py b/livekit-agents/livekit/agents/utils/codecs/__init__.py
index 35f19332a..ad2f77b91 100644
--- a/livekit-agents/livekit/agents/utils/codecs/__init__.py
+++ b/livekit-agents/livekit/agents/utils/codecs/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .decoder import AudioStreamDecoder, StreamBuffer
 from .mp3 import Mp3StreamDecoder
 
-__all__ = ["Mp3StreamDecoder"]
+__all__ = ["Mp3StreamDecoder", "AudioStreamDecoder", "StreamBuffer"]
diff --git a/livekit-agents/livekit/agents/utils/codecs/decoder.py b/livekit-agents/livekit/agents/utils/codecs/decoder.py
new file mode 100644
index 000000000..01367c055
--- /dev/null
+++ b/livekit-agents/livekit/agents/utils/codecs/decoder.py
@@ -0,0 +1,159 @@
+# Copyright 2024 LiveKit, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import io
+from typing import AsyncIterator
+
+from livekit.agents.utils import aio
+
+try:
+    # preload to ensure faster startup
+    import av  # noqa
+except ImportError:
+    pass
+import threading
+
+from livekit import rtc
+
+
+class StreamBuffer:
+    """
+    A thread-safe buffer that behaves like an IO stream.
+    Allows writing from one thread and reading from another.
+    """
+
+    def __init__(self):
+        self._buffer = io.BytesIO()
+        self._lock = threading.Lock()
+        self._data_available = threading.Condition(self._lock)
+        self._eof = False  # EOF flag to signal no more writes
+
+    def write(self, data: bytes):
+        """Write data to the buffer from a writer thread."""
+        with self._data_available:  # Lock and notify readers
+            self._buffer.seek(0, io.SEEK_END)  # Move to the end
+            self._buffer.write(data)
+            self._data_available.notify_all()  # Notify waiting readers
+
+    def read(self, size: int = -1) -> bytes:
+        """Read data from the buffer in a reader thread."""
+
+        if self._buffer.closed:
+            return b""
+
+        with self._data_available:
+            while True:
+                self._buffer.seek(0)  # Rewind for reading
+                data = self._buffer.read(size)
+
+                # If data is available, return it
+                if data:
+                    # Shrink the buffer to remove already-read data
+                    remaining = self._buffer.read()
+                    self._buffer = io.BytesIO(remaining)
+                    return data
+
+                # If EOF is signaled and no data remains, return EOF
+                if self._eof:
+                    return b""
+
+                # Wait for more data
+                self._data_available.wait()
+
+    def end_input(self):
+        """Signal that no more data will be written."""
+        with self._data_available:
+            self._eof = True
+            self._data_available.notify_all()
+
+    def close(self):
+        self._buffer.close()
+
+
+class AudioStreamDecoder:
+    """A class that can be used to decode audio stream into PCM AudioFrames.
+
+    Decoders are stateful, and it should not be reused across multiple streams. Each decoder
+    is designed to decode a single stream.
+    """
+
+    def __init__(self):
+        try:
+            import av  # noqa
+        except ImportError:
+            raise ImportError(
+                "You haven't included the 'codecs' optional dependencies. Please install the 'codecs' extra by running `pip install livekit-agents[codecs]`"
+            )
+
+        self._output_ch = aio.Chan[rtc.AudioFrame]()
+        self._closed = False
+        self._started = False
+        self._output_finished = False
+        self._input_buf = StreamBuffer()
+        self._loop = asyncio.get_event_loop()
+
+    def push(self, chunk: bytes):
+        self._input_buf.write(chunk)
+        if not self._started:
+            self._started = True
+            self._loop.run_in_executor(None, self._decode_loop)
+
+    def end_input(self):
+        self._input_buf.end_input()
+
+    def _decode_loop(self):
+        container = av.open(self._input_buf)
+        audio_stream = next(s for s in container.streams if s.type == "audio")
+        resampler = av.AudioResampler(
+            # convert to signed 16-bit little endian
+            format="s16",
+            layout="mono",
+            rate=audio_stream.rate,
+        )
+        try:
+            # TODO: handle error where audio stream isn't found
+            if not audio_stream:
+                return
+            for frame in container.decode(audio_stream):
+                if self._closed:
+                    return
+                for resampled_frame in resampler.resample(frame):
+                    nchannels = len(resampled_frame.layout.channels)
+                    data = resampled_frame.to_ndarray().tobytes()
+                    self._output_ch.send_nowait(
+                        rtc.AudioFrame(
+                            data=data,
+                            num_channels=nchannels,
+                            sample_rate=resampled_frame.sample_rate,
+                            samples_per_channel=resampled_frame.samples / nchannels,
+                        )
+                    )
+        finally:
+            self._output_finished = True
+
+    def __aiter__(self) -> AsyncIterator[rtc.AudioFrame]:
+        return self
+
+    async def __anext__(self) -> rtc.AudioFrame:
+        if self._output_finished and self._output_ch.empty():
+            raise StopAsyncIteration
+        return await self._output_ch.__anext__()
+
+    async def aclose(self):
+        if self._closed:
+            return
+        self._closed = True
+        self._input_buf.close()
+        self._output_ch.close()
diff --git a/livekit-agents/setup.py b/livekit-agents/setup.py
index bf662dc34..9ff541808 100644
--- a/livekit-agents/setup.py
+++ b/livekit-agents/setup.py
@@ -66,7 +66,7 @@
         ':sys_platform!="win32"': [
             "aiodns~=3.2"
         ],  # use default aiohttp resolver on windows
-        "codecs": ["av>=11.0.0"],
+        "codecs": ["av>=12.0.0", "numpy>=1.26.0"],
         "images": ["pillow>=10.3.0"],
     },
     package_data={"livekit.agents": ["py.typed"]},
diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py
index 077c6d659..8a583611d 100644
--- a/livekit-plugins/livekit-plugins-deepgram/setup.py
+++ b/livekit-plugins/livekit-plugins-deepgram/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11.3", "numpy~=1.21"],
+    install_requires=["livekit-agents>=0.12.2", "numpy>=1.26"],
     package_data={"livekit.plugins.deepgram": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
diff --git a/tests/.gitattributes b/tests/.gitattributes
index 9a8911093..83117e69b 100644
--- a/tests/.gitattributes
+++ b/tests/.gitattributes
@@ -1,4 +1,5 @@
 long.mp3 filter=lfs diff=lfs merge=lfs -text
 change-sophie.wav filter=lfs diff=lfs merge=lfs -text
+change-sophie.opus filter=lfs diff=lfs merge=lfs -text
 hearts.rgba filter=lfs diff=lfs merge=lfs -text
 hearts.jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/tests/change-sophie.opus b/tests/change-sophie.opus
new file mode 100644
index 000000000..5112fcab5
--- /dev/null
+++ b/tests/change-sophie.opus
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a2eb5667dc35714b4cb70324d3722f89580885ee5e51be5f2c793e7893d9a24
+size 48905
diff --git a/tests/test_decoder.py b/tests/test_decoder.py
new file mode 100644
index 000000000..c5ecacce8
--- /dev/null
+++ b/tests/test_decoder.py
@@ -0,0 +1,140 @@
+import os
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import aiohttp
+import pytest
+from livekit.agents.stt import SpeechEventType
+from livekit.agents.utils.codecs import AudioStreamDecoder, StreamBuffer
+from livekit.plugins import deepgram
+
+from .utils import wer
+
+TEST_AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "change-sophie.opus")
+
+
+@pytest.mark.asyncio
+async def test_decode_and_transcribe():
+    # Skip if test file doesn't exist
+    if not os.path.exists(TEST_AUDIO_FILEPATH):
+        pytest.skip(f"Test file not found: {TEST_AUDIO_FILEPATH}")
+
+    decoder = AudioStreamDecoder()
+    with open(TEST_AUDIO_FILEPATH, "rb") as f:
+        opus_data = f.read()
+    decoder.push(opus_data)
+    decoder.end_input()
+
+    session = aiohttp.ClientSession()
+    stt = deepgram.STT(http_session=session)
+    stream = stt.stream()
+
+    # Push frames to STT
+    async for frame in decoder:
+        stream.push_frame(frame)
+
+    # Mark end of input
+    stream.end_input()
+
+    # Collect results
+    final_text = ""
+    async for event in stream:
+        if event.type == SpeechEventType.FINAL_TRANSCRIPT:
+            if event.alternatives:
+                if final_text:
+                    final_text += " "
+                final_text += event.alternatives[0].text
+
+    await decoder.aclose()
+    await stream.aclose()
+    await session.close()
+
+    # Verify the transcription
+    expected_text = "the people that are crazy enough to think they can change the world are the ones who do"
+    assert wer(final_text, expected_text) < 0.2
+
+
+def test_stream_buffer():
+    buffer = StreamBuffer()
+    data_chunks = [b"hello", b"world", b"test", b"data"]
+    received_data = bytearray()
+    write_completed = threading.Event()
+
+    def writer():
+        for chunk in data_chunks:
+            buffer.write(chunk)
+            time.sleep(0.01)  # Simulate some processing time
+        buffer.end_input()
+        write_completed.set()
+
+    def reader():
+        while True:
+            data = buffer.read(4)  # Read in small chunks
+            if not data:  # EOF
+                break
+            received_data.extend(data)
+
+    # Run writer and reader in separate threads
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        reader_future = executor.submit(reader)
+        writer_future = executor.submit(writer)
+
+        # Wait for both threads to complete
+        writer_future.result()
+        reader_future.result()
+
+    # Verify that all data was received correctly
+    expected_data = b"".join(data_chunks)
+    assert bytes(received_data) == expected_data
+
+
+def test_stream_buffer_large_chunks():
+    buffer = StreamBuffer()
+    large_chunk = b"x" * 1024 * 1024  # 1MB chunk
+    num_chunks = 5
+    total_size = 0
+    write_completed = threading.Event()
+
+    def writer():
+        nonlocal total_size
+        for _ in range(num_chunks):
+            buffer.write(large_chunk)
+            total_size += len(large_chunk)
+        buffer.end_input()
+        write_completed.set()
+
+    received_size = 0
+
+    def reader():
+        nonlocal received_size
+        while True:
+            chunk = buffer.read(8192)  # Read in 8KB chunks
+            if not chunk:
+                break
+            received_size += len(chunk)
+
+    # Run writer and reader in separate threads
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        reader_future = executor.submit(reader)
+        writer_future = executor.submit(writer)
+
+        # Wait for both threads to complete
+        writer_future.result()
+        reader_future.result()
+
+    assert received_size == total_size
+    assert total_size == num_chunks * len(large_chunk)
+
+
+def test_stream_buffer_early_close():
+    buffer = StreamBuffer()
+
+    # Write some data
+    buffer.write(b"test data")
+
+    # Close the buffer
+    buffer.close()
+
+    # Reading from closed buffer should return empty bytes
+    assert buffer.read() == b""
diff --git a/tests/test_stt.py b/tests/test_stt.py
index 836cfd20a..d1f340b1e 100644
--- a/tests/test_stt.py
+++ b/tests/test_stt.py
@@ -108,6 +108,8 @@ async def _stream_output():
                 continue
 
             if event.type == agents.stt.SpeechEventType.FINAL_TRANSCRIPT:
+                if text != "":
+                    text += " "
                 text += event.alternatives[0].text
                 # ensure STT is tagging languages correctly
                 language = event.alternatives[0].language

From 52880aa9876df77e55aefae7947bbd7a4a5867f3 Mon Sep 17 00:00:00 2001
From: Ishimwe Prince <mbuke404@gmail.com>
Date: Tue, 17 Dec 2024 12:56:44 +0200
Subject: [PATCH 03/46] fix: fix `imgui` setup (#1226)

---
 .changeset/fix-imgui-setup.md                            | 5 +++++
 .../livekit-plugins-browser/src/CMakeLists.txt           | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 .changeset/fix-imgui-setup.md

diff --git a/.changeset/fix-imgui-setup.md b/.changeset/fix-imgui-setup.md
new file mode 100644
index 000000000..a6e52168e
--- /dev/null
+++ b/.changeset/fix-imgui-setup.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-browser": patch
+---
+
+fix: fix `imgui` setup
diff --git a/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt b/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt
index 298ee3c37..f236519cb 100644
--- a/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt
+++ b/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt
@@ -11,8 +11,15 @@ set(GLFW_INSTALL OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(glfw GIT_REPOSITORY https://github.com/glfw/glfw.git GIT_TAG 3.4)
 FetchContent_MakeAvailable(glfw)
 
-FetchContent_Declare(imgui GIT_REPOSITORY https://github.com/ocornut/imgui GIT_TAG origin/docking)
+FetchContent_Declare(
+  imgui 
+  GIT_REPOSITORY https://github.com/ocornut/imgui 
+  GIT_TAG origin/docking
+  GIT_SHALLOW TRUE
+)
 FetchContent_GetProperties(imgui)
+FetchContent_Populate(imgui)
+
 FetchContent_MakeAvailable(imgui)
 file(GLOB IMGUI_SOURCES ${imgui_SOURCE_DIR}/*.cpp)
 add_library(imgui STATIC ${IMGUI_SOURCES}

From 891d5e7bd6329fa9d2df06e725eb70d656948687 Mon Sep 17 00:00:00 2001
From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com>
Date: Tue, 17 Dec 2024 18:00:45 +0530
Subject: [PATCH 04/46] fix: correctly parse function argument types (#1221)

---
 .changeset/nervous-years-sell.md              |   7 +
 .github/workflows/tests.yml                   |   2 +-
 .../livekit/agents/llm/function_context.py    |  38 ++--
 .../livekit/plugins/anthropic/llm.py          |  17 +-
 .../livekit/plugins/openai/_oai_api.py        |  31 +--
 tests/test_build_func_desc.py                 |  51 +++++
 tests/test_create_func.py                     | 209 ++++++++++++++++++
 7 files changed, 313 insertions(+), 42 deletions(-)
 create mode 100644 .changeset/nervous-years-sell.md
 create mode 100644 tests/test_build_func_desc.py
 create mode 100644 tests/test_create_func.py

diff --git a/.changeset/nervous-years-sell.md b/.changeset/nervous-years-sell.md
new file mode 100644
index 000000000..a7829fe92
--- /dev/null
+++ b/.changeset/nervous-years-sell.md
@@ -0,0 +1,7 @@
+---
+"livekit-plugins-anthropic": patch
+"livekit-plugins-openai": patch
+"livekit-agents": patch
+---
+
+fix: correctly parse function argument types
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2fac6f9a2..2da4754b0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -117,7 +117,7 @@ jobs:
 
           case "${{ matrix.test_group }}" in
             base)
-              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py"
+              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py test_build_func_desc.py test_create_func.py"
               ;;
             llm)
               test_files="test_llm.py"
diff --git a/livekit-agents/livekit/agents/llm/function_context.py b/livekit-agents/livekit/agents/llm/function_context.py
index 4290d121e..aa4df9842 100644
--- a/livekit-agents/livekit/agents/llm/function_context.py
+++ b/livekit-agents/livekit/agents/llm/function_context.py
@@ -18,9 +18,10 @@
 import enum
 import functools
 import inspect
+import types
 import typing
 from dataclasses import dataclass
-from typing import Any, Callable, Tuple
+from typing import Any, Callable, Optional, Tuple
 
 from ..log import logger
 
@@ -54,7 +55,6 @@ class FunctionArgInfo:
     type: type
     default: Any
     choices: tuple | None
-    is_optional: bool
 
 
 @dataclass(frozen=True)
@@ -169,15 +169,13 @@ def _register_ai_function(self, fnc: Callable) -> None:
                 )
 
             desc = type_info.description if type_info else ""
-            choices = type_info.choices if type_info else None
+            choices = type_info.choices if type_info else ()
 
-            is_optional, optional_inner = _is_optional_type(inner_th)
-            if is_optional:
-                # when the type is optional, only the inner type is relevant
-                # the argument info for default would be None
-                inner_th = optional_inner
-
-            if issubclass(inner_th, enum.Enum) and not choices:
+            if (
+                isinstance(inner_th, type)
+                and issubclass(inner_th, enum.Enum)
+                and not choices
+            ):
                 # the enum must be a str or int (and at least one value)
                 # this is verified by is_type_supported
                 choices = tuple([item.value for item in inner_th])
@@ -189,7 +187,6 @@ def _register_ai_function(self, fnc: Callable) -> None:
                 type=inner_th,
                 default=param.default,
                 choices=choices,
-                is_optional=is_optional,
             )
 
         self._fncs[metadata.name] = FunctionInfo(
@@ -225,7 +222,8 @@ def _extract_types(annotation: type) -> tuple[type, TypeInfo | None]:
 
         is_optional, optional_inner = _is_optional_type(annotation)
         if is_optional:
-            return _extract_types(optional_inner)
+            inner_type, info = _extract_types(optional_inner)
+            return Optional[inner_type], info  # type: ignore
 
         return annotation, None
 
@@ -293,17 +291,15 @@ def is_type_supported(t: type) -> bool:
 def _is_optional_type(typ) -> Tuple[bool, Any]:
     """return is_optional, inner_type"""
     origin = typing.get_origin(typ)
+    if origin is None or origin is list:
+        return False, typ
 
-    if origin in {typing.Union, getattr(__builtins__, "UnionType", typing.Union)}:
+    if origin in {typing.Union, getattr(types, "UnionType", typing.Union)}:
         args = typing.get_args(typ)
         is_optional = type(None) in args
-
-        inner_arg = None
-        for arg in args:
-            if arg is not type(None):
-                inner_arg = arg
-                break
-
-        return is_optional, inner_arg
+        non_none_args = [a for a in args if a is not type(None)]
+        if is_optional and len(non_none_args) == 1:
+            # Exactly one non-None type + None means optional
+            return True, non_none_args[0]
 
     return False, None
diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
index b48d6ec58..9678c9381 100644
--- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
+++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
@@ -41,6 +41,7 @@
     utils,
 )
 from livekit.agents.llm import ToolChoice
+from livekit.agents.llm.function_context import _is_optional_type
 from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions
 
 import anthropic
@@ -517,13 +518,15 @@ def _create_ai_function_info(
             continue
 
         arg_value = parsed_arguments[arg_info.name]
-        if get_origin(arg_info.type) is not None:
+        is_optional, inner_th = _is_optional_type(arg_info.type)
+
+        if get_origin(inner_th) is not None:
             if not isinstance(arg_value, list):
                 raise ValueError(
                     f"AI function {fnc_name} argument {arg_info.name} should be a list"
                 )
 
-            inner_type = get_args(arg_info.type)[0]
+            inner_type = get_args(inner_th)[0]
             sanitized_value = [
                 _sanitize_primitive(
                     value=v, expected_type=inner_type, choices=arg_info.choices
@@ -532,7 +535,7 @@ def _create_ai_function_info(
             ]
         else:
             sanitized_value = _sanitize_primitive(
-                value=arg_value, expected_type=arg_info.type, choices=arg_info.choices
+                value=arg_value, expected_type=inner_th, choices=arg_info.choices
             )
 
         sanitized_arguments[arg_info.name] = sanitized_value
@@ -568,8 +571,10 @@ def type2str(t: type) -> str:
         if arg_info.description:
             p["description"] = arg_info.description
 
-        if get_origin(arg_info.type) is list:
-            inner_type = get_args(arg_info.type)[0]
+        is_optional, inner_th = _is_optional_type(arg_info.type)
+
+        if get_origin(inner_th) is list:
+            inner_type = get_args(inner_th)[0]
             p["type"] = "array"
             p["items"] = {}
             p["items"]["type"] = type2str(inner_type)
@@ -577,7 +582,7 @@ def type2str(t: type) -> str:
             if arg_info.choices:
                 p["items"]["enum"] = arg_info.choices
         else:
-            p["type"] = type2str(arg_info.type)
+            p["type"] = type2str(inner_th)
             if arg_info.choices:
                 p["enum"] = arg_info.choices
 
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py
index b82c29de9..8bf05a19f 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py
@@ -20,6 +20,7 @@
 from typing import Any
 
 from livekit.agents.llm import function_context, llm
+from livekit.agents.llm.function_context import _is_optional_type
 
 __all__ = ["build_oai_function_description", "create_ai_function_info"]
 
@@ -55,28 +56,28 @@ def create_ai_function_info(
             continue
 
         arg_value = parsed_arguments[arg_info.name]
-        if typing.get_origin(arg_info.type) is not None:
+        is_optional, inner_th = _is_optional_type(arg_info.type)
+
+        if typing.get_origin(inner_th) is not None:
             if not isinstance(arg_value, list):
                 raise ValueError(
                     f"AI function {fnc_name} argument {arg_info.name} should be a list"
                 )
 
-            inner_type = typing.get_args(arg_info.type)[0]
+            inner_type = typing.get_args(inner_th)[0]
             sanitized_value = [
                 _sanitize_primitive(
                     value=v,
                     expected_type=inner_type,
                     choices=arg_info.choices,
-                    is_optional=arg_info.is_optional,
                 )
                 for v in arg_value
             ]
         else:
             sanitized_value = _sanitize_primitive(
                 value=arg_value,
-                expected_type=arg_info.type,
+                expected_type=inner_th,
                 choices=arg_info.choices,
-                is_optional=arg_info.is_optional,
             )
 
         sanitized_arguments[arg_info.name] = sanitized_value
@@ -109,8 +110,10 @@ def type2str(t: type) -> str:
         if arg_info.description:
             p["description"] = arg_info.description
 
-        if typing.get_origin(arg_info.type) is list:
-            inner_type = typing.get_args(arg_info.type)[0]
+        is_optional, inner_th = _is_optional_type(arg_info.type)
+
+        if typing.get_origin(inner_th) is list:
+            inner_type = typing.get_args(inner_th)[0]
             p["type"] = "array"
             p["items"] = {}
             p["items"]["type"] = type2str(inner_type)
@@ -118,11 +121,14 @@ def type2str(t: type) -> str:
             if arg_info.choices:
                 p["items"]["enum"] = arg_info.choices
         else:
-            p["type"] = type2str(arg_info.type)
+            p["type"] = type2str(inner_th)
             if arg_info.choices:
                 p["enum"] = arg_info.choices
-            if arg_info.type is int and arg_info.choices and capabilities is not None:
-                if not capabilities.supports_choices_on_int:
+                if (
+                    inner_th is int
+                    and capabilities
+                    and not capabilities.supports_choices_on_int
+                ):
                     raise ValueError(
                         f"Parameter '{arg_info.name}' uses 'choices' with 'int', which is not supported by this model."
                     )
@@ -153,11 +159,8 @@ def type2str(t: type) -> str:
 
 
 def _sanitize_primitive(
-    *, value: Any, expected_type: type, choices: tuple | None, is_optional: bool = False
+    *, value: Any, expected_type: type, choices: tuple | None
 ) -> Any:
-    if is_optional and value is None:
-        return None
-
     if expected_type is str:
         if not isinstance(value, str):
             raise ValueError(f"expected str, got {type(value)}")
diff --git a/tests/test_build_func_desc.py b/tests/test_build_func_desc.py
new file mode 100644
index 000000000..67659df3b
--- /dev/null
+++ b/tests/test_build_func_desc.py
@@ -0,0 +1,51 @@
+import sys
+from inspect import _empty
+from typing import List, Optional, Union
+
+import pytest
+from livekit.agents.llm import FunctionArgInfo, FunctionInfo
+from livekit.agents.llm.function_context import _is_optional_type
+from livekit.plugins.openai import _oai_api
+
+
+def test_typing():
+    assert _is_optional_type(Optional[int]) == (True, int)
+    assert _is_optional_type(Union[str, None]) == (True, str)
+    if sys.version_info >= (3, 10):
+        assert _is_optional_type(float | None) == (True, float)
+    assert _is_optional_type(Union[str, int]) == (False, None)
+
+
+@pytest.mark.parametrize(
+    ("arg_typ", "oai_type"),
+    [
+        pytest.param(int, "number", id="int"),
+        pytest.param(Optional[int], "number", id="optional[int]"),
+        pytest.param(Union[None, int], "number", id="union[none, int]"),
+        pytest.param(Union[str, None], "string", id="union[str, none]"),
+        pytest.param(List[int], "array", id="list[int]"),
+        pytest.param(Optional[List[int]], "array", id="optional[list[int]]"),
+    ],
+)
+def test_description_building(arg_typ: type, oai_type: str):
+    fi = FunctionInfo(
+        name="foo",
+        description="foo",
+        auto_retry=False,
+        callable=lambda: None,
+        arguments={
+            "arg": FunctionArgInfo(
+                name="foo",
+                description="foo",
+                type=arg_typ,
+                default=_empty,
+                choices=(),
+            ),
+        },
+    )
+    assert (
+        _oai_api.build_oai_function_description(fi)["function"]["parameters"][
+            "properties"
+        ]["foo"]["type"]
+        == oai_type
+    )
diff --git a/tests/test_create_func.py b/tests/test_create_func.py
new file mode 100644
index 000000000..97583fb36
--- /dev/null
+++ b/tests/test_create_func.py
@@ -0,0 +1,209 @@
+import enum
+from inspect import _empty
+from typing import Annotated, List, Optional
+
+import pytest
+from livekit.agents import llm
+from livekit.plugins.openai import _oai_api
+
+
+def test_func_basic():
+    class TestFunctionContext(llm.FunctionContext):
+        @llm.ai_callable(name="test_function", description="A simple test function")
+        def test_fn(
+            self, param: Annotated[str, llm.TypeInfo(description="A string parameter")]
+        ):
+            pass
+
+    fnc_ctx = TestFunctionContext()
+    assert (
+        "test_function" in fnc_ctx.ai_functions
+    ), "Function should be registered in ai_functions"
+
+    fnc_info = fnc_ctx.ai_functions["test_function"]
+    build_info = _oai_api.build_oai_function_description(fnc_info)
+    assert fnc_info.name == build_info["function"]["name"]
+    assert fnc_info.description == build_info["function"]["description"]
+    assert not fnc_info.auto_retry
+    assert "param" in fnc_info.arguments
+    assert "param" in build_info["function"]["parameters"]["properties"]
+    assert "param" in build_info["function"]["parameters"]["required"]
+
+    arg_info = fnc_info.arguments["param"]
+    build_arg_info = build_info["function"]["parameters"]["properties"]["param"]
+
+    assert arg_info.name == "param"
+    assert arg_info.description == "A string parameter"
+    assert arg_info.type is str
+    assert arg_info.default is _empty
+    assert arg_info.choices == ()
+    assert build_arg_info["description"] == arg_info.description
+    assert build_arg_info["type"] == "string"
+
+
+def test_func_duplicate():
+    class TestFunctionContext(llm.FunctionContext):
+        @llm.ai_callable(name="duplicate_function")
+        def fn1(self):
+            pass
+
+        @llm.ai_callable(name="duplicate_function")
+        def fn2(self):
+            pass
+
+    with pytest.raises(
+        ValueError, match="duplicate ai_callable name: duplicate_function"
+    ):
+        TestFunctionContext()
+
+
+def test_func_with_optional_parameter():
+    class TestFunctionContext(llm.FunctionContext):
+        @llm.ai_callable(
+            name="optional_function", description="Function with optional parameter"
+        )
+        def optional_fn(
+            self,
+            param: Annotated[
+                Optional[int], llm.TypeInfo(description="An optional integer parameter")
+            ] = None,
+            param2: Optional[List[str]] = None,
+            param3: str = "A string",
+        ):
+            pass
+
+    fnc_ctx = TestFunctionContext()
+    assert (
+        "optional_function" in fnc_ctx.ai_functions
+    ), "Function should be registered in ai_functions"
+
+    fnc_info = fnc_ctx.ai_functions["optional_function"]
+    build_info = _oai_api.build_oai_function_description(fnc_info)
+    print(build_info)
+    assert fnc_info.name == build_info["function"]["name"]
+    assert fnc_info.description == build_info["function"]["description"]
+    assert "param" in fnc_info.arguments
+    assert "param2" in fnc_info.arguments
+    assert "param3" in fnc_info.arguments
+    assert "param" in build_info["function"]["parameters"]["properties"]
+    assert "param2" in build_info["function"]["parameters"]["properties"]
+    assert "param3" in build_info["function"]["parameters"]["properties"]
+    assert "param" not in build_info["function"]["parameters"]["required"]
+    assert "param2" not in build_info["function"]["parameters"]["required"]
+    assert "param3" not in build_info["function"]["parameters"]["required"]
+
+    # Check 'param'
+    arg_info = fnc_info.arguments["param"]
+    build_arg_info = build_info["function"]["parameters"]["properties"]["param"]
+
+    assert arg_info.name == "param"
+    assert arg_info.description == "An optional integer parameter"
+    assert arg_info.type == Optional[int]
+    assert arg_info.default is None
+    assert arg_info.choices == ()
+    assert build_arg_info["description"] == arg_info.description
+    assert build_arg_info["type"] == "number"
+
+    # Check 'param2'
+    arg_info = fnc_info.arguments["param2"]
+    build_arg_info = build_info["function"]["parameters"]["properties"]["param2"]
+
+    assert arg_info.name == "param2"
+    assert arg_info.description == ""
+    assert arg_info.type == Optional[List[str]]
+    assert arg_info.default is None
+    assert arg_info.choices == ()
+    assert build_arg_info["type"] == "array"
+    assert build_arg_info["items"]["type"] == "string"
+
+    # check 'param3'
+    arg_info = fnc_info.arguments["param3"]
+    build_arg_info = build_info["function"]["parameters"]["properties"]["param3"]
+
+    assert arg_info.name == "param3"
+    assert arg_info.description == ""
+    assert arg_info.type is str
+    assert arg_info.default == "A string"
+    assert arg_info.choices == ()
+    assert build_arg_info["type"] == "string"
+
+
+def test_func_with_list_parameter():
+    class TestFunctionContext(llm.FunctionContext):
+        @llm.ai_callable(
+            name="list_function", description="Function with list parameter"
+        )
+        def list_fn(
+            self,
+            items: Annotated[List[str], llm.TypeInfo(description="A list of strings")],
+        ):
+            pass
+
+    fnc_ctx = TestFunctionContext()
+    assert (
+        "list_function" in fnc_ctx.ai_functions
+    ), "Function should be registered in ai_functions"
+
+    fnc_info = fnc_ctx.ai_functions["list_function"]
+    build_info = _oai_api.build_oai_function_description(fnc_info)
+    assert fnc_info.name == build_info["function"]["name"]
+    assert fnc_info.description == build_info["function"]["description"]
+    assert not fnc_info.auto_retry
+    assert "items" in fnc_info.arguments
+    assert "items" in build_info["function"]["parameters"]["properties"]
+    assert "items" in build_info["function"]["parameters"]["required"]
+
+    arg_info = fnc_info.arguments["items"]
+    build_arg_info = build_info["function"]["parameters"]["properties"]["items"]
+
+    assert arg_info.name == "items"
+    assert arg_info.description == "A list of strings"
+    assert arg_info.type is List[str]
+    assert arg_info.default is _empty
+    assert arg_info.choices == ()
+    assert build_arg_info["description"] == arg_info.description
+    assert build_arg_info["type"] == "array"
+    assert build_arg_info["items"]["type"] == "string"
+
+
+def test_func_with_enum_parameter():
+    class Status(enum.Enum):
+        ACTIVE = "active"
+        INACTIVE = "inactive"
+        PENDING = "pending"
+
+    class TestFunctionContext(llm.FunctionContext):
+        @llm.ai_callable(
+            name="enum_function", description="Function with enum parameter"
+        )
+        def enum_fn(
+            self,
+            status: Annotated[Status, llm.TypeInfo(description="Status of the entity")],
+        ):
+            pass
+
+    fnc_ctx = TestFunctionContext()
+    assert (
+        "enum_function" in fnc_ctx.ai_functions
+    ), "Function should be registered in ai_functions"
+
+    fnc_info = fnc_ctx.ai_functions["enum_function"]
+    build_info = _oai_api.build_oai_function_description(fnc_info)
+    assert fnc_info.name == build_info["function"]["name"]
+    assert fnc_info.description == build_info["function"]["description"]
+    assert not fnc_info.auto_retry
+    assert "status" in fnc_info.arguments
+    assert "status" in build_info["function"]["parameters"]["properties"]
+    assert "status" in build_info["function"]["parameters"]["required"]
+
+    arg_info = fnc_info.arguments["status"]
+    build_arg_info = build_info["function"]["parameters"]["properties"]["status"]
+
+    assert arg_info.name == "status"
+    assert arg_info.description == "Status of the entity"
+    assert arg_info.type is str  # Enum values are converted to their underlying type
+    assert arg_info.default is _empty
+    assert arg_info.choices == ("active", "inactive", "pending")
+    assert build_arg_info["description"] == arg_info.description
+    assert build_arg_info["type"] == "string"
+    assert build_arg_info["enum"] == arg_info.choices

From c6e9fa87e72fd08cfc76b5efabd06492ba609b51 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Tue, 17 Dec 2024 08:35:22 -0800
Subject: [PATCH 05/46] fix azure stt language autodetection (#1246)

---
 .changeset/twenty-dragons-shave.md                    |  5 +++++
 .../livekit/plugins/azure/stt.py                      |  2 +-
 tests/test_decoder.py                                 | 11 ++++++++++-
 3 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 .changeset/twenty-dragons-shave.md

diff --git a/.changeset/twenty-dragons-shave.md b/.changeset/twenty-dragons-shave.md
new file mode 100644
index 000000000..ceaa8890c
--- /dev/null
+++ b/.changeset/twenty-dragons-shave.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-azure": patch
+---
+
+fix azure stt language autodetection
diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
index d705a7f2c..309cc9c5c 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
@@ -330,7 +330,7 @@ def _create_speech_recognizer(
         )
 
     auto_detect_source_language_config = None
-    if config.languages and len(config.languages) > 1:
+    if config.languages and len(config.languages) >= 1:
         auto_detect_source_language_config = (
             speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
                 languages=config.languages
diff --git a/tests/test_decoder.py b/tests/test_decoder.py
index c5ecacce8..10b5b521d 100644
--- a/tests/test_decoder.py
+++ b/tests/test_decoder.py
@@ -90,29 +90,37 @@ def reader():
 
 
 def test_stream_buffer_large_chunks():
+    import hashlib
+
     buffer = StreamBuffer()
-    large_chunk = b"x" * 1024 * 1024  # 1MB chunk
+    large_chunk = os.urandom(1024 * 1024)  # 1MB of random bytes
     num_chunks = 5
     total_size = 0
     write_completed = threading.Event()
+    input_hasher = hashlib.sha256()
 
     def writer():
         nonlocal total_size
         for _ in range(num_chunks):
             buffer.write(large_chunk)
             total_size += len(large_chunk)
+            input_hasher.update(large_chunk)
         buffer.end_input()
         write_completed.set()
 
     received_size = 0
+    output_hasher = hashlib.sha256()
 
     def reader():
         nonlocal received_size
+        # allow writer to start first
+        time.sleep(1)
         while True:
             chunk = buffer.read(8192)  # Read in 8KB chunks
             if not chunk:
                 break
             received_size += len(chunk)
+            output_hasher.update(chunk)
 
     # Run writer and reader in separate threads
     with ThreadPoolExecutor(max_workers=2) as executor:
@@ -125,6 +133,7 @@ def reader():
 
     assert received_size == total_size
     assert total_size == num_chunks * len(large_chunk)
+    assert input_hasher.hexdigest() == output_hasher.hexdigest()
 
 
 def test_stream_buffer_early_close():

From 8bc8d14b3650844a14a906184e085bb316ceac49 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Tue, 17 Dec 2024 08:35:38 -0800
Subject: [PATCH 06/46] Include instructions on enabling Google APIs (#1243)

---
 examples/voice-pipeline-agent/gemini_voice_agent.py | 6 +++++-
 livekit-plugins/livekit-plugins-google/README.md    | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/voice-pipeline-agent/gemini_voice_agent.py b/examples/voice-pipeline-agent/gemini_voice_agent.py
index 5b3d62171..bb3641c6b 100644
--- a/examples/voice-pipeline-agent/gemini_voice_agent.py
+++ b/examples/voice-pipeline-agent/gemini_voice_agent.py
@@ -27,7 +27,11 @@ def prewarm(proc: JobProcess):
 # 2. save your service account credentials and set the following environments:
 #    * GOOGLE_APPLICATION_CREDENTIALS to the path of the service account key file
 #    * GOOGLE_CLOUD_PROJECT to your Google Cloud project ID
-#
+# 3. the following services are enabled on your Google Cloud project:
+#    * Vertex AI
+#    * Cloud Speech-to-Text API
+#    * Cloud Text-to-Speech API
+
 # Read more about authentication with Google: https://cloud.google.com/docs/authentication/application-default-credentials
 
 
diff --git a/livekit-plugins/livekit-plugins-google/README.md b/livekit-plugins/livekit-plugins-google/README.md
index b0fffb41e..383fe1a62 100644
--- a/livekit-plugins/livekit-plugins-google/README.md
+++ b/livekit-plugins/livekit-plugins-google/README.md
@@ -11,3 +11,8 @@ pip install livekit-plugins-google
 ## Pre-requisites
 
 For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
+
+To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
+
+- Cloud Speech-to-Text API
+- Cloud Text-to-Speech API

From c7efb63ff4f83da3fd5d0fde8207410ad977696b Mon Sep 17 00:00:00 2001
From: Wills Manley <48636156+willsmanley@users.noreply.github.com>
Date: Tue, 17 Dec 2024 17:59:21 -0600
Subject: [PATCH 07/46] added cached_token_details to multimodalllmmetrics
 (#1248)

---
 livekit-agents/livekit/agents/metrics/base.py               | 6 ++++++
 .../livekit/plugins/openai/realtime/realtime_model.py       | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/livekit-agents/livekit/agents/metrics/base.py b/livekit-agents/livekit/agents/metrics/base.py
index 78d09e4f2..d524b02b8 100644
--- a/livekit-agents/livekit/agents/metrics/base.py
+++ b/livekit-agents/livekit/agents/metrics/base.py
@@ -108,11 +108,17 @@ class MultimodalLLMError(Error):
 
 @dataclass
 class MultimodalLLMMetrics(LLMMetrics):
+    @dataclass
+    class CachedTokenDetails:
+        text_tokens: int
+        audio_tokens: int
+
     @dataclass
     class InputTokenDetails:
         cached_tokens: int
         text_tokens: int
         audio_tokens: int
+        cached_tokens_details: MultimodalLLMMetrics.CachedTokenDetails
 
     @dataclass
     class OutputTokenDetails:
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
index 06e6930a8..c7d2a5d5f 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
@@ -1568,6 +1568,10 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon
                 audio_tokens=usage.get("input_token_details", {}).get(
                     "audio_tokens", 0
                 ),
+                cached_tokens_details=MultimodalLLMMetrics.CachedTokenDetails(
+                    text_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("text_tokens", 0),
+                    audio_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("audio_tokens", 0),
+                ),
             ),
             output_token_details=MultimodalLLMMetrics.OutputTokenDetails(
                 text_tokens=usage.get("output_token_details", {}).get("text_tokens", 0),

From 4bce8dbd6628e7d30f50750287e6ad81a5a84af3 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Tue, 17 Dec 2024 17:01:52 -0800
Subject: [PATCH 08/46] updated default realtime model to
 gpt-4o-realtime-preview-2024-12-17 (#1250)

---
 .changeset/thin-carpets-thank.md              |  5 +++
 .github/workflows/tests.yml                   |  2 +-
 .../function_calling_weather.py               | 34 +++++++++++--------
 .../plugins/openai/realtime/api_proto.py      | 16 +++++++++
 .../plugins/openai/realtime/realtime_model.py | 20 ++++++-----
 5 files changed, 54 insertions(+), 23 deletions(-)
 create mode 100644 .changeset/thin-carpets-thank.md

diff --git a/.changeset/thin-carpets-thank.md b/.changeset/thin-carpets-thank.md
new file mode 100644
index 000000000..809ac6fa5
--- /dev/null
+++ b/.changeset/thin-carpets-thank.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-openai": patch
+---
+
+update default realtime model to gpt-4o-realtime-preview-2024-12-17
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2da4754b0..d2a26cbf2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -117,7 +117,7 @@ jobs:
 
           case "${{ matrix.test_group }}" in
             base)
-              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py test_build_func_desc.py test_create_func.py"
+              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_build_func_desc.py test_create_func.py"
               ;;
             llm)
               test_files="test_llm.py"
diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py
index 4e1784ad2..e8add68d0 100644
--- a/examples/voice-pipeline-agent/function_calling_weather.py
+++ b/examples/voice-pipeline-agent/function_calling_weather.py
@@ -1,5 +1,6 @@
 import logging
 import random
+import urllib
 from typing import Annotated
 
 import aiohttp
@@ -34,14 +35,11 @@ async def get_weather(
         ],
     ):
         """Called when the user asks about the weather. This function will return the weather for the given location."""
-
-        # Example of a filler message while waiting for the function call to complete.
-        # NOTE: This message illustrates how the agent can engage users by using the `say()` method
-        # while awaiting the completion of the function call. To create a more dynamic and engaging
-        # interaction, consider varying the responses based on context or user input.
+        # When a function call is running, there are a couple of options to inform the user
+        # that it might take awhile:
+        # Option 1: you can use .say filler message immediately after the call is triggered
+        # Option 2: you can prompt the agent to return a text response when it's making a function call
         call_ctx = AgentCallContext.get_current()
-        # message = f"Let me check the weather in {location} for you."
-        message = f"Here is the weather in {location}: "
         filler_messages = [
             "Let me check the weather in {location} for you.",
             "Let me see what the weather is like in {location} right now.",
@@ -54,22 +52,25 @@ async def get_weather(
         #   of the chat context of the function call for answer synthesis
         speech_handle = await call_ctx.agent.say(message, add_to_chat_ctx=True)  # noqa: F841
 
-        # To wait for the speech to finish
-        # await speech_handle.join()
-
         logger.info(f"getting weather for {location}")
-        url = f"https://wttr.in/{location}?format=%C+%t"
+        url = f"https://wttr.in/{urllib.parse.quote(location)}?format=%C+%t"
+        weather_data = ""
         async with aiohttp.ClientSession() as session:
             async with session.get(url) as response:
                 if response.status == 200:
-                    weather_data = await response.text()
                     # response from the function call is returned to the LLM
-                    return f"The weather in {location} is {weather_data}."
+                    weather_data = (
+                        f"The weather in {location} is {await response.text()}."
+                    )
                 else:
                     raise Exception(
                         f"Failed to get weather data, status code: {response.status}"
                     )
 
+        # To wait for the speech to finish before giving results of the function call
+        await speech_handle.join()
+        return weather_data
+
 
 def prewarm_process(proc: JobProcess):
     # preload silero VAD in memory to speed up session start
@@ -82,7 +83,11 @@ async def entrypoint(ctx: JobContext):
     initial_chat_ctx = llm.ChatContext().append(
         text=(
             "You are a weather assistant created by LiveKit. Your interface with users will be voice. "
-            "You will provide weather information for a given location."
+            "You will provide weather information for a given location. "
+            # when using option 1, you can suppress from the agent with prompt
+            "do not say anything while waiting for the function call to complete."
+            # uncomment this to use option 2
+            # "when performing function calls, let user know that you are checking the weather."
         ),
         role="system",
     )
@@ -95,6 +100,7 @@ async def entrypoint(ctx: JobContext):
         fnc_ctx=fnc_ctx,
         chat_ctx=initial_chat_ctx,
     )
+
     # Start the assistant. This will automatically publish a microphone track and listen to the participant.
     agent.start(ctx.room, participant)
     await agent.say(
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py
index 506add5ef..2bf9778d3 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py
@@ -27,6 +27,16 @@ class FunctionToolChoice(TypedDict):
     "in_progress", "completed", "incomplete", "cancelled", "failed"
 ]
 
+# https://platform.openai.com/docs/models/gp#gpt-4o-realtime
+OpenAIModel = Literal[
+    "gpt-4o-realtime-preview",
+    "gpt-4o-realtime-preview-2024-10-01",
+    "gpt-4o-realtime-preview-2024-12-17",
+    "gpt-4o-mini-realtime-preview",
+    "gpt-4o-mini-realtime-preview-2024-12-17",
+]
+DefaultOpenAIModel = "gpt-4o-realtime-preview"
+
 
 class TextContent(TypedDict):
     type: Literal["text"]
@@ -145,6 +155,12 @@ class InputTokenDetails(TypedDict):
     cached_tokens: int
     text_tokens: int
     audio_tokens: int
+    cached_tokens_details: CachedTokenDetails
+
+
+class CachedTokenDetails(TypedDict):
+    text_tokens: int
+    audio_tokens: int
 
 
 class OutputTokenDetails(TypedDict):
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
index c7d2a5d5f..83b2cbfa6 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
@@ -152,7 +152,7 @@ class RealtimeError:
 
 @dataclass
 class _ModelOptions:
-    model: str | None
+    model: api_proto.OpenAIModel | str
     modalities: list[api_proto.Modality]
     instructions: str
     voice: api_proto.Voice
@@ -182,6 +182,7 @@ class _ContentPtr(TypedDict):
     prefix_padding_ms=300,
     silence_duration_ms=500,
 )
+
 DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputTranscriptionOptions(model="whisper-1")
 
 
@@ -192,7 +193,7 @@ def __init__(
         *,
         instructions: str = "",
         modalities: list[api_proto.Modality] = ["text", "audio"],
-        model: str = "gpt-4o-realtime-preview-2024-10-01",
+        model: api_proto.OpenAIModel | str = api_proto.DefaultOpenAIModel,
         voice: api_proto.Voice = "alloy",
         input_audio_format: api_proto.AudioFormat = "pcm16",
         output_audio_format: api_proto.AudioFormat = "pcm16",
@@ -235,7 +236,7 @@ def __init__(
         *,
         instructions: str = "",
         modalities: list[api_proto.Modality] = ["text", "audio"],
-        model: str | None = "gpt-4o-realtime-preview-2024-10-01",
+        model: api_proto.OpenAIModel | str = api_proto.DefaultOpenAIModel,
         voice: api_proto.Voice = "alloy",
         input_audio_format: api_proto.AudioFormat = "pcm16",
         output_audio_format: api_proto.AudioFormat = "pcm16",
@@ -1548,6 +1549,7 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon
         duration = time.time() - response._created_timestamp
 
         usage = response.usage or {}  # type: ignore
+        input_token_details = usage.get("input_token_details", {})
         metrics = MultimodalLLMMetrics(
             timestamp=response._created_timestamp,
             request_id=response.id,
@@ -1561,16 +1563,18 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon
             tokens_per_second=usage.get("output_tokens", 0) / duration,
             error=metrics_error,
             input_token_details=MultimodalLLMMetrics.InputTokenDetails(
-                cached_tokens=usage.get("input_token_details", {}).get(
-                    "cached_tokens", 0
-                ),
+                cached_tokens=input_token_details.get("cached_tokens", 0),
                 text_tokens=usage.get("input_token_details", {}).get("text_tokens", 0),
                 audio_tokens=usage.get("input_token_details", {}).get(
                     "audio_tokens", 0
                 ),
                 cached_tokens_details=MultimodalLLMMetrics.CachedTokenDetails(
-                    text_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("text_tokens", 0),
-                    audio_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("audio_tokens", 0),
+                    text_tokens=input_token_details.get(
+                        "cached_tokens_details", {}
+                    ).get("text_tokens", 0),
+                    audio_tokens=input_token_details.get(
+                        "cached_tokens_details", {}
+                    ).get("audio_tokens", 0),
                 ),
             ),
             output_token_details=MultimodalLLMMetrics.OutputTokenDetails(

From e4c3454acca1e0494a5eee04ba6f7425f685d406 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Wed, 18 Dec 2024 15:08:10 +0800
Subject: [PATCH 09/46] fix: filter out empty message for set chat ctx in
 realtime model (#1245)

---
 .changeset/grumpy-dancers-develop.md                 |  5 +++++
 .../plugins/openai/realtime/realtime_model.py        | 12 +++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)
 create mode 100644 .changeset/grumpy-dancers-develop.md

diff --git a/.changeset/grumpy-dancers-develop.md b/.changeset/grumpy-dancers-develop.md
new file mode 100644
index 000000000..c5563f597
--- /dev/null
+++ b/.changeset/grumpy-dancers-develop.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-openai": patch
+---
+
+filter out empty message for set chat ctx in realtime model
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
index 83b2cbfa6..c99294d1a 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
@@ -507,10 +507,6 @@ def create(
 
             message_content = message.content
             tool_call_id = message.tool_call_id
-            if not tool_call_id and message_content is None:
-                # not a function call while the message content is None
-                fut.set_result(False)
-                return fut
             event: api_proto.ClientEvent.ConversationItemCreate | None = None
             if tool_call_id:
                 if message.role == "tool":
@@ -952,8 +948,14 @@ async def set_chat_ctx(self, new_ctx: llm.ChatContext) -> None:
         """
         original_ctx = self._remote_conversation_items.to_chat_context()
 
+        # filter out messages that are not function calls and content is None
+        filtered_messages = [
+            msg
+            for msg in new_ctx.messages
+            if msg.tool_call_id or msg.content is not None
+        ]
         changes = utils._compute_changes(
-            original_ctx.messages, new_ctx.messages, key_fnc=lambda x: x.id
+            original_ctx.messages, filtered_messages, key_fnc=lambda x: x.id
         )
         logger.debug(
             "sync chat context",

From 4d7a04530b3337d95a7883af254984105c86fb2a Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Wed, 18 Dec 2024 18:29:36 +0800
Subject: [PATCH 10/46] fix: add session_updated event for realtime model
 (#1253)

---
 .changeset/real-squids-warn.md                |  5 ++
 .../plugins/openai/realtime/__init__.py       |  2 +
 .../plugins/openai/realtime/realtime_model.py | 72 +++++++++++++++----
 3 files changed, 67 insertions(+), 12 deletions(-)
 create mode 100644 .changeset/real-squids-warn.md

diff --git a/.changeset/real-squids-warn.md b/.changeset/real-squids-warn.md
new file mode 100644
index 000000000..43c5d096d
--- /dev/null
+++ b/.changeset/real-squids-warn.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-openai": patch
+---
+
+add session_updated event for RealtimeSession
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py
index ac9b866d6..471deef37 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py
@@ -11,6 +11,7 @@
     RealtimeOutput,
     RealtimeResponse,
     RealtimeSession,
+    RealtimeSessionOptions,
     RealtimeToolCall,
     ServerVadOptions,
 )
@@ -25,6 +26,7 @@
     "RealtimeSession",
     "RealtimeModel",
     "RealtimeError",
+    "RealtimeSessionOptions",
     "ServerVadOptions",
     "InputTranscriptionOptions",
     "ConversationItemCreated",
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
index c99294d1a..04bf14ac5 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
@@ -21,6 +21,7 @@
 
 EventTypes = Literal[
     "start_session",
+    "session_updated",
     "error",
     "input_speech_started",
     "input_speech_stopped",
@@ -151,18 +152,22 @@ class RealtimeError:
 
 
 @dataclass
-class _ModelOptions:
+class RealtimeSessionOptions:
     model: api_proto.OpenAIModel | str
     modalities: list[api_proto.Modality]
     instructions: str
     voice: api_proto.Voice
     input_audio_format: api_proto.AudioFormat
     output_audio_format: api_proto.AudioFormat
-    input_audio_transcription: InputTranscriptionOptions
-    turn_detection: ServerVadOptions
+    input_audio_transcription: InputTranscriptionOptions | None
+    turn_detection: ServerVadOptions | None
     tool_choice: api_proto.ToolChoice
     temperature: float
     max_response_output_tokens: int | Literal["inf"]
+
+
+@dataclass
+class _ModelOptions(RealtimeSessionOptions):
     api_key: str | None
     base_url: str
     entra_token: str | None
@@ -897,12 +902,19 @@ def session_update(
                 function_data["type"] = "function"
                 tools.append(function_data)
 
-        server_vad_opts: api_proto.ServerVad = {
-            "type": "server_vad",
-            "threshold": self._opts.turn_detection.threshold,
-            "prefix_padding_ms": self._opts.turn_detection.prefix_padding_ms,
-            "silence_duration_ms": self._opts.turn_detection.silence_duration_ms,
-        }
+        server_vad_opts: api_proto.ServerVad | None = None
+        if self._opts.turn_detection is not None:
+            server_vad_opts = {
+                "type": "server_vad",
+                "threshold": self._opts.turn_detection.threshold,
+                "prefix_padding_ms": self._opts.turn_detection.prefix_padding_ms,
+                "silence_duration_ms": self._opts.turn_detection.silence_duration_ms,
+            }
+        input_audio_transcription_opts: api_proto.InputAudioTranscription | None = None
+        if self._opts.input_audio_transcription is not None:
+            input_audio_transcription_opts = {
+                "model": self._opts.input_audio_transcription.model,
+            }
 
         session_data: api_proto.ClientEvent.SessionUpdateData = {
             "modalities": self._opts.modalities,
@@ -910,9 +922,7 @@ def session_update(
             "voice": self._opts.voice,
             "input_audio_format": self._opts.input_audio_format,
             "output_audio_format": self._opts.output_audio_format,
-            "input_audio_transcription": {
-                "model": self._opts.input_audio_transcription.model,
-            },
+            "input_audio_transcription": input_audio_transcription_opts,
             "turn_detection": server_vad_opts,
             "tools": tools,
             "tool_choice": self._opts.tool_choice,
@@ -1105,6 +1115,8 @@ async def _recv_task():
 
                     if event == "session.created":
                         self._handle_session_created(data)
+                    if event == "session.updated":
+                        self._handle_session_updated(data)
                     elif event == "error":
                         self._handle_error(data)
                     elif event == "input_audio_buffer.speech_started":
@@ -1173,6 +1185,42 @@ def _handle_session_created(
     ):
         self._session_id = session_created["session"]["id"]
 
+    def _handle_session_updated(
+        self, session_updated: api_proto.ServerEvent.SessionUpdated
+    ):
+        session = session_updated["session"]
+        if session["turn_detection"] is None:
+            turn_detection = None
+        else:
+            turn_detection = ServerVadOptions(
+                threshold=session["turn_detection"]["threshold"],
+                prefix_padding_ms=session["turn_detection"]["prefix_padding_ms"],
+                silence_duration_ms=session["turn_detection"]["silence_duration_ms"],
+            )
+        if session["input_audio_transcription"] is None:
+            input_audio_transcription = None
+        else:
+            input_audio_transcription = InputTranscriptionOptions(
+                model=session["input_audio_transcription"]["model"],
+            )
+
+        self.emit(
+            "session_updated",
+            RealtimeSessionOptions(
+                model=session["model"],
+                modalities=session["modalities"],
+                instructions=session["instructions"],
+                voice=session["voice"],
+                input_audio_format=session["input_audio_format"],
+                output_audio_format=session["output_audio_format"],
+                input_audio_transcription=input_audio_transcription,
+                turn_detection=turn_detection,
+                tool_choice=session["tool_choice"],
+                temperature=session["temperature"],
+                max_response_output_tokens=session["max_response_output_tokens"],
+            ),
+        )
+
     def _handle_error(self, error: api_proto.ServerEvent.Error):
         logger.error(
             "OpenAI S2S error %s",

From 7e8c08986bc4966d421532c3022d26e23b2b4445 Mon Sep 17 00:00:00 2001
From: Ben Cherry <bcherry@gmail.com>
Date: Wed, 18 Dec 2024 09:57:07 -0800
Subject: [PATCH 11/46] Add JPEG quality param to image encoder (#1249)

---
 .changeset/gorgeous-days-retire.md            |  5 +++
 .../livekit/agents/llm/chat_context.py        | 19 ++++++++--
 .../livekit/agents/utils/images/image.py      | 37 +++++++++++++++----
 3 files changed, 51 insertions(+), 10 deletions(-)
 create mode 100644 .changeset/gorgeous-days-retire.md

diff --git a/.changeset/gorgeous-days-retire.md b/.changeset/gorgeous-days-retire.md
new file mode 100644
index 000000000..fa28e85a8
--- /dev/null
+++ b/.changeset/gorgeous-days-retire.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+Add JPEG quality param to image encoder
diff --git a/livekit-agents/livekit/agents/llm/chat_context.py b/livekit-agents/livekit/agents/llm/chat_context.py
index 07e36d6c0..ccde86bba 100644
--- a/livekit-agents/livekit/agents/llm/chat_context.py
+++ b/livekit-agents/livekit/agents/llm/chat_context.py
@@ -32,13 +32,26 @@ class ChatImage:
     You may need to consult your LLM provider's documentation on supported URL types.
 
     ```python
-    # With a VideoFrame, which will be automatically converted to a data URL internally
+    # Pass a VideoFrame directly, which will be automatically converted to a JPEG data URL internally
     async for event in rtc.VideoStream(video_track):
         chat_image = ChatImage(image=event.frame)
         # this instance is now available for your ChatContext
 
-    # With a data URL
-    chat_image = ChatImage(image=f"data:image/jpeg;base64,{base64_encoded_image}")
+    # Encode your VideoFrame yourself for more control, and pass the result as a data URL (see EncodeOptions for more details)
+    from livekit.agents.utils.images import encode, EncodeOptions, ResizeOptions
+
+    image_bytes = encode(
+        event.frame,
+        EncodeOptions(
+            format="PNG",
+            resize_options=ResizeOptions(
+                width=512, height=512, strategy="scale_aspect_fit"
+            ),
+        ),
+    )
+    chat_image = ChatImage(
+        image=f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"
+    )
 
     # With an external URL
     chat_image = ChatImage(image="https://example.com/image.jpg")
diff --git a/livekit-agents/livekit/agents/utils/images/image.py b/livekit-agents/livekit/agents/utils/images/image.py
index bcc0a5b5f..dd9aac739 100644
--- a/livekit-agents/livekit/agents/utils/images/image.py
+++ b/livekit-agents/livekit/agents/utils/images/image.py
@@ -25,26 +25,42 @@
 
 @dataclass
 class EncodeOptions:
+    """Options for encoding rtc.VideoFrame to portable image formats."""
+
     format: Literal["JPEG", "PNG"] = "JPEG"
+    """The format to encode the image."""
+
     resize_options: Optional["ResizeOptions"] = None
+    """Options for resizing the image."""
+
+    quality: Optional[int] = 75
+    """Image compression quality, 0-100. Only applies to JPEG."""
 
 
 @dataclass
 class ResizeOptions:
+    """Options for resizing rtc.VideoFrame as part of encoding to a portable image format."""
+
     width: int
+    """The desired resize width (in)"""
+
     height: int
+    """The desired height to resize the image to."""
+
     strategy: Literal[
-        # Fit the image into the provided dimensions, with letterboxing
         "center_aspect_fit",
-        # Fill the provided dimensions, with cropping
         "center_aspect_cover",
-        # Fit the image into the provided dimensions, preserving its original aspect ratio
         "scale_aspect_fit",
-        # Fill the provided dimensions, preserving its original aspect ratio (image will be larger than the provided dimensions)
         "scale_aspect_cover",
-        # Precisely resize the image to the provided dimensions
         "skew",
     ]
+    """The strategy to use when resizing the image:
+    - center_aspect_fit: Fit the image into the provided dimensions, with letterboxing
+    - center_aspect_cover: Fill the provided dimensions, with cropping
+    - scale_aspect_fit: Fit the image into the provided dimensions, preserving its original aspect ratio
+    - scale_aspect_cover: Fill the provided dimensions, preserving its original aspect ratio (image will be larger than the provided dimensions)
+    - skew: Precisely resize the image to the provided dimensions
+    """
 
 
 def import_pil():
@@ -57,12 +73,19 @@ def import_pil():
         )
 
 
-def encode(frame: rtc.VideoFrame, options: EncodeOptions):
+def encode(frame: rtc.VideoFrame, options: EncodeOptions) -> bytes:
+    """Encode a rtc.VideoFrame to a portable image format (JPEG or PNG).
+
+    See EncodeOptions for more details.
+    """
     import_pil()
     img = _image_from_frame(frame)
     resized = _resize_image(img, options)
     buffer = io.BytesIO()
-    resized.save(buffer, options.format)
+    kwargs = {}
+    if options.format == "JPEG" and options.quality is not None:
+        kwargs["quality"] = options.quality
+    resized.save(buffer, options.format, **kwargs)
     buffer.seek(0)
     return buffer.read()
 

From e32278b9c31f62f56225382ad636b508f5948d52 Mon Sep 17 00:00:00 2001
From: jerad fields <jeradfields@gmail.com>
Date: Thu, 19 Dec 2024 00:21:28 -0600
Subject: [PATCH 12/46] use plain onnxruntime for turn detector, remove pytorch
 (#1257)

---
 .changeset/yellow-kings-hear.md               |  5 +++
 .../livekit/plugins/turn_detector/__init__.py |  7 ++--
 .../livekit/plugins/turn_detector/eou.py      | 36 ++++++++++++-------
 .../livekit-plugins-turn-detector/setup.py    |  5 ++-
 4 files changed, 33 insertions(+), 20 deletions(-)
 create mode 100644 .changeset/yellow-kings-hear.md

diff --git a/.changeset/yellow-kings-hear.md b/.changeset/yellow-kings-hear.md
new file mode 100644
index 000000000..582956a37
--- /dev/null
+++ b/.changeset/yellow-kings-hear.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-turn-detector": patch
+---
+
+use onnxruntime for turn detection and remove pytorch dependency
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py
index 32692361a..54d7a90af 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py
@@ -27,15 +27,12 @@ def __init__(self):
         super().__init__(__name__, __version__, __package__, logger)
 
     def download_files(self) -> None:
-        from optimum.onnxruntime import ORTModelForCausalLM
         from transformers import AutoTokenizer
 
-        from .eou import HG_MODEL
+        from .eou import HG_MODEL, ONNX_FILENAME, _download_from_hf_hub
 
-        ORTModelForCausalLM.from_pretrained(
-            HG_MODEL, use_cache=False, use_io_binding=False
-        )
         AutoTokenizer.from_pretrained(HG_MODEL)
+        _download_from_hf_hub(HG_MODEL, ONNX_FILENAME)
 
 
 Plugin.register_plugin(EOUPlugin())
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
index afbc09415..acb915ab5 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
@@ -13,10 +13,18 @@
 from .log import logger
 
 HG_MODEL = "livekit/turn-detector"
+ONNX_FILENAME = "model_quantized.onnx"
 PUNCS = string.punctuation.replace("'", "")
 MAX_HISTORY = 4
 
 
+def _download_from_hf_hub(repo_id, filename, **kwargs):
+    from huggingface_hub import hf_hub_download
+
+    local_path = hf_hub_download(repo_id=repo_id, filename=filename, **kwargs)
+    return local_path
+
+
 def _softmax(logits: np.ndarray) -> np.ndarray:
     exp_logits = np.exp(logits - np.max(logits))
     return exp_logits / np.sum(exp_logits)
@@ -55,17 +63,18 @@ def _format_chat_ctx(self, chat_ctx: dict):
         return text
 
     def initialize(self) -> None:
+        import onnxruntime as ort
         from huggingface_hub import errors
-        from optimum.onnxruntime import ORTModelForCausalLM
         from transformers import AutoTokenizer
 
         try:
-            self._model = ORTModelForCausalLM.from_pretrained(
-                HG_MODEL,
-                local_files_only=True,
-                use_io_binding=False,
-                use_cache=False,
+            local_path_onnx = _download_from_hf_hub(
+                HG_MODEL, ONNX_FILENAME, local_files_only=True
+            )
+            self._session = ort.InferenceSession(
+                local_path_onnx, providers=["CPUExecutionProvider"]
             )
+
             self._tokenizer = AutoTokenizer.from_pretrained(
                 HG_MODEL, local_files_only=True
             )
@@ -94,13 +103,17 @@ def run(self, data: bytes) -> bytes | None:
         inputs = self._tokenizer(
             text,
             add_special_tokens=False,
-            return_tensors="pt",
+            return_tensors="np",
         )
 
-        outputs = self._model(**inputs)
-        logits = outputs.logits[0, -1, :].detach().numpy()
-        output_probs = _softmax(logits)
-        eou_probability = output_probs[self._eou_index]
+        input_dict = {"input_ids": inputs["input_ids"]}
+
+        # Run inference
+        outputs = self._session.run(["logits"], input_dict)
+
+        logits = outputs[0][0, -1, :]
+        probs = _softmax(logits)
+        eou_probability = probs[self._eou_index]
 
         end_time = time.perf_counter()
 
@@ -112,7 +125,6 @@ def run(self, data: bytes) -> bytes | None:
                 "duration": round(end_time - start_time, 3),
             },
         )
-
         return json.dumps({"eou_probability": float(eou_probability)}).encode()
 
 
diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py
index b26b8e536..7b9b4b192 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/setup.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py
@@ -51,10 +51,9 @@
     python_requires=">=3.9.0",
     install_requires=[
         "livekit-agents>=0.11",
-        "transformers>=4.46",
+        "transformers>=4.47.1",
         "numpy>=1.26",
-        "torch>=2.5.1",
-        "optimum[onnxruntime]>=1.23.3",
+        "onnxruntime>=1.18",
     ],
     package_data={"livekit.plugins.turn_detector": ["py.typed"]},
     project_urls={

From c57b4ccfb8fe65f146949771ca068cb56a380b28 Mon Sep 17 00:00:00 2001
From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com>
Date: Fri, 20 Dec 2024 13:42:45 +0530
Subject: [PATCH 13/46] Move `create_ai_function_info` to function_context.py
 (#1260)

---
 .changeset/clever-lies-explode.md             |  7 ++
 livekit-agents/livekit/agents/llm/__init__.py |  2 +
 .../livekit/agents/llm/function_context.py    | 94 ++++++++++++++++++
 .../livekit/plugins/anthropic/llm.py          | 95 +-----------------
 .../livekit/plugins/assemblyai/stt.py         |  2 +
 .../livekit/plugins/openai/_oai_api.py        | 96 +------------------
 .../livekit/plugins/openai/llm.py             |  9 +-
 .../plugins/openai/realtime/realtime_model.py |  5 +-
 8 files changed, 116 insertions(+), 194 deletions(-)
 create mode 100644 .changeset/clever-lies-explode.md

diff --git a/.changeset/clever-lies-explode.md b/.changeset/clever-lies-explode.md
new file mode 100644
index 000000000..1bf7ea69d
--- /dev/null
+++ b/.changeset/clever-lies-explode.md
@@ -0,0 +1,7 @@
+---
+"livekit-plugins-anthropic": patch
+"livekit-plugins-openai": patch
+"livekit-agents": patch
+---
+
+Moved create_ai_function_info to function_context.py for better reusability and reduce repetation
diff --git a/livekit-agents/livekit/agents/llm/__init__.py b/livekit-agents/livekit/agents/llm/__init__.py
index acc5b0ce6..d3a06f520 100644
--- a/livekit-agents/livekit/agents/llm/__init__.py
+++ b/livekit-agents/livekit/agents/llm/__init__.py
@@ -15,6 +15,7 @@
     FunctionContext,
     FunctionInfo,
     TypeInfo,
+    _create_ai_function_info,
     ai_callable,
 )
 from .llm import (
@@ -54,4 +55,5 @@
     "FallbackAdapter",
     "AvailabilityChangedEvent",
     "ToolChoice",
+    "_create_ai_function_info",
 ]
diff --git a/livekit-agents/livekit/agents/llm/function_context.py b/livekit-agents/livekit/agents/llm/function_context.py
index aa4df9842..4470492fe 100644
--- a/livekit-agents/livekit/agents/llm/function_context.py
+++ b/livekit-agents/livekit/agents/llm/function_context.py
@@ -18,6 +18,7 @@
 import enum
 import functools
 import inspect
+import json
 import types
 import typing
 from dataclasses import dataclass
@@ -303,3 +304,96 @@ def _is_optional_type(typ) -> Tuple[bool, Any]:
             return True, non_none_args[0]
 
     return False, None
+
+
+def _create_ai_function_info(
+    fnc_ctx: FunctionContext,
+    tool_call_id: str,
+    fnc_name: str,
+    raw_arguments: str,  # JSON string
+) -> FunctionCallInfo:
+    if fnc_name not in fnc_ctx.ai_functions:
+        raise ValueError(f"AI function {fnc_name} not found")
+
+    parsed_arguments: dict[str, Any] = {}
+    try:
+        if raw_arguments:  # ignore empty string
+            parsed_arguments = json.loads(raw_arguments)
+    except json.JSONDecodeError:
+        raise ValueError(
+            f"AI function {fnc_name} received invalid JSON arguments - {raw_arguments}"
+        )
+
+    fnc_info = fnc_ctx.ai_functions[fnc_name]
+
+    # Ensure all necessary arguments are present and of the correct type.
+    sanitized_arguments: dict[str, Any] = {}
+    for arg_info in fnc_info.arguments.values():
+        if arg_info.name not in parsed_arguments:
+            if arg_info.default is inspect.Parameter.empty:
+                raise ValueError(
+                    f"AI function {fnc_name} missing required argument {arg_info.name}"
+                )
+            continue
+
+        arg_value = parsed_arguments[arg_info.name]
+        is_optional, inner_th = _is_optional_type(arg_info.type)
+
+        if typing.get_origin(inner_th) is not None:
+            if not isinstance(arg_value, list):
+                raise ValueError(
+                    f"AI function {fnc_name} argument {arg_info.name} should be a list"
+                )
+
+            inner_type = typing.get_args(inner_th)[0]
+            sanitized_value = [
+                _sanitize_primitive(
+                    value=v,
+                    expected_type=inner_type,
+                    choices=arg_info.choices,
+                )
+                for v in arg_value
+            ]
+        else:
+            sanitized_value = _sanitize_primitive(
+                value=arg_value,
+                expected_type=inner_th,
+                choices=arg_info.choices,
+            )
+
+        sanitized_arguments[arg_info.name] = sanitized_value
+
+    return FunctionCallInfo(
+        tool_call_id=tool_call_id,
+        raw_arguments=raw_arguments,
+        function_info=fnc_info,
+        arguments=sanitized_arguments,
+    )
+
+
+def _sanitize_primitive(
+    *, value: Any, expected_type: type, choices: tuple | None
+) -> Any:
+    if expected_type is str:
+        if not isinstance(value, str):
+            raise ValueError(f"expected str, got {type(value)}")
+    elif expected_type in (int, float):
+        if not isinstance(value, (int, float)):
+            raise ValueError(f"expected number, got {type(value)}")
+
+        if expected_type is int:
+            if value % 1 != 0:
+                raise ValueError("expected int, got float")
+
+            value = int(value)
+        elif expected_type is float:
+            value = float(value)
+
+    elif expected_type is bool:
+        if not isinstance(value, bool):
+            raise ValueError(f"expected bool, got {type(value)}")
+
+    if choices and value not in choices:
+        raise ValueError(f"invalid value {value}, not in {choices}")
+
+    return value
diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
index 9678c9381..69b468d23 100644
--- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
+++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
@@ -24,7 +24,6 @@
     Awaitable,
     List,
     Literal,
-    Tuple,
     Union,
     cast,
     get_args,
@@ -41,7 +40,10 @@
     utils,
 )
 from livekit.agents.llm import ToolChoice
-from livekit.agents.llm.function_context import _is_optional_type
+from livekit.agents.llm.function_context import (
+    _create_ai_function_info,
+    _is_optional_type,
+)
 from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions
 
 import anthropic
@@ -487,67 +489,6 @@ def _build_anthropic_image_content(
     )
 
 
-def _create_ai_function_info(
-    fnc_ctx: llm.function_context.FunctionContext,
-    tool_call_id: str,
-    fnc_name: str,
-    raw_arguments: str,  # JSON string
-) -> llm.function_context.FunctionCallInfo:
-    if fnc_name not in fnc_ctx.ai_functions:
-        raise ValueError(f"AI function {fnc_name} not found")
-
-    parsed_arguments: dict[str, Any] = {}
-    try:
-        if raw_arguments:  # ignore empty string
-            parsed_arguments = json.loads(raw_arguments)
-    except json.JSONDecodeError:
-        raise ValueError(
-            f"AI function {fnc_name} received invalid JSON arguments - {raw_arguments}"
-        )
-
-    fnc_info = fnc_ctx.ai_functions[fnc_name]
-
-    # Ensure all necessary arguments are present and of the correct type.
-    sanitized_arguments: dict[str, Any] = {}
-    for arg_info in fnc_info.arguments.values():
-        if arg_info.name not in parsed_arguments:
-            if arg_info.default is inspect.Parameter.empty:
-                raise ValueError(
-                    f"AI function {fnc_name} missing required argument {arg_info.name}"
-                )
-            continue
-
-        arg_value = parsed_arguments[arg_info.name]
-        is_optional, inner_th = _is_optional_type(arg_info.type)
-
-        if get_origin(inner_th) is not None:
-            if not isinstance(arg_value, list):
-                raise ValueError(
-                    f"AI function {fnc_name} argument {arg_info.name} should be a list"
-                )
-
-            inner_type = get_args(inner_th)[0]
-            sanitized_value = [
-                _sanitize_primitive(
-                    value=v, expected_type=inner_type, choices=arg_info.choices
-                )
-                for v in arg_value
-            ]
-        else:
-            sanitized_value = _sanitize_primitive(
-                value=arg_value, expected_type=inner_th, choices=arg_info.choices
-            )
-
-        sanitized_arguments[arg_info.name] = sanitized_value
-
-    return llm.function_context.FunctionCallInfo(
-        tool_call_id=tool_call_id,
-        raw_arguments=raw_arguments,
-        function_info=fnc_info,
-        arguments=sanitized_arguments,
-    )
-
-
 def _build_function_description(
     fnc_info: llm.function_context.FunctionInfo,
 ) -> anthropic.types.ToolParam:
@@ -598,31 +539,3 @@ def type2str(t: type) -> str:
         "description": fnc_info.description,
         "input_schema": input_schema,
     }
-
-
-def _sanitize_primitive(
-    *, value: Any, expected_type: type, choices: Tuple[Any] | None
-) -> Any:
-    if expected_type is str:
-        if not isinstance(value, str):
-            raise ValueError(f"expected str, got {type(value)}")
-    elif expected_type in (int, float):
-        if not isinstance(value, (int, float)):
-            raise ValueError(f"expected number, got {type(value)}")
-
-        if expected_type is int:
-            if value % 1 != 0:
-                raise ValueError("expected int, got float")
-
-            value = int(value)
-        elif expected_type is float:
-            value = float(value)
-
-    elif expected_type is bool:
-        if not isinstance(value, bool):
-            raise ValueError(f"expected bool, got {type(value)}")
-
-    if choices and value not in choices:
-        raise ValueError(f"invalid value {value}, not in {choices}")
-
-    return value
diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
index a87eaf542..acef65b6a 100644
--- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
+++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
@@ -289,6 +289,8 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse):
                 except Exception:
                     logger.exception("failed to process AssemblyAI message")
 
+        ws: aiohttp.ClientWebSocketResponse | None = None
+
         while True:
             try:
                 ws = await self._connect_ws()
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py
index 8bf05a19f..8dbc3a33e 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py
@@ -15,79 +15,13 @@
 from __future__ import annotations
 
 import inspect
-import json
 import typing
 from typing import Any
 
 from livekit.agents.llm import function_context, llm
 from livekit.agents.llm.function_context import _is_optional_type
 
-__all__ = ["build_oai_function_description", "create_ai_function_info"]
-
-
-def create_ai_function_info(
-    fnc_ctx: function_context.FunctionContext,
-    tool_call_id: str,
-    fnc_name: str,
-    raw_arguments: str,  # JSON string
-) -> function_context.FunctionCallInfo:
-    if fnc_name not in fnc_ctx.ai_functions:
-        raise ValueError(f"AI function {fnc_name} not found")
-
-    parsed_arguments: dict[str, Any] = {}
-    try:
-        if raw_arguments:  # ignore empty string
-            parsed_arguments = json.loads(raw_arguments)
-    except json.JSONDecodeError:
-        raise ValueError(
-            f"AI function {fnc_name} received invalid JSON arguments - {raw_arguments}"
-        )
-
-    fnc_info = fnc_ctx.ai_functions[fnc_name]
-
-    # Ensure all necessary arguments are present and of the correct type.
-    sanitized_arguments: dict[str, Any] = {}
-    for arg_info in fnc_info.arguments.values():
-        if arg_info.name not in parsed_arguments:
-            if arg_info.default is inspect.Parameter.empty:
-                raise ValueError(
-                    f"AI function {fnc_name} missing required argument {arg_info.name}"
-                )
-            continue
-
-        arg_value = parsed_arguments[arg_info.name]
-        is_optional, inner_th = _is_optional_type(arg_info.type)
-
-        if typing.get_origin(inner_th) is not None:
-            if not isinstance(arg_value, list):
-                raise ValueError(
-                    f"AI function {fnc_name} argument {arg_info.name} should be a list"
-                )
-
-            inner_type = typing.get_args(inner_th)[0]
-            sanitized_value = [
-                _sanitize_primitive(
-                    value=v,
-                    expected_type=inner_type,
-                    choices=arg_info.choices,
-                )
-                for v in arg_value
-            ]
-        else:
-            sanitized_value = _sanitize_primitive(
-                value=arg_value,
-                expected_type=inner_th,
-                choices=arg_info.choices,
-            )
-
-        sanitized_arguments[arg_info.name] = sanitized_value
-
-    return function_context.FunctionCallInfo(
-        tool_call_id=tool_call_id,
-        raw_arguments=raw_arguments,
-        function_info=fnc_info,
-        arguments=sanitized_arguments,
-    )
+__all__ = ["build_oai_function_description"]
 
 
 def build_oai_function_description(
@@ -156,31 +90,3 @@ def type2str(t: type) -> str:
             },
         },
     }
-
-
-def _sanitize_primitive(
-    *, value: Any, expected_type: type, choices: tuple | None
-) -> Any:
-    if expected_type is str:
-        if not isinstance(value, str):
-            raise ValueError(f"expected str, got {type(value)}")
-    elif expected_type in (int, float):
-        if not isinstance(value, (int, float)):
-            raise ValueError(f"expected number, got {type(value)}")
-
-        if expected_type is int:
-            if value % 1 != 0:
-                raise ValueError("expected int, got float")
-
-            value = int(value)
-        elif expected_type is float:
-            value = float(value)
-
-    elif expected_type is bool:
-        if not isinstance(value, bool):
-            raise ValueError(f"expected bool, got {type(value)}")
-
-    if choices and value not in choices:
-        raise ValueError(f"invalid value {value}, not in {choices}")
-
-    return value
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
index 7dfbaff24..bcff2cfa9 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
@@ -29,17 +29,14 @@
     APITimeoutError,
     llm,
 )
-from livekit.agents.llm import ToolChoice
+from livekit.agents.llm import ToolChoice, _create_ai_function_info
 from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions
 
 import openai
 from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
 from openai.types.chat.chat_completion_chunk import Choice
 
-from ._oai_api import (
-    build_oai_function_description,
-    create_ai_function_info,
-)
+from ._oai_api import build_oai_function_description
 from .log import logger
 from .models import (
     CerebrasChatModels,
@@ -840,7 +837,7 @@ def _try_build_function(self, id: str, choice: Choice) -> llm.ChatChunk | None:
             )
             return None
 
-        fnc_info = create_ai_function_info(
+        fnc_info = _create_ai_function_info(
             self._fnc_ctx, self._tool_call_id, self._fnc_name, self._fnc_raw_arguments
         )
 
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
index 04bf14ac5..26bc2649b 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
@@ -12,10 +12,11 @@
 import aiohttp
 from livekit import rtc
 from livekit.agents import llm, utils
+from livekit.agents.llm.function_context import _create_ai_function_info
 from livekit.agents.metrics import MultimodalLLMError, MultimodalLLMMetrics
 from typing_extensions import TypedDict
 
-from .._oai_api import build_oai_function_description, create_ai_function_info
+from .._oai_api import build_oai_function_description
 from . import api_proto, remote_items
 from .log import logger
 
@@ -1521,7 +1522,7 @@ def _handle_response_output_item_done(
             item = response_output_done["item"]
             assert item["type"] == "function_call"
 
-            fnc_call_info = create_ai_function_info(
+            fnc_call_info = _create_ai_function_info(
                 self._fnc_ctx,
                 item["call_id"],
                 item["name"],

From 7941263f4919b46646becef71ee00bd42831440b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 20 Dec 2024 12:33:35 -0600
Subject: [PATCH 14/46] Version Packages (#1218)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .changeset/clever-lies-explode.md             |  7 -----
 .changeset/empty-sheep-pump.md                |  6 -----
 .changeset/famous-points-tickle.md            |  5 ----
 .changeset/fix-imgui-setup.md                 |  5 ----
 .changeset/gorgeous-days-retire.md            |  5 ----
 .changeset/great-lizards-pump.md              |  9 -------
 .changeset/grumpy-dancers-develop.md          |  5 ----
 .changeset/loud-onions-invent.md              |  5 ----
 .changeset/nervous-years-sell.md              |  7 -----
 .changeset/real-squids-warn.md                |  5 ----
 .changeset/strange-snakes-hug.md              |  5 ----
 .changeset/thin-carpets-thank.md              |  5 ----
 .changeset/tiny-papayas-film.md               |  9 -------
 .changeset/twenty-dragons-shave.md            |  5 ----
 .changeset/warm-pillows-grow.md               |  5 ----
 .changeset/yellow-kings-hear.md               |  5 ----
 .../participant-entrypoint/requirements.txt   |  2 +-
 examples/simple-color/requirements.txt        |  2 +-
 examples/speech-to-text/requirements.txt      |  4 +--
 examples/text-to-speech/requirements.txt      |  4 +--
 .../voice-pipeline-agent/requirements.txt     |  4 +--
 livekit-agents/CHANGELOG.md                   | 20 ++++++++++++++
 livekit-agents/livekit/agents/version.py      |  2 +-
 livekit-agents/package.json                   |  2 +-
 .../livekit-plugins-anthropic/CHANGELOG.md    | 16 ++++++++++++
 .../livekit/plugins/anthropic/version.py      |  2 +-
 .../livekit-plugins-anthropic/package.json    |  2 +-
 .../livekit-plugins-azure/CHANGELOG.md        |  6 +++++
 .../livekit/plugins/azure/version.py          |  2 +-
 .../livekit-plugins-azure/package.json        |  2 +-
 .../livekit-plugins-browser/CHANGELOG.md      |  6 +++++
 .../livekit/plugins/browser/version.py        |  2 +-
 .../livekit-plugins-browser/package.json      |  2 +-
 .../livekit-plugins-deepgram/CHANGELOG.md     |  8 ++++++
 .../livekit/plugins/deepgram/version.py       |  2 +-
 .../livekit-plugins-deepgram/package.json     |  2 +-
 .../livekit-plugins-openai/CHANGELOG.md       | 26 +++++++++++++++++++
 .../livekit/plugins/openai/version.py         |  2 +-
 .../livekit-plugins-openai/package.json       |  2 +-
 .../CHANGELOG.md                              |  8 ++++++
 .../livekit/plugins/turn_detector/version.py  |  2 +-
 .../package.json                              |  2 +-
 42 files changed, 112 insertions(+), 115 deletions(-)
 delete mode 100644 .changeset/clever-lies-explode.md
 delete mode 100644 .changeset/empty-sheep-pump.md
 delete mode 100644 .changeset/famous-points-tickle.md
 delete mode 100644 .changeset/fix-imgui-setup.md
 delete mode 100644 .changeset/gorgeous-days-retire.md
 delete mode 100644 .changeset/great-lizards-pump.md
 delete mode 100644 .changeset/grumpy-dancers-develop.md
 delete mode 100644 .changeset/loud-onions-invent.md
 delete mode 100644 .changeset/nervous-years-sell.md
 delete mode 100644 .changeset/real-squids-warn.md
 delete mode 100644 .changeset/strange-snakes-hug.md
 delete mode 100644 .changeset/thin-carpets-thank.md
 delete mode 100644 .changeset/tiny-papayas-film.md
 delete mode 100644 .changeset/twenty-dragons-shave.md
 delete mode 100644 .changeset/warm-pillows-grow.md
 delete mode 100644 .changeset/yellow-kings-hear.md

diff --git a/.changeset/clever-lies-explode.md b/.changeset/clever-lies-explode.md
deleted file mode 100644
index 1bf7ea69d..000000000
--- a/.changeset/clever-lies-explode.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-"livekit-plugins-anthropic": patch
-"livekit-plugins-openai": patch
-"livekit-agents": patch
----
-
-Moved create_ai_function_info to function_context.py for better reusability and reduce repetation
diff --git a/.changeset/empty-sheep-pump.md b/.changeset/empty-sheep-pump.md
deleted file mode 100644
index 06c854c20..000000000
--- a/.changeset/empty-sheep-pump.md
+++ /dev/null
@@ -1,6 +0,0 @@
----
-"livekit-plugins-deepgram": patch
-"livekit-agents": patch
----
-
-added streaming audio decoder for compressed audio.
diff --git a/.changeset/famous-points-tickle.md b/.changeset/famous-points-tickle.md
deleted file mode 100644
index 48df9b431..000000000
--- a/.changeset/famous-points-tickle.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-openai": patch
----
-
-add on_duplicate option for multimodal agent response create
diff --git a/.changeset/fix-imgui-setup.md b/.changeset/fix-imgui-setup.md
deleted file mode 100644
index a6e52168e..000000000
--- a/.changeset/fix-imgui-setup.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-browser": patch
----
-
-fix: fix `imgui` setup
diff --git a/.changeset/gorgeous-days-retire.md b/.changeset/gorgeous-days-retire.md
deleted file mode 100644
index fa28e85a8..000000000
--- a/.changeset/gorgeous-days-retire.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-Add JPEG quality param to image encoder
diff --git a/.changeset/great-lizards-pump.md b/.changeset/great-lizards-pump.md
deleted file mode 100644
index a9542b8be..000000000
--- a/.changeset/great-lizards-pump.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-"livekit-agents": patch
-"livekit-plugins-anthropic": patch
-"livekit-plugins-openai": patch
----
-
-Add support for OpenAI's "detail" parameter to ChatImage
-
-Add support for data URLs on ChatImage in the Anthropic plugin.
diff --git a/.changeset/grumpy-dancers-develop.md b/.changeset/grumpy-dancers-develop.md
deleted file mode 100644
index c5563f597..000000000
--- a/.changeset/grumpy-dancers-develop.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-openai": patch
----
-
-filter out empty message for set chat ctx in realtime model
diff --git a/.changeset/loud-onions-invent.md b/.changeset/loud-onions-invent.md
deleted file mode 100644
index dcedf95b4..000000000
--- a/.changeset/loud-onions-invent.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-turn-detector": patch
----
-
-use quantized onnx version of turn detector model
diff --git a/.changeset/nervous-years-sell.md b/.changeset/nervous-years-sell.md
deleted file mode 100644
index a7829fe92..000000000
--- a/.changeset/nervous-years-sell.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-"livekit-plugins-anthropic": patch
-"livekit-plugins-openai": patch
-"livekit-agents": patch
----
-
-fix: correctly parse function argument types
diff --git a/.changeset/real-squids-warn.md b/.changeset/real-squids-warn.md
deleted file mode 100644
index 43c5d096d..000000000
--- a/.changeset/real-squids-warn.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-openai": patch
----
-
-add session_updated event for RealtimeSession
diff --git a/.changeset/strange-snakes-hug.md b/.changeset/strange-snakes-hug.md
deleted file mode 100644
index 1753e0133..000000000
--- a/.changeset/strange-snakes-hug.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-openai": patch
----
-
-added llama 3.3 70b to model definitions
diff --git a/.changeset/thin-carpets-thank.md b/.changeset/thin-carpets-thank.md
deleted file mode 100644
index 809ac6fa5..000000000
--- a/.changeset/thin-carpets-thank.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-openai": patch
----
-
-update default realtime model to gpt-4o-realtime-preview-2024-12-17
diff --git a/.changeset/tiny-papayas-film.md b/.changeset/tiny-papayas-film.md
deleted file mode 100644
index 07ccea04c..000000000
--- a/.changeset/tiny-papayas-film.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-"livekit-agents": patch
-"livekit-plugins-anthropic": patch
-"livekit-plugins-openai": patch
----
-
-Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options.
-
-Make scale_aspect_fit the new default resizing option for video frames.
diff --git a/.changeset/twenty-dragons-shave.md b/.changeset/twenty-dragons-shave.md
deleted file mode 100644
index ceaa8890c..000000000
--- a/.changeset/twenty-dragons-shave.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-azure": patch
----
-
-fix azure stt language autodetection
diff --git a/.changeset/warm-pillows-grow.md b/.changeset/warm-pillows-grow.md
deleted file mode 100644
index f0f29092a..000000000
--- a/.changeset/warm-pillows-grow.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-deepgram": patch
----
-
-Support Deepgram TTS
diff --git a/.changeset/yellow-kings-hear.md b/.changeset/yellow-kings-hear.md
deleted file mode 100644
index 582956a37..000000000
--- a/.changeset/yellow-kings-hear.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-turn-detector": patch
----
-
-use onnxruntime for turn detection and remove pytorch dependency
diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt
index 5616d9626..53a52b16a 100644
--- a/examples/participant-entrypoint/requirements.txt
+++ b/examples/participant-entrypoint/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.2
+livekit-agents>=0.12.3
 python-dotenv~=1.0
diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt
index 5616d9626..53a52b16a 100644
--- a/examples/simple-color/requirements.txt
+++ b/examples/simple-color/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.2
+livekit-agents>=0.12.3
 python-dotenv~=1.0
diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt
index e74eccacc..53ee39eb8 100644
--- a/examples/speech-to-text/requirements.txt
+++ b/examples/speech-to-text/requirements.txt
@@ -1,3 +1,3 @@
-livekit-agents>=0.12.2
-livekit-plugins-deepgram>=0.6.14
+livekit-agents>=0.12.3
+livekit-plugins-deepgram>=0.6.15
 python-dotenv~=1.0
diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt
index 8e983ef04..e5e0d8ddd 100644
--- a/examples/text-to-speech/requirements.txt
+++ b/examples/text-to-speech/requirements.txt
@@ -1,5 +1,5 @@
-livekit-agents>=0.12.2
-livekit-plugins-openai>=0.10.10
+livekit-agents>=0.12.3
+livekit-plugins-openai>=0.10.11
 livekit-plugins-cartesia>=0.4.5
 livekit-plugins-elevenlabs>=0.7.9
 python-dotenv~=1.0
diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt
index a4cdff1ef..c8942df19 100644
--- a/examples/voice-pipeline-agent/requirements.txt
+++ b/examples/voice-pipeline-agent/requirements.txt
@@ -1,5 +1,5 @@
-livekit-agents>=0.12.2
-livekit-plugins-deepgram>=0.6.14
+livekit-agents>=0.12.3
+livekit-plugins-deepgram>=0.6.15
 livekit-plugins-google>=0.8.1
 livekit-plugins-openai[vertex]>=0.10.10
 livekit-plugins-silero>=0.7.4
diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md
index 8a65f0234..83a2959c1 100644
--- a/livekit-agents/CHANGELOG.md
+++ b/livekit-agents/CHANGELOG.md
@@ -1,5 +1,25 @@
 # livekit-agents
 
+## 0.12.3
+
+### Patch Changes
+
+- Moved create_ai_function_info to function_context.py for better reusability and reduce repetation - [#1260](https://github.com/livekit/agents/pull/1260) ([@jayeshp19](https://github.com/jayeshp19))
+
+- added streaming audio decoder for compressed audio. - [#1236](https://github.com/livekit/agents/pull/1236) ([@davidzhao](https://github.com/davidzhao))
+
+- Add JPEG quality param to image encoder - [#1249](https://github.com/livekit/agents/pull/1249) ([@bcherry](https://github.com/bcherry))
+
+- Add support for OpenAI's "detail" parameter to ChatImage - [#1213](https://github.com/livekit/agents/pull/1213) ([@bcherry](https://github.com/bcherry))
+
+  Add support for data URLs on ChatImage in the Anthropic plugin.
+
+- fix: correctly parse function argument types - [#1221](https://github.com/livekit/agents/pull/1221) ([@jayeshp19](https://github.com/jayeshp19))
+
+- Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options. - [#1222](https://github.com/livekit/agents/pull/1222) ([@bcherry](https://github.com/bcherry))
+
+  Make scale_aspect_fit the new default resizing option for video frames.
+
 ## 0.12.2
 
 ### Patch Changes
diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py
index 769b5d67e..55829dea7 100644
--- a/livekit-agents/livekit/agents/version.py
+++ b/livekit-agents/livekit/agents/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.12.2"
+__version__ = "0.12.3"
diff --git a/livekit-agents/package.json b/livekit-agents/package.json
index 172ad6196..c23feb751 100644
--- a/livekit-agents/package.json
+++ b/livekit-agents/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-agents",
   "private": true,
-  "version": "0.12.2"
+  "version": "0.12.3"
 }
diff --git a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md
index ab0944df0..f540e9641 100644
--- a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md
@@ -1,5 +1,21 @@
 # livekit-plugins-anthropic
 
+## 0.2.8
+
+### Patch Changes
+
+- Moved create_ai_function_info to function_context.py for better reusability and reduce repetation - [#1260](https://github.com/livekit/agents/pull/1260) ([@jayeshp19](https://github.com/jayeshp19))
+
+- Add support for OpenAI's "detail" parameter to ChatImage - [#1213](https://github.com/livekit/agents/pull/1213) ([@bcherry](https://github.com/bcherry))
+
+  Add support for data URLs on ChatImage in the Anthropic plugin.
+
+- fix: correctly parse function argument types - [#1221](https://github.com/livekit/agents/pull/1221) ([@jayeshp19](https://github.com/jayeshp19))
+
+- Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options. - [#1222](https://github.com/livekit/agents/pull/1222) ([@bcherry](https://github.com/bcherry))
+
+  Make scale_aspect_fit the new default resizing option for video frames.
+
 ## 0.2.7
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py
index c75e497a4..e558b382c 100644
--- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py
+++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.2.7"
+__version__ = "0.2.8"
diff --git a/livekit-plugins/livekit-plugins-anthropic/package.json b/livekit-plugins/livekit-plugins-anthropic/package.json
index a4f8b5235..ad2ba63a2 100644
--- a/livekit-plugins/livekit-plugins-anthropic/package.json
+++ b/livekit-plugins/livekit-plugins-anthropic/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-anthropic",
   "private": true,
-  "version": "0.2.7"
+  "version": "0.2.8"
 }
diff --git a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md
index 9a5897906..5d4ab532b 100644
--- a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-azure
 
+## 0.5.1
+
+### Patch Changes
+
+- fix azure stt language autodetection - [#1246](https://github.com/livekit/agents/pull/1246) ([@davidzhao](https://github.com/davidzhao))
+
 ## 0.5.0
 
 ### Minor Changes
diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py
index 63a2bd75e..79283902f 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.5.0"
+__version__ = "0.5.1"
diff --git a/livekit-plugins/livekit-plugins-azure/package.json b/livekit-plugins/livekit-plugins-azure/package.json
index dc0b821de..cdd81c035 100644
--- a/livekit-plugins/livekit-plugins-azure/package.json
+++ b/livekit-plugins/livekit-plugins-azure/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-azure",
   "private": true,
-  "version": "0.5.0"
+  "version": "0.5.1"
 }
diff --git a/livekit-plugins/livekit-plugins-browser/CHANGELOG.md b/livekit-plugins/livekit-plugins-browser/CHANGELOG.md
index e13c5455f..498a259c3 100644
--- a/livekit-plugins/livekit-plugins-browser/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-browser/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-browser
 
+## 0.0.5
+
+### Patch Changes
+
+- fix: fix `imgui` setup - [#1226](https://github.com/livekit/agents/pull/1226) ([@mbukeRepo](https://github.com/mbukeRepo))
+
 ## 0.0.4
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py b/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py
index 1308acf66..0f8366140 100644
--- a/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py
+++ b/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.0.4"
+__version__ = "0.0.5"
diff --git a/livekit-plugins/livekit-plugins-browser/package.json b/livekit-plugins/livekit-plugins-browser/package.json
index 5340f768c..f28e403c5 100644
--- a/livekit-plugins/livekit-plugins-browser/package.json
+++ b/livekit-plugins/livekit-plugins-browser/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-browser",
   "private": true,
-  "version": "0.0.4"
+  "version": "0.0.5"
 }
diff --git a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md
index 6836c4522..9c624c19f 100644
--- a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md
@@ -1,5 +1,13 @@
 # livekit-plugins-deepgram
 
+## 0.6.15
+
+### Patch Changes
+
+- added streaming audio decoder for compressed audio. - [#1236](https://github.com/livekit/agents/pull/1236) ([@davidzhao](https://github.com/davidzhao))
+
+- Support Deepgram TTS - [#1201](https://github.com/livekit/agents/pull/1201) ([@jayeshp19](https://github.com/jayeshp19))
+
 ## 0.6.14
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py
index 63f6f8624..c83922d4e 100644
--- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py
+++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.14"
+__version__ = "0.6.15"
diff --git a/livekit-plugins/livekit-plugins-deepgram/package.json b/livekit-plugins/livekit-plugins-deepgram/package.json
index 1259f3ad3..65cf7a26a 100644
--- a/livekit-plugins/livekit-plugins-deepgram/package.json
+++ b/livekit-plugins/livekit-plugins-deepgram/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-deepgram",
   "private": true,
-  "version": "0.6.14"
+  "version": "0.6.15"
 }
diff --git a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
index 3d2783b7c..d9f42cc0c 100644
--- a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
@@ -1,5 +1,31 @@
 # livekit-plugins-openai
 
+## 0.10.11
+
+### Patch Changes
+
+- Moved create_ai_function_info to function_context.py for better reusability and reduce repetation - [#1260](https://github.com/livekit/agents/pull/1260) ([@jayeshp19](https://github.com/jayeshp19))
+
+- add on_duplicate option for multimodal agent response create - [#1204](https://github.com/livekit/agents/pull/1204) ([@longcw](https://github.com/longcw))
+
+- Add support for OpenAI's "detail" parameter to ChatImage - [#1213](https://github.com/livekit/agents/pull/1213) ([@bcherry](https://github.com/bcherry))
+
+  Add support for data URLs on ChatImage in the Anthropic plugin.
+
+- filter out empty message for set chat ctx in realtime model - [#1245](https://github.com/livekit/agents/pull/1245) ([@longcw](https://github.com/longcw))
+
+- fix: correctly parse function argument types - [#1221](https://github.com/livekit/agents/pull/1221) ([@jayeshp19](https://github.com/jayeshp19))
+
+- add session_updated event for RealtimeSession - [#1253](https://github.com/livekit/agents/pull/1253) ([@longcw](https://github.com/longcw))
+
+- added llama 3.3 70b to model definitions - [#1233](https://github.com/livekit/agents/pull/1233) ([@davidzhao](https://github.com/davidzhao))
+
+- update default realtime model to gpt-4o-realtime-preview-2024-12-17 - [#1250](https://github.com/livekit/agents/pull/1250) ([@davidzhao](https://github.com/davidzhao))
+
+- Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options. - [#1222](https://github.com/livekit/agents/pull/1222) ([@bcherry](https://github.com/bcherry))
+
+  Make scale_aspect_fit the new default resizing option for video frames.
+
 ## 0.10.10
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
index 9a14e871f..613650a21 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.10.10"
+__version__ = "0.10.11"
diff --git a/livekit-plugins/livekit-plugins-openai/package.json b/livekit-plugins/livekit-plugins-openai/package.json
index b9238338f..a5087740b 100644
--- a/livekit-plugins/livekit-plugins-openai/package.json
+++ b/livekit-plugins/livekit-plugins-openai/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-openai",
   "private": true,
-  "version": "0.10.10"
+  "version": "0.10.11"
 }
diff --git a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
index 201e0f662..0bc8544b5 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
@@ -1,5 +1,13 @@
 # livekit-plugins-eou
 
+## 0.3.3
+
+### Patch Changes
+
+- use quantized onnx version of turn detector model - [#1231](https://github.com/livekit/agents/pull/1231) ([@jeradf](https://github.com/jeradf))
+
+- use onnxruntime for turn detection and remove pytorch dependency - [#1257](https://github.com/livekit/agents/pull/1257) ([@jeradf](https://github.com/jeradf))
+
 ## 0.3.2
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
index adb9a59d4..6b8f1ef90 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.3.2"
+__version__ = "0.3.3"
diff --git a/livekit-plugins/livekit-plugins-turn-detector/package.json b/livekit-plugins/livekit-plugins-turn-detector/package.json
index 6e6bfea47..acd5d4300 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/package.json
+++ b/livekit-plugins/livekit-plugins-turn-detector/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-turn-detector",
   "private": true,
-  "version": "0.3.2"
+  "version": "0.3.3"
 }

From 0ccc4b02a7c812fabcbd149c9ca7291eb0dfe14a Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Fri, 20 Dec 2024 23:40:54 -0800
Subject: [PATCH 15/46] Revert to upload-artifacts@v3, update dependency
 versions (#1267)

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
---
 .changeset/pre.json                           | 25 +++++++++++++++++++
 .github/workflows/build-package.yml           |  4 +--
 .github/workflows/publish-package.yml         |  2 +-
 .../livekit-plugins-anthropic/setup.py        |  2 +-
 .../livekit-plugins-assemblyai/setup.py       |  2 +-
 .../livekit-plugins-azure/setup.py            |  2 +-
 .../livekit-plugins-browser/setup.py          |  2 +-
 .../livekit-plugins-cartesia/setup.py         |  2 +-
 .../livekit-plugins-clova/setup.py            |  2 +-
 .../livekit-plugins-deepgram/setup.py         |  2 +-
 .../livekit-plugins-elevenlabs/setup.py       |  2 +-
 livekit-plugins/livekit-plugins-fal/setup.py  |  2 +-
 .../livekit-plugins-google/setup.py           |  2 +-
 .../livekit-plugins-llama-index/setup.py      |  2 +-
 .../livekit-plugins-openai/setup.py           |  2 +-
 .../livekit-plugins-playht/setup.py           |  2 +-
 livekit-plugins/livekit-plugins-rag/setup.py  |  2 +-
 .../livekit-plugins-silero/setup.py           |  2 +-
 .../livekit-plugins-turn-detector/setup.py    |  2 +-
 19 files changed, 44 insertions(+), 19 deletions(-)
 create mode 100644 .changeset/pre.json

diff --git a/.changeset/pre.json b/.changeset/pre.json
new file mode 100644
index 000000000..1bfc38cea
--- /dev/null
+++ b/.changeset/pre.json
@@ -0,0 +1,25 @@
+{
+  "mode": "pre",
+  "tag": "dev",
+  "initialVersions": {
+    "livekit-agents": "0.12.3",
+    "livekit-plugins-anthropic": "0.2.8",
+    "livekit-plugins-assemblyai": "0.2.1",
+    "livekit-plugins-azure": "0.5.1",
+    "livekit-plugins-browser": "0.0.5",
+    "livekit-plugins-cartesia": "0.4.5",
+    "livekit-plugins-deepgram": "0.6.15",
+    "livekit-plugins-elevenlabs": "0.7.9",
+    "livekit-plugins-fal": "0.2.2",
+    "livekit-plugins-google": "0.8.1",
+    "livekit-plugins-llama-index": "0.2.2",
+    "livekit-plugins-minimal": "0.2.1",
+    "livekit-plugins-nltk": "0.7.3",
+    "livekit-plugins-openai": "0.10.11",
+    "livekit-plugins-playht": "1.0.3",
+    "livekit-plugins-rag": "0.2.3",
+    "livekit-plugins-silero": "0.7.4",
+    "livekit-plugins-turn-detector": "0.3.3"
+  },
+  "changesets": []
+}
diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.yml
index f0f721f72..7593c01be 100644
--- a/.github/workflows/build-package.yml
+++ b/.github/workflows/build-package.yml
@@ -47,7 +47,7 @@ jobs:
         run: python -m build
 
       - name: Upload distribution package
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
         with:
           name: ${{ inputs.artifact_name }}
           path: "${{ startsWith(inputs.package, 'livekit-plugin') && 'livekit-plugins/' || '' }}${{ inputs.package }}/dist/"
@@ -82,7 +82,7 @@ jobs:
           CIBW_BUILD_VERBOSITY: 3
 
       - name: Upload distribution package
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
         with:
           name: ${{ inputs.artifact_name }}
           path: livekit-plugins/livekit-plugins-browser/dist/
diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
index 669f37d68..05feeb366 100644
--- a/.github/workflows/publish-package.yml
+++ b/.github/workflows/publish-package.yml
@@ -96,7 +96,7 @@ jobs:
 
     steps:
       - name: Download all the dists
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v3
         with:
           name: python-package-distributions
           path: dist/
diff --git a/livekit-plugins/livekit-plugins-anthropic/setup.py b/livekit-plugins/livekit-plugins-anthropic/setup.py
index 5a21aeb5c..4d9c3a1ba 100644
--- a/livekit-plugins/livekit-plugins-anthropic/setup.py
+++ b/livekit-plugins/livekit-plugins-anthropic/setup.py
@@ -49,7 +49,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11", "anthropic>=0.34"],
+    install_requires=["livekit-agents>=0.12.3", "anthropic>=0.34"],
     package_data={"livekit.plugins.anthropic": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
diff --git a/livekit-plugins/livekit-plugins-assemblyai/setup.py b/livekit-plugins/livekit-plugins-assemblyai/setup.py
index 8cd008a0c..edd7e5494 100644
--- a/livekit-plugins/livekit-plugins-assemblyai/setup.py
+++ b/livekit-plugins/livekit-plugins-assemblyai/setup.py
@@ -48,7 +48,7 @@
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
     install_requires=[
-        "livekit-agents>=0.11",
+        "livekit-agents>=0.12.3",
     ],
     package_data={},
     project_urls={
diff --git a/livekit-plugins/livekit-plugins-azure/setup.py b/livekit-plugins/livekit-plugins-azure/setup.py
index 288de7187..e854fc492 100644
--- a/livekit-plugins/livekit-plugins-azure/setup.py
+++ b/livekit-plugins/livekit-plugins-azure/setup.py
@@ -46,7 +46,7 @@
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
     install_requires=[
-        "livekit-agents>=0.11",
+        "livekit-agents>=0.12.3",
         "azure-cognitiveservices-speech>=1.41.0",
     ],
     package_data={},
diff --git a/livekit-plugins/livekit-plugins-browser/setup.py b/livekit-plugins/livekit-plugins-browser/setup.py
index 8eafd27d8..088259ebf 100644
--- a/livekit-plugins/livekit-plugins-browser/setup.py
+++ b/livekit-plugins/livekit-plugins-browser/setup.py
@@ -113,7 +113,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
     cmdclass={"build_ext": CMakeBuild},
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11"],
+    install_requires=["livekit-agents>=0.12.3"],
     package_data={
         "livekit.plugins.browser": ["py.typed"],
         "livekit.plugins.browser.resources": ["**", "lkcef_app.app"],
diff --git a/livekit-plugins/livekit-plugins-cartesia/setup.py b/livekit-plugins/livekit-plugins-cartesia/setup.py
index e4ce007f9..8044f23c6 100644
--- a/livekit-plugins/livekit-plugins-cartesia/setup.py
+++ b/livekit-plugins/livekit-plugins-cartesia/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11"],
+    install_requires=["livekit-agents>=0.12.3"],
     project_urls={
         "Documentation": "https://docs.livekit.io",
         "Website": "https://livekit.io/",
diff --git a/livekit-plugins/livekit-plugins-clova/setup.py b/livekit-plugins/livekit-plugins-clova/setup.py
index 254fd1cba..08abcf970 100644
--- a/livekit-plugins/livekit-plugins-clova/setup.py
+++ b/livekit-plugins/livekit-plugins-clova/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11", "pydub~=0.25.1"],
+    install_requires=["livekit-agents>=0.12.3", "pydub~=0.25.1"],
     project_urls={
         "Documentation": "https://docs.livekit.io",
         "Website": "https://livekit.io/",
diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py
index 8a583611d..b9316b839 100644
--- a/livekit-plugins/livekit-plugins-deepgram/setup.py
+++ b/livekit-plugins/livekit-plugins-deepgram/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.12.2", "numpy>=1.26"],
+    install_requires=["livekit-agents>=0.12.3", "numpy>=1.26"],
     package_data={"livekit.plugins.deepgram": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
diff --git a/livekit-plugins/livekit-plugins-elevenlabs/setup.py b/livekit-plugins/livekit-plugins-elevenlabs/setup.py
index ba5400e84..829739fe2 100644
--- a/livekit-plugins/livekit-plugins-elevenlabs/setup.py
+++ b/livekit-plugins/livekit-plugins-elevenlabs/setup.py
@@ -49,7 +49,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents[codecs]>=0.11"],
+    install_requires=["livekit-agents[codecs]>=0.12.3"],
     package_data={"livekit.plugins.elevenlabs": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
diff --git a/livekit-plugins/livekit-plugins-fal/setup.py b/livekit-plugins/livekit-plugins-fal/setup.py
index 014251d0c..760607daf 100644
--- a/livekit-plugins/livekit-plugins-fal/setup.py
+++ b/livekit-plugins/livekit-plugins-fal/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11", "fal_client"],
+    install_requires=["livekit-agents>=0.12.3", "fal_client"],
     package_data={"livekit.plugins.fal": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
diff --git a/livekit-plugins/livekit-plugins-google/setup.py b/livekit-plugins/livekit-plugins-google/setup.py
index b6e72949b..87646895f 100644
--- a/livekit-plugins/livekit-plugins-google/setup.py
+++ b/livekit-plugins/livekit-plugins-google/setup.py
@@ -51,7 +51,7 @@
         "google-auth >= 2, < 3",
         "google-cloud-speech >= 2, < 3",
         "google-cloud-texttospeech >= 2, < 3",
-        "livekit-agents>=0.11",
+        "livekit-agents>=0.12.3",
     ],
     package_data={"livekit.plugins.google": ["py.typed"]},
     project_urls={
diff --git a/livekit-plugins/livekit-plugins-llama-index/setup.py b/livekit-plugins/livekit-plugins-llama-index/setup.py
index 98b0babab..acc39333d 100644
--- a/livekit-plugins/livekit-plugins-llama-index/setup.py
+++ b/livekit-plugins/livekit-plugins-llama-index/setup.py
@@ -49,7 +49,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11"],
+    install_requires=["livekit-agents>=0.12.3"],
     package_data={"livekit.plugins.llama_index": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
diff --git a/livekit-plugins/livekit-plugins-openai/setup.py b/livekit-plugins/livekit-plugins-openai/setup.py
index a7b6cdf19..eb9d6d0fe 100644
--- a/livekit-plugins/livekit-plugins-openai/setup.py
+++ b/livekit-plugins/livekit-plugins-openai/setup.py
@@ -48,7 +48,7 @@
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
     install_requires=[
-        "livekit-agents[codecs, images]>=0.11",
+        "livekit-agents[codecs, images]>=0.12.3",
         "openai>=1.50",
     ],
     extras_require={
diff --git a/livekit-plugins/livekit-plugins-playht/setup.py b/livekit-plugins/livekit-plugins-playht/setup.py
index ea5c7bf77..eb41a5b89 100644
--- a/livekit-plugins/livekit-plugins-playht/setup.py
+++ b/livekit-plugins/livekit-plugins-playht/setup.py
@@ -32,7 +32,7 @@
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
     install_requires=[
-        "livekit-agents[codecs]>=0.11",
+        "livekit-agents[codecs]>=0.12.3",
         "pyht",
         "aiohttp",
         "livekit",
diff --git a/livekit-plugins/livekit-plugins-rag/setup.py b/livekit-plugins/livekit-plugins-rag/setup.py
index 55c8223a8..00ae59c86 100644
--- a/livekit-plugins/livekit-plugins-rag/setup.py
+++ b/livekit-plugins/livekit-plugins-rag/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11", "annoy>=1.17"],
+    install_requires=["livekit-agents>=0.12.3", "annoy>=1.17"],
     package_data={"livekit.plugins.rag": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
diff --git a/livekit-plugins/livekit-plugins-silero/setup.py b/livekit-plugins/livekit-plugins-silero/setup.py
index c5202db9c..52bc41ba2 100644
--- a/livekit-plugins/livekit-plugins-silero/setup.py
+++ b/livekit-plugins/livekit-plugins-silero/setup.py
@@ -47,7 +47,7 @@
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
-    install_requires=["livekit-agents>=0.11", "onnxruntime>=1.18", "numpy>=1.26"],
+    install_requires=["livekit-agents>=0.12.3", "onnxruntime>=1.18", "numpy>=1.26"],
     package_data={
         "livekit.plugins.silero.resources": ["silero_vad.onnx"],
         "livekit.plugins.silero": ["py.typed"],
diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py
index 7b9b4b192..f53e82135 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/setup.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py
@@ -50,7 +50,7 @@
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
     install_requires=[
-        "livekit-agents>=0.11",
+        "livekit-agents>=0.12.3",
         "transformers>=4.47.1",
         "numpy>=1.26",
         "onnxruntime>=1.18",

From af777bef0586f53b26bffe1c7834b405cf77429c Mon Sep 17 00:00:00 2001
From: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 21 Dec 2024 09:03:42 +0000
Subject: [PATCH 16/46] Exit pre release mode

---
 .changeset/pre.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.changeset/pre.json b/.changeset/pre.json
index 1bfc38cea..c3a216b74 100644
--- a/.changeset/pre.json
+++ b/.changeset/pre.json
@@ -1,5 +1,5 @@
 {
-  "mode": "pre",
+  "mode": "exit",
   "tag": "dev",
   "initialVersions": {
     "livekit-agents": "0.12.3",

From 50d0a716da00ff4b9b8b82ccc0560653bbd3c974 Mon Sep 17 00:00:00 2001
From: lukasIO <mail@lukasseiler.de>
Date: Sat, 21 Dec 2024 17:06:59 +0100
Subject: [PATCH 17/46] Update to v4 versions with multiple artifact download
 (#1268)

---
 .github/workflows/build-package.yml   |  4 ++--
 .github/workflows/publish-package.yml | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.yml
index 7593c01be..f0f721f72 100644
--- a/.github/workflows/build-package.yml
+++ b/.github/workflows/build-package.yml
@@ -47,7 +47,7 @@ jobs:
         run: python -m build
 
       - name: Upload distribution package
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ inputs.artifact_name }}
           path: "${{ startsWith(inputs.package, 'livekit-plugin') && 'livekit-plugins/' || '' }}${{ inputs.package }}/dist/"
@@ -82,7 +82,7 @@ jobs:
           CIBW_BUILD_VERBOSITY: 3
 
       - name: Upload distribution package
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ inputs.artifact_name }}
           path: livekit-plugins/livekit-plugins-browser/dist/
diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
index 05feeb366..6724f50aa 100644
--- a/.github/workflows/publish-package.yml
+++ b/.github/workflows/publish-package.yml
@@ -27,7 +27,7 @@ jobs:
           submodules: true
           lfs: true
         env:
-          GITHUB_TOKEN: ${{ secrets.CHANGESETS_PUSH_PAT }}
+          GITHUB_TOKEN: ${{ secrets.CHANGESETS_PUSH_DEPLOY_KEY }}
 
       - uses: pnpm/action-setup@v4
       - name: Use Node.js 20
@@ -84,7 +84,7 @@ jobs:
     uses: livekit/agents/.github/workflows/build-package.yml@main
     with:
       package: ${{ matrix.package.name }}
-      artifact_name: python-package-distributions
+      artifact_name: python-package-dist-${{matrix.package.name}}
 
   publish:
     needs:
@@ -96,10 +96,11 @@ jobs:
 
     steps:
       - name: Download all the dists
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
-          name: python-package-distributions
-          path: dist/
+          path: dist
+          pattern: python-package-dist-*
+          merge-multiple: true
 
       - name: Publish package
         uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29

From dec87af9c8b65ae9b4d85c003bb8a4c73f877954 Mon Sep 17 00:00:00 2001
From: aoife cassidy <aoife@livekit.io>
Date: Sat, 21 Dec 2024 18:20:09 +0200
Subject: [PATCH 18/46] ci: use ssh key and remove references to GITHUB_TOKEN
 (#1269)

---
 .github/workflows/publish-package.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
index 6724f50aa..5e5200417 100644
--- a/.github/workflows/publish-package.yml
+++ b/.github/workflows/publish-package.yml
@@ -26,8 +26,7 @@ jobs:
         with:
           submodules: true
           lfs: true
-        env:
-          GITHUB_TOKEN: ${{ secrets.CHANGESETS_PUSH_DEPLOY_KEY }}
+          ssh-key: ${{ secrets.CHANGESETS_PUSH_DEPLOY_KEY }}
 
       - uses: pnpm/action-setup@v4
       - name: Use Node.js 20
@@ -50,8 +49,6 @@ jobs:
           set +e
           pnpm changeset pre ${{ github.ref == 'refs/heads/main' && 'exit' || 'enter dev' }}
           echo "exitcode=$?" >> $GITHUB_OUTPUT
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Add changes
         if: ${{ steps.release_mode.outputs.exitcode == '0' }}
@@ -67,8 +64,6 @@ jobs:
         with:
           version: pnpm ci:version
           publish: pnpm ci:publish
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: debug packages
         run: echo "${{ steps.changesets.outputs.publishedPackages }}"

From a4f0bdb51cf5bb2a8ddbf372bc8ed034c5df0676 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Sun, 22 Dec 2024 16:05:46 +0800
Subject: [PATCH 19/46] fix: avoid duplicated chat ctx for function call with
 messages (#1254)

---
 .changeset/curvy-knives-promise.md                    |  5 +++++
 .../voice-pipeline-agent/function_calling_weather.py  |  4 ++--
 .../livekit/agents/pipeline/pipeline_agent.py         | 11 ++++++++++-
 .../livekit/agents/pipeline/speech_handle.py          |  8 ++++++++
 4 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 .changeset/curvy-knives-promise.md

diff --git a/.changeset/curvy-knives-promise.md b/.changeset/curvy-knives-promise.md
new file mode 100644
index 000000000..a4d79c4ca
--- /dev/null
+++ b/.changeset/curvy-knives-promise.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+avoid duplicated chat ctx for function calls with messages
diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py
index e8add68d0..7f1ba5fa5 100644
--- a/examples/voice-pipeline-agent/function_calling_weather.py
+++ b/examples/voice-pipeline-agent/function_calling_weather.py
@@ -67,8 +67,8 @@ async def get_weather(
                         f"Failed to get weather data, status code: {response.status}"
                     )
 
-        # To wait for the speech to finish before giving results of the function call
-        await speech_handle.join()
+        # (optional) To wait for the speech to finish before giving results of the function call
+        # await speech_handle.join()
         return weather_data
 
 
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index a08291ea4..5493583b7 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -801,12 +801,20 @@ def _commit_user_question_if_needed() -> None:
             speech_handle.source.function_calls
         )
 
+        message_id_committed: str | None = None
         if (
             collected_text
             and speech_handle.add_to_chat_ctx
             and (not user_question or speech_handle.user_committed)
         ):
             if speech_handle.extra_tools_messages:
+                msgs = self._chat_ctx.messages
+                if msgs and msgs[-1].id == speech_handle.fnc_text_message_id:
+                    # remove text message alongside function calls if it's the last in the ctx
+                    msgs.pop()
+                elif speech_handle.extra_tools_messages[0].tool_calls:
+                    # remove the content of the tool call message
+                    speech_handle.extra_tools_messages[0].content = ""
                 self._chat_ctx.messages.extend(speech_handle.extra_tools_messages)
 
             if interrupted:
@@ -814,7 +822,7 @@ def _commit_user_question_if_needed() -> None:
 
             msg = ChatMessage.create(text=collected_text, role="assistant")
             self._chat_ctx.messages.append(msg)
-
+            message_id_committed = msg.id
             speech_handle.mark_speech_committed()
 
             if interrupted:
@@ -914,6 +922,7 @@ async def _execute_function_calls() -> None:
                 add_to_chat_ctx=speech_handle.add_to_chat_ctx,
                 extra_tools_messages=extra_tools_messages,
                 fnc_nested_depth=speech_handle.fnc_nested_depth + 1,
+                fnc_text_message_id=message_id_committed,
             )
 
             # synthesize the tool speech with the chat ctx from llm_stream
diff --git a/livekit-agents/livekit/agents/pipeline/speech_handle.py b/livekit-agents/livekit/agents/pipeline/speech_handle.py
index d1c64b5c9..d36eb7aee 100644
--- a/livekit-agents/livekit/agents/pipeline/speech_handle.py
+++ b/livekit-agents/livekit/agents/pipeline/speech_handle.py
@@ -19,6 +19,7 @@ def __init__(
         user_question: str,
         fnc_nested_depth: int = 0,
         extra_tools_messages: list[ChatMessage] | None = None,
+        fnc_text_message_id: str | None = None,
     ) -> None:
         self._id = id
         self._allow_interruptions = allow_interruptions
@@ -41,6 +42,7 @@ def __init__(
         # nested speech handle and function calls
         self._fnc_nested_depth = fnc_nested_depth
         self._fnc_extra_tools_messages: list[ChatMessage] | None = extra_tools_messages
+        self._fnc_text_message_id: str | None = fnc_text_message_id
 
         self._nested_speech_handles: list[SpeechHandle] = []
         self._nested_speech_changed = asyncio.Event()
@@ -82,6 +84,7 @@ def create_tool_speech(
         add_to_chat_ctx: bool,
         fnc_nested_depth: int,
         extra_tools_messages: list[ChatMessage],
+        fnc_text_message_id: str | None = None,
     ) -> SpeechHandle:
         return SpeechHandle(
             id=utils.shortuuid(),
@@ -91,6 +94,7 @@ def create_tool_speech(
             user_question="",
             fnc_nested_depth=fnc_nested_depth,
             extra_tools_messages=extra_tools_messages,
+            fnc_text_message_id=fnc_text_message_id,
         )
 
     async def wait_for_initialization(self) -> None:
@@ -200,6 +204,10 @@ def fnc_nested_depth(self) -> int:
     def extra_tools_messages(self) -> list[ChatMessage] | None:
         return self._fnc_extra_tools_messages
 
+    @property
+    def fnc_text_message_id(self) -> str | None:
+        return self._fnc_text_message_id
+
     def add_nested_speech(self, speech_handle: SpeechHandle) -> None:
         self._nested_speech_handles.append(speech_handle)
         self._nested_speech_changed.set()

From 49f14dd71dc1708e4f99e8ea12124f213b84dbf4 Mon Sep 17 00:00:00 2001
From: lukasIO <mail@lukasseiler.de>
Date: Sun, 22 Dec 2024 12:18:16 +0100
Subject: [PATCH 20/46] ci: re-add GITHUB_TOKEN to publish workflow (#1272)

---
 .github/workflows/publish-package.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
index 5e5200417..61692429e 100644
--- a/.github/workflows/publish-package.yml
+++ b/.github/workflows/publish-package.yml
@@ -49,6 +49,8 @@ jobs:
           set +e
           pnpm changeset pre ${{ github.ref == 'refs/heads/main' && 'exit' || 'enter dev' }}
           echo "exitcode=$?" >> $GITHUB_OUTPUT
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Add changes
         if: ${{ steps.release_mode.outputs.exitcode == '0' }}
@@ -64,6 +66,8 @@ jobs:
         with:
           version: pnpm ci:version
           publish: pnpm ci:publish
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: debug packages
         run: echo "${{ steps.changesets.outputs.publishedPackages }}"

From b6542abe5f42f5a49bc845301b57e91f049b7cd8 Mon Sep 17 00:00:00 2001
From: Mike McLaughlin <23640224+mike-r-mclaughlin@users.noreply.github.com>
Date: Sun, 22 Dec 2024 07:54:44 -0600
Subject: [PATCH 21/46] examples: updated, tested, and added @dsa's Hive
 moderation agent (#1263)

---
 examples/hive-moderation-agent/README.md      |  41 +++++
 examples/hive-moderation-agent/agent.py       | 163 ++++++++++++++++++
 .../hive_data_classes.py                      |  95 ++++++++++
 .../hive-moderation-agent/requirements.txt    |   5 +
 4 files changed, 304 insertions(+)
 create mode 100644 examples/hive-moderation-agent/README.md
 create mode 100644 examples/hive-moderation-agent/agent.py
 create mode 100644 examples/hive-moderation-agent/hive_data_classes.py
 create mode 100644 examples/hive-moderation-agent/requirements.txt

diff --git a/examples/hive-moderation-agent/README.md b/examples/hive-moderation-agent/README.md
new file mode 100644
index 000000000..8f48218bb
--- /dev/null
+++ b/examples/hive-moderation-agent/README.md
@@ -0,0 +1,41 @@
+# LiveKit realtime moderation agent using Hive
+
+This is an agent that performs visual moderation of every participant's video in a room.  It does this moderation using the Visual Content Moderation model from [Hive](https://thehive.ai) [[docs](https://docs.thehive.ai/docs/visual-content-moderation#visual-content-moderation)].
+
+## Prerequisites
+
+Before running this agent, you'll need:
+
+1. A LiveKit Cloud project (or a self-hosted LiveKit server).
+2. An API key from Hive to access the above mentioned model.
+
+## Configuration
+
+Currently, this agent is configured entirely from the `agent.py` source code and the environment.
+
+### Environment Variables
+
+| configuration | description | example value |
+|---------------|-------------|---------------|
+| `LIVEKIT_URL` | Your LiveKit URL | `wss://test-abc123de.livekit.cloud` |
+| `LIVEKIT_API_KEY` | Your LiveKit API key | |
+| `LIVEKIT_API_SECRET` | Your LiveKit API secret | |
+| `HIVE_API_KEY` | The API key from Hive to access the `Visual Content Moderation` model | `abc1deFgHIjK23KLMNOp45QrsTuv6wx8` |
+
+### Code
+
+| configuration | description | example value |
+|---------------|-------------|---------------|
+| `MOD_FRAME_INTERVAL` | Minimum number of seconds to wait between frames | 5.0 |
+| `HIVE_HEADERS` | The headers to send with every request to the Hive API | `{}` |
+| `CONFIDENCE_THRESHOLD` | The minimum score Hive's moderation class must meet before it is considered a problem | 0.9 |
+
+## Running
+
+Run this code like you would any other [LiveKit agent](https://docs.livekit.io/agents/build/anatomy/#starting-the-worker):
+
+```
+python3 agent.py start
+```
+
+Once running, the agent will join all new LiveKit rooms by default and begin moderation.
diff --git a/examples/hive-moderation-agent/agent.py b/examples/hive-moderation-agent/agent.py
new file mode 100644
index 000000000..bf0b23b07
--- /dev/null
+++ b/examples/hive-moderation-agent/agent.py
@@ -0,0 +1,163 @@
+"""
+LiveKit agent that connects to a room and performs visual moderation on the video
+of all participants using the Visual Content Moderation model from Hive
+(https://docs.thehive.ai/docs/visual-content-moderation#visual-content-moderation).
+
+The agent periodically sends a frame from the participant's video to Hive's API
+for a moderation check. If the results of that check show a confidence score
+of 0.9 or higher for any of the positive classes, it logs the result and adds a
+message to the room's chat. This can easily be extended to take additional
+actions like removing a participant or ending a livestream, etc.
+"""
+
+import asyncio
+import logging
+import os
+import time
+from io import BytesIO
+
+import aiohttp
+from dotenv import load_dotenv
+from hive_data_classes import HiveResponse, from_dict
+from livekit import agents, rtc
+from PIL import Image
+
+load_dotenv()
+
+MOD_FRAME_INTERVAL = 5.0  # check 1 frame every 5 seconds
+"""
+How often to check a frame (in seconds)
+"""
+
+HIVE_HEADERS = {
+    "Authorization": f"Token {os.getenv('HIVE_API_KEY')}",
+    "accept": "application/json",
+}
+"""
+The default headers included with every request to thehive.ai
+"""
+
+CONFIDENCE_THRESHOLD = 0.9
+"""
+THe threshold level for scores returned by thehive.ai.  See details in this doc:
+https://docs.thehive.ai/docs/visual-content-moderation#choosing-thresholds-for-visual-moderation
+"""
+
+
+logger = logging.getLogger("hive-moderation-agent")
+logger.setLevel(logging.INFO)
+
+
+async def request_fnc(req: agents.JobRequest):
+    """
+    The request handler for the agent.  We use this to set the name of the
+    agent that is displayed to users
+    """
+    # accept the job request and name the agent participant so users know what this is
+    await req.accept(
+        name="Moderator",
+        identity="hive-moderator",
+    )
+
+
+async def entrypoint(ctx: agents.JobContext):
+    """
+    The entrypoint of the agent.  This is called every time the moderator
+    agent joins a room.
+    """
+
+    # connect to the room and automatically subscribe to all participants' video
+    await ctx.connect(auto_subscribe=agents.AutoSubscribe.VIDEO_ONLY)
+    chat = rtc.ChatManager(ctx.room)
+
+    @ctx.room.on("track_subscribed")
+    def on_track_subscribed(
+        track: rtc.Track,
+        _publication: rtc.TrackPublication,
+        participant: rtc.RemoteParticipant,
+    ):
+        """
+        Event handler for video tracks.  We automatically subscribe to all video
+        tracks when a participant joins the room.  This event is triggered
+        once we have completed subscription to that video track.
+        This creates a backgrond task to process frames from each track
+        """
+        asyncio.create_task(process_track(participant, track))
+
+    async def process_track(participant: rtc.RemoteParticipant, track: rtc.VideoTrack):
+        """
+        This function is running in a background task once for each video track
+        (i.e., once for each participant).  It handles processing a frame
+        from the video once every MOD_FRAME INTERVAL seconds.
+        """
+
+        video_stream = rtc.VideoStream(track)
+        last_processed_time = 0
+        async for frame in video_stream:
+            current_time = time.time()
+            if (current_time - last_processed_time) >= MOD_FRAME_INTERVAL:
+                last_processed_time = current_time
+                await check_frame(participant, frame)
+
+    async def check_frame(participant: rtc.RemoteParticipant, frame: rtc.VideoFrame):
+        """
+        Uses thehive.ai API to check the frame for any classifications we care about
+        """
+
+        # get the current frame and convert to png format
+        argb_frame = frame.frame.convert(rtc.VideoBufferType.RGBA)
+        image = Image.frombytes(
+            "RGBA", (argb_frame.width, argb_frame.height), argb_frame.data
+        )
+        buffer = BytesIO()
+        image.save(buffer, format="PNG")
+        buffer.seek(0)  # reset buffer position to beginning after writing
+
+        data = aiohttp.FormData()
+        data.add_field("image", buffer, filename="image.png", content_type="image/png")
+
+        # submit the image to Hive
+        logger.info("submitting image to hive")
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                "https://api.thehive.ai/api/v2/task/sync",
+                headers=HIVE_HEADERS,
+                data=data,
+            ) as response:
+                response.raise_for_status()
+                response_dict = await response.json()
+                hive_response: HiveResponse = from_dict(HiveResponse, response_dict)
+                if (
+                    hive_response.code == 200
+                    and len(hive_response.status) > 0
+                    and len(hive_response.status[0].response.output) > 0
+                ):
+                    results = hive_response.status[0].response.output[0].classes
+                    # filter to anything with a confidence score > threshold
+                    for mod_class in results:
+                        if mod_class.class_[0:4] == "yes_":
+                            # TODO: should also include "general_nsfw" class
+                            if mod_class.score >= CONFIDENCE_THRESHOLD:
+                                class_name = mod_class.class_[4:]
+                                message = (
+                                    'FOUND %s for participant "%s" (confidence score: %0.3f)'
+                                    % (
+                                        class_name,
+                                        participant.identity,
+                                        mod_class.score,
+                                    )
+                                )
+                                logger.info(message)
+                                await chat.send_message(message)
+
+    await ctx.wait_for_participant()
+    await chat.send_message(
+        "I'm a moderation agent,"
+        "I will detect and notify you of all inappropriate material in your video stream"
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    agents.cli.run_app(agents.WorkerOptions(entrypoint, request_fnc=request_fnc))
diff --git a/examples/hive-moderation-agent/hive_data_classes.py b/examples/hive-moderation-agent/hive_data_classes.py
new file mode 100644
index 000000000..a1773435d
--- /dev/null
+++ b/examples/hive-moderation-agent/hive_data_classes.py
@@ -0,0 +1,95 @@
+from dataclasses import dataclass, is_dataclass
+from typing import List, get_type_hints
+
+
+def from_dict(cls, data):
+    if is_dataclass(cls) and isinstance(data, dict):
+        # Get type hints for all fields in the dataclass
+        field_types = get_type_hints(cls)
+        # Special handling for reserved words like 'class'
+        reserved_word_mappings = {"class": "class_"}  # Map 'class' to 'class_'
+        processed_data = {}
+        for key, value in data.items():
+            # Check if the key is a reserved word and map it accordingly
+            field_name = reserved_word_mappings.get(key, key)
+            # Only include keys that have corresponding fields in the dataclass
+            if field_name in field_types:
+                field_type = field_types[field_name]
+                # Determine if the field_type is itself a dataclass
+                if is_dataclass(field_type):
+                    processed_value = from_dict(field_type, value)
+                elif hasattr(field_type, "__origin__") and issubclass(
+                    field_type.__origin__, List
+                ):
+                    # Handle List fields, assuming all elements are of the same type
+                    item_type = field_type.__args__[0]
+                    processed_value = [from_dict(item_type, item) for item in value]
+                else:
+                    processed_value = value
+                processed_data[field_name] = processed_value
+        return cls(**processed_data)
+    elif isinstance(data, list):
+        # This assumes that the function was called with a list type as `cls`,
+        # which might not work as expected without context on the list's element type.
+        # A better approach might be needed for handling lists of dataclasses.
+        return [
+            from_dict(cls.__args__[0], item) if hasattr(cls, "__args__") else item
+            for item in data
+        ]
+    else:
+        return data
+
+
+@dataclass
+class Status:
+    code: str
+    message: str
+
+
+@dataclass
+class ModInput:
+    id: str
+    charge: float
+    config_tag: SyntaxWarning
+    config_version: float
+    created_on: str
+    model: str
+    model_type: str
+    model_version: float
+    project_id: int
+    user_id: int
+
+
+@dataclass
+class ModClass:
+    class_: str
+    score: float
+
+
+@dataclass
+class ModOutput:
+    time: int
+    classes: List[ModClass]
+
+
+@dataclass
+class Response:
+    input: ModInput
+    output: List[ModOutput]
+
+
+@dataclass
+class ModResponse:
+    status: Status
+    response: Response
+
+
+@dataclass
+class HiveResponse:
+    id: str
+    code: int
+    project_id: int
+    user_id: int
+    created_on: str
+    status: List[ModResponse]
+    from_cache: bool
diff --git a/examples/hive-moderation-agent/requirements.txt b/examples/hive-moderation-agent/requirements.txt
new file mode 100644
index 000000000..517a8283f
--- /dev/null
+++ b/examples/hive-moderation-agent/requirements.txt
@@ -0,0 +1,5 @@
+livekit
+livekit-agents
+python-dotenv
+Pillow
+aiohttp
\ No newline at end of file

From a76c21becb45305bfdd94c8347369091d40abe21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Monnom?= <theo.8bits@gmail.com>
Date: Mon, 23 Dec 2024 00:58:43 +0100
Subject: [PATCH 22/46] fix unknown `metadata` & `store` fields on OpenAI-like
 API (#1276)

---
 .changeset/lazy-dragons-give.md                          | 5 +++++
 .../livekit-plugins-openai/livekit/plugins/openai/llm.py | 9 +++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 .changeset/lazy-dragons-give.md

diff --git a/.changeset/lazy-dragons-give.md b/.changeset/lazy-dragons-give.md
new file mode 100644
index 000000000..6eb6a3db5
--- /dev/null
+++ b/.changeset/lazy-dragons-give.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-openai": patch
+---
+
+fix unknown `metadata` & `store` fields on OpenAI-like API
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
index bcff2cfa9..6f7cbccb1 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
@@ -730,6 +730,13 @@ async def _run(self) -> None:
                     else:
                         opts["tool_choice"] = self._tool_choice
 
+            if self._llm._opts.metadata is not None:
+                # some OpenAI-like API doesn't support having a `metadata` field. (Even None)
+                opts["metadata"] = self._llm._opts.metadata
+
+            if self._llm._opts.store is not None:
+                opts["store"] = self._llm._opts.store
+
             user = self._user or openai.NOT_GIVEN
             messages = _build_oai_context(self._chat_ctx, id(self))
             stream = await self._client.chat.completions.create(
@@ -740,8 +747,6 @@ async def _run(self) -> None:
                 stream_options={"include_usage": True},
                 stream=True,
                 user=user,
-                store=self._llm._opts.store,
-                metadata=self._llm._opts.metadata,
                 **opts,
             )
 

From 0f685455411b093140294b97e1f9dc153a3e9baa Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sun, 22 Dec 2024 18:33:51 -0600
Subject: [PATCH 23/46] Version Packages (#1273)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .changeset/curvy-knives-promise.md            |  5 ----
 .changeset/lazy-dragons-give.md               |  5 ----
 .changeset/pre.json                           | 25 -------------------
 .../participant-entrypoint/requirements.txt   |  2 +-
 examples/simple-color/requirements.txt        |  2 +-
 examples/speech-to-text/requirements.txt      |  2 +-
 examples/text-to-speech/requirements.txt      |  4 +--
 .../voice-pipeline-agent/requirements.txt     |  2 +-
 livekit-agents/CHANGELOG.md                   |  6 +++++
 livekit-agents/livekit/agents/version.py      |  2 +-
 livekit-agents/package.json                   |  2 +-
 .../livekit-plugins-openai/CHANGELOG.md       |  6 +++++
 .../livekit/plugins/openai/version.py         |  2 +-
 .../livekit-plugins-openai/package.json       |  2 +-
 14 files changed, 22 insertions(+), 45 deletions(-)
 delete mode 100644 .changeset/curvy-knives-promise.md
 delete mode 100644 .changeset/lazy-dragons-give.md
 delete mode 100644 .changeset/pre.json

diff --git a/.changeset/curvy-knives-promise.md b/.changeset/curvy-knives-promise.md
deleted file mode 100644
index a4d79c4ca..000000000
--- a/.changeset/curvy-knives-promise.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-avoid duplicated chat ctx for function calls with messages
diff --git a/.changeset/lazy-dragons-give.md b/.changeset/lazy-dragons-give.md
deleted file mode 100644
index 6eb6a3db5..000000000
--- a/.changeset/lazy-dragons-give.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-openai": patch
----
-
-fix unknown `metadata` & `store` fields on OpenAI-like API
diff --git a/.changeset/pre.json b/.changeset/pre.json
deleted file mode 100644
index c3a216b74..000000000
--- a/.changeset/pre.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "mode": "exit",
-  "tag": "dev",
-  "initialVersions": {
-    "livekit-agents": "0.12.3",
-    "livekit-plugins-anthropic": "0.2.8",
-    "livekit-plugins-assemblyai": "0.2.1",
-    "livekit-plugins-azure": "0.5.1",
-    "livekit-plugins-browser": "0.0.5",
-    "livekit-plugins-cartesia": "0.4.5",
-    "livekit-plugins-deepgram": "0.6.15",
-    "livekit-plugins-elevenlabs": "0.7.9",
-    "livekit-plugins-fal": "0.2.2",
-    "livekit-plugins-google": "0.8.1",
-    "livekit-plugins-llama-index": "0.2.2",
-    "livekit-plugins-minimal": "0.2.1",
-    "livekit-plugins-nltk": "0.7.3",
-    "livekit-plugins-openai": "0.10.11",
-    "livekit-plugins-playht": "1.0.3",
-    "livekit-plugins-rag": "0.2.3",
-    "livekit-plugins-silero": "0.7.4",
-    "livekit-plugins-turn-detector": "0.3.3"
-  },
-  "changesets": []
-}
diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt
index 53a52b16a..5e6395561 100644
--- a/examples/participant-entrypoint/requirements.txt
+++ b/examples/participant-entrypoint/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.3
+livekit-agents>=0.12.4
 python-dotenv~=1.0
diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt
index 53a52b16a..5e6395561 100644
--- a/examples/simple-color/requirements.txt
+++ b/examples/simple-color/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.3
+livekit-agents>=0.12.4
 python-dotenv~=1.0
diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt
index 53ee39eb8..0a18a4bb6 100644
--- a/examples/speech-to-text/requirements.txt
+++ b/examples/speech-to-text/requirements.txt
@@ -1,3 +1,3 @@
-livekit-agents>=0.12.3
+livekit-agents>=0.12.4
 livekit-plugins-deepgram>=0.6.15
 python-dotenv~=1.0
diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt
index e5e0d8ddd..6a534b331 100644
--- a/examples/text-to-speech/requirements.txt
+++ b/examples/text-to-speech/requirements.txt
@@ -1,5 +1,5 @@
-livekit-agents>=0.12.3
-livekit-plugins-openai>=0.10.11
+livekit-agents>=0.12.4
+livekit-plugins-openai>=0.10.12
 livekit-plugins-cartesia>=0.4.5
 livekit-plugins-elevenlabs>=0.7.9
 python-dotenv~=1.0
diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt
index c8942df19..77975fb53 100644
--- a/examples/voice-pipeline-agent/requirements.txt
+++ b/examples/voice-pipeline-agent/requirements.txt
@@ -1,4 +1,4 @@
-livekit-agents>=0.12.3
+livekit-agents>=0.12.4
 livekit-plugins-deepgram>=0.6.15
 livekit-plugins-google>=0.8.1
 livekit-plugins-openai[vertex]>=0.10.10
diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md
index 83a2959c1..5bd84faf9 100644
--- a/livekit-agents/CHANGELOG.md
+++ b/livekit-agents/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-agents
 
+## 0.12.4
+
+### Patch Changes
+
+- avoid duplicated chat ctx for function calls with messages - [#1254](https://github.com/livekit/agents/pull/1254) ([@longcw](https://github.com/longcw))
+
 ## 0.12.3
 
 ### Patch Changes
diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py
index 55829dea7..ee001ea03 100644
--- a/livekit-agents/livekit/agents/version.py
+++ b/livekit-agents/livekit/agents/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.12.3"
+__version__ = "0.12.4"
diff --git a/livekit-agents/package.json b/livekit-agents/package.json
index c23feb751..212896f2b 100644
--- a/livekit-agents/package.json
+++ b/livekit-agents/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-agents",
   "private": true,
-  "version": "0.12.3"
+  "version": "0.12.4"
 }
diff --git a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
index d9f42cc0c..02ff2f06f 100644
--- a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-openai
 
+## 0.10.12
+
+### Patch Changes
+
+- fix unknown `metadata` & `store` fields on OpenAI-like API - [#1276](https://github.com/livekit/agents/pull/1276) ([@theomonnom](https://github.com/theomonnom))
+
 ## 0.10.11
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
index 613650a21..16e535380 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.10.11"
+__version__ = "0.10.12"
diff --git a/livekit-plugins/livekit-plugins-openai/package.json b/livekit-plugins/livekit-plugins-openai/package.json
index a5087740b..bfe2370d0 100644
--- a/livekit-plugins/livekit-plugins-openai/package.json
+++ b/livekit-plugins/livekit-plugins-openai/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-openai",
   "private": true,
-  "version": "0.10.11"
+  "version": "0.10.12"
 }

From 4b7230330cbad47f31efee562bd64438225be405 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 23 Dec 2024 10:53:33 +0800
Subject: [PATCH 24/46] fix: check fnc_text_message_id it not None (#1271)

---
 .../livekit/agents/pipeline/pipeline_agent.py    | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index 5493583b7..65d7e83f8 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -808,13 +808,15 @@ def _commit_user_question_if_needed() -> None:
             and (not user_question or speech_handle.user_committed)
         ):
             if speech_handle.extra_tools_messages:
-                msgs = self._chat_ctx.messages
-                if msgs and msgs[-1].id == speech_handle.fnc_text_message_id:
-                    # remove text message alongside function calls if it's the last in the ctx
-                    msgs.pop()
-                elif speech_handle.extra_tools_messages[0].tool_calls:
-                    # remove the content of the tool call message
-                    speech_handle.extra_tools_messages[0].content = ""
+                if speech_handle.fnc_text_message_id is not None:
+                    # there is a message alongside the function calls
+                    msgs = self._chat_ctx.messages
+                    if msgs and msgs[-1].id == speech_handle.fnc_text_message_id:
+                        # replace it with the tool call message if it's the last in the ctx
+                        msgs.pop()
+                    elif speech_handle.extra_tools_messages[0].tool_calls:
+                        # remove the content of the tool call message
+                        speech_handle.extra_tools_messages[0].content = ""
                 self._chat_ctx.messages.extend(speech_handle.extra_tools_messages)
 
             if interrupted:

From f0175c4e15091db931b2a926d6f607dbf3994246 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 23 Dec 2024 10:54:22 +0800
Subject: [PATCH 25/46] fix: set USE_DOCSTRING as default for ai_callable
 (#1266)

---
 .changeset/nasty-rings-wave.md                |  5 ++++
 .../livekit/agents/llm/function_context.py    | 14 +++++------
 tests/test_create_func.py                     | 23 +++++++++++++++++--
 3 files changed, 32 insertions(+), 10 deletions(-)
 create mode 100644 .changeset/nasty-rings-wave.md

diff --git a/.changeset/nasty-rings-wave.md b/.changeset/nasty-rings-wave.md
new file mode 100644
index 000000000..cbbcb7979
--- /dev/null
+++ b/.changeset/nasty-rings-wave.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+set USE_DOCSTRING as default for ai_callable
diff --git a/livekit-agents/livekit/agents/llm/function_context.py b/livekit-agents/livekit/agents/llm/function_context.py
index 4470492fe..59604fc8d 100644
--- a/livekit-agents/livekit/agents/llm/function_context.py
+++ b/livekit-agents/livekit/agents/llm/function_context.py
@@ -105,7 +105,7 @@ class CalledFunction:
 def ai_callable(
     *,
     name: str | None = None,
-    description: str | _UseDocMarker | None = None,
+    description: str | _UseDocMarker = USE_DOCSTRING,
     auto_retry: bool = False,
 ) -> Callable:
     def deco(f):
@@ -127,7 +127,7 @@ def ai_callable(
         self,
         *,
         name: str | None = None,
-        description: str | _UseDocMarker | None = None,
+        description: str | _UseDocMarker = USE_DOCSTRING,
         auto_retry: bool = True,
     ) -> Callable:
         def deco(f):
@@ -243,19 +243,17 @@ def _extract_types(annotation: type) -> tuple[type, TypeInfo | None]:
 def _set_metadata(
     f: Callable,
     name: str | None = None,
-    desc: str | _UseDocMarker | None = None,
+    desc: str | _UseDocMarker = USE_DOCSTRING,
     auto_retry: bool = False,
 ) -> None:
-    if desc is None:
-        desc = ""
-
     if isinstance(desc, _UseDocMarker):
-        desc = inspect.getdoc(f)
-        if desc is None:
+        docstring = inspect.getdoc(f)
+        if docstring is None:
             raise ValueError(
                 f"missing docstring for function {f.__name__}, "
                 "use explicit description or provide docstring"
             )
+        desc = docstring
 
     metadata = _AIFncMetadata(
         name=name or f.__name__, description=desc, auto_retry=auto_retry
diff --git a/tests/test_create_func.py b/tests/test_create_func.py
index 97583fb36..a81d31d93 100644
--- a/tests/test_create_func.py
+++ b/tests/test_create_func.py
@@ -43,11 +43,15 @@ def test_fn(
 
 def test_func_duplicate():
     class TestFunctionContext(llm.FunctionContext):
-        @llm.ai_callable(name="duplicate_function")
+        @llm.ai_callable(
+            name="duplicate_function", description="A simple test function"
+        )
         def fn1(self):
             pass
 
-        @llm.ai_callable(name="duplicate_function")
+        @llm.ai_callable(
+            name="duplicate_function", description="A simple test function"
+        )
         def fn2(self):
             pass
 
@@ -57,6 +61,21 @@ def fn2(self):
         TestFunctionContext()
 
 
+def test_func_with_docstring():
+    class TestFunctionContext(llm.FunctionContext):
+        @llm.ai_callable()
+        def test_fn(self):
+            """A simple test function"""
+            pass
+
+    fnc_ctx = TestFunctionContext()
+    assert (
+        "test_fn" in fnc_ctx.ai_functions
+    ), "Function should be registered in ai_functions"
+
+    assert fnc_ctx.ai_functions["test_fn"].description == "A simple test function"
+
+
 def test_func_with_optional_parameter():
     class TestFunctionContext(llm.FunctionContext):
         @llm.ai_callable(

From 12047cda44b9d43e06a49388ca39b1c780ff9768 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Sun, 22 Dec 2024 21:55:44 -0800
Subject: [PATCH 26/46] Add jinja2 dependency to turn detector (#1277)

---
 .changeset/dirty-mails-reflect.md                   |  5 +++++
 .changeset/four-rockets-accept.md                   |  5 +++++
 .../livekit/agents/pipeline/pipeline_agent.py       | 13 ++++++++-----
 .../livekit-plugins-turn-detector/setup.py          |  1 +
 4 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 .changeset/dirty-mails-reflect.md
 create mode 100644 .changeset/four-rockets-accept.md

diff --git a/.changeset/dirty-mails-reflect.md b/.changeset/dirty-mails-reflect.md
new file mode 100644
index 000000000..34eedc25a
--- /dev/null
+++ b/.changeset/dirty-mails-reflect.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+make max_endpoint_delay configurable
diff --git a/.changeset/four-rockets-accept.md b/.changeset/four-rockets-accept.md
new file mode 100644
index 000000000..a200e141d
--- /dev/null
+++ b/.changeset/four-rockets-accept.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-turn-detector": patch
+---
+
+add jinja2 dependency to turn detector
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index 65d7e83f8..3b9f8e83b 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -130,6 +130,7 @@ class _ImplOptions:
     int_speech_duration: float
     int_min_words: int
     min_endpointing_delay: float
+    max_endpointing_delay: float
     max_nested_fnc_calls: int
     preemptive_synthesis: bool
     before_llm_cb: BeforeLLMCallback
@@ -190,6 +191,7 @@ def __init__(
         interrupt_speech_duration: float = 0.5,
         interrupt_min_words: int = 0,
         min_endpointing_delay: float = 0.5,
+        max_endpointing_delay: float = 6.0,
         max_nested_fnc_calls: int = 1,
         preemptive_synthesis: bool = False,
         transcription: AgentTranscriptionOptions = AgentTranscriptionOptions(),
@@ -247,6 +249,7 @@ def __init__(
             int_speech_duration=interrupt_speech_duration,
             int_min_words=interrupt_min_words,
             min_endpointing_delay=min_endpointing_delay,
+            max_endpointing_delay=max_endpointing_delay,
             max_nested_fnc_calls=max_nested_fnc_calls,
             preemptive_synthesis=preemptive_synthesis,
             transcription=transcription,
@@ -293,7 +296,8 @@ def __init__(
 
         self._deferred_validation = _DeferredReplyValidation(
             self._validate_reply_if_possible,
-            self._opts.min_endpointing_delay,
+            min_endpointing_delay=self._opts.min_endpointing_delay,
+            max_endpointing_delay=self._opts.max_endpointing_delay,
             turn_detector=self._turn_detector,
             agent=self,
         )
@@ -1120,15 +1124,13 @@ class _DeferredReplyValidation:
     PUNCTUATION = ".!?"
     PUNCTUATION_REDUCE_FACTOR = 0.75
 
-    # Long delay to use when the model thinks the user is still speaking
-    UNLIKELY_ENDPOINT_DELAY = 6
-
     FINAL_TRANSCRIPT_TIMEOUT = 5
 
     def __init__(
         self,
         validate_fnc: Callable[[], None],
         min_endpointing_delay: float,
+        max_endpointing_delay: float,
         turn_detector: _TurnDetector | None,
         agent: VoicePipelineAgent,
     ) -> None:
@@ -1144,6 +1146,7 @@ def __init__(
 
         self._agent = agent
         self._end_of_speech_delay = min_endpointing_delay
+        self._max_endpointing_delay = max_endpointing_delay
 
     @property
     def validating(self) -> bool:
@@ -1237,7 +1240,7 @@ async def _run_task(chat_ctx: ChatContext, delay: float) -> None:
                 unlikely_threshold = self._turn_detector.unlikely_threshold()
                 elasped = time.perf_counter() - start_time
                 if eot_prob < unlikely_threshold:
-                    delay = self.UNLIKELY_ENDPOINT_DELAY
+                    delay = self._max_endpointing_delay
                 delay = max(0, delay - elasped)
             await asyncio.sleep(delay)
 
diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py
index f53e82135..1585ed0cf 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/setup.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py
@@ -54,6 +54,7 @@
         "transformers>=4.47.1",
         "numpy>=1.26",
         "onnxruntime>=1.18",
+        "jinja2",
     ],
     package_data={"livekit.plugins.turn_detector": ["py.typed"]},
     project_urls={

From 8afc3955e9e1158199bcd3eafc29c5d350ca65db Mon Sep 17 00:00:00 2001
From: Tina Nguyen <72938484+tinalenguyen@users.noreply.github.com>
Date: Mon, 23 Dec 2024 01:35:02 -0500
Subject: [PATCH 27/46] added ConversationPersistor() to document
 events/transcriptions in external file (#1209)

---
 examples/conversation_persistor.py | 213 +++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 examples/conversation_persistor.py

diff --git a/examples/conversation_persistor.py b/examples/conversation_persistor.py
new file mode 100644
index 000000000..0d9909b63
--- /dev/null
+++ b/examples/conversation_persistor.py
@@ -0,0 +1,213 @@
+import asyncio
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Union
+
+import aiofiles
+from dotenv import load_dotenv
+from livekit.agents import (
+    AutoSubscribe,
+    JobContext,
+    WorkerOptions,
+    cli,
+    multimodal,
+    utils,
+)
+from livekit.agents.llm import ChatMessage
+from livekit.agents.multimodal.multimodal_agent import EventTypes
+from livekit.plugins import openai
+
+
+@dataclass
+class EventLog:
+    eventname: str | None
+    """name of recorded event"""
+    time: str = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+    """time the event is recorded"""
+
+
+@dataclass
+class TranscriptionLog:
+    role: str | None
+    """role of the speaker"""
+    transcription: str | None
+    """transcription of speech"""
+    time: str = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+    """time the event is recorded"""
+
+
+class ConversationPersistor(utils.EventEmitter[EventTypes]):
+    def __init__(
+        self,
+        *,
+        model: multimodal.MultimodalAgent | None,
+        log: str | None,
+        transcriptions_only: bool = False,
+    ):
+        """
+        Initializes a ConversationPersistor instance which records the events and transcriptions of a MultimodalAgent.
+
+        Args:
+            model (multimodal.MultimodalAgent): an instance of a MultiModalAgent
+            log (str): name of the external file to record events in
+            transcriptions_only (bool): a boolean variable to determine if only transcriptions will be recorded, False by default
+            user_transcriptions (arr): list of user transcriptions
+            agent_transcriptions (arr): list of agent transcriptions
+            events (arr): list of all events
+            log_q (asyncio.Queue): a queue of EventLog and TranscriptionLog
+
+        """
+        super().__init__()
+
+        self._model = model
+        self._log = log
+        self._transcriptions_only = transcriptions_only
+
+        self._user_transcriptions = []
+        self._agent_transcriptions = []
+        self._events = []
+
+        self._log_q = asyncio.Queue[Union[EventLog, TranscriptionLog, None]]()
+
+    @property
+    def log(self) -> str | None:
+        return self._log
+
+    @property
+    def model(self) -> multimodal.MultimodalAgent | None:
+        return self._model
+
+    @property
+    def user_transcriptions(self) -> dict:
+        return self._user_transcriptions
+
+    @property
+    def agent_transcriptions(self) -> dict:
+        return self._agent_transcriptions
+
+    @property
+    def events(self) -> dict:
+        return self._events
+
+    @log.setter
+    def log(self, newlog: str | None) -> None:
+        self._log = newlog
+
+    async def _main_atask(self) -> None:
+        # Writes to file asynchronously
+        while True:
+            log = await self._log_q.get()
+
+            if log is None:
+                break
+
+            async with aiofiles.open(self._log, "a") as file:
+                if type(log) is EventLog and not self._transcriptions_only:
+                    self._events.append(log)
+                    await file.write("\n" + log.time + " " + log.eventname)
+
+                if type(log) is TranscriptionLog:
+                    if log.role == "user":
+                        self._user_transcriptions.append(log)
+                    else:
+                        self._agent_transcriptions.append(log)
+
+                    await file.write(
+                        "\n" + log.time + " " + log.role + " " + log.transcription
+                    )
+
+    async def aclose(self) -> None:
+        # Exits
+        self._log_q.put_nowait(None)
+        await self._main_task
+
+    def start(self) -> None:
+        # Listens for emitted MultimodalAgent events
+        self._main_task = asyncio.create_task(self._main_atask())
+
+        @self._model.on("user_started_speaking")
+        def _user_started_speaking():
+            event = EventLog(eventname="user_started_speaking")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("user_stopped_speaking")
+        def _user_stopped_speaking():
+            event = EventLog(eventname="user_stopped_speaking")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("agent_started_speaking")
+        def _agent_started_speaking():
+            event = EventLog(eventname="agent_started_speaking")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("agent_stopped_speaking")
+        def _agent_stopped_speaking():
+            transcription = TranscriptionLog(
+                role="agent",
+                transcription=(self._model._playing_handle._tr_fwd.played_text)[1:],
+            )
+            self._log_q.put_nowait(transcription)
+
+            event = EventLog(eventname="agent_stopped_speaking")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("user_speech_committed")
+        def _user_speech_committed(user_msg: ChatMessage):
+            transcription = TranscriptionLog(
+                role="user", transcription=user_msg.content
+            )
+            self._log_q.put_nowait(transcription)
+
+            event = EventLog(eventname="user_speech_committed")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("agent_speech_committed")
+        def _agent_speech_committed():
+            event = EventLog(eventname="agent_speech_committed")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("agent_speech_interrupted")
+        def _agent_speech_interrupted():
+            event = EventLog(eventname="agent_speech_interrupted")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("function_calls_collected")
+        def _function_calls_collected():
+            event = EventLog(eventname="function_calls_collected")
+            self._log_q.put_nowait(event)
+
+        @self._model.on("function_calls_finished")
+        def _function_calls_finished():
+            event = EventLog(eventname="function_calls_finished")
+            self._log_q.put_nowait(event)
+
+
+load_dotenv()
+
+logger = logging.getLogger("my-worker")
+logger.setLevel(logging.INFO)
+
+
+async def entrypoint(ctx: JobContext):
+    agent = multimodal.MultimodalAgent(
+        model=openai.realtime.RealtimeModel(
+            voice="alloy",
+            temperature=0.8,
+            instructions="You are a helpful assistant.",
+            turn_detection=openai.realtime.ServerVadOptions(
+                threshold=0.6, prefix_padding_ms=200, silence_duration_ms=500
+            ),
+        ),
+    )
+
+    cp = ConversationPersistor(model=agent, log="log.txt")
+    cp.start()
+
+    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
+    participant = await ctx.wait_for_participant()
+    agent.start(ctx.room, participant)
+
+
+if __name__ == "__main__":
+    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))

From 797253de7cff9ffae223b7d38006698cee17f51d Mon Sep 17 00:00:00 2001
From: Juan Mugica Gonzalez <47819159+jmugicagonz@users.noreply.github.com>
Date: Mon, 23 Dec 2024 07:36:47 +0100
Subject: [PATCH 28/46] Substitute google error for warning (#1280)

---
 .../livekit-plugins-openai/livekit/plugins/openai/llm.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
index 6f7cbccb1..8e3dda787 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
@@ -220,8 +220,8 @@ def with_vertex(
         location = location
         _gac = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
         if _gac is None:
-            raise ValueError(
-                "`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file."
+            logger.warning(
+                "`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file. Otherwise, use any of the other Google Cloud auth methods."
             )
 
         try:

From 42b0e683e08fbc6624b38dfab2cb947357f4638c Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Mon, 23 Dec 2024 00:19:41 -0800
Subject: [PATCH 29/46] fix: do not log process warning when process not found
 (#1281)

---
 .changeset/six-wasps-pay.md                          | 5 +++++
 livekit-agents/livekit/agents/ipc/supervised_proc.py | 5 +++++
 2 files changed, 10 insertions(+)
 create mode 100644 .changeset/six-wasps-pay.md

diff --git a/.changeset/six-wasps-pay.md b/.changeset/six-wasps-pay.md
new file mode 100644
index 000000000..03ccb0dd8
--- /dev/null
+++ b/.changeset/six-wasps-pay.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+fix: do not log process warning when process not found
diff --git a/livekit-agents/livekit/agents/ipc/supervised_proc.py b/livekit-agents/livekit/agents/ipc/supervised_proc.py
index e93f46a9e..e56119876 100644
--- a/livekit-agents/livekit/agents/ipc/supervised_proc.py
+++ b/livekit-agents/livekit/agents/ipc/supervised_proc.py
@@ -378,11 +378,16 @@ async def _memory_monitor_task(self) -> None:
                     )
 
             except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
+                if self._closing or self._kill_sent:
+                    return
+
                 logger.warning(
                     "Failed to get memory info for process",
                     extra=self.logging_extra(),
                     exc_info=e,
                 )
+                # don't bother rechecking if we cannot get process info
+                return
             except Exception:
                 if self._closing or self._kill_sent:
                     return

From c7881f3776faa2dc4cea0bda4fd832173c00ac17 Mon Sep 17 00:00:00 2001
From: Juan Mugica Gonzalez <47819159+jmugicagonz@users.noreply.github.com>
Date: Mon, 23 Dec 2024 09:21:17 +0100
Subject: [PATCH 30/46] fix context when functions have been called (#1279)

Co-authored-by: David Zhao <dz@livekit.io>
---
 .changeset/witty-fishes-stare.md                         | 5 +++++
 livekit-agents/livekit/agents/pipeline/pipeline_agent.py | 5 +++++
 2 files changed, 10 insertions(+)
 create mode 100644 .changeset/witty-fishes-stare.md

diff --git a/.changeset/witty-fishes-stare.md b/.changeset/witty-fishes-stare.md
new file mode 100644
index 000000000..4f82113d7
--- /dev/null
+++ b/.changeset/witty-fishes-stare.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+fix context when functions have been called
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index 3b9f8e83b..7b5c28e79 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -702,6 +702,11 @@ async def _synthesize_answer_task(
                 not playing_speech.user_question or playing_speech.user_committed
             ) and not playing_speech.speech_committed:
                 # the speech is playing but not committed yet, add it to the chat context for this new reply synthesis
+                # First add the previous function call message if any
+                if playing_speech.extra_tools_messages:
+                    copied_ctx.messages.extend(playing_speech.extra_tools_messages)
+
+                # Then add the previous assistant message
                 copied_ctx.messages.append(
                     ChatMessage.create(
                         text=playing_speech.synthesis_handle.tts_forwarder.played_text,

From 799a53d519cf0ab79e7ad00751d2aa69f7a311a4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:58:11 -0600
Subject: [PATCH 31/46] Version Packages (#1278)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .changeset/dirty-mails-reflect.md                    |  5 -----
 .changeset/four-rockets-accept.md                    |  5 -----
 .changeset/nasty-rings-wave.md                       |  5 -----
 .changeset/six-wasps-pay.md                          |  5 -----
 .changeset/witty-fishes-stare.md                     |  5 -----
 examples/participant-entrypoint/requirements.txt     |  2 +-
 examples/simple-color/requirements.txt               |  2 +-
 examples/speech-to-text/requirements.txt             |  2 +-
 examples/text-to-speech/requirements.txt             |  2 +-
 examples/voice-pipeline-agent/requirements.txt       |  2 +-
 livekit-agents/CHANGELOG.md                          | 12 ++++++++++++
 livekit-agents/livekit/agents/version.py             |  2 +-
 livekit-agents/package.json                          |  2 +-
 .../livekit-plugins-turn-detector/CHANGELOG.md       |  6 ++++++
 .../livekit/plugins/turn_detector/version.py         |  2 +-
 .../livekit-plugins-turn-detector/package.json       |  2 +-
 16 files changed, 27 insertions(+), 34 deletions(-)
 delete mode 100644 .changeset/dirty-mails-reflect.md
 delete mode 100644 .changeset/four-rockets-accept.md
 delete mode 100644 .changeset/nasty-rings-wave.md
 delete mode 100644 .changeset/six-wasps-pay.md
 delete mode 100644 .changeset/witty-fishes-stare.md

diff --git a/.changeset/dirty-mails-reflect.md b/.changeset/dirty-mails-reflect.md
deleted file mode 100644
index 34eedc25a..000000000
--- a/.changeset/dirty-mails-reflect.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-make max_endpoint_delay configurable
diff --git a/.changeset/four-rockets-accept.md b/.changeset/four-rockets-accept.md
deleted file mode 100644
index a200e141d..000000000
--- a/.changeset/four-rockets-accept.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-turn-detector": patch
----
-
-add jinja2 dependency to turn detector
diff --git a/.changeset/nasty-rings-wave.md b/.changeset/nasty-rings-wave.md
deleted file mode 100644
index cbbcb7979..000000000
--- a/.changeset/nasty-rings-wave.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-set USE_DOCSTRING as default for ai_callable
diff --git a/.changeset/six-wasps-pay.md b/.changeset/six-wasps-pay.md
deleted file mode 100644
index 03ccb0dd8..000000000
--- a/.changeset/six-wasps-pay.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-fix: do not log process warning when process not found
diff --git a/.changeset/witty-fishes-stare.md b/.changeset/witty-fishes-stare.md
deleted file mode 100644
index 4f82113d7..000000000
--- a/.changeset/witty-fishes-stare.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-fix context when functions have been called
diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt
index 5e6395561..a92be36b8 100644
--- a/examples/participant-entrypoint/requirements.txt
+++ b/examples/participant-entrypoint/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.4
+livekit-agents>=0.12.5
 python-dotenv~=1.0
diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt
index 5e6395561..a92be36b8 100644
--- a/examples/simple-color/requirements.txt
+++ b/examples/simple-color/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.4
+livekit-agents>=0.12.5
 python-dotenv~=1.0
diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt
index 0a18a4bb6..e58a682b3 100644
--- a/examples/speech-to-text/requirements.txt
+++ b/examples/speech-to-text/requirements.txt
@@ -1,3 +1,3 @@
-livekit-agents>=0.12.4
+livekit-agents>=0.12.5
 livekit-plugins-deepgram>=0.6.15
 python-dotenv~=1.0
diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt
index 6a534b331..f025ab277 100644
--- a/examples/text-to-speech/requirements.txt
+++ b/examples/text-to-speech/requirements.txt
@@ -1,4 +1,4 @@
-livekit-agents>=0.12.4
+livekit-agents>=0.12.5
 livekit-plugins-openai>=0.10.12
 livekit-plugins-cartesia>=0.4.5
 livekit-plugins-elevenlabs>=0.7.9
diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt
index 77975fb53..481cb0136 100644
--- a/examples/voice-pipeline-agent/requirements.txt
+++ b/examples/voice-pipeline-agent/requirements.txt
@@ -1,4 +1,4 @@
-livekit-agents>=0.12.4
+livekit-agents>=0.12.5
 livekit-plugins-deepgram>=0.6.15
 livekit-plugins-google>=0.8.1
 livekit-plugins-openai[vertex]>=0.10.10
diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md
index 5bd84faf9..b04f10f1d 100644
--- a/livekit-agents/CHANGELOG.md
+++ b/livekit-agents/CHANGELOG.md
@@ -1,5 +1,17 @@
 # livekit-agents
 
+## 0.12.5
+
+### Patch Changes
+
+- make max_endpoint_delay configurable - [#1277](https://github.com/livekit/agents/pull/1277) ([@davidzhao](https://github.com/davidzhao))
+
+- set USE_DOCSTRING as default for ai_callable - [#1266](https://github.com/livekit/agents/pull/1266) ([@longcw](https://github.com/longcw))
+
+- fix: do not log process warning when process not found - [#1281](https://github.com/livekit/agents/pull/1281) ([@davidzhao](https://github.com/davidzhao))
+
+- fix context when functions have been called - [#1279](https://github.com/livekit/agents/pull/1279) ([@jmugicagonz](https://github.com/jmugicagonz))
+
 ## 0.12.4
 
 ### Patch Changes
diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py
index ee001ea03..93e989e31 100644
--- a/livekit-agents/livekit/agents/version.py
+++ b/livekit-agents/livekit/agents/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.12.4"
+__version__ = "0.12.5"
diff --git a/livekit-agents/package.json b/livekit-agents/package.json
index 212896f2b..4986b2889 100644
--- a/livekit-agents/package.json
+++ b/livekit-agents/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-agents",
   "private": true,
-  "version": "0.12.4"
+  "version": "0.12.5"
 }
diff --git a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
index 0bc8544b5..2d38bf347 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-eou
 
+## 0.3.4
+
+### Patch Changes
+
+- add jinja2 dependency to turn detector - [#1277](https://github.com/livekit/agents/pull/1277) ([@davidzhao](https://github.com/davidzhao))
+
 ## 0.3.3
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
index 6b8f1ef90..bcfe9b179 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.3.3"
+__version__ = "0.3.4"
diff --git a/livekit-plugins/livekit-plugins-turn-detector/package.json b/livekit-plugins/livekit-plugins-turn-detector/package.json
index acd5d4300..82d16bb89 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/package.json
+++ b/livekit-plugins/livekit-plugins-turn-detector/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-turn-detector",
   "private": true,
-  "version": "0.3.3"
+  "version": "0.3.4"
 }

From 37bbfccb0166b174c3cb399497f6b7465f97311b Mon Sep 17 00:00:00 2001
From: aoife cassidy <aoife@livekit.io>
Date: Mon, 23 Dec 2024 23:16:31 +0200
Subject: [PATCH 32/46] fix(turn-detector): fix int32/64 errors on Windows
 (#1285)

---
 .changeset/tricky-spiders-change.md                          | 5 +++++
 .../livekit/plugins/turn_detector/eou.py                     | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 .changeset/tricky-spiders-change.md

diff --git a/.changeset/tricky-spiders-change.md b/.changeset/tricky-spiders-change.md
new file mode 100644
index 000000000..a017624fc
--- /dev/null
+++ b/.changeset/tricky-spiders-change.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-turn-detector": patch
+---
+
+fix int32/64 errors on Windows
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
index acb915ab5..8c8090946 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py
@@ -106,7 +106,7 @@ def run(self, data: bytes) -> bytes | None:
             return_tensors="np",
         )
 
-        input_dict = {"input_ids": inputs["input_ids"]}
+        input_dict = {"input_ids": np.array(inputs["input_ids"], dtype=np.int64)}
 
         # Run inference
         outputs = self._session.run(["logits"], input_dict)

From c89960882146897049a393db0de8903bea0d54fc Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Tue, 24 Dec 2024 10:34:21 -0800
Subject: [PATCH 33/46] improve interruption handling, avoid agent from getting
 stuck (#1290)

---
 .changeset/silent-oranges-warn.md             |  5 +++++
 .../livekit/agents/pipeline/pipeline_agent.py | 21 ++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)
 create mode 100644 .changeset/silent-oranges-warn.md

diff --git a/.changeset/silent-oranges-warn.md b/.changeset/silent-oranges-warn.md
new file mode 100644
index 000000000..e7bcd0189
--- /dev/null
+++ b/.changeset/silent-oranges-warn.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+improve interruption handling, avoid agent from getting stuck
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index 7b5c28e79..b2a223bd0 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -714,6 +714,9 @@ async def _synthesize_answer_task(
                     )
                 )
 
+        # we want to add this question even if it's empty. during false positive interruptions,
+        # adding an empty user message gives the LLM context so it could continue from where
+        # it had been interrupted.
         copied_ctx.messages.append(
             ChatMessage.create(text=handle.user_question, role="user")
         )
@@ -1035,7 +1038,7 @@ async def _llm_stream_to_str_generator(
     def _validate_reply_if_possible(self) -> None:
         """Check if the new agent speech should be played"""
 
-        if self._playing_speech is not None:
+        if self._playing_speech and not self._playing_speech.interrupted:
             should_ignore_input = False
             if not self._playing_speech.allow_interruptions:
                 should_ignore_input = True
@@ -1049,19 +1052,24 @@ def _validate_reply_if_possible(self) -> None:
                     "interrupt threshold is not met",
                     extra={"speech_id": self._playing_speech.id},
                 )
+
             if should_ignore_input:
                 self._transcribed_text = ""
                 return
 
         if self._pending_agent_reply is None:
-            if self._opts.preemptive_synthesis or not self._transcribed_text:
+            if self._opts.preemptive_synthesis:
                 return
 
+            # as long as we don't have a pending reply, we need to synthesize it
+            # in order to keep the conversation flowing.
+            # transcript could be empty at this moment, if the user interrupted the agent
+            # but did not generate any transcribed text.
             self._synthesize_agent_reply()
 
         assert self._pending_agent_reply is not None
 
-        # in some bad timing, we could end up with two pushed agent replies inside the speech queue.
+        # due to timing, we could end up with two pushed agent replies inside the speech queue.
         # so make sure we directly interrupt every reply when validating a new one
         for speech in self._speech_q:
             if not speech.is_reply:
@@ -1072,7 +1080,10 @@ def _validate_reply_if_possible(self) -> None:
 
         logger.debug(
             "validated agent reply",
-            extra={"speech_id": self._pending_agent_reply.id},
+            extra={
+                "speech_id": self._pending_agent_reply.id,
+                "text": self._transcribed_text,
+            },
         )
 
         if self._last_speech_time is not None:
@@ -1101,7 +1112,7 @@ def _interrupt_if_possible(self) -> None:
 
     def _should_interrupt(self) -> bool:
         if self._playing_speech is None:
-            return True
+            return False
 
         if (
             not self._playing_speech.allow_interruptions

From ee0850937e773f41f337e287d5022b51004bdbc3 Mon Sep 17 00:00:00 2001
From: Juan Mugica Gonzalez <47819159+jmugicagonz@users.noreply.github.com>
Date: Tue, 24 Dec 2024 20:30:47 +0100
Subject: [PATCH 34/46] encode boost words (#1284)

Co-authored-by: David Zhao <dz@livekit.io>
---
 .changeset/hot-trainers-press.md                             | 5 +++++
 .../livekit/plugins/assemblyai/stt.py                        | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 .changeset/hot-trainers-press.md

diff --git a/.changeset/hot-trainers-press.md b/.changeset/hot-trainers-press.md
new file mode 100644
index 000000000..326150914
--- /dev/null
+++ b/.changeset/hot-trainers-press.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-assemblyai": patch
+---
+
+assemblyai: encode boost words
diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
index acef65b6a..8fc51c774 100644
--- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
+++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
@@ -318,7 +318,9 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse):
     async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
         live_config = {
             "sample_rate": self._opts.sample_rate,
-            "word_boost": self._opts.word_boost,
+            "word_boost": json.dumps(self._opts.word_boost)
+            if self._opts.word_boost is not None
+            else None,
             "encoding": self._opts.encoding,
             "disable_partial_transcripts": self._opts.disable_partial_transcripts,
             "enable_extra_session_information": self._opts.enable_extra_session_information,

From ffeee077f31f0c42f9bfd705bd8d45618d321e0c Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Tue, 24 Dec 2024 11:36:00 -0800
Subject: [PATCH 35/46] Ensure STT exceptions are being propagated (#1291)

Co-authored-by: jayesh <jayeshparmar9829@gmail.com>
---
 .changeset/giant-ways-invite.md                    |  8 ++++++++
 livekit-agents/livekit/agents/_exceptions.py       |  6 +++---
 .../livekit/plugins/assemblyai/stt.py              | 14 ++++++++++++--
 .../livekit/plugins/azure/stt.py                   |  5 ++++-
 .../livekit/plugins/deepgram/stt.py                | 10 +++++++++-
 .../livekit/plugins/google/stt.py                  |  5 ++++-
 6 files changed, 40 insertions(+), 8 deletions(-)
 create mode 100644 .changeset/giant-ways-invite.md

diff --git a/.changeset/giant-ways-invite.md b/.changeset/giant-ways-invite.md
new file mode 100644
index 000000000..5644cb581
--- /dev/null
+++ b/.changeset/giant-ways-invite.md
@@ -0,0 +1,8 @@
+---
+"livekit-plugins-assemblyai": patch
+"livekit-plugins-deepgram": patch
+"livekit-plugins-google": patch
+"livekit-plugins-azure": patch
+---
+
+fix: Ensure STT exceptions are being propagated
diff --git a/livekit-agents/livekit/agents/_exceptions.py b/livekit-agents/livekit/agents/_exceptions.py
index 128efacee..a6d987e7d 100644
--- a/livekit-agents/livekit/agents/_exceptions.py
+++ b/livekit-agents/livekit/agents/_exceptions.py
@@ -48,9 +48,9 @@ def __init__(
         self,
         message: str,
         *,
-        status_code: int,
-        request_id: str | None,
-        body: object | None,
+        status_code: int = -1,
+        request_id: str | None = None,
+        body: object | None = None,
     ) -> None:
         super().__init__(message, body=body)
 
diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
index 8fc51c774..40c359fd8 100644
--- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
+++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py
@@ -25,7 +25,13 @@
 from urllib.parse import urlencode
 
 import aiohttp
-from livekit.agents import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions, stt, utils
+from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    APIConnectOptions,
+    APIStatusError,
+    stt,
+    utils,
+)
 from livekit.agents.stt import SpeechEvent
 from livekit.agents.utils import AudioBuffer
 
@@ -274,7 +280,7 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse):
                     if closing_ws:  # close is expected, see SpeechStream.aclose
                         return
 
-                    raise Exception(
+                    raise APIStatusError(
                         "AssemblyAI connection closed unexpectedly",
                     )  # this will trigger a reconnection, see the _run loop
 
@@ -305,6 +311,10 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse):
                         [asyncio.gather(*tasks), wait_reconnect_task],
                         return_when=asyncio.FIRST_COMPLETED,
                     )  # type: ignore
+                    for task in done:
+                        if task != wait_reconnect_task:
+                            task.result()
+
                     if wait_reconnect_task not in done:
                         break
 
diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
index 309cc9c5c..2bda776fd 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py
@@ -199,10 +199,13 @@ async def process_input():
                 wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
 
                 try:
-                    await asyncio.wait(
+                    done, _ = await asyncio.wait(
                         [process_input_task, wait_reconnect_task],
                         return_when=asyncio.FIRST_COMPLETED,
                     )
+                    for task in done:
+                        if task != wait_reconnect_task:
+                            task.result()
                 finally:
                     await utils.aio.gracefully_cancel(
                         process_input_task, wait_reconnect_task
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py
index 2ae6d74fe..d45966e4e 100644
--- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py
+++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py
@@ -471,7 +471,9 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse):
                         return
 
                     # this will trigger a reconnection, see the _run loop
-                    raise Exception("deepgram connection closed unexpectedly")
+                    raise APIStatusError(
+                        message="deepgram connection closed unexpectedly"
+                    )
 
                 if msg.type != aiohttp.WSMsgType.TEXT:
                     logger.warning("unexpected deepgram message type %s", msg.type)
@@ -498,6 +500,12 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse):
                         [asyncio.gather(*tasks), wait_reconnect_task],
                         return_when=asyncio.FIRST_COMPLETED,
                     )  # type: ignore
+
+                    # propagate exceptions from completed tasks
+                    for task in done:
+                        if task != wait_reconnect_task:
+                            task.result()
+
                     if wait_reconnect_task not in done:
                         break
 
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py
index de4ac6251..7fe2a527d 100644
--- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py
@@ -438,10 +438,13 @@ async def process_stream(stream):
                 process_stream_task = asyncio.create_task(process_stream(stream))
                 wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
                 try:
-                    await asyncio.wait(
+                    done, _ = await asyncio.wait(
                         [process_stream_task, wait_reconnect_task],
                         return_when=asyncio.FIRST_COMPLETED,
                     )
+                    for task in done:
+                        if task != wait_reconnect_task:
+                            task.result()
                 finally:
                     await utils.aio.gracefully_cancel(
                         process_stream_task, wait_reconnect_task

From baae79b2ea9ec8300c9f47ea42cf830a66c2e41d Mon Sep 17 00:00:00 2001
From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com>
Date: Wed, 25 Dec 2024 01:20:54 +0530
Subject: [PATCH 36/46] Support PlayHT/PlayAI TTS (#1174)

Co-authored-by: David Zhao <dz@livekit.io>
---
 .changeset/khaki-stingrays-train.md           |   5 +
 .github/workflows/ci.yml                      |   1 +
 .github/workflows/tests.yml                   |   3 +
 .../livekit/agents/utils/codecs/mp3.py        |  14 +
 livekit-plugins/install_local.sh              |   1 +
 .../CHANGELOG.md                              |   0
 .../livekit-plugins-playai/README.md          |  13 +
 .../livekit/plugins/playai}/__init__.py       |  13 +-
 .../livekit/plugins/playai/log.py             |   5 +
 .../livekit/plugins/playai/models.py          |   9 +
 .../livekit/plugins/playai/py.typed           |   0
 .../livekit/plugins/playai/tts.py             | 296 ++++++++++++++++++
 .../livekit/plugins/playai}/version.py        |   0
 .../livekit-plugins-playai/package.json       |   5 +
 .../pyproject.toml                            |   0
 .../setup.py                                  |  12 +-
 .../livekit-plugins-playht/README.md          |  13 -
 .../livekit/plugins/playht/log.py             |   3 -
 .../livekit/plugins/playht/models.py          |  20 --
 .../livekit/plugins/playht/tts.py             | 238 --------------
 .../livekit-plugins-playht/package.json       |   5 -
 tests/test_tts.py                             |  12 +-
 22 files changed, 372 insertions(+), 296 deletions(-)
 create mode 100644 .changeset/khaki-stingrays-train.md
 rename livekit-plugins/{livekit-plugins-playht => livekit-plugins-playai}/CHANGELOG.md (100%)
 create mode 100644 livekit-plugins/livekit-plugins-playai/README.md
 rename livekit-plugins/{livekit-plugins-playht/livekit/plugins/playht => livekit-plugins-playai/livekit/plugins/playai}/__init__.py (58%)
 create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py
 create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py
 create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/py.typed
 create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py
 rename livekit-plugins/{livekit-plugins-playht/livekit/plugins/playht => livekit-plugins-playai/livekit/plugins/playai}/version.py (100%)
 create mode 100644 livekit-plugins/livekit-plugins-playai/package.json
 rename livekit-plugins/{livekit-plugins-playht => livekit-plugins-playai}/pyproject.toml (100%)
 rename livekit-plugins/{livekit-plugins-playht => livekit-plugins-playai}/setup.py (86%)
 delete mode 100644 livekit-plugins/livekit-plugins-playht/README.md
 delete mode 100644 livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py
 delete mode 100644 livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py
 delete mode 100644 livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py
 delete mode 100644 livekit-plugins/livekit-plugins-playht/package.json

diff --git a/.changeset/khaki-stingrays-train.md b/.changeset/khaki-stingrays-train.md
new file mode 100644
index 000000000..ca99f9fa7
--- /dev/null
+++ b/.changeset/khaki-stingrays-train.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-playai": patch
+---
+
+Support PlayAI TTS engine.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5f048347d..9eb72c55c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -80,4 +80,5 @@ jobs:
                -p livekit.plugins.azure \
                -p livekit.plugins.anthropic \
                -p livekit.plugins.fal \
+               -p livekit.plugins.playai \
                -p livekit.plugins.assemblyai
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d2a26cbf2..25f72cc33 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -77,6 +77,7 @@ jobs:
           sudo dpkg -i libssl1.1_1.1.1-1ubuntu2.1_arm64.deb
           sudo dpkg -i libssl-dev_1.1.1-1ubuntu2.1_arm64.deb
 
+
       - name: Install ffmpeg (macOS)
         if: ${{ startsWith(matrix.os, 'macos') }}
         run: brew install ffmpeg
@@ -109,6 +110,8 @@ jobs:
           GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
           ASSEMBLYAI_API_KEY: ${{ secrets.ASSEMBLYAI_API_KEY }}
           FAL_KEY: ${{ secrets.FAL_KEY }}
+          PLAYHT_API_KEY: ${{ secrets.PLAYHT_API_KEY }}
+          PLAYHT_USER_ID: ${{ secrets.PLAYHT_USER_ID }}
           GOOGLE_APPLICATION_CREDENTIALS: google.json
           PYTEST_ADDOPTS: "--color=yes"
         working-directory: tests
diff --git a/livekit-agents/livekit/agents/utils/codecs/mp3.py b/livekit-agents/livekit/agents/utils/codecs/mp3.py
index 6f3b1aa45..2f2321028 100644
--- a/livekit-agents/livekit/agents/utils/codecs/mp3.py
+++ b/livekit-agents/livekit/agents/utils/codecs/mp3.py
@@ -39,6 +39,20 @@ def __init__(self):
         self._codec = av.CodecContext.create("mp3", "r")  # noqa
 
     def decode_chunk(self, chunk: bytes) -> List[rtc.AudioFrame]:
+        # Skip ID3v2 header if present
+        if chunk.startswith(b"ID3"):
+            # ID3v2 header is 10 bytes long
+            # The size is encoded in the next 4 bytes (bytes 6-9)
+            # Each byte only uses 7 bits (most significant bit is always 0)
+            if len(chunk) >= 10:
+                size = (
+                    ((chunk[6] & 0x7F) << 21)
+                    | ((chunk[7] & 0x7F) << 14)
+                    | ((chunk[8] & 0x7F) << 7)
+                    | (chunk[9] & 0x7F)
+                )
+                chunk = chunk[10 + size :]
+
         packets = self._codec.parse(chunk)
         result: List[rtc.AudioFrame] = []
         for packet in packets:
diff --git a/livekit-plugins/install_local.sh b/livekit-plugins/install_local.sh
index 79ec29f0d..3e6a1cee4 100755
--- a/livekit-plugins/install_local.sh
+++ b/livekit-plugins/install_local.sh
@@ -17,5 +17,6 @@ pip install \
   "${SCRIPT_DIR}/livekit-plugins-nltk" \
   "${SCRIPT_DIR}/livekit-plugins-openai" \
   "${SCRIPT_DIR}/livekit-plugins-rag" \
+  "${SCRIPT_DIR}/livekit-plugins-playai" \
   "${SCRIPT_DIR}/livekit-plugins-silero" \
   "${SCRIPT_DIR}/livekit-plugins-turn-detector"
diff --git a/livekit-plugins/livekit-plugins-playht/CHANGELOG.md b/livekit-plugins/livekit-plugins-playai/CHANGELOG.md
similarity index 100%
rename from livekit-plugins/livekit-plugins-playht/CHANGELOG.md
rename to livekit-plugins/livekit-plugins-playai/CHANGELOG.md
diff --git a/livekit-plugins/livekit-plugins-playai/README.md b/livekit-plugins/livekit-plugins-playai/README.md
new file mode 100644
index 000000000..5561dbe66
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-playai/README.md
@@ -0,0 +1,13 @@
+# LiveKit Plugins PlayAI/PlayHT
+
+Agent Framework plugin for voice synthesis with [PlayAI](https://play.ai/) API.
+
+## Installation
+
+```bash
+pip install livekit-plugins-playai
+```
+
+## Pre-requisites
+
+You'll need USER ID and API Secret KEY from PlayHT. It can be set as an environment variable: `PLAYHT_USER_ID`, `PLAYHT_API_KEY` get it from [here](https://play.ht/studio/api-access)
diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/__init__.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/__init__.py
similarity index 58%
rename from livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/__init__.py
rename to livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/__init__.py
index 82229c316..033d9363e 100644
--- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/__init__.py
+++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/__init__.py
@@ -1,27 +1,20 @@
-from .models import TTSEngines
-from .tts import DEFAULT_VOICE, TTS, Voice
+from .tts import TTS
 from .version import __version__
 
 __all__ = [
     "TTS",
-    "Voice",
-    "DEFAULT_VOICE",
-    "TTSEngines",
     "__version__",
 ]
 
 from livekit.agents import Plugin
 
 
-class PlayHTPlugin(Plugin):
+class PlayAIPlugin(Plugin):
     def __init__(self) -> None:
         super().__init__(__name__, __version__, __package__)
 
-    def download_files(self) -> None:
-        self.download_files(self)
 
-
-Plugin.register_plugin(PlayHTPlugin())
+Plugin.register_plugin(PlayAIPlugin())
 
 # Cleanup docs of unexported modules
 _module = dir()
diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py
new file mode 100644
index 000000000..decd14a99
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py
@@ -0,0 +1,5 @@
+import logging
+
+logger = logging.getLogger("livekit.plugins.playai")
+# suppress verbose websocket logs
+logging.getLogger("websockets.client").setLevel(logging.INFO)
diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py
new file mode 100644
index 000000000..1dc6dfce8
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py
@@ -0,0 +1,9 @@
+from typing import Literal
+
+from pyht.client import Format  # type: ignore
+
+TTSModel = Literal["Play3.0-mini-ws", "PlayDialog-ws", "Play3.0-mini", "PlayDialog"]
+FORMAT = Literal["mp3"]
+format_mapping = {
+    "mp3": Format.FORMAT_MP3,
+}
diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/py.typed b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/py.typed
new file mode 100644
index 000000000..e69de29bb
diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py
new file mode 100644
index 000000000..464f3f418
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import asyncio
+import os
+import weakref
+from dataclasses import dataclass, fields
+
+from livekit import rtc
+from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    APIConnectionError,
+    APIConnectOptions,
+    tokenize,
+    tts,
+    utils,
+)
+from pyht import AsyncClient as PlayHTAsyncClient  # type: ignore
+from pyht.client import Format, Language, TTSOptions  # type: ignore
+
+from .log import logger
+from .models import TTSModel
+
+NUM_CHANNELS = 1
+
+
+@dataclass
+class _Options:
+    model: TTSModel | str
+    tts_options: TTSOptions
+    word_tokenizer: tokenize.WordTokenizer
+
+
+class TTS(tts.TTS):
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        user_id: str | None = None,
+        voice: str = "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
+        language: str = "english",
+        sample_rate: int = 24000,
+        model: TTSModel | str = "Play3.0-mini-ws",
+        word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
+            ignore_punctuation=False
+        ),
+        **kwargs,
+    ) -> None:
+        """
+        Initialize the PlayAI TTS engine.
+
+        Args:
+            api_key (str): PlayAI API key.
+            user_id (str): PlayAI user ID.
+            voice (str): Voice manifest URL.
+            model (TTSModel): TTS model, defaults to "Play3.0-mini-ws".
+            language (str): language, defaults to "english".
+            sample_rate (int): sample rate (Hz), A number greater than or equal to 8000, and must be less than or equal to 48000
+            word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
+            **kwargs: Additional options.
+        """
+
+        super().__init__(
+            capabilities=tts.TTSCapabilities(
+                streaming=False,
+            ),
+            sample_rate=sample_rate,
+            num_channels=1,
+        )
+
+        api_key = api_key or os.environ.get("PLAYHT_API_KEY")
+        user_id = user_id or os.environ.get("PLAYHT_USER_ID")
+
+        if not api_key or not user_id:
+            raise ValueError(
+                "PlayHT API key and user ID are required. Set environment variables PLAYHT_API_KEY and PLAYHT_USER_ID or pass them explicitly."
+            )
+        _validate_kwargs(kwargs)
+        self._config = TTSOptions(
+            voice=voice,
+            format=Format.FORMAT_MP3,  # Default format for now
+            sample_rate=sample_rate,
+            language=Language(language),
+            **kwargs,
+        )
+
+        self._opts = _Options(
+            model=model,
+            tts_options=self._config,
+            word_tokenizer=word_tokenizer,
+        )
+
+        # Initialize client
+        self._client = PlayHTAsyncClient(
+            user_id=user_id,
+            api_key=api_key,
+        )
+        self._streams = weakref.WeakSet[SynthesizeStream]()
+
+    def update_options(
+        self,
+        *,
+        voice: str | None = None,
+        model: TTSModel | str | None = None,
+        language: str | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        Update the TTS options.
+        """
+        updates = {}
+        if voice is not None:
+            updates["voice"] = voice
+        if language is not None:
+            updates["language"] = Language(language)
+        tts_kwargs = {k: v for k, v in kwargs.items()}
+
+        self._config = _update_options(self._config, **updates, **tts_kwargs)
+
+        if model is not None:
+            self._opts.model = model
+
+        for stream in self._streams:
+            stream._config = _update_options(stream._config, **updates, **tts_kwargs)
+            if model is not None:
+                stream._opts.model = model
+
+    def synthesize(
+        self,
+        text: str,
+        *,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> "ChunkedStream":
+        return ChunkedStream(
+            tts=self,
+            input_text=text,
+            conn_options=conn_options,
+            opts=self._opts,
+        )
+
+    def stream(
+        self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
+    ) -> "SynthesizeStream":
+        stream = SynthesizeStream(
+            tts=self,
+            conn_options=conn_options,
+            opts=self._opts,
+        )
+        self._streams.add(stream)
+        return stream
+
+
+class ChunkedStream(tts.ChunkedStream):
+    def __init__(
+        self,
+        *,
+        tts: TTS,
+        input_text: str,
+        conn_options: APIConnectOptions,
+        opts: _Options,
+    ) -> None:
+        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
+        self._client = tts._client
+        self._opts = opts
+        self._config = self._opts.tts_options
+        self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
+
+    async def _run(self) -> None:
+        request_id = utils.shortuuid()
+        bstream = utils.audio.AudioByteStream(
+            sample_rate=self._config.sample_rate, num_channels=NUM_CHANNELS
+        )
+
+        try:
+            async for chunk in self._client.tts(
+                text=self._input_text,
+                options=self._config,
+                voice_engine=self._opts.model,
+                streaming=True,
+            ):
+                for frame in self._mp3_decoder.decode_chunk(chunk):
+                    for frame in bstream.write(frame.data.tobytes()):
+                        self._event_ch.send_nowait(
+                            tts.SynthesizedAudio(
+                                request_id=request_id,
+                                frame=frame,
+                            )
+                        )
+            for frame in bstream.flush():
+                self._event_ch.send_nowait(
+                    tts.SynthesizedAudio(request_id=request_id, frame=frame)
+                )
+        except Exception as e:
+            raise APIConnectionError() from e
+
+
+class SynthesizeStream(tts.SynthesizeStream):
+    def __init__(
+        self,
+        *,
+        tts: TTS,
+        conn_options: APIConnectOptions,
+        opts: _Options,
+    ):
+        super().__init__(tts=tts, conn_options=conn_options)
+        self._client = tts._client
+        self._opts = opts
+        self._config = self._opts.tts_options
+        self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
+        self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
+
+    async def _run(self) -> None:
+        request_id = utils.shortuuid()
+        segment_id = utils.shortuuid()
+        bstream = utils.audio.AudioByteStream(
+            sample_rate=self._config.sample_rate,
+            num_channels=NUM_CHANNELS,
+        )
+        last_frame: rtc.AudioFrame | None = None
+
+        def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
+            nonlocal last_frame
+            if last_frame is not None:
+                self._event_ch.send_nowait(
+                    tts.SynthesizedAudio(
+                        request_id=request_id,
+                        segment_id=segment_id,
+                        frame=last_frame,
+                        is_final=is_final,
+                    )
+                )
+                last_frame = None
+
+        input_task = asyncio.create_task(self._tokenize_input())
+        try:
+            text_stream = await self._create_text_stream()
+            async for chunk in self._client.stream_tts_input(
+                text_stream=text_stream,
+                options=self._config,
+                voice_engine=self._opts.model,
+            ):
+                for frame in self._mp3_decoder.decode_chunk(chunk):
+                    for frame in bstream.write(frame.data.tobytes()):
+                        _send_last_frame(segment_id=segment_id, is_final=False)
+                        last_frame = frame
+
+            for frame in bstream.flush():
+                _send_last_frame(segment_id=segment_id, is_final=False)
+                last_frame = frame
+            _send_last_frame(segment_id=segment_id, is_final=True)
+        except Exception as e:
+            raise APIConnectionError() from e
+        finally:
+            await utils.aio.gracefully_cancel(input_task)
+            self._client.close()
+
+    @utils.log_exceptions(logger=logger)
+    async def _tokenize_input(self):
+        # Converts incoming text into WordStreams and sends them into _segments_ch
+        word_stream = None
+        async for input in self._input_ch:
+            if isinstance(input, str):
+                if word_stream is None:
+                    word_stream = self._opts.word_tokenizer.stream()
+                    self._segments_ch.send_nowait(word_stream)
+                word_stream.push_text(input)
+            elif isinstance(input, self._FlushSentinel):
+                if word_stream:
+                    word_stream.end_input()
+                word_stream = None
+        self._segments_ch.close()
+
+    @utils.log_exceptions(logger=logger)
+    async def _create_text_stream(self):
+        async def text_stream():
+            async for word_stream in self._segments_ch:
+                async for word in word_stream:
+                    yield word.token
+
+        return text_stream()
+
+
+def _update_options(config: TTSOptions, **kwargs) -> TTSOptions:
+    _validate_kwargs(kwargs)
+    for k, v in kwargs.items():
+        if v is not None:
+            setattr(config, k, v)
+    return config
+
+
+def _validate_kwargs(kwargs: dict) -> None:
+    valid_keys = {field.name for field in fields(TTSOptions)}
+    invalid_keys = set(kwargs.keys()) - valid_keys
+    if invalid_keys:
+        raise ValueError(
+            f"Invalid parameters: {invalid_keys}. Allowed parameters: {valid_keys}"
+        )
diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/version.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py
similarity index 100%
rename from livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/version.py
rename to livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py
diff --git a/livekit-plugins/livekit-plugins-playai/package.json b/livekit-plugins/livekit-plugins-playai/package.json
new file mode 100644
index 000000000..043890665
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-playai/package.json
@@ -0,0 +1,5 @@
+{
+  "name": "livekit-plugins-playai",
+  "private": true,
+  "version": "1.0.3"
+}
diff --git a/livekit-plugins/livekit-plugins-playht/pyproject.toml b/livekit-plugins/livekit-plugins-playai/pyproject.toml
similarity index 100%
rename from livekit-plugins/livekit-plugins-playht/pyproject.toml
rename to livekit-plugins/livekit-plugins-playai/pyproject.toml
diff --git a/livekit-plugins/livekit-plugins-playht/setup.py b/livekit-plugins/livekit-plugins-playai/setup.py
similarity index 86%
rename from livekit-plugins/livekit-plugins-playht/setup.py
rename to livekit-plugins/livekit-plugins-playai/setup.py
index eb41a5b89..76c2d2ba5 100644
--- a/livekit-plugins/livekit-plugins-playht/setup.py
+++ b/livekit-plugins/livekit-plugins-playai/setup.py
@@ -6,14 +6,14 @@
 
 here = pathlib.Path(__file__).parent.resolve()
 about = {}
-with open(os.path.join(here, "livekit", "plugins", "playht", "version.py"), "r") as f:
+with open(os.path.join(here, "livekit", "plugins", "playai", "version.py"), "r") as f:
     exec(f.read(), about)
 
 
 setuptools.setup(
-    name="livekit-plugins-playht",
+    name="livekit-plugins-playai",
     version=about["__version__"],
-    description="Agent Framework plugin for voice synthesis with PlayHT's API.",
+    description="Agent Framework plugin for voice synthesis with PlayAI's API.",
     long_description=(here / "README.md").read_text(encoding="utf-8"),
     long_description_content_type="text/markdown",
     url="https://github.com/livekit/agents",
@@ -27,17 +27,17 @@
         "Programming Language :: Python :: 3.12",
         "Programming Language :: Python :: 3 :: Only",
     ],
-    keywords=["webrtc", "realtime", "audio", "livekit", "playHT"],
+    keywords=["webrtc", "realtime", "audio", "livekit", "playHT", "playAI"],
     license="Apache-2.0",
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.9.0",
     install_requires=[
         "livekit-agents[codecs]>=0.12.3",
-        "pyht",
+        "pyht>=0.1.10",
         "aiohttp",
         "livekit",
     ],
-    package_data={"livekit.plugins.playht": ["py.typed"]},
+    package_data={"livekit.plugins.playai": ["py.typed"]},
     project_urls={
         "Documentation": "https://docs.livekit.io",
         "Website": "https://livekit.io/",
diff --git a/livekit-plugins/livekit-plugins-playht/README.md b/livekit-plugins/livekit-plugins-playht/README.md
deleted file mode 100644
index 53badc144..000000000
--- a/livekit-plugins/livekit-plugins-playht/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# LiveKit Plugins PlayHT
-
-Agent Framework plugin for voice synthesis with [PlayHT](https://play.ht/) API.
-
-## Installation
-
-```bash
-pip install livekit-plugins-playht
-```
-
-## Pre-requisites
-
-You'll need USER ID and API Secret KEY from PlayHT. It can be set as an environment variable: `PLAYHT_USER_ID`, `PLAYHT_API_KEY`
\ No newline at end of file
diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py b/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py
deleted file mode 100644
index 18a81836e..000000000
--- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import logging
-
-logger = logging.getLogger("livekit.custom_tts_plugins.playht")
diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py b/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py
deleted file mode 100644
index 6ffe63a5b..000000000
--- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from typing import Literal
-
-TTSEngines = Literal[
-    "PlayHT2.0",
-    "PlayHT1.0",
-    "PlayHT2.0-turbo",
-    "Play3.0-mini",
-]
-
-TTSEncoding = Literal[
-    "mp3_22050_32",
-    "mp3_44100_32",
-    "mp3_44100_64",
-    "mp3_44100_96",
-    "mp3_44100_128",
-    "mp3_44100_192",
-    "pcm_16000",
-    "pcm_22050",
-    "pcm_44100",
-]
diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py b/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py
deleted file mode 100644
index 982565da7..000000000
--- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py
+++ /dev/null
@@ -1,238 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-import os
-from dataclasses import dataclass
-from typing import Any, List, Literal
-
-import aiohttp
-from livekit.agents import (
-    DEFAULT_API_CONNECT_OPTIONS,
-    APIConnectionError,
-    APIConnectOptions,
-    APIStatusError,
-    APITimeoutError,
-    tts,
-    utils,
-)
-
-from .log import logger
-from .models import TTSEncoding, TTSEngines
-
-_Encoding = Literal["mp3", "pcm"]
-
-
-def _sample_rate_from_format(output_format: TTSEncoding) -> int:
-    split = output_format.split("_")
-    return int(split[1])
-
-
-def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
-    if output_format.startswith("mp3"):
-        return "mp3"
-    elif output_format.startswith("pcm"):
-        return "pcm"
-    elif output_format.startswith("wav"):
-        return "pcm"
-
-    raise ValueError(f"Unknown format: {output_format}")
-
-
-@dataclass
-class Voice:
-    id: str
-    name: str
-    voice_engine: TTSEngines
-
-
-DEFAULT_VOICE = Voice(
-    id="s3://peregrine-voices/mel22/manifest.json",
-    name="Will",
-    voice_engine="Play3.0-mini",
-)
-
-ACCEPT_HEADER = {
-    "mp3": "audio/mpeg",
-    "wav": "audio/wav",
-    "ogg": "audio/ogg",
-    "flac": "audio/flac",
-    "mulaw": "audio/basic",  # commonly used for mulaw
-}
-
-
-API_BASE_URL_V2 = "https://api.play.ht/api/v2"
-AUTHORIZATION_HEADER = "AUTHORIZATION"
-USERID_HEADER = "X-USER-ID"
-PLAYHT_TTS_CHANNELS = 1
-
-_TTSEncoding = Literal["mp3", "wav", "ogg", "flac", "mulaw"]
-
-
-@dataclass
-class _TTSOptions:
-    api_key: str
-    user_id: str
-    voice: Voice
-    base_url: str
-    sample_rate: int
-    encoding: _TTSEncoding
-
-
-class TTS(tts.TTS):
-    def __init__(
-        self,
-        *,
-        voice: Voice = DEFAULT_VOICE,
-        api_key: str | None = None,
-        user_id: str | None = None,
-        base_url: str | None = None,
-        sample_rate: int = 24000,
-        encoding: _TTSEncoding = "wav",
-        http_session: aiohttp.ClientSession | None = None,
-    ) -> None:
-        super().__init__(
-            capabilities=tts.TTSCapabilities(
-                streaming=False,
-            ),
-            sample_rate=sample_rate,
-            num_channels=PLAYHT_TTS_CHANNELS,
-        )
-        api_key = api_key or os.environ.get("PLAYHT_API_KEY")
-        if not api_key:
-            raise ValueError("PLAYHT_API_KEY must be set")
-
-        user_id = user_id or os.environ.get("PLAYHT_USER_ID")
-        if not user_id:
-            raise ValueError("PLAYHT_USER_ID mus be set")
-
-        self._opts = _TTSOptions(
-            voice=voice,
-            user_id=user_id,
-            api_key=api_key,
-            base_url=base_url or API_BASE_URL_V2,
-            sample_rate=sample_rate,
-            encoding=encoding,
-        )
-        self._session = http_session
-
-    def _ensure_session(self) -> aiohttp.ClientSession:
-        if not self._session:
-            self._session = utils.http_context.http_session()
-
-        return self._session
-
-    async def list_voices(self) -> List[Voice]:
-        async with self._ensure_session().get(
-            f"{self._opts.base_url}/voices",
-            headers={
-                "accept": "application/json",
-                AUTHORIZATION_HEADER: self._opts.api_key,
-                USERID_HEADER: self._opts.user_id,
-            },
-        ) as resp:
-            return _dict_to_voices_list(await resp.json())
-
-    def synthesize(
-        self,
-        text: str,
-        *,
-        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
-    ) -> "ChunkedStream":
-        return ChunkedStream(
-            tts=self,
-            input_text=text,
-            conn_options=conn_options,
-            opts=self._opts,
-            session=self._ensure_session(),
-        )
-
-
-class ChunkedStream(tts.ChunkedStream):
-    """Synthesize using the chunked api endpoint"""
-
-    def __init__(
-        self,
-        tts: TTS,
-        input_text: str,
-        opts: _TTSOptions,
-        conn_options: APIConnectOptions,
-        session: aiohttp.ClientSession,
-    ) -> None:
-        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
-        self._opts, self._session = opts, session
-
-    async def _run(self) -> None:
-        stream = utils.audio.AudioByteStream(
-            sample_rate=self._opts.sample_rate, num_channels=1
-        )
-        self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
-        request_id = utils.shortuuid()
-        url = f"{API_BASE_URL_V2}/tts/stream"
-        headers = {
-            "accept": ACCEPT_HEADER[self._opts.encoding],
-            "content-type": "application/json",
-            AUTHORIZATION_HEADER: self._opts.api_key,
-            USERID_HEADER: self._opts.user_id,
-        }
-        json_data = {
-            "text": self._input_text,
-            "output_format": self._opts.encoding,
-            "sample_rate": self._opts.sample_rate,
-            "voice": self._opts.voice.id,
-        }
-        try:
-            async with self._session.post(
-                url=url, headers=headers, json=json_data
-            ) as resp:
-                if not resp.content_type.startswith("audio/"):
-                    content = await resp.text()
-                    logger.error("playHT returned non-audio data: %s", content)
-                    return
-
-                encoding = _encoding_from_format(self._opts.encoding)
-                if encoding == "mp3":
-                    async for bytes_data, _ in resp.content.iter_chunks():
-                        for frame in self._mp3_decoder.decode_chunk(bytes_data):
-                            self._event_ch.send_nowait(
-                                tts.SynthesizedAudio(
-                                    request_id=request_id,
-                                    frame=frame,
-                                )
-                            )
-                else:
-                    async for bytes_data, _ in resp.content.iter_chunks():
-                        for frame in stream.write(bytes_data):
-                            self._event_ch.send_nowait(
-                                tts.SynthesizedAudio(
-                                    request_id=request_id,
-                                    frame=frame,
-                                )
-                            )
-
-                    for frame in stream.flush():
-                        self._event_ch.send_nowait(
-                            tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                        )
-
-        except asyncio.TimeoutError as e:
-            raise APITimeoutError() from e
-        except aiohttp.ClientResponseError as e:
-            raise APIStatusError(
-                message=e.message,
-                status_code=e.status,
-                request_id=None,
-                body=None,
-            ) from e
-        except Exception as e:
-            raise APIConnectionError() from e
-
-
-def _dict_to_voices_list(data: dict[str, Any]):
-    voices: List[Voice] = []
-    for voice in data["text"]:
-        voices.append(
-            Voice(
-                id=voice["id"], name=voice["name"], voice_engine=voice["voice_engine"]
-            )
-        )
-    return voices
diff --git a/livekit-plugins/livekit-plugins-playht/package.json b/livekit-plugins/livekit-plugins-playht/package.json
deleted file mode 100644
index fee72ee05..000000000
--- a/livekit-plugins/livekit-plugins-playht/package.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "name": "livekit-plugins-playht",
-    "private": true,
-    "version": "1.0.3"
-}
\ No newline at end of file
diff --git a/tests/test_tts.py b/tests/test_tts.py
index b0fbd8034..91f8035b5 100644
--- a/tests/test_tts.py
+++ b/tests/test_tts.py
@@ -10,7 +10,15 @@
 from livekit import agents
 from livekit.agents import APIConnectionError, tokenize, tts
 from livekit.agents.utils import AudioBuffer, merge_frames
-from livekit.plugins import azure, cartesia, deepgram, elevenlabs, google, openai
+from livekit.plugins import (
+    azure,
+    cartesia,
+    deepgram,
+    elevenlabs,
+    google,
+    openai,
+    playai,
+)
 
 from .conftest import TEST_CONNECT_OPTIONS
 from .fake_tts import FakeTTS
@@ -44,6 +52,7 @@ async def _assert_valid_synthesized_audio(
     pytest.param(lambda: azure.TTS(), id="azure"),
     pytest.param(lambda: cartesia.TTS(), id="cartesia"),
     pytest.param(lambda: deepgram.TTS(), id="deepgram"),
+    pytest.param(lambda: playai.TTS(), id="playai"),
 ]
 
 
@@ -89,6 +98,7 @@ async def test_synthesize(tts_factory):
         id="azure.stream",
     ),
     pytest.param(lambda: deepgram.TTS(), id="deepgram"),
+    pytest.param(lambda: playai.TTS(), id="playai"),
 ]
 
 

From 32383939025453616d23c82ccc8c725ee87a9870 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Wed, 25 Dec 2024 13:55:16 -0800
Subject: [PATCH 37/46] improved handling of LLM errors, do not retry if
 already began (#1298)

---
 .changeset/gorgeous-sheep-grow.md             |  7 +++++
 livekit-agents/livekit/agents/_exceptions.py  | 31 ++++++++++++++-----
 livekit-agents/livekit/agents/llm/llm.py      |  2 +-
 .../livekit/plugins/anthropic/llm.py          |  6 ++--
 .../livekit/plugins/cartesia/tts.py           |  5 ++-
 .../livekit/plugins/deepgram/tts.py           | 10 ++++--
 .../livekit/plugins/elevenlabs/tts.py         |  5 +--
 .../livekit/plugins/openai/llm.py             |  6 ++--
 8 files changed, 54 insertions(+), 18 deletions(-)
 create mode 100644 .changeset/gorgeous-sheep-grow.md

diff --git a/.changeset/gorgeous-sheep-grow.md b/.changeset/gorgeous-sheep-grow.md
new file mode 100644
index 000000000..5bdc7cc5f
--- /dev/null
+++ b/.changeset/gorgeous-sheep-grow.md
@@ -0,0 +1,7 @@
+---
+"livekit-plugins-anthropic": patch
+"livekit-plugins-openai": patch
+"livekit-agents": patch
+---
+
+improved handling of LLM errors, do not retry if already began
diff --git a/livekit-agents/livekit/agents/_exceptions.py b/livekit-agents/livekit/agents/_exceptions.py
index a6d987e7d..74a1ab3c1 100644
--- a/livekit-agents/livekit/agents/_exceptions.py
+++ b/livekit-agents/livekit/agents/_exceptions.py
@@ -23,16 +23,22 @@ class APIError(Exception):
     body: object | None
     """The API response body, if available.
 
-    
+
     If the API returned a valid json, the body will contains
     the decodede result.
     """
 
-    def __init__(self, message: str, *, body: object | None) -> None:
+    retryable: bool = False
+    """Whether the error can be retried."""
+
+    def __init__(
+        self, message: str, *, body: object | None, retryable: bool = True
+    ) -> None:
         super().__init__(message)
 
         self.message = message
         self.body = body
+        self.retryable = retryable
 
 
 class APIStatusError(APIError):
@@ -51,8 +57,15 @@ def __init__(
         status_code: int = -1,
         request_id: str | None = None,
         body: object | None = None,
+        retryable: bool | None = None,
     ) -> None:
-        super().__init__(message, body=body)
+        if retryable is None:
+            retryable = True
+            # 4xx errors are not retryable
+            if status_code >= 400 and status_code < 500:
+                retryable = False
+
+        super().__init__(message, body=body, retryable=retryable)
 
         self.status_code = status_code
         self.request_id = request_id
@@ -61,12 +74,16 @@ def __init__(
 class APIConnectionError(APIError):
     """Raised when an API request failed due to a connection error."""
 
-    def __init__(self, message: str = "Connection error.") -> None:
-        super().__init__(message, body=None)
+    def __init__(
+        self, message: str = "Connection error.", *, retryable: bool = True
+    ) -> None:
+        super().__init__(message, body=None, retryable=retryable)
 
 
 class APITimeoutError(APIConnectionError):
     """Raised when an API request timed out."""
 
-    def __init__(self, message: str = "Request timed out.") -> None:
-        super().__init__(message)
+    def __init__(
+        self, message: str = "Request timed out.", *, retryable: bool = True
+    ) -> None:
+        super().__init__(message, retryable=retryable)
diff --git a/livekit-agents/livekit/agents/llm/llm.py b/livekit-agents/livekit/agents/llm/llm.py
index 351fcc9b1..099e3139c 100644
--- a/livekit-agents/livekit/agents/llm/llm.py
+++ b/livekit-agents/livekit/agents/llm/llm.py
@@ -148,7 +148,7 @@ async def _main_task(self) -> None:
             try:
                 return await self._run()
             except APIError as e:
-                if self._conn_options.max_retry == 0:
+                if self._conn_options.max_retry == 0 or not e.retryable:
                     raise
                 elif i == self._conn_options.max_retry:
                     raise APIConnectionError(
diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
index 69b468d23..3af490211 100644
--- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
+++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
@@ -206,6 +206,7 @@ def __init__(
         self._output_tokens = 0
 
     async def _run(self) -> None:
+        retryable = True
         try:
             if not self._anthropic_stream:
                 self._anthropic_stream = await self._awaitable_anthropic_stream
@@ -215,6 +216,7 @@ async def _run(self) -> None:
                     chat_chunk = self._parse_event(event)
                     if chat_chunk is not None:
                         self._event_ch.send_nowait(chat_chunk)
+                        retryable = False
 
                 self._event_ch.send_nowait(
                     llm.ChatChunk(
@@ -227,7 +229,7 @@ async def _run(self) -> None:
                     )
                 )
         except anthropic.APITimeoutError:
-            raise APITimeoutError()
+            raise APITimeoutError(retryable=retryable)
         except anthropic.APIStatusError as e:
             raise APIStatusError(
                 e.message,
@@ -236,7 +238,7 @@ async def _run(self) -> None:
                 body=e.body,
             )
         except Exception as e:
-            raise APIConnectionError() from e
+            raise APIConnectionError(retryable=retryable) from e
 
     def _parse_event(
         self, event: anthropic.types.RawMessageStreamEvent
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
index dd76473c7..eae3a0679 100644
--- a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
+++ b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
@@ -312,7 +312,10 @@ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
                     aiohttp.WSMsgType.CLOSE,
                     aiohttp.WSMsgType.CLOSING,
                 ):
-                    raise Exception("Cartesia connection closed unexpectedly")
+                    raise APIStatusError(
+                        "Cartesia connection closed unexpectedly",
+                        request_id=request_id,
+                    )
 
                 if msg.type != aiohttp.WSMsgType.TEXT:
                     logger.warning("unexpected Cartesia message type %s", msg.type)
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py
index 56d7405a7..401c26be7 100644
--- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py
+++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py
@@ -314,8 +314,9 @@ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
                     aiohttp.WSMsgType.CLOSING,
                 ):
                     if not closing_ws:
-                        raise Exception(
-                            "Deepgram websocket connection closed unexpectedly"
+                        raise APIStatusError(
+                            "Deepgram websocket connection closed unexpectedly",
+                            request_id=request_id,
                         )
                     return
 
@@ -393,7 +394,10 @@ async def _connection_timeout():
                 raise APITimeoutError() from e
             except aiohttp.ClientResponseError as e:
                 raise APIStatusError(
-                    message=e.message, status_code=e.status, request_id=None, body=None
+                    message=e.message,
+                    status_code=e.status,
+                    request_id=request_id,
+                    body=None,
                 ) from e
             except Exception as e:
                 raise APIConnectionError() from e
diff --git a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py
index 0c5490707..948d42758 100644
--- a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py
+++ b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py
@@ -469,8 +469,9 @@ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
                     aiohttp.WSMsgType.CLOSING,
                 ):
                     if not eos_sent:
-                        raise Exception(
-                            "11labs connection closed unexpectedly, not all tokens have been consumed"
+                        raise APIStatusError(
+                            "11labs connection closed unexpectedly, not all tokens have been consumed",
+                            request_id=request_id,
                         )
                     return
 
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
index 8e3dda787..37526dd4b 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
@@ -706,6 +706,7 @@ async def _run(self) -> None:
         self._fnc_name: str | None = None
         self._fnc_raw_arguments: str | None = None
         self._tool_index: int | None = None
+        retryable = True
 
         try:
             opts: dict[str, Any] = dict()
@@ -755,6 +756,7 @@ async def _run(self) -> None:
                     for choice in chunk.choices:
                         chat_chunk = self._parse_choice(chunk.id, choice)
                         if chat_chunk is not None:
+                            retryable = False
                             self._event_ch.send_nowait(chat_chunk)
 
                     if chunk.usage is not None:
@@ -771,7 +773,7 @@ async def _run(self) -> None:
                         )
 
         except openai.APITimeoutError:
-            raise APITimeoutError()
+            raise APITimeoutError(retryable=retryable)
         except openai.APIStatusError as e:
             raise APIStatusError(
                 e.message,
@@ -780,7 +782,7 @@ async def _run(self) -> None:
                 body=e.body,
             )
         except Exception as e:
-            raise APIConnectionError() from e
+            raise APIConnectionError(retryable=retryable) from e
 
     def _parse_choice(self, id: str, choice: Choice) -> llm.ChatChunk | None:
         delta = choice.delta

From 66152a49649e234f1914f3cd7cb96b1a872b6a7f Mon Sep 17 00:00:00 2001
From: Sahil Suman <34382211+sahilsuman933@users.noreply.github.com>
Date: Sat, 28 Dec 2024 11:52:45 +0530
Subject: [PATCH 38/46] Broken Link (#1300)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c9ffa9b78..11664a68f 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ Documentation on the framework and how to use it can be found [here](https://doc
 | Voice agent using the new OpenAI Realtime API                         | [demo](https://playground.livekit.io)          | [code](https://github.com/livekit-examples/realtime-playground)                                        |
 | Super fast voice agent using Cerebras hosted Llama 3.1                | [demo](https://cerebras.vercel.app)            | [code](https://github.com/dsa/fast-voice-assistant/)                                                   |
 | Voice agent using Cartesia's Sonic model                              | [demo](https://cartesia-assistant.vercel.app/) | [code](https://github.com/livekit-examples/cartesia-voice-agent)                                       |
-| Agent that looks up the current weather via function call             | N/A                                            | [code](https://github.com/livekit-examples/cartesia-voice-agent)                                       |
+| Agent that looks up the current weather via function call             | N/A                                            | [code](https://github.com/livekit/agents/blob/main/examples/voice-pipeline-agent/function_calling_weather.py)                                       |
 | Voice Agent using Gemini 2.0 Flash                                    | N/A                                            | [code](https://github.com/livekit-examples/voice-pipeline-agent/gemini_voice_agent.py)                 |
 | Voice agent with custom turn-detection model                          | N/A                                            | [code](https://github.com/livekit/agents/blob/main/examples/voice-pipeline-agent/turn_detector.py)     |
 | Voice agent that performs a RAG-based lookup                          | N/A                                            | [code](https://github.com/livekit/agents/tree/main/examples/voice-pipeline-agent/simple-rag)           |

From 83dc84ec25e1464d9a0cb72ef7ad7157bbb4c53b Mon Sep 17 00:00:00 2001
From: Hamdan <96612374+s-hamdananwar@users.noreply.github.com>
Date: Sat, 28 Dec 2024 04:50:18 -0800
Subject: [PATCH 39/46] expose worker_id in jobcontext (#1307)

---
 .changeset/eight-lemons-hear.md            | 5 +++++
 livekit-agents/livekit/agents/cli/proto.py | 2 ++
 livekit-agents/livekit/agents/ipc/proto.py | 2 ++
 livekit-agents/livekit/agents/job.py       | 6 ++++++
 livekit-agents/livekit/agents/worker.py    | 2 ++
 tests/test_ipc.py                          | 1 +
 6 files changed, 18 insertions(+)
 create mode 100644 .changeset/eight-lemons-hear.md

diff --git a/.changeset/eight-lemons-hear.md b/.changeset/eight-lemons-hear.md
new file mode 100644
index 000000000..38a3f1b1b
--- /dev/null
+++ b/.changeset/eight-lemons-hear.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+expose worker_id in jobcontext
diff --git a/livekit-agents/livekit/agents/cli/proto.py b/livekit-agents/livekit/agents/cli/proto.py
index f7753c579..761690783 100644
--- a/livekit-agents/livekit/agents/cli/proto.py
+++ b/livekit-agents/livekit/agents/cli/proto.py
@@ -52,6 +52,7 @@ def write(self, b: io.BytesIO) -> None:
             channel.write_string(b, accept_args.metadata)
             channel.write_string(b, running_job.url)
             channel.write_string(b, running_job.token)
+            channel.write_string(b, running_job.worker_id)
 
         channel.write_int(b, self.reload_count)
 
@@ -69,6 +70,7 @@ def read(self, b: io.BytesIO) -> None:
                     job=job,
                     url=channel.read_string(b),
                     token=channel.read_string(b),
+                    worker_id=channel.read_string(b),
                 )
             )
 
diff --git a/livekit-agents/livekit/agents/ipc/proto.py b/livekit-agents/livekit/agents/ipc/proto.py
index c878b4f23..509964b55 100644
--- a/livekit-agents/livekit/agents/ipc/proto.py
+++ b/livekit-agents/livekit/agents/ipc/proto.py
@@ -90,6 +90,7 @@ def write(self, b: io.BytesIO) -> None:
         channel.write_string(b, accept_args.metadata)
         channel.write_string(b, self.running_job.url)
         channel.write_string(b, self.running_job.token)
+        channel.write_string(b, self.running_job.worker_id)
 
     def read(self, b: io.BytesIO) -> None:
         job = agent.Job()
@@ -103,6 +104,7 @@ def read(self, b: io.BytesIO) -> None:
             job=job,
             url=channel.read_string(b),
             token=channel.read_string(b),
+            worker_id=channel.read_string(b),
         )
 
 
diff --git a/livekit-agents/livekit/agents/job.py b/livekit-agents/livekit/agents/job.py
index 64be850f0..b54f8358c 100644
--- a/livekit-agents/livekit/agents/job.py
+++ b/livekit-agents/livekit/agents/job.py
@@ -68,6 +68,7 @@ class RunningJobInfo:
     job: agent.Job
     url: str
     token: str
+    worker_id: str
 
 
 DEFAULT_PARTICIPANT_KINDS: list[rtc.ParticipantKind.ValueType] = [
@@ -123,6 +124,11 @@ def job(self) -> agent.Job:
         """Returns the current job that the worker is executing."""
         return self._info.job
 
+    @property
+    def worker_id(self) -> str:
+        """Returns the id of the worker."""
+        return self._info.worker_id
+
     @property
     def room(self) -> rtc.Room:
         """The Room object is the main interface that the worker should interact with.
diff --git a/livekit-agents/livekit/agents/worker.py b/livekit-agents/livekit/agents/worker.py
index 4708a34d3..54ad75470 100644
--- a/livekit-agents/livekit/agents/worker.py
+++ b/livekit-agents/livekit/agents/worker.py
@@ -621,6 +621,7 @@ async def _reload_jobs(self, jobs: list[RunningJobInfo]) -> None:
                 job=aj.job,
                 url=url,
                 token=jwt.encode(decoded, self._opts.api_secret, algorithm="HS256"),
+                worker_id=aj.worker_id,
             )
             await self._proc_pool.launch_job(running_info)
 
@@ -692,6 +693,7 @@ async def _on_accept(args: JobAcceptArguments) -> None:
                 job=msg.job,
                 url=job_assign.url or self._opts.ws_url,
                 token=job_assign.token,
+                worker_id=self._id,
             )
 
             await self._proc_pool.launch_job(running_info)
diff --git a/tests/test_ipc.py b/tests/test_ipc.py
index d964c9f55..4e1fd4fe7 100644
--- a/tests/test_ipc.py
+++ b/tests/test_ipc.py
@@ -114,6 +114,7 @@ def _generate_fake_job() -> job.RunningJobInfo:
         url="fake_url",
         token="fake_token",
         accept_arguments=job.JobAcceptArguments(name="", identity="", metadata=""),
+        worker_id="fake_id",
     )
 
 

From 97d9bce766602d815ad305a077b651b6062b5a51 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Sat, 28 Dec 2024 20:54:04 +0800
Subject: [PATCH 40/46] fix: add manual interrupt for pipeline agent (#1294)

---
 .changeset/slow-walls-bake.md                 |  5 ++
 .../livekit/agents/pipeline/pipeline_agent.py | 49 ++++++++++++++++---
 .../livekit/agents/pipeline/speech_handle.py  | 19 ++++---
 3 files changed, 60 insertions(+), 13 deletions(-)
 create mode 100644 .changeset/slow-walls-bake.md

diff --git a/.changeset/slow-walls-bake.md b/.changeset/slow-walls-bake.md
new file mode 100644
index 000000000..11df23f75
--- /dev/null
+++ b/.changeset/slow-walls-bake.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+add manual interrupt method for pipeline agent
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index b2a223bd0..2c7dc1363 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -473,7 +473,7 @@ async def say(
         synthesis_handle = self._synthesize_agent_speech(new_handle.id, source)
         new_handle.initialize(source=source, synthesis_handle=synthesis_handle)
 
-        if self._playing_speech and not self._playing_speech.nested_speech_finished:
+        if self._playing_speech and not self._playing_speech.nested_speech_done:
             self._playing_speech.add_nested_speech(new_handle)
         else:
             self._add_speech_for_playout(new_handle)
@@ -497,6 +497,23 @@ async def say(
 
         return new_handle
 
+    def interrupt(self, interrupt_all: bool = True) -> None:
+        """Interrupt the current speech
+
+        Args:
+            interrupt_all: Whether to interrupt all pending speech
+        """
+        if interrupt_all:
+            # interrupt all pending speech
+            if self._pending_agent_reply is not None:
+                self._pending_agent_reply.cancel(cancel_nested=True)
+            for speech in self._speech_q:
+                speech.cancel(cancel_nested=True)
+
+        # interrupt the playing speech
+        if self._playing_speech is not None:
+            self._playing_speech.cancel()
+
     def _update_state(self, state: AgentState, delay: float = 0.0):
         """Set the current state of the agent"""
 
@@ -956,19 +973,31 @@ async def _execute_function_calls() -> None:
             self.emit("function_calls_finished", called_fncs)
             _CallContextVar.reset(tk)
 
+        if not is_using_tools:
+            speech_handle._set_done()
+            return
+
         fnc_task = asyncio.create_task(_execute_function_calls())
-        while not speech_handle.nested_speech_finished:
-            event_wait_task = asyncio.create_task(
+        while not speech_handle.nested_speech_done:
+            nesting_changed = asyncio.create_task(
                 speech_handle.nested_speech_changed.wait()
             )
+            nesting_done_fut: asyncio.Future = speech_handle._nested_speech_done_fut
             await asyncio.wait(
-                [event_wait_task, fnc_task], return_when=asyncio.FIRST_COMPLETED
+                [nesting_changed, fnc_task, nesting_done_fut],
+                return_when=asyncio.FIRST_COMPLETED,
             )
-            if not event_wait_task.done():
-                event_wait_task.cancel()
+            if not nesting_changed.done():
+                nesting_changed.cancel()
 
             while speech_handle.nested_speech_handles:
                 speech = speech_handle.nested_speech_handles[0]
+                if speech_handle.nested_speech_done:
+                    # in case tool speech is added after nested speech done
+                    speech.cancel(cancel_nested=True)
+                    speech_handle.nested_speech_handles.pop(0)
+                    continue
+
                 self._playing_speech = speech
                 await self._play_speech(speech)
                 speech_handle.nested_speech_handles.pop(0)
@@ -977,7 +1006,13 @@ async def _execute_function_calls() -> None:
             speech_handle.nested_speech_changed.clear()
             # break if the function calls task is done
             if fnc_task.done():
-                speech_handle.mark_nested_speech_finished()
+                speech_handle.mark_nested_speech_done()
+
+        if not fnc_task.done():
+            logger.debug(
+                "cancelling function calls task", extra={"speech_id": speech_handle.id}
+            )
+            fnc_task.cancel()
 
         # mark the speech as done
         speech_handle._set_done()
diff --git a/livekit-agents/livekit/agents/pipeline/speech_handle.py b/livekit-agents/livekit/agents/pipeline/speech_handle.py
index d36eb7aee..cd1f39dec 100644
--- a/livekit-agents/livekit/agents/pipeline/speech_handle.py
+++ b/livekit-agents/livekit/agents/pipeline/speech_handle.py
@@ -46,7 +46,7 @@ def __init__(
 
         self._nested_speech_handles: list[SpeechHandle] = []
         self._nested_speech_changed = asyncio.Event()
-        self._nested_speech_finished = False
+        self._nested_speech_done_fut = asyncio.Future[None]()
 
     @staticmethod
     def create_assistant_reply(
@@ -190,12 +190,17 @@ def interrupt(self) -> None:
             raise RuntimeError("interruptions are not allowed")
         self.cancel()
 
-    def cancel(self) -> None:
+    def cancel(self, cancel_nested: bool = False) -> None:
         self._init_fut.cancel()
 
         if self._synthesis_handle is not None:
             self._synthesis_handle.interrupt()
 
+        if cancel_nested:
+            for speech in self._nested_speech_handles:
+                speech.cancel(cancel_nested=True)
+            self.mark_nested_speech_done()
+
     @property
     def fnc_nested_depth(self) -> int:
         return self._fnc_nested_depth
@@ -221,8 +226,10 @@ def nested_speech_changed(self) -> asyncio.Event:
         return self._nested_speech_changed
 
     @property
-    def nested_speech_finished(self) -> bool:
-        return self._nested_speech_finished
+    def nested_speech_done(self) -> bool:
+        return self._nested_speech_done_fut.done()
 
-    def mark_nested_speech_finished(self) -> None:
-        self._nested_speech_finished = True
+    def mark_nested_speech_done(self) -> None:
+        if self._nested_speech_done_fut.done():
+            return
+        self._nested_speech_done_fut.set_result(None)

From bd36bc989d819f5da003a0a34fc9f570e100583c Mon Sep 17 00:00:00 2001
From: martin-purplefish <martin@purplefish.com>
Date: Sat, 28 Dec 2024 22:27:13 -0500
Subject: [PATCH 41/46] Do not pass function context if at max depth (#1306)

---
 .changeset/khaki-candles-rest.md              |  5 +++++
 .../livekit/agents/pipeline/pipeline_agent.py | 19 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 .changeset/khaki-candles-rest.md

diff --git a/.changeset/khaki-candles-rest.md b/.changeset/khaki-candles-rest.md
new file mode 100644
index 000000000..91afec21c
--- /dev/null
+++ b/.changeset/khaki-candles-rest.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+Do not pass function context if at max depth
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index 2c7dc1363..872bd5d4b 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -960,7 +960,24 @@ async def _execute_function_calls() -> None:
             chat_ctx = call_ctx.chat_ctx.copy()
             chat_ctx.messages.extend(extra_tools_messages)
             chat_ctx.messages.extend(call_ctx.extra_chat_messages)
-            answer_llm_stream = self._llm.chat(chat_ctx=chat_ctx, fnc_ctx=self.fnc_ctx)
+            fnc_ctx = self.fnc_ctx
+            if (
+                fnc_ctx
+                and new_speech_handle.fnc_nested_depth
+                >= self._opts.max_nested_fnc_calls
+            ):
+                logger.warning(
+                    "max function calls nested depth reached, not propagating fnc ctx",
+                    extra={
+                        "speech_id": speech_handle.id,
+                        "fnc_nested_depth": speech_handle.fnc_nested_depth,
+                    },
+                )
+                fnc_ctx = None
+            answer_llm_stream = self._llm.chat(
+                chat_ctx=chat_ctx,
+                fnc_ctx=fnc_ctx,
+            )
 
             synthesis_handle = self._synthesize_agent_speech(
                 new_speech_handle.id, answer_llm_stream

From b7f289560260598a4aeb1915d3bcba28a8013245 Mon Sep 17 00:00:00 2001
From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com>
Date: Mon, 30 Dec 2024 21:09:48 +0530
Subject: [PATCH 42/46] Support Gemini Live API (#1240)

---
 .changeset/thirty-coats-tie.md                |   7 +
 examples/multimodal_agent/gemini_agent.py     |  68 +++
 .../openai_agent.py}                          |   0
 livekit-agents/livekit/agents/cli/log.py      |   1 +
 .../livekit/agents/multimodal/__init__.py     |  14 +-
 .../agents/multimodal/multimodal_agent.py     | 129 ++++--
 .../livekit/plugins/google/__init__.py        |   4 +-
 .../livekit/plugins/google/beta/__init__.py   |   3 +
 .../plugins/google/beta/realtime/__init__.py  |  15 +
 .../plugins/google/beta/realtime/api_proto.py |  79 ++++
 .../google/beta/realtime/realtime_api.py      | 424 ++++++++++++++++++
 .../livekit-plugins-google/setup.py           |   1 +
 .../plugins/openai/realtime/__init__.py       |   4 -
 .../plugins/openai/realtime/realtime_model.py |  35 +-
 14 files changed, 741 insertions(+), 43 deletions(-)
 create mode 100644 .changeset/thirty-coats-tie.md
 create mode 100644 examples/multimodal_agent/gemini_agent.py
 rename examples/{multimodal_agent.py => multimodal_agent/openai_agent.py} (100%)
 create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py
 create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py
 create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py
 create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py

diff --git a/.changeset/thirty-coats-tie.md b/.changeset/thirty-coats-tie.md
new file mode 100644
index 000000000..f0c6a9e67
--- /dev/null
+++ b/.changeset/thirty-coats-tie.md
@@ -0,0 +1,7 @@
+---
+"livekit-plugins-google": minor
+"livekit-plugins-openai": patch
+"livekit-agents": patch
+---
+
+make multimodal class generic and support gemini live api
diff --git a/examples/multimodal_agent/gemini_agent.py b/examples/multimodal_agent/gemini_agent.py
new file mode 100644
index 000000000..81a474609
--- /dev/null
+++ b/examples/multimodal_agent/gemini_agent.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import logging
+from typing import Annotated
+
+import aiohttp
+from dotenv import load_dotenv
+from livekit.agents import (
+    AutoSubscribe,
+    JobContext,
+    WorkerOptions,
+    WorkerType,
+    cli,
+    llm,
+    multimodal,
+)
+from livekit.plugins import google
+
+load_dotenv()
+
+logger = logging.getLogger("my-worker")
+logger.setLevel(logging.INFO)
+
+
+async def entrypoint(ctx: JobContext):
+    logger.info("starting entrypoint")
+
+    fnc_ctx = llm.FunctionContext()
+
+    @fnc_ctx.ai_callable()
+    async def get_weather(
+        location: Annotated[
+            str, llm.TypeInfo(description="The location to get the weather for")
+        ],
+    ):
+        """Called when the user asks about the weather. This function will return the weather for the given location."""
+        logger.info(f"getting weather for {location}")
+        url = f"https://wttr.in/{location}?format=%C+%t"
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                if response.status == 200:
+                    weather_data = await response.text()
+                    # # response from the function call is returned to the LLM
+                    return f"The weather in {location} is {weather_data}."
+                else:
+                    raise Exception(
+                        f"Failed to get weather data, status code: {response.status}"
+                    )
+
+    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
+    participant = await ctx.wait_for_participant()
+
+    chat_ctx = llm.ChatContext()
+
+    agent = multimodal.MultimodalAgent(
+        model=google.beta.realtime.RealtimeModel(
+            voice="Charon",
+            temperature=0.8,
+            instructions="You are a helpful assistant",
+        ),
+        fnc_ctx=fnc_ctx,
+        chat_ctx=chat_ctx,
+    )
+    agent.start(ctx.room, participant)
+
+
+if __name__ == "__main__":
+    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, worker_type=WorkerType.ROOM))
diff --git a/examples/multimodal_agent.py b/examples/multimodal_agent/openai_agent.py
similarity index 100%
rename from examples/multimodal_agent.py
rename to examples/multimodal_agent/openai_agent.py
diff --git a/livekit-agents/livekit/agents/cli/log.py b/livekit-agents/livekit/agents/cli/log.py
index dc16bfdfa..c4b5e5e52 100644
--- a/livekit-agents/livekit/agents/cli/log.py
+++ b/livekit-agents/livekit/agents/cli/log.py
@@ -18,6 +18,7 @@
     "openai",
     "watchfiles",
     "anthropic",
+    "websockets.client",
 ]
 
 
diff --git a/livekit-agents/livekit/agents/multimodal/__init__.py b/livekit-agents/livekit/agents/multimodal/__init__.py
index d165c082a..f741e168a 100644
--- a/livekit-agents/livekit/agents/multimodal/__init__.py
+++ b/livekit-agents/livekit/agents/multimodal/__init__.py
@@ -1,3 +1,13 @@
-from .multimodal_agent import AgentTranscriptionOptions, MultimodalAgent
+from .multimodal_agent import (
+    AgentTranscriptionOptions,
+    MultimodalAgent,
+    _RealtimeAPI,
+    _RealtimeAPISession,
+)
 
-__all__ = ["MultimodalAgent", "AgentTranscriptionOptions"]
+__all__ = [
+    "MultimodalAgent",
+    "AgentTranscriptionOptions",
+    "_RealtimeAPI",
+    "_RealtimeAPISession",
+]
diff --git a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py
index ee3a2d992..f02bb2e64 100644
--- a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py
+++ b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py
@@ -2,7 +2,17 @@
 
 import asyncio
 from dataclasses import dataclass
-from typing import Callable, Literal, Protocol
+from typing import (
+    Any,
+    AsyncIterable,
+    Callable,
+    Literal,
+    Optional,
+    Protocol,
+    TypeVar,
+    Union,
+    overload,
+)
 
 import aiohttp
 from livekit import rtc
@@ -28,6 +38,76 @@
 ]
 
 
+class _InputTranscriptionProto(Protocol):
+    item_id: str
+    """id of the item"""
+    transcript: str
+    """transcript of the input audio"""
+
+
+class _ContentProto(Protocol):
+    response_id: str
+    item_id: str
+    output_index: int
+    content_index: int
+    text: str
+    audio: list[rtc.AudioFrame]
+    text_stream: AsyncIterable[str]
+    audio_stream: AsyncIterable[rtc.AudioFrame]
+    content_type: Literal["text", "audio"]
+
+
+class _CapabilitiesProto(Protocol):
+    supports_truncate: bool
+
+
+class _RealtimeAPI(Protocol):
+    """Realtime API protocol"""
+
+    @property
+    def capabilities(self) -> _CapabilitiesProto: ...
+    def session(
+        self,
+        *,
+        chat_ctx: llm.ChatContext | None = None,
+        fnc_ctx: llm.FunctionContext | None = None,
+    ) -> _RealtimeAPISession:
+        """
+        Create a new realtime session with the given chat and function contexts.
+        """
+        pass
+
+
+T = TypeVar("T", bound=Callable[..., Any])
+
+
+class _RealtimeAPISession(Protocol):
+    async def set_chat_ctx(self, ctx: llm.ChatContext) -> None: ...
+    @overload
+    def on(self, event: str, callback: None = None) -> Callable[[T], T]: ...
+    @overload
+    def on(self, event: str, callback: T) -> T: ...
+    def on(
+        self, event: str, callback: Optional[T] = None
+    ) -> Union[T, Callable[[T], T]]: ...
+
+    def _push_audio(self, frame: rtc.AudioFrame) -> None: ...
+    @property
+    def fnc_ctx(self) -> llm.FunctionContext | None: ...
+    @fnc_ctx.setter
+    def fnc_ctx(self, value: llm.FunctionContext | None) -> None: ...
+    def chat_ctx_copy(self) -> llm.ChatContext: ...
+    def _recover_from_text_response(self, item_id: str) -> None: ...
+    def _update_conversation_item_content(
+        self,
+        item_id: str,
+        content: llm.ChatContent | list[llm.ChatContent] | None = None,
+    ) -> None: ...
+    def _truncate_conversation_item(
+        self, item_id: str, content_index: int, audio_end_ms: int
+    ) -> None: ...
+
+
 @dataclass(frozen=True)
 class AgentTranscriptionOptions:
     user_transcription: bool = True
@@ -50,9 +130,6 @@ class AgentTranscriptionOptions:
     representing the hyphenated parts of the word."""
 
 
-class S2SModel(Protocol): ...
-
-
 @dataclass(frozen=True)
 class _ImplOptions:
     transcription: AgentTranscriptionOptions
@@ -62,7 +139,7 @@ class MultimodalAgent(utils.EventEmitter[EventTypes]):
     def __init__(
         self,
         *,
-        model: S2SModel,
+        model: _RealtimeAPI,
         vad: vad.VAD | None = None,
         chat_ctx: llm.ChatContext | None = None,
         fnc_ctx: llm.FunctionContext | None = None,
@@ -73,7 +150,7 @@ def __init__(
         """Create a new MultimodalAgent.
 
         Args:
-            model: S2SModel instance.
+            model: RealtimeAPI instance.
             vad: Voice Activity Detection (VAD) instance.
             chat_ctx: Chat context for the assistant.
             fnc_ctx: Function context for the assistant.
@@ -89,10 +166,6 @@ def __init__(
         super().__init__()
         self._loop = loop or asyncio.get_event_loop()
 
-        from livekit.plugins.openai import realtime
-
-        assert isinstance(model, realtime.RealtimeModel)
-
         self._model = model
         self._vad = vad
         self._chat_ctx = chat_ctx
@@ -177,13 +250,8 @@ async def _init_and_start():
         # Schedule the initialization and start task
         asyncio.create_task(_init_and_start())
 
-        from livekit.plugins.openai import realtime
-
         @self._session.on("response_content_added")
-        def _on_content_added(message: realtime.RealtimeContent):
-            if message.content_type == "text":
-                return
-
+        def _on_content_added(message: _ContentProto):
             tr_fwd = transcription.TTSSegmentsForwarder(
                 room=self._room,
                 participant=self._room.local_participant,
@@ -202,7 +270,7 @@ def _on_content_added(message: realtime.RealtimeContent):
             )
 
         @self._session.on("response_content_done")
-        def _response_content_done(message: realtime.RealtimeContent):
+        def _response_content_done(message: _ContentProto):
             if message.content_type == "text":
                 if self._text_response_retries >= self._max_text_response_retries:
                     raise RuntimeError(
@@ -236,9 +304,7 @@ def _input_speech_committed():
             )
 
         @self._session.on("input_speech_transcription_completed")
-        def _input_speech_transcription_completed(
-            ev: realtime.InputTranscriptionCompleted,
-        ):
+        def _input_speech_transcription_completed(ev: _InputTranscriptionProto):
             self._stt_forwarder.update(
                 stt.SpeechEvent(
                     type=stt.SpeechEventType.FINAL_TRANSCRIPT,
@@ -248,6 +314,7 @@ def _input_speech_transcription_completed(
             user_msg = ChatMessage.create(
                 text=ev.transcript, role="user", id=ev.item_id
             )
+
             self._session._update_conversation_item_content(
                 ev.item_id, user_msg.content
             )
@@ -265,11 +332,14 @@ def _input_speech_started():
             if self._playing_handle is not None and not self._playing_handle.done():
                 self._playing_handle.interrupt()
 
-                self._session.conversation.item.truncate(
-                    item_id=self._playing_handle.item_id,
-                    content_index=self._playing_handle.content_index,
-                    audio_end_ms=int(self._playing_handle.audio_samples / 24000 * 1000),
-                )
+                if self._model.capabilities.supports_truncate:
+                    self._session._truncate_conversation_item(
+                        item_id=self._playing_handle.item_id,
+                        content_index=self._playing_handle.content_index,
+                        audio_end_ms=int(
+                            self._playing_handle.audio_samples / 24000 * 1000
+                        ),
+                    )
 
         @self._session.on("input_speech_stopped")
         def _input_speech_stopped():
@@ -330,9 +400,10 @@ def _on_playout_stopped(interrupted: bool) -> None:
                     role="assistant",
                     id=self._playing_handle.item_id,
                 )
-                self._session._update_conversation_item_content(
-                    self._playing_handle.item_id, msg.content
-                )
+                if self._model.capabilities.supports_truncate:
+                    self._session._update_conversation_item_content(
+                        self._playing_handle.item_id, msg.content
+                    )
 
                 if interrupted:
                     self.emit("agent_speech_interrupted", msg)
@@ -366,7 +437,7 @@ def _on_playout_stopped(interrupted: bool) -> None:
         )
         async for frame in self._input_audio_ch:
             for f in bstream.write(frame.data.tobytes()):
-                self._session.input_audio_buffer.append(f)
+                self._session._push_audio(f)
 
     def _on_participant_connected(self, participant: rtc.RemoteParticipant):
         if self._linked_participant is None:
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py
index ca754bd30..88e163634 100644
--- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from . import beta
 from .stt import STT, SpeechStream
 from .tts import TTS
 from .version import __version__
 
-__all__ = ["STT", "TTS", "SpeechStream", "__version__"]
-
+__all__ = ["STT", "TTS", "SpeechStream", "__version__", "beta"]
 from livekit.agents import Plugin
 
 from .log import logger
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py
new file mode 100644
index 000000000..89cb122c8
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py
@@ -0,0 +1,3 @@
+from . import realtime
+
+__all__ = ["realtime"]
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py
new file mode 100644
index 000000000..e95a86917
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py
@@ -0,0 +1,15 @@
+from .api_proto import (
+    ClientEvents,
+    LiveAPIModels,
+    ResponseModality,
+    Voice,
+)
+from .realtime_api import RealtimeModel
+
+__all__ = [
+    "RealtimeModel",
+    "ClientEvents",
+    "LiveAPIModels",
+    "ResponseModality",
+    "Voice",
+]
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py
new file mode 100644
index 000000000..c02fb3859
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import inspect
+from typing import Any, Dict, List, Literal, Sequence, Union
+
+from google.genai import types  # type: ignore
+
+LiveAPIModels = Literal["gemini-2.0-flash-exp"]
+
+Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
+ResponseModality = Literal["AUDIO", "TEXT"]
+
+
+ClientEvents = Union[
+    types.ContentListUnion,
+    types.ContentListUnionDict,
+    types.LiveClientContentOrDict,
+    types.LiveClientRealtimeInput,
+    types.LiveClientRealtimeInputOrDict,
+    types.LiveClientToolResponseOrDict,
+    types.FunctionResponseOrDict,
+    Sequence[types.FunctionResponseOrDict],
+]
+
+
+JSON_SCHEMA_TYPE_MAP = {
+    str: "string",
+    int: "integer",
+    float: "number",
+    bool: "boolean",
+    dict: "object",
+    list: "array",
+}
+
+
+def _build_parameters(arguments: Dict[str, Any]) -> types.SchemaDict:
+    properties: Dict[str, types.SchemaDict] = {}
+    required: List[str] = []
+
+    for arg_name, arg_info in arguments.items():
+        py_type = arg_info.type
+        if py_type not in JSON_SCHEMA_TYPE_MAP:
+            raise ValueError(f"Unsupported type: {py_type}")
+
+        prop: types.SchemaDict = {
+            "type": JSON_SCHEMA_TYPE_MAP[py_type],
+            "description": arg_info.description,
+        }
+
+        if arg_info.choices:
+            prop["enum"] = arg_info.choices
+
+        properties[arg_name] = prop
+
+        if arg_info.default is inspect.Parameter.empty:
+            required.append(arg_name)
+
+    parameters: types.SchemaDict = {"type": "object", "properties": properties}
+
+    if required:
+        parameters["required"] = required
+
+    return parameters
+
+
+def _build_tools(fnc_ctx: Any) -> List[types.FunctionDeclarationDict]:
+    function_declarations: List[types.FunctionDeclarationDict] = []
+    for fnc_info in fnc_ctx.ai_functions.values():
+        parameters = _build_parameters(fnc_info.arguments)
+
+        func_decl: types.FunctionDeclarationDict = {
+            "name": fnc_info.name,
+            "description": fnc_info.description,
+            "parameters": parameters,
+        }
+
+        function_declarations.append(func_decl)
+
+    return function_declarations
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py
new file mode 100644
index 000000000..40bb0d7a1
--- /dev/null
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py
@@ -0,0 +1,424 @@
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+import os
+from dataclasses import dataclass
+from typing import AsyncIterable, Literal
+
+from livekit import rtc
+from livekit.agents import llm, utils
+from livekit.agents.llm.function_context import _create_ai_function_info
+
+from google import genai  # type: ignore
+from google.genai.types import (  # type: ignore
+    FunctionResponse,
+    GenerationConfigDict,
+    LiveClientToolResponse,
+    LiveConnectConfigDict,
+    PrebuiltVoiceConfig,
+    SpeechConfig,
+    VoiceConfig,
+)
+
+from ...log import logger
+from .api_proto import (
+    ClientEvents,
+    LiveAPIModels,
+    ResponseModality,
+    Voice,
+    _build_tools,
+)
+
+EventTypes = Literal[
+    "start_session",
+    "input_speech_started",
+    "response_content_added",
+    "response_content_done",
+    "function_calls_collected",
+    "function_calls_finished",
+    "function_calls_cancelled",
+]
+
+
+@dataclass
+class GeminiContent:
+    response_id: str
+    item_id: str
+    output_index: int
+    content_index: int
+    text: str
+    audio: list[rtc.AudioFrame]
+    text_stream: AsyncIterable[str]
+    audio_stream: AsyncIterable[rtc.AudioFrame]
+    content_type: Literal["text", "audio"]
+
+
+@dataclass
+class Capabilities:
+    supports_truncate: bool
+
+
+@dataclass
+class ModelOptions:
+    model: LiveAPIModels | str
+    api_key: str | None
+    voice: Voice | str
+    response_modalities: ResponseModality
+    vertexai: bool
+    project: str | None
+    location: str | None
+    candidate_count: int
+    temperature: float | None
+    max_output_tokens: int | None
+    top_p: float | None
+    top_k: int | None
+    presence_penalty: float | None
+    frequency_penalty: float | None
+    instructions: str
+
+
+class RealtimeModel:
+    def __init__(
+        self,
+        *,
+        instructions: str = "",
+        model: LiveAPIModels | str = "gemini-2.0-flash-exp",
+        api_key: str | None = None,
+        voice: Voice | str = "Puck",
+        modalities: ResponseModality = "AUDIO",
+        vertexai: bool = False,
+        project: str | None = None,
+        location: str | None = None,
+        candidate_count: int = 1,
+        temperature: float | None = None,
+        max_output_tokens: int | None = None,
+        top_p: float | None = None,
+        top_k: int | None = None,
+        presence_penalty: float | None = None,
+        frequency_penalty: float | None = None,
+        loop: asyncio.AbstractEventLoop | None = None,
+    ):
+        """
+        Initializes a RealtimeModel instance for interacting with Google's Realtime API.
+
+        Args:
+            instructions (str, optional): Initial system instructions for the model. Defaults to "".
+            api_key (str or None, optional): OpenAI API key. If None, will attempt to read from the environment variable OPENAI_API_KEY
+            modalities (ResponseModality): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
+            model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
+            voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
+            temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
+            vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
+                project (str or None, optional): The project to use for the API. Defaults to None. (for vertexai)
+                location (str or None, optional): The location to use for the API. Defaults to None. (for vertexai)
+            candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
+            top_p (float, optional): The top-p value for response generation
+            top_k (int, optional): The top-k value for response generation
+            presence_penalty (float, optional): The presence penalty for response generation
+            frequency_penalty (float, optional): The frequency penalty for response generation
+            loop (asyncio.AbstractEventLoop or None, optional): Event loop to use for async operations. If None, the current event loop is used.
+
+        Raises:
+            ValueError: If the API key is not provided and cannot be found in environment variables.
+        """
+        super().__init__()
+        self._capabilities = Capabilities(
+            supports_truncate=False,
+        )
+        self._model = model
+        self._loop = loop or asyncio.get_event_loop()
+        self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
+        self._vertexai = vertexai
+        self._project_id = project or os.environ.get("GOOGLE_PROJECT")
+        self._location = location or os.environ.get("GOOGLE_LOCATION")
+        if self._api_key is None and not self._vertexai:
+            raise ValueError("GOOGLE_API_KEY is not set")
+
+        self._rt_sessions: list[GeminiRealtimeSession] = []
+        self._opts = ModelOptions(
+            model=model,
+            api_key=api_key,
+            voice=voice,
+            response_modalities=modalities,
+            vertexai=vertexai,
+            project=project,
+            location=location,
+            candidate_count=candidate_count,
+            temperature=temperature,
+            max_output_tokens=max_output_tokens,
+            top_p=top_p,
+            top_k=top_k,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            instructions=instructions,
+        )
+
+    @property
+    def sessions(self) -> list[GeminiRealtimeSession]:
+        return self._rt_sessions
+
+    @property
+    def capabilities(self) -> Capabilities:
+        return self._capabilities
+
+    def session(
+        self,
+        *,
+        chat_ctx: llm.ChatContext | None = None,
+        fnc_ctx: llm.FunctionContext | None = None,
+    ) -> GeminiRealtimeSession:
+        session = GeminiRealtimeSession(
+            opts=self._opts,
+            chat_ctx=chat_ctx or llm.ChatContext(),
+            fnc_ctx=fnc_ctx,
+            loop=self._loop,
+        )
+        self._rt_sessions.append(session)
+
+        return session
+
+    async def aclose(self) -> None:
+        for session in self._rt_sessions:
+            await session.aclose()
+
+
+class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
+    def __init__(
+        self,
+        *,
+        opts: ModelOptions,
+        chat_ctx: llm.ChatContext,
+        fnc_ctx: llm.FunctionContext | None,
+        loop: asyncio.AbstractEventLoop,
+    ):
+        """
+        Initializes a GeminiRealtimeSession instance for interacting with Google's Realtime API.
+
+        Args:
+            opts (ModelOptions): The model options for the session.
+            chat_ctx (llm.ChatContext): The chat context for the session.
+            fnc_ctx (llm.FunctionContext or None): The function context for the session.
+            loop (asyncio.AbstractEventLoop): The event loop for the session.
+        """
+        super().__init__()
+        self._loop = loop
+        self._opts = opts
+        self._chat_ctx = chat_ctx
+        self._fnc_ctx = fnc_ctx
+        self._fnc_tasks = utils.aio.TaskSet()
+
+        tools = []
+        if self._fnc_ctx is not None:
+            functions = _build_tools(self._fnc_ctx)
+            tools.append({"function_declarations": functions})
+
+        self._config = LiveConnectConfigDict(
+            model=self._opts.model,
+            response_modalities=self._opts.response_modalities,
+            generation_config=GenerationConfigDict(
+                candidate_count=self._opts.candidate_count,
+                temperature=self._opts.temperature,
+                max_output_tokens=self._opts.max_output_tokens,
+                top_p=self._opts.top_p,
+                top_k=self._opts.top_k,
+                presence_penalty=self._opts.presence_penalty,
+                frequency_penalty=self._opts.frequency_penalty,
+            ),
+            system_instruction=self._opts.instructions,
+            speech_config=SpeechConfig(
+                voice_config=VoiceConfig(
+                    prebuilt_voice_config=PrebuiltVoiceConfig(
+                        voice_name=self._opts.voice
+                    )
+                )
+            ),
+            tools=tools,
+        )
+        self._client = genai.Client(
+            http_options={"api_version": "v1alpha"},
+            api_key=self._opts.api_key,
+            vertexai=self._opts.vertexai,
+            project=self._opts.project,
+            location=self._opts.location,
+        )
+        self._main_atask = asyncio.create_task(
+            self._main_task(), name="gemini-realtime-session"
+        )
+        # dummy task to wait for the session to be initialized # TODO: sync chat ctx
+        self._init_sync_task = asyncio.create_task(
+            asyncio.sleep(0), name="gemini-realtime-session-init"
+        )
+        self._send_ch = utils.aio.Chan[ClientEvents]()
+        self._active_response_id = None
+
+    async def aclose(self) -> None:
+        if self._send_ch.closed:
+            return
+
+        self._send_ch.close()
+        await self._main_atask
+
+    @property
+    def fnc_ctx(self) -> llm.FunctionContext | None:
+        return self._fnc_ctx
+
+    @fnc_ctx.setter
+    def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
+        self._fnc_ctx = value
+
+    def _push_audio(self, frame: rtc.AudioFrame) -> None:
+        data = base64.b64encode(frame.data).decode("utf-8")
+        self._queue_msg({"mime_type": "audio/pcm", "data": data})
+
+    def _queue_msg(self, msg: dict) -> None:
+        self._send_ch.send_nowait(msg)
+
+    def chat_ctx_copy(self) -> llm.ChatContext:
+        return self._chat_ctx.copy()
+
+    async def set_chat_ctx(self, ctx: llm.ChatContext) -> None:
+        self._chat_ctx = ctx.copy()
+
+    @utils.log_exceptions(logger=logger)
+    async def _main_task(self):
+        @utils.log_exceptions(logger=logger)
+        async def _send_task():
+            async for msg in self._send_ch:
+                await self._session.send(msg)
+
+            await self._session.send(".", end_of_turn=True)
+
+        @utils.log_exceptions(logger=logger)
+        async def _recv_task():
+            while True:
+                async for response in self._session.receive():
+                    if self._active_response_id is None:
+                        self._active_response_id = utils.shortuuid()
+                        text_stream = utils.aio.Chan[str]()
+                        audio_stream = utils.aio.Chan[rtc.AudioFrame]()
+                        content = GeminiContent(
+                            response_id=self._active_response_id,
+                            item_id=self._active_response_id,
+                            output_index=0,
+                            content_index=0,
+                            text="",
+                            audio=[],
+                            text_stream=text_stream,
+                            audio_stream=audio_stream,
+                            content_type=self._opts.response_modalities,
+                        )
+                        self.emit("response_content_added", content)
+
+                    server_content = response.server_content
+                    if server_content:
+                        model_turn = server_content.model_turn
+                        if model_turn:
+                            for part in model_turn.parts:
+                                if part.text:
+                                    content.text_stream.send_nowait(part.text)
+                                if part.inline_data:
+                                    frame = rtc.AudioFrame(
+                                        data=part.inline_data.data,
+                                        sample_rate=24000,
+                                        num_channels=1,
+                                        samples_per_channel=len(part.inline_data.data)
+                                        // 2,
+                                    )
+                                    content.audio_stream.send_nowait(frame)
+
+                        if server_content.interrupted or server_content.turn_complete:
+                            for stream in (content.text_stream, content.audio_stream):
+                                if isinstance(stream, utils.aio.Chan):
+                                    stream.close()
+
+                            if server_content.interrupted:
+                                self.emit("input_speech_started")
+                            elif server_content.turn_complete:
+                                self.emit("response_content_done", content)
+
+                            self._active_response_id = None
+
+                    if response.tool_call:
+                        if self._fnc_ctx is None:
+                            raise ValueError("Function context is not set")
+                        fnc_calls = []
+                        for fnc_call in response.tool_call.function_calls:
+                            fnc_call_info = _create_ai_function_info(
+                                self._fnc_ctx,
+                                fnc_call.id,
+                                fnc_call.name,
+                                json.dumps(fnc_call.args),
+                            )
+                            fnc_calls.append(fnc_call_info)
+
+                        self.emit("function_calls_collected", fnc_calls)
+
+                        for fnc_call_info in fnc_calls:
+                            self._fnc_tasks.create_task(
+                                self._run_fnc_task(fnc_call_info, content.item_id)
+                            )
+
+                    # Handle function call cancellations
+                    if response.tool_call_cancellation:
+                        logger.warning(
+                            "function call cancelled",
+                            extra={
+                                "function_call_ids": response.tool_call_cancellation.function_call_ids,
+                            },
+                        )
+                        self.emit(
+                            "function_calls_cancelled",
+                            response.tool_call_cancellation.function_call_ids,
+                        )
+
+        async with self._client.aio.live.connect(
+            model=self._opts.model, config=self._config
+        ) as session:
+            self._session = session
+            tasks = [
+                asyncio.create_task(_send_task(), name="gemini-realtime-send"),
+                asyncio.create_task(_recv_task(), name="gemini-realtime-recv"),
+            ]
+
+            try:
+                await asyncio.gather(*tasks)
+            finally:
+                await utils.aio.gracefully_cancel(*tasks)
+                await self._session.close()
+
+    @utils.log_exceptions(logger=logger)
+    async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str):
+        logger.debug(
+            "executing ai function",
+            extra={
+                "function": fnc_call_info.function_info.name,
+            },
+        )
+
+        called_fnc = fnc_call_info.execute()
+        try:
+            await called_fnc.task
+        except Exception as e:
+            logger.exception(
+                "error executing ai function",
+                extra={
+                    "function": fnc_call_info.function_info.name,
+                },
+                exc_info=e,
+            )
+        tool_call = llm.ChatMessage.create_tool_from_called_function(called_fnc)
+        if tool_call.content is not None:
+            tool_response = LiveClientToolResponse(
+                function_responses=[
+                    FunctionResponse(
+                        name=tool_call.name,
+                        id=tool_call.tool_call_id,
+                        response={"result": tool_call.content},
+                    )
+                ]
+            )
+            await self._session.send(tool_response)
+
+            self.emit("function_calls_finished", [called_fnc])
diff --git a/livekit-plugins/livekit-plugins-google/setup.py b/livekit-plugins/livekit-plugins-google/setup.py
index 87646895f..0db8addce 100644
--- a/livekit-plugins/livekit-plugins-google/setup.py
+++ b/livekit-plugins/livekit-plugins-google/setup.py
@@ -51,6 +51,7 @@
         "google-auth >= 2, < 3",
         "google-cloud-speech >= 2, < 3",
         "google-cloud-texttospeech >= 2, < 3",
+        "google-genai >= 0.3.0",
         "livekit-agents>=0.12.3",
     ],
     package_data={"livekit.plugins.google": ["py.typed"]},
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py
index 471deef37..fbb453609 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py
@@ -2,8 +2,6 @@
 from .realtime_model import (
     DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
     DEFAULT_SERVER_VAD_OPTIONS,
-    InputTranscriptionCompleted,
-    InputTranscriptionFailed,
     InputTranscriptionOptions,
     RealtimeContent,
     RealtimeError,
@@ -17,8 +15,6 @@
 )
 
 __all__ = [
-    "InputTranscriptionCompleted",
-    "InputTranscriptionFailed",
     "RealtimeContent",
     "RealtimeOutput",
     "RealtimeResponse",
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
index 26bc2649b..10d7abc1f 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py
@@ -4,6 +4,7 @@
 import base64
 import os
 import time
+import weakref
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import AsyncIterable, Literal, Optional, Union, cast, overload
@@ -105,8 +106,11 @@ class RealtimeToolCall:
     """id of the tool call"""
 
 
-# TODO(theomonnom): add the content type directly inside RealtimeContent?
-# text/audio/transcript?
+@dataclass
+class Capabilities:
+    supports_truncate: bool
+
+
 @dataclass
 class RealtimeContent:
     response_id: str
@@ -284,6 +288,9 @@ def __init__(
             ValueError: If the API key is not provided and cannot be found in environment variables.
         """
         super().__init__()
+        self._capabilities = Capabilities(
+            supports_truncate=True,
+        )
         self._base_url = base_url
 
         is_azure = (
@@ -322,7 +329,7 @@ def __init__(
         )
 
         self._loop = loop or asyncio.get_event_loop()
-        self._rt_sessions: list[RealtimeSession] = []
+        self._rt_sessions = weakref.WeakSet[RealtimeSession]()
         self._http_session = http_session
 
     @classmethod
@@ -427,9 +434,13 @@ def _ensure_session(self) -> aiohttp.ClientSession:
         return self._http_session
 
     @property
-    def sessions(self) -> list[RealtimeSession]:
+    def sessions(self) -> weakref.WeakSet[RealtimeSession]:
         return self._rt_sessions
 
+    @property
+    def capabilities(self) -> Capabilities:
+        return self._capabilities
+
     def session(
         self,
         *,
@@ -475,7 +486,7 @@ def session(
             http_session=self._ensure_session(),
             loop=self._loop,
         )
-        self._rt_sessions.append(new_session)
+        self._rt_sessions.add(new_session)
         return new_session
 
     async def aclose(self) -> None:
@@ -854,6 +865,9 @@ def conversation(self) -> Conversation:
     def input_audio_buffer(self) -> InputAudioBuffer:
         return RealtimeSession.InputAudioBuffer(self)
 
+    def _push_audio(self, frame: rtc.AudioFrame) -> None:
+        self.input_audio_buffer.append(frame)
+
     @property
     def response(self) -> Response:
         return RealtimeSession.Response(self)
@@ -1023,6 +1037,15 @@ def _recover_from_text_response(self, item_id: str | None = None) -> None:
         self.conversation.item.create(self._create_empty_user_audio_message(1.0))
         self.response.create(on_duplicate="keep_both")
 
+    def _truncate_conversation_item(
+        self, item_id: str, content_index: int, audio_end_ms: int
+    ) -> None:
+        self.conversation.item.truncate(
+            item_id=item_id,
+            content_index=content_index,
+            audio_end_ms=audio_end_ms,
+        )
+
     def _update_conversation_item_content(
         self, item_id: str, content: llm.ChatContent | list[llm.ChatContent] | None
     ) -> None:
@@ -1662,7 +1685,7 @@ async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str)
                 "function": fnc_call_info.function_info.name,
             },
         )
-        if called_fnc.result is not None:
+        if tool_call.content is not None:
             create_fut = self.conversation.item.create(
                 tool_call,
                 previous_item_id=item_id,

From bcbe7dd0f8ffdb55b2741d1b17637f1016a71030 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Tue, 31 Dec 2024 11:56:51 +0800
Subject: [PATCH 43/46] avoid duplicate say in function call example (#1317)

---
 .../function_calling_weather.py               | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py
index 7f1ba5fa5..88358e419 100644
--- a/examples/voice-pipeline-agent/function_calling_weather.py
+++ b/examples/voice-pipeline-agent/function_calling_weather.py
@@ -39,18 +39,25 @@ async def get_weather(
         # that it might take awhile:
         # Option 1: you can use .say filler message immediately after the call is triggered
         # Option 2: you can prompt the agent to return a text response when it's making a function call
-        call_ctx = AgentCallContext.get_current()
-        filler_messages = [
-            "Let me check the weather in {location} for you.",
-            "Let me see what the weather is like in {location} right now.",
-            # LLM will complete this sentence if it is added to the end of the chat context
-            "The current weather in {location} is ",
-        ]
-        message = random.choice(filler_messages).format(location=location)
-
-        # NOTE: set add_to_chat_ctx=True will add the message to the end
-        #   of the chat context of the function call for answer synthesis
-        speech_handle = await call_ctx.agent.say(message, add_to_chat_ctx=True)  # noqa: F841
+        agent = AgentCallContext.get_current().agent
+
+        if (
+            not agent.chat_ctx.messages
+            or agent.chat_ctx.messages[-1].role != "assistant"
+        ):
+            # skip if assistant already said something
+            filler_messages = [
+                "Let me check the weather in {location} for you.",
+                "Let me see what the weather is like in {location} right now.",
+                # LLM will complete this sentence if it is added to the end of the chat context
+                "The current weather in {location} is ",
+            ]
+            message = random.choice(filler_messages).format(location=location)
+            logger.info(f"saying filler message: {message}")
+
+            # NOTE: set add_to_chat_ctx=True will add the message to the end
+            #   of the chat context of the function call for answer synthesis
+            speech_handle = await agent.say(message, add_to_chat_ctx=True)  # noqa: F841
 
         logger.info(f"getting weather for {location}")
         url = f"https://wttr.in/{urllib.parse.quote(location)}?format=%C+%t"

From aedbb82a5130142c760a482e264a44f135ae5f65 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Mon, 30 Dec 2024 21:12:46 -0800
Subject: [PATCH 44/46] avoid warnings when function depth matches limit
 (#1316)

---
 .changeset/quiet-dots-fly.md                      |  5 +++++
 .../function_calling_weather.py                   |  7 ++++++-
 .../livekit/agents/pipeline/pipeline_agent.py     | 15 ++++++++-------
 3 files changed, 19 insertions(+), 8 deletions(-)
 create mode 100644 .changeset/quiet-dots-fly.md

diff --git a/.changeset/quiet-dots-fly.md b/.changeset/quiet-dots-fly.md
new file mode 100644
index 000000000..3f7208c3e
--- /dev/null
+++ b/.changeset/quiet-dots-fly.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+avoid warnings when function depth matches limit
diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py
index 88358e419..f39705f17 100644
--- a/examples/voice-pipeline-agent/function_calling_weather.py
+++ b/examples/voice-pipeline-agent/function_calling_weather.py
@@ -1,5 +1,6 @@
 import logging
 import random
+import re
 import urllib
 from typing import Annotated
 
@@ -35,6 +36,9 @@ async def get_weather(
         ],
     ):
         """Called when the user asks about the weather. This function will return the weather for the given location."""
+        # Clean the location string of special characters
+        location = re.sub(r"[^a-zA-Z0-9]+", " ", location).strip()
+
         # When a function call is running, there are a couple of options to inform the user
         # that it might take awhile:
         # Option 1: you can use .say filler message immediately after the call is triggered
@@ -69,6 +73,7 @@ async def get_weather(
                     weather_data = (
                         f"The weather in {location} is {await response.text()}."
                     )
+                    logger.info(f"weather data: {weather_data}")
                 else:
                     raise Exception(
                         f"Failed to get weather data, status code: {response.status}"
@@ -92,7 +97,7 @@ async def entrypoint(ctx: JobContext):
             "You are a weather assistant created by LiveKit. Your interface with users will be voice. "
             "You will provide weather information for a given location. "
             # when using option 1, you can suppress from the agent with prompt
-            "do not say anything while waiting for the function call to complete."
+            "do not return any text while calling the function."
             # uncomment this to use option 2
             # "when performing function calls, let user know that you are checking the weather."
         ),
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
index 872bd5d4b..e6f65e772 100644
--- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
+++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -966,13 +966,14 @@ async def _execute_function_calls() -> None:
                 and new_speech_handle.fnc_nested_depth
                 >= self._opts.max_nested_fnc_calls
             ):
-                logger.warning(
-                    "max function calls nested depth reached, not propagating fnc ctx",
-                    extra={
-                        "speech_id": speech_handle.id,
-                        "fnc_nested_depth": speech_handle.fnc_nested_depth,
-                    },
-                )
+                if len(fnc_ctx.ai_functions) > 1:
+                    logger.info(
+                        "max function calls nested depth reached, dropping function context. increase max_nested_fnc_calls to enable additional nesting.",
+                        extra={
+                            "speech_id": speech_handle.id,
+                            "fnc_nested_depth": speech_handle.fnc_nested_depth,
+                        },
+                    )
                 fnc_ctx = None
             answer_llm_stream = self._llm.chat(
                 chat_ctx=chat_ctx,

From 924b79e0a0305be5fa3c77cfbd8fcab63bfd7de7 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 30 Dec 2024 23:15:36 -0600
Subject: [PATCH 45/46] Version Packages (#1286)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .changeset/eight-lemons-hear.md                |  5 -----
 .changeset/giant-ways-invite.md                |  8 --------
 .changeset/gorgeous-sheep-grow.md              |  7 -------
 .changeset/hot-trainers-press.md               |  5 -----
 .changeset/khaki-candles-rest.md               |  5 -----
 .changeset/khaki-stingrays-train.md            |  5 -----
 .changeset/quiet-dots-fly.md                   |  5 -----
 .changeset/silent-oranges-warn.md              |  5 -----
 .changeset/slow-walls-bake.md                  |  5 -----
 .changeset/thirty-coats-tie.md                 |  7 -------
 .changeset/tricky-spiders-change.md            |  5 -----
 .../participant-entrypoint/requirements.txt    |  2 +-
 examples/simple-color/requirements.txt         |  2 +-
 examples/speech-to-text/requirements.txt       |  4 ++--
 examples/text-to-speech/requirements.txt       |  4 ++--
 examples/voice-pipeline-agent/requirements.txt |  6 +++---
 livekit-agents/CHANGELOG.md                    | 18 ++++++++++++++++++
 livekit-agents/livekit/agents/version.py       |  2 +-
 livekit-agents/package.json                    |  2 +-
 .../livekit-plugins-anthropic/CHANGELOG.md     |  6 ++++++
 .../livekit/plugins/anthropic/version.py       |  2 +-
 .../livekit-plugins-anthropic/package.json     |  2 +-
 .../livekit-plugins-assemblyai/CHANGELOG.md    |  8 ++++++++
 .../livekit/plugins/assemblyai/version.py      |  2 +-
 .../livekit-plugins-assemblyai/package.json    |  2 +-
 .../livekit-plugins-azure/CHANGELOG.md         |  6 ++++++
 .../livekit/plugins/azure/version.py           |  2 +-
 .../livekit-plugins-azure/package.json         |  2 +-
 .../livekit-plugins-deepgram/CHANGELOG.md      |  6 ++++++
 .../livekit/plugins/deepgram/version.py        |  2 +-
 .../livekit-plugins-deepgram/package.json      |  2 +-
 .../livekit-plugins-google/CHANGELOG.md        | 10 ++++++++++
 .../livekit/plugins/google/version.py          |  2 +-
 .../livekit-plugins-google/package.json        |  2 +-
 .../livekit-plugins-openai/CHANGELOG.md        |  8 ++++++++
 .../livekit/plugins/openai/version.py          |  2 +-
 .../livekit-plugins-openai/package.json        |  2 +-
 .../livekit-plugins-playai/CHANGELOG.md        |  6 ++++++
 .../livekit/plugins/playai/version.py          |  2 +-
 .../livekit-plugins-playai/package.json        |  2 +-
 .../livekit-plugins-turn-detector/CHANGELOG.md |  6 ++++++
 .../livekit/plugins/turn_detector/version.py   |  2 +-
 .../livekit-plugins-turn-detector/package.json |  2 +-
 43 files changed, 101 insertions(+), 89 deletions(-)
 delete mode 100644 .changeset/eight-lemons-hear.md
 delete mode 100644 .changeset/giant-ways-invite.md
 delete mode 100644 .changeset/gorgeous-sheep-grow.md
 delete mode 100644 .changeset/hot-trainers-press.md
 delete mode 100644 .changeset/khaki-candles-rest.md
 delete mode 100644 .changeset/khaki-stingrays-train.md
 delete mode 100644 .changeset/quiet-dots-fly.md
 delete mode 100644 .changeset/silent-oranges-warn.md
 delete mode 100644 .changeset/slow-walls-bake.md
 delete mode 100644 .changeset/thirty-coats-tie.md
 delete mode 100644 .changeset/tricky-spiders-change.md

diff --git a/.changeset/eight-lemons-hear.md b/.changeset/eight-lemons-hear.md
deleted file mode 100644
index 38a3f1b1b..000000000
--- a/.changeset/eight-lemons-hear.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-expose worker_id in jobcontext
diff --git a/.changeset/giant-ways-invite.md b/.changeset/giant-ways-invite.md
deleted file mode 100644
index 5644cb581..000000000
--- a/.changeset/giant-ways-invite.md
+++ /dev/null
@@ -1,8 +0,0 @@
----
-"livekit-plugins-assemblyai": patch
-"livekit-plugins-deepgram": patch
-"livekit-plugins-google": patch
-"livekit-plugins-azure": patch
----
-
-fix: Ensure STT exceptions are being propagated
diff --git a/.changeset/gorgeous-sheep-grow.md b/.changeset/gorgeous-sheep-grow.md
deleted file mode 100644
index 5bdc7cc5f..000000000
--- a/.changeset/gorgeous-sheep-grow.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-"livekit-plugins-anthropic": patch
-"livekit-plugins-openai": patch
-"livekit-agents": patch
----
-
-improved handling of LLM errors, do not retry if already began
diff --git a/.changeset/hot-trainers-press.md b/.changeset/hot-trainers-press.md
deleted file mode 100644
index 326150914..000000000
--- a/.changeset/hot-trainers-press.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-assemblyai": patch
----
-
-assemblyai: encode boost words
diff --git a/.changeset/khaki-candles-rest.md b/.changeset/khaki-candles-rest.md
deleted file mode 100644
index 91afec21c..000000000
--- a/.changeset/khaki-candles-rest.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-Do not pass function context if at max depth
diff --git a/.changeset/khaki-stingrays-train.md b/.changeset/khaki-stingrays-train.md
deleted file mode 100644
index ca99f9fa7..000000000
--- a/.changeset/khaki-stingrays-train.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-playai": patch
----
-
-Support PlayAI TTS engine.
diff --git a/.changeset/quiet-dots-fly.md b/.changeset/quiet-dots-fly.md
deleted file mode 100644
index 3f7208c3e..000000000
--- a/.changeset/quiet-dots-fly.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-avoid warnings when function depth matches limit
diff --git a/.changeset/silent-oranges-warn.md b/.changeset/silent-oranges-warn.md
deleted file mode 100644
index e7bcd0189..000000000
--- a/.changeset/silent-oranges-warn.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-improve interruption handling, avoid agent from getting stuck
diff --git a/.changeset/slow-walls-bake.md b/.changeset/slow-walls-bake.md
deleted file mode 100644
index 11df23f75..000000000
--- a/.changeset/slow-walls-bake.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-agents": patch
----
-
-add manual interrupt method for pipeline agent
diff --git a/.changeset/thirty-coats-tie.md b/.changeset/thirty-coats-tie.md
deleted file mode 100644
index f0c6a9e67..000000000
--- a/.changeset/thirty-coats-tie.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-"livekit-plugins-google": minor
-"livekit-plugins-openai": patch
-"livekit-agents": patch
----
-
-make multimodal class generic and support gemini live api
diff --git a/.changeset/tricky-spiders-change.md b/.changeset/tricky-spiders-change.md
deleted file mode 100644
index a017624fc..000000000
--- a/.changeset/tricky-spiders-change.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"livekit-plugins-turn-detector": patch
----
-
-fix int32/64 errors on Windows
diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt
index a92be36b8..77c8959d1 100644
--- a/examples/participant-entrypoint/requirements.txt
+++ b/examples/participant-entrypoint/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.5
+livekit-agents>=0.12.6
 python-dotenv~=1.0
diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt
index a92be36b8..77c8959d1 100644
--- a/examples/simple-color/requirements.txt
+++ b/examples/simple-color/requirements.txt
@@ -1,2 +1,2 @@
-livekit-agents>=0.12.5
+livekit-agents>=0.12.6
 python-dotenv~=1.0
diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt
index e58a682b3..b9f8e9fb0 100644
--- a/examples/speech-to-text/requirements.txt
+++ b/examples/speech-to-text/requirements.txt
@@ -1,3 +1,3 @@
-livekit-agents>=0.12.5
-livekit-plugins-deepgram>=0.6.15
+livekit-agents>=0.12.6
+livekit-plugins-deepgram>=0.6.16
 python-dotenv~=1.0
diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt
index f025ab277..f03f7fa49 100644
--- a/examples/text-to-speech/requirements.txt
+++ b/examples/text-to-speech/requirements.txt
@@ -1,5 +1,5 @@
-livekit-agents>=0.12.5
-livekit-plugins-openai>=0.10.12
+livekit-agents>=0.12.6
+livekit-plugins-openai>=0.10.13
 livekit-plugins-cartesia>=0.4.5
 livekit-plugins-elevenlabs>=0.7.9
 python-dotenv~=1.0
diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt
index 481cb0136..cf97c8314 100644
--- a/examples/voice-pipeline-agent/requirements.txt
+++ b/examples/voice-pipeline-agent/requirements.txt
@@ -1,6 +1,6 @@
-livekit-agents>=0.12.5
-livekit-plugins-deepgram>=0.6.15
-livekit-plugins-google>=0.8.1
+livekit-agents>=0.12.6
+livekit-plugins-deepgram>=0.6.16
+livekit-plugins-google>=0.9.0
 livekit-plugins-openai[vertex]>=0.10.10
 livekit-plugins-silero>=0.7.4
 livekit-plugins-rag>=0.2.3
diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md
index b04f10f1d..d9c3770d4 100644
--- a/livekit-agents/CHANGELOG.md
+++ b/livekit-agents/CHANGELOG.md
@@ -1,5 +1,23 @@
 # livekit-agents
 
+## 0.12.6
+
+### Patch Changes
+
+- expose worker_id in jobcontext - [#1307](https://github.com/livekit/agents/pull/1307) ([@s-hamdananwar](https://github.com/s-hamdananwar))
+
+- improved handling of LLM errors, do not retry if already began - [#1298](https://github.com/livekit/agents/pull/1298) ([@davidzhao](https://github.com/davidzhao))
+
+- Do not pass function context if at max depth - [#1306](https://github.com/livekit/agents/pull/1306) ([@martin-purplefish](https://github.com/martin-purplefish))
+
+- avoid warnings when function depth matches limit - [#1316](https://github.com/livekit/agents/pull/1316) ([@davidzhao](https://github.com/davidzhao))
+
+- improve interruption handling, avoid agent from getting stuck - [#1290](https://github.com/livekit/agents/pull/1290) ([@davidzhao](https://github.com/davidzhao))
+
+- add manual interrupt method for pipeline agent - [#1294](https://github.com/livekit/agents/pull/1294) ([@longcw](https://github.com/longcw))
+
+- make multimodal class generic and support gemini live api - [#1240](https://github.com/livekit/agents/pull/1240) ([@jayeshp19](https://github.com/jayeshp19))
+
 ## 0.12.5
 
 ### Patch Changes
diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py
index 93e989e31..0696f486e 100644
--- a/livekit-agents/livekit/agents/version.py
+++ b/livekit-agents/livekit/agents/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.12.5"
+__version__ = "0.12.6"
diff --git a/livekit-agents/package.json b/livekit-agents/package.json
index 4986b2889..c321ac852 100644
--- a/livekit-agents/package.json
+++ b/livekit-agents/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-agents",
   "private": true,
-  "version": "0.12.5"
+  "version": "0.12.6"
 }
diff --git a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md
index f540e9641..3b75922f3 100644
--- a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-anthropic
 
+## 0.2.9
+
+### Patch Changes
+
+- improved handling of LLM errors, do not retry if already began - [#1298](https://github.com/livekit/agents/pull/1298) ([@davidzhao](https://github.com/davidzhao))
+
 ## 0.2.8
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py
index e558b382c..bd4a8d004 100644
--- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py
+++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.2.8"
+__version__ = "0.2.9"
diff --git a/livekit-plugins/livekit-plugins-anthropic/package.json b/livekit-plugins/livekit-plugins-anthropic/package.json
index ad2ba63a2..eb8866886 100644
--- a/livekit-plugins/livekit-plugins-anthropic/package.json
+++ b/livekit-plugins/livekit-plugins-anthropic/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-anthropic",
   "private": true,
-  "version": "0.2.8"
+  "version": "0.2.9"
 }
diff --git a/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md b/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md
index 5a5f68908..71d63e941 100644
--- a/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md
@@ -1,5 +1,13 @@
 # livekit-plugins-assemblyai
 
+## 0.2.2
+
+### Patch Changes
+
+- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao))
+
+- assemblyai: encode boost words - [#1284](https://github.com/livekit/agents/pull/1284) ([@jmugicagonz](https://github.com/jmugicagonz))
+
 ## 0.2.1
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py
index 875ee5214..2985d9da1 100644
--- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py
+++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.2.1"
+__version__ = "0.2.2"
diff --git a/livekit-plugins/livekit-plugins-assemblyai/package.json b/livekit-plugins/livekit-plugins-assemblyai/package.json
index 992070917..8b0962663 100644
--- a/livekit-plugins/livekit-plugins-assemblyai/package.json
+++ b/livekit-plugins/livekit-plugins-assemblyai/package.json
@@ -1,5 +1,5 @@
 {
     "name": "livekit-plugins-assemblyai",
     "private": true,
-    "version": "0.2.1"
+    "version": "0.2.2"
 }
diff --git a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md
index 5d4ab532b..414181cbd 100644
--- a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-azure
 
+## 0.5.2
+
+### Patch Changes
+
+- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao))
+
 ## 0.5.1
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py
index 79283902f..ec65e487a 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.5.1"
+__version__ = "0.5.2"
diff --git a/livekit-plugins/livekit-plugins-azure/package.json b/livekit-plugins/livekit-plugins-azure/package.json
index cdd81c035..45561032c 100644
--- a/livekit-plugins/livekit-plugins-azure/package.json
+++ b/livekit-plugins/livekit-plugins-azure/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-azure",
   "private": true,
-  "version": "0.5.1"
+  "version": "0.5.2"
 }
diff --git a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md
index 9c624c19f..617d61f38 100644
--- a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-deepgram
 
+## 0.6.16
+
+### Patch Changes
+
+- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao))
+
 ## 0.6.15
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py
index c83922d4e..e1df9b637 100644
--- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py
+++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.15"
+__version__ = "0.6.16"
diff --git a/livekit-plugins/livekit-plugins-deepgram/package.json b/livekit-plugins/livekit-plugins-deepgram/package.json
index 65cf7a26a..3a0a81159 100644
--- a/livekit-plugins/livekit-plugins-deepgram/package.json
+++ b/livekit-plugins/livekit-plugins-deepgram/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-deepgram",
   "private": true,
-  "version": "0.6.15"
+  "version": "0.6.16"
 }
diff --git a/livekit-plugins/livekit-plugins-google/CHANGELOG.md b/livekit-plugins/livekit-plugins-google/CHANGELOG.md
index 82ccd17ff..8867829ea 100644
--- a/livekit-plugins/livekit-plugins-google/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-google/CHANGELOG.md
@@ -1,5 +1,15 @@
 # livekit-plugins-google
 
+## 0.9.0
+
+### Minor Changes
+
+- make multimodal class generic and support gemini live api - [#1240](https://github.com/livekit/agents/pull/1240) ([@jayeshp19](https://github.com/jayeshp19))
+
+### Patch Changes
+
+- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao))
+
 ## 0.8.1
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py
index eb38535e3..654ad56ec 100644
--- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.8.1"
+__version__ = "0.9.0"
diff --git a/livekit-plugins/livekit-plugins-google/package.json b/livekit-plugins/livekit-plugins-google/package.json
index c79ee66c4..17bc59ac6 100644
--- a/livekit-plugins/livekit-plugins-google/package.json
+++ b/livekit-plugins/livekit-plugins-google/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-google",
   "private": true,
-  "version": "0.8.1"
+  "version": "0.9.0"
 }
diff --git a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
index 02ff2f06f..1e363b412 100644
--- a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md
@@ -1,5 +1,13 @@
 # livekit-plugins-openai
 
+## 0.10.13
+
+### Patch Changes
+
+- improved handling of LLM errors, do not retry if already began - [#1298](https://github.com/livekit/agents/pull/1298) ([@davidzhao](https://github.com/davidzhao))
+
+- make multimodal class generic and support gemini live api - [#1240](https://github.com/livekit/agents/pull/1240) ([@jayeshp19](https://github.com/jayeshp19))
+
 ## 0.10.12
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
index 16e535380..c1fcb43b8 100644
--- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
+++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.10.12"
+__version__ = "0.10.13"
diff --git a/livekit-plugins/livekit-plugins-openai/package.json b/livekit-plugins/livekit-plugins-openai/package.json
index bfe2370d0..e23704cba 100644
--- a/livekit-plugins/livekit-plugins-openai/package.json
+++ b/livekit-plugins/livekit-plugins-openai/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-openai",
   "private": true,
-  "version": "0.10.12"
+  "version": "0.10.13"
 }
diff --git a/livekit-plugins/livekit-plugins-playai/CHANGELOG.md b/livekit-plugins/livekit-plugins-playai/CHANGELOG.md
index 84c891ab4..8fd61d2cf 100644
--- a/livekit-plugins/livekit-plugins-playai/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-playai/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-playht
 
+## 1.0.4
+
+### Patch Changes
+
+- Support PlayAI TTS engine. - [#1174](https://github.com/livekit/agents/pull/1174) ([@jayeshp19](https://github.com/jayeshp19))
+
 ## 1.0.3
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py
index 976498ab9..92192eed4 100644
--- a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py
+++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py
@@ -1 +1 @@
-__version__ = "1.0.3"
+__version__ = "1.0.4"
diff --git a/livekit-plugins/livekit-plugins-playai/package.json b/livekit-plugins/livekit-plugins-playai/package.json
index 043890665..a4879d16b 100644
--- a/livekit-plugins/livekit-plugins-playai/package.json
+++ b/livekit-plugins/livekit-plugins-playai/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-playai",
   "private": true,
-  "version": "1.0.3"
+  "version": "1.0.4"
 }
diff --git a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
index 2d38bf347..46a9a7fe5 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
+++ b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md
@@ -1,5 +1,11 @@
 # livekit-plugins-eou
 
+## 0.3.5
+
+### Patch Changes
+
+- fix int32/64 errors on Windows - [#1285](https://github.com/livekit/agents/pull/1285) ([@nbsp](https://github.com/nbsp))
+
 ## 0.3.4
 
 ### Patch Changes
diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
index bcfe9b179..4be9d79b7 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
+++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.3.4"
+__version__ = "0.3.5"
diff --git a/livekit-plugins/livekit-plugins-turn-detector/package.json b/livekit-plugins/livekit-plugins-turn-detector/package.json
index 82d16bb89..264da83bf 100644
--- a/livekit-plugins/livekit-plugins-turn-detector/package.json
+++ b/livekit-plugins/livekit-plugins-turn-detector/package.json
@@ -1,5 +1,5 @@
 {
   "name": "livekit-plugins-turn-detector",
   "private": true,
-  "version": "0.3.4"
+  "version": "0.3.5"
 }

From 1ab8d88749c9e42ddb952741e4fb2a65bb6645e8 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Tue, 31 Dec 2024 00:08:28 -0800
Subject: [PATCH 46/46] rename `multimodal_agent` directory for consistency
 (#1318)

---
 examples/{multimodal_agent => multimodal-agent}/gemini_agent.py | 0
 examples/{multimodal_agent => multimodal-agent}/openai_agent.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/{multimodal_agent => multimodal-agent}/gemini_agent.py (100%)
 rename examples/{multimodal_agent => multimodal-agent}/openai_agent.py (100%)

diff --git a/examples/multimodal_agent/gemini_agent.py b/examples/multimodal-agent/gemini_agent.py
similarity index 100%
rename from examples/multimodal_agent/gemini_agent.py
rename to examples/multimodal-agent/gemini_agent.py
diff --git a/examples/multimodal_agent/openai_agent.py b/examples/multimodal-agent/openai_agent.py
similarity index 100%
rename from examples/multimodal_agent/openai_agent.py
rename to examples/multimodal-agent/openai_agent.py