From ce1c058bf802fec2767cb43e8bc2d09c3ffaa389 Mon Sep 17 00:00:00 2001 From: jerad fields Date: Mon, 16 Dec 2024 12:49:56 -0600 Subject: [PATCH 01/46] use onnx turn detector model (#1231) Co-authored-by: David Zhao --- .changeset/loud-onions-invent.md | 5 +++++ .../livekit-plugins-turn-detector/README.md | 2 +- .../livekit/plugins/turn_detector/__init__.py | 7 +++++-- .../livekit/plugins/turn_detector/eou.py | 10 +++++++--- livekit-plugins/livekit-plugins-turn-detector/setup.py | 1 + 5 files changed, 19 insertions(+), 6 deletions(-) create mode 100644 .changeset/loud-onions-invent.md diff --git a/.changeset/loud-onions-invent.md b/.changeset/loud-onions-invent.md new file mode 100644 index 000000000..dcedf95b4 --- /dev/null +++ b/.changeset/loud-onions-invent.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-turn-detector": patch +--- + +use quantized onnx version of turn detector model diff --git a/livekit-plugins/livekit-plugins-turn-detector/README.md b/livekit-plugins/livekit-plugins-turn-detector/README.md index 988706784..859b803cf 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/README.md +++ b/livekit-plugins/livekit-plugins-turn-detector/README.md @@ -35,7 +35,7 @@ python my_agent.py download-files ## Model system requirements -The end-of-turn model is optimized to run on CPUs with modest system requirements. It is designed to run on the same server hosting your agents. On a 4-core server instance, it completes inference in under 100ms with minimal CPU usage. +The end-of-turn model is optimized to run on CPUs with modest system requirements. It is designed to run on the same server hosting your agents. On a 4-core server instance, it completes inference in ~50ms with minimal CPU usage. The model requires 1.5GB of RAM and runs within a shared inference server, supporting multiple concurrent sessions. diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py index 6ca7eecbb..32692361a 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py @@ -27,11 +27,14 @@ def __init__(self): super().__init__(__name__, __version__, __package__, logger) def download_files(self) -> None: - from transformers import AutoModelForCausalLM, AutoTokenizer + from optimum.onnxruntime import ORTModelForCausalLM + from transformers import AutoTokenizer from .eou import HG_MODEL - AutoModelForCausalLM.from_pretrained(HG_MODEL) + ORTModelForCausalLM.from_pretrained( + HG_MODEL, use_cache=False, use_io_binding=False + ) AutoTokenizer.from_pretrained(HG_MODEL) diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py index d5f21799e..afbc09415 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py @@ -56,11 +56,15 @@ def _format_chat_ctx(self, chat_ctx: dict): def initialize(self) -> None: from huggingface_hub import errors - from transformers import AutoModelForCausalLM, AutoTokenizer + from optimum.onnxruntime import ORTModelForCausalLM + from transformers import AutoTokenizer try: - self._model = AutoModelForCausalLM.from_pretrained( - HG_MODEL, local_files_only=True + self._model = ORTModelForCausalLM.from_pretrained( + HG_MODEL, + local_files_only=True, + use_io_binding=False, + use_cache=False, ) self._tokenizer = AutoTokenizer.from_pretrained( HG_MODEL, local_files_only=True diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py index a73d4c797..b26b8e536 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/setup.py +++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py @@ -54,6 +54,7 @@ "transformers>=4.46", "numpy>=1.26", "torch>=2.5.1", + "optimum[onnxruntime]>=1.23.3", ], package_data={"livekit.plugins.turn_detector": ["py.typed"]}, project_urls={ From aa933d2bb5ba2131cd54a5d3a91b44cd7c16f303 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Mon, 16 Dec 2024 17:09:29 -0800 Subject: [PATCH 02/46] streaming audio decoder, enables receiving compressed audio from TTS services (#1236) --- .changeset/empty-sheep-pump.md | 6 + .github/workflows/tests.yml | 32 +--- livekit-agents/livekit/agents/stt/stt.py | 2 +- .../livekit/agents/utils/codecs/__init__.py | 3 +- .../livekit/agents/utils/codecs/decoder.py | 159 ++++++++++++++++++ livekit-agents/setup.py | 2 +- .../livekit-plugins-deepgram/setup.py | 2 +- tests/.gitattributes | 1 + tests/change-sophie.opus | 3 + tests/test_decoder.py | 140 +++++++++++++++ tests/test_stt.py | 2 + 11 files changed, 325 insertions(+), 27 deletions(-) create mode 100644 .changeset/empty-sheep-pump.md create mode 100644 livekit-agents/livekit/agents/utils/codecs/decoder.py create mode 100644 tests/change-sophie.opus create mode 100644 tests/test_decoder.py diff --git a/.changeset/empty-sheep-pump.md b/.changeset/empty-sheep-pump.md new file mode 100644 index 000000000..06c854c20 --- /dev/null +++ b/.changeset/empty-sheep-pump.md @@ -0,0 +1,6 @@ +--- +"livekit-plugins-deepgram": patch +"livekit-agents": patch +--- + +added streaming audio decoder for compressed audio. diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 302e5ad71..2fac6f9a2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,15 +18,15 @@ jobs: strategy: fail-fast: false matrix: - os: - [ - macos-14-large, + os: [ + # disabled Intel Macs due to pytorch 2.3+ not supporting it + # macos-14-large, macos-14, windows-2019, ubuntu-20.04, namespace-profile-default-arm64, ] - python_version: ["3.12"] + python_version: ["3.9", "3.12"] test_group: ["base"] include: # Include llm, stt, and tts tests only on Ubuntu 20.04 with Python 3.9 @@ -60,11 +60,8 @@ jobs: ${{ runner.os }}-cache - uses: actions/setup-python@v5 - # brew will install python as part of ffmpeg install on MacOS - # installing system Python could cause a conflict with `Could not symlink bin/idle3` - if: ${{ matrix.os != 'macos-14-large' }} with: - python-version: "3.12" + python-version: ${{ matrix.python_version }} cache: "pip" - name: Install ffmpeg (Linux) @@ -91,20 +88,9 @@ jobs: - name: Install packages shell: bash run: | - pip3 install pytest pytest-asyncio pytest-timeout './livekit-agents[codecs]' psutil - pip3 install -r ./tests/test-requirements.txt - pip3 install ./livekit-agents \ - ./livekit-plugins/livekit-plugins-openai \ - ./livekit-plugins/livekit-plugins-deepgram \ - ./livekit-plugins/livekit-plugins-google \ - ./livekit-plugins/livekit-plugins-nltk \ - ./livekit-plugins/livekit-plugins-silero \ - ./livekit-plugins/livekit-plugins-elevenlabs \ - ./livekit-plugins/livekit-plugins-cartesia \ - ./livekit-plugins/livekit-plugins-azure \ - ./livekit-plugins/livekit-plugins-anthropic \ - ./livekit-plugins/livekit-plugins-assemblyai \ - ./livekit-plugins/livekit-plugins-fal + pip install pytest pytest-asyncio pytest-timeout './livekit-agents[codecs]' psutil + pip install -r ./tests/test-requirements.txt + ./livekit-plugins/install_local.sh - name: Run tests shell: bash @@ -131,7 +117,7 @@ jobs: case "${{ matrix.test_group }}" in base) - test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py" + test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py" ;; llm) test_files="test_llm.py" diff --git a/livekit-agents/livekit/agents/stt/stt.py b/livekit-agents/livekit/agents/stt/stt.py index c1922bc56..e2f79f93c 100644 --- a/livekit-agents/livekit/agents/stt/stt.py +++ b/livekit-agents/livekit/agents/stt/stt.py @@ -295,7 +295,7 @@ def flush(self) -> None: self._input_ch.send_nowait(self._FlushSentinel()) def end_input(self) -> None: - """Mark the end of input, no more text will be pushed""" + """Mark the end of input, no more audio will be pushed""" self.flush() self._input_ch.close() diff --git a/livekit-agents/livekit/agents/utils/codecs/__init__.py b/livekit-agents/livekit/agents/utils/codecs/__init__.py index 35f19332a..ad2f77b91 100644 --- a/livekit-agents/livekit/agents/utils/codecs/__init__.py +++ b/livekit-agents/livekit/agents/utils/codecs/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .decoder import AudioStreamDecoder, StreamBuffer from .mp3 import Mp3StreamDecoder -__all__ = ["Mp3StreamDecoder"] +__all__ = ["Mp3StreamDecoder", "AudioStreamDecoder", "StreamBuffer"] diff --git a/livekit-agents/livekit/agents/utils/codecs/decoder.py b/livekit-agents/livekit/agents/utils/codecs/decoder.py new file mode 100644 index 000000000..01367c055 --- /dev/null +++ b/livekit-agents/livekit/agents/utils/codecs/decoder.py @@ -0,0 +1,159 @@ +# Copyright 2024 LiveKit, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import io +from typing import AsyncIterator + +from livekit.agents.utils import aio + +try: + # preload to ensure faster startup + import av # noqa +except ImportError: + pass +import threading + +from livekit import rtc + + +class StreamBuffer: + """ + A thread-safe buffer that behaves like an IO stream. + Allows writing from one thread and reading from another. + """ + + def __init__(self): + self._buffer = io.BytesIO() + self._lock = threading.Lock() + self._data_available = threading.Condition(self._lock) + self._eof = False # EOF flag to signal no more writes + + def write(self, data: bytes): + """Write data to the buffer from a writer thread.""" + with self._data_available: # Lock and notify readers + self._buffer.seek(0, io.SEEK_END) # Move to the end + self._buffer.write(data) + self._data_available.notify_all() # Notify waiting readers + + def read(self, size: int = -1) -> bytes: + """Read data from the buffer in a reader thread.""" + + if self._buffer.closed: + return b"" + + with self._data_available: + while True: + self._buffer.seek(0) # Rewind for reading + data = self._buffer.read(size) + + # If data is available, return it + if data: + # Shrink the buffer to remove already-read data + remaining = self._buffer.read() + self._buffer = io.BytesIO(remaining) + return data + + # If EOF is signaled and no data remains, return EOF + if self._eof: + return b"" + + # Wait for more data + self._data_available.wait() + + def end_input(self): + """Signal that no more data will be written.""" + with self._data_available: + self._eof = True + self._data_available.notify_all() + + def close(self): + self._buffer.close() + + +class AudioStreamDecoder: + """A class that can be used to decode audio stream into PCM AudioFrames. + + Decoders are stateful, and it should not be reused across multiple streams. Each decoder + is designed to decode a single stream. + """ + + def __init__(self): + try: + import av # noqa + except ImportError: + raise ImportError( + "You haven't included the 'codecs' optional dependencies. Please install the 'codecs' extra by running `pip install livekit-agents[codecs]`" + ) + + self._output_ch = aio.Chan[rtc.AudioFrame]() + self._closed = False + self._started = False + self._output_finished = False + self._input_buf = StreamBuffer() + self._loop = asyncio.get_event_loop() + + def push(self, chunk: bytes): + self._input_buf.write(chunk) + if not self._started: + self._started = True + self._loop.run_in_executor(None, self._decode_loop) + + def end_input(self): + self._input_buf.end_input() + + def _decode_loop(self): + container = av.open(self._input_buf) + audio_stream = next(s for s in container.streams if s.type == "audio") + resampler = av.AudioResampler( + # convert to signed 16-bit little endian + format="s16", + layout="mono", + rate=audio_stream.rate, + ) + try: + # TODO: handle error where audio stream isn't found + if not audio_stream: + return + for frame in container.decode(audio_stream): + if self._closed: + return + for resampled_frame in resampler.resample(frame): + nchannels = len(resampled_frame.layout.channels) + data = resampled_frame.to_ndarray().tobytes() + self._output_ch.send_nowait( + rtc.AudioFrame( + data=data, + num_channels=nchannels, + sample_rate=resampled_frame.sample_rate, + samples_per_channel=resampled_frame.samples / nchannels, + ) + ) + finally: + self._output_finished = True + + def __aiter__(self) -> AsyncIterator[rtc.AudioFrame]: + return self + + async def __anext__(self) -> rtc.AudioFrame: + if self._output_finished and self._output_ch.empty(): + raise StopAsyncIteration + return await self._output_ch.__anext__() + + async def aclose(self): + if self._closed: + return + self._closed = True + self._input_buf.close() + self._output_ch.close() diff --git a/livekit-agents/setup.py b/livekit-agents/setup.py index bf662dc34..9ff541808 100644 --- a/livekit-agents/setup.py +++ b/livekit-agents/setup.py @@ -66,7 +66,7 @@ ':sys_platform!="win32"': [ "aiodns~=3.2" ], # use default aiohttp resolver on windows - "codecs": ["av>=11.0.0"], + "codecs": ["av>=12.0.0", "numpy>=1.26.0"], "images": ["pillow>=10.3.0"], }, package_data={"livekit.agents": ["py.typed"]}, diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py index 077c6d659..8a583611d 100644 --- a/livekit-plugins/livekit-plugins-deepgram/setup.py +++ b/livekit-plugins/livekit-plugins-deepgram/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11.3", "numpy~=1.21"], + install_requires=["livekit-agents>=0.12.2", "numpy>=1.26"], package_data={"livekit.plugins.deepgram": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", diff --git a/tests/.gitattributes b/tests/.gitattributes index 9a8911093..83117e69b 100644 --- a/tests/.gitattributes +++ b/tests/.gitattributes @@ -1,4 +1,5 @@ long.mp3 filter=lfs diff=lfs merge=lfs -text change-sophie.wav filter=lfs diff=lfs merge=lfs -text +change-sophie.opus filter=lfs diff=lfs merge=lfs -text hearts.rgba filter=lfs diff=lfs merge=lfs -text hearts.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/tests/change-sophie.opus b/tests/change-sophie.opus new file mode 100644 index 000000000..5112fcab5 --- /dev/null +++ b/tests/change-sophie.opus @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2eb5667dc35714b4cb70324d3722f89580885ee5e51be5f2c793e7893d9a24 +size 48905 diff --git a/tests/test_decoder.py b/tests/test_decoder.py new file mode 100644 index 000000000..c5ecacce8 --- /dev/null +++ b/tests/test_decoder.py @@ -0,0 +1,140 @@ +import os +import threading +import time +from concurrent.futures import ThreadPoolExecutor + +import aiohttp +import pytest +from livekit.agents.stt import SpeechEventType +from livekit.agents.utils.codecs import AudioStreamDecoder, StreamBuffer +from livekit.plugins import deepgram + +from .utils import wer + +TEST_AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "change-sophie.opus") + + +@pytest.mark.asyncio +async def test_decode_and_transcribe(): + # Skip if test file doesn't exist + if not os.path.exists(TEST_AUDIO_FILEPATH): + pytest.skip(f"Test file not found: {TEST_AUDIO_FILEPATH}") + + decoder = AudioStreamDecoder() + with open(TEST_AUDIO_FILEPATH, "rb") as f: + opus_data = f.read() + decoder.push(opus_data) + decoder.end_input() + + session = aiohttp.ClientSession() + stt = deepgram.STT(http_session=session) + stream = stt.stream() + + # Push frames to STT + async for frame in decoder: + stream.push_frame(frame) + + # Mark end of input + stream.end_input() + + # Collect results + final_text = "" + async for event in stream: + if event.type == SpeechEventType.FINAL_TRANSCRIPT: + if event.alternatives: + if final_text: + final_text += " " + final_text += event.alternatives[0].text + + await decoder.aclose() + await stream.aclose() + await session.close() + + # Verify the transcription + expected_text = "the people that are crazy enough to think they can change the world are the ones who do" + assert wer(final_text, expected_text) < 0.2 + + +def test_stream_buffer(): + buffer = StreamBuffer() + data_chunks = [b"hello", b"world", b"test", b"data"] + received_data = bytearray() + write_completed = threading.Event() + + def writer(): + for chunk in data_chunks: + buffer.write(chunk) + time.sleep(0.01) # Simulate some processing time + buffer.end_input() + write_completed.set() + + def reader(): + while True: + data = buffer.read(4) # Read in small chunks + if not data: # EOF + break + received_data.extend(data) + + # Run writer and reader in separate threads + with ThreadPoolExecutor(max_workers=2) as executor: + reader_future = executor.submit(reader) + writer_future = executor.submit(writer) + + # Wait for both threads to complete + writer_future.result() + reader_future.result() + + # Verify that all data was received correctly + expected_data = b"".join(data_chunks) + assert bytes(received_data) == expected_data + + +def test_stream_buffer_large_chunks(): + buffer = StreamBuffer() + large_chunk = b"x" * 1024 * 1024 # 1MB chunk + num_chunks = 5 + total_size = 0 + write_completed = threading.Event() + + def writer(): + nonlocal total_size + for _ in range(num_chunks): + buffer.write(large_chunk) + total_size += len(large_chunk) + buffer.end_input() + write_completed.set() + + received_size = 0 + + def reader(): + nonlocal received_size + while True: + chunk = buffer.read(8192) # Read in 8KB chunks + if not chunk: + break + received_size += len(chunk) + + # Run writer and reader in separate threads + with ThreadPoolExecutor(max_workers=2) as executor: + reader_future = executor.submit(reader) + writer_future = executor.submit(writer) + + # Wait for both threads to complete + writer_future.result() + reader_future.result() + + assert received_size == total_size + assert total_size == num_chunks * len(large_chunk) + + +def test_stream_buffer_early_close(): + buffer = StreamBuffer() + + # Write some data + buffer.write(b"test data") + + # Close the buffer + buffer.close() + + # Reading from closed buffer should return empty bytes + assert buffer.read() == b"" diff --git a/tests/test_stt.py b/tests/test_stt.py index 836cfd20a..d1f340b1e 100644 --- a/tests/test_stt.py +++ b/tests/test_stt.py @@ -108,6 +108,8 @@ async def _stream_output(): continue if event.type == agents.stt.SpeechEventType.FINAL_TRANSCRIPT: + if text != "": + text += " " text += event.alternatives[0].text # ensure STT is tagging languages correctly language = event.alternatives[0].language From 52880aa9876df77e55aefae7947bbd7a4a5867f3 Mon Sep 17 00:00:00 2001 From: Ishimwe Prince Date: Tue, 17 Dec 2024 12:56:44 +0200 Subject: [PATCH 03/46] fix: fix `imgui` setup (#1226) --- .changeset/fix-imgui-setup.md | 5 +++++ .../livekit-plugins-browser/src/CMakeLists.txt | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 .changeset/fix-imgui-setup.md diff --git a/.changeset/fix-imgui-setup.md b/.changeset/fix-imgui-setup.md new file mode 100644 index 000000000..a6e52168e --- /dev/null +++ b/.changeset/fix-imgui-setup.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-browser": patch +--- + +fix: fix `imgui` setup diff --git a/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt b/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt index 298ee3c37..f236519cb 100644 --- a/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt +++ b/livekit-plugins/livekit-plugins-browser/src/CMakeLists.txt @@ -11,8 +11,15 @@ set(GLFW_INSTALL OFF CACHE BOOL "" FORCE) FetchContent_Declare(glfw GIT_REPOSITORY https://github.com/glfw/glfw.git GIT_TAG 3.4) FetchContent_MakeAvailable(glfw) -FetchContent_Declare(imgui GIT_REPOSITORY https://github.com/ocornut/imgui GIT_TAG origin/docking) +FetchContent_Declare( + imgui + GIT_REPOSITORY https://github.com/ocornut/imgui + GIT_TAG origin/docking + GIT_SHALLOW TRUE +) FetchContent_GetProperties(imgui) +FetchContent_Populate(imgui) + FetchContent_MakeAvailable(imgui) file(GLOB IMGUI_SOURCES ${imgui_SOURCE_DIR}/*.cpp) add_library(imgui STATIC ${IMGUI_SOURCES} From 891d5e7bd6329fa9d2df06e725eb70d656948687 Mon Sep 17 00:00:00 2001 From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:00:45 +0530 Subject: [PATCH 04/46] fix: correctly parse function argument types (#1221) --- .changeset/nervous-years-sell.md | 7 + .github/workflows/tests.yml | 2 +- .../livekit/agents/llm/function_context.py | 38 ++-- .../livekit/plugins/anthropic/llm.py | 17 +- .../livekit/plugins/openai/_oai_api.py | 31 +-- tests/test_build_func_desc.py | 51 +++++ tests/test_create_func.py | 209 ++++++++++++++++++ 7 files changed, 313 insertions(+), 42 deletions(-) create mode 100644 .changeset/nervous-years-sell.md create mode 100644 tests/test_build_func_desc.py create mode 100644 tests/test_create_func.py diff --git a/.changeset/nervous-years-sell.md b/.changeset/nervous-years-sell.md new file mode 100644 index 000000000..a7829fe92 --- /dev/null +++ b/.changeset/nervous-years-sell.md @@ -0,0 +1,7 @@ +--- +"livekit-plugins-anthropic": patch +"livekit-plugins-openai": patch +"livekit-agents": patch +--- + +fix: correctly parse function argument types diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2fac6f9a2..2da4754b0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -117,7 +117,7 @@ jobs: case "${{ matrix.test_group }}" in base) - test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py" + test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py test_build_func_desc.py test_create_func.py" ;; llm) test_files="test_llm.py" diff --git a/livekit-agents/livekit/agents/llm/function_context.py b/livekit-agents/livekit/agents/llm/function_context.py index 4290d121e..aa4df9842 100644 --- a/livekit-agents/livekit/agents/llm/function_context.py +++ b/livekit-agents/livekit/agents/llm/function_context.py @@ -18,9 +18,10 @@ import enum import functools import inspect +import types import typing from dataclasses import dataclass -from typing import Any, Callable, Tuple +from typing import Any, Callable, Optional, Tuple from ..log import logger @@ -54,7 +55,6 @@ class FunctionArgInfo: type: type default: Any choices: tuple | None - is_optional: bool @dataclass(frozen=True) @@ -169,15 +169,13 @@ def _register_ai_function(self, fnc: Callable) -> None: ) desc = type_info.description if type_info else "" - choices = type_info.choices if type_info else None + choices = type_info.choices if type_info else () - is_optional, optional_inner = _is_optional_type(inner_th) - if is_optional: - # when the type is optional, only the inner type is relevant - # the argument info for default would be None - inner_th = optional_inner - - if issubclass(inner_th, enum.Enum) and not choices: + if ( + isinstance(inner_th, type) + and issubclass(inner_th, enum.Enum) + and not choices + ): # the enum must be a str or int (and at least one value) # this is verified by is_type_supported choices = tuple([item.value for item in inner_th]) @@ -189,7 +187,6 @@ def _register_ai_function(self, fnc: Callable) -> None: type=inner_th, default=param.default, choices=choices, - is_optional=is_optional, ) self._fncs[metadata.name] = FunctionInfo( @@ -225,7 +222,8 @@ def _extract_types(annotation: type) -> tuple[type, TypeInfo | None]: is_optional, optional_inner = _is_optional_type(annotation) if is_optional: - return _extract_types(optional_inner) + inner_type, info = _extract_types(optional_inner) + return Optional[inner_type], info # type: ignore return annotation, None @@ -293,17 +291,15 @@ def is_type_supported(t: type) -> bool: def _is_optional_type(typ) -> Tuple[bool, Any]: """return is_optional, inner_type""" origin = typing.get_origin(typ) + if origin is None or origin is list: + return False, typ - if origin in {typing.Union, getattr(__builtins__, "UnionType", typing.Union)}: + if origin in {typing.Union, getattr(types, "UnionType", typing.Union)}: args = typing.get_args(typ) is_optional = type(None) in args - - inner_arg = None - for arg in args: - if arg is not type(None): - inner_arg = arg - break - - return is_optional, inner_arg + non_none_args = [a for a in args if a is not type(None)] + if is_optional and len(non_none_args) == 1: + # Exactly one non-None type + None means optional + return True, non_none_args[0] return False, None diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py index b48d6ec58..9678c9381 100644 --- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py +++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py @@ -41,6 +41,7 @@ utils, ) from livekit.agents.llm import ToolChoice +from livekit.agents.llm.function_context import _is_optional_type from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions import anthropic @@ -517,13 +518,15 @@ def _create_ai_function_info( continue arg_value = parsed_arguments[arg_info.name] - if get_origin(arg_info.type) is not None: + is_optional, inner_th = _is_optional_type(arg_info.type) + + if get_origin(inner_th) is not None: if not isinstance(arg_value, list): raise ValueError( f"AI function {fnc_name} argument {arg_info.name} should be a list" ) - inner_type = get_args(arg_info.type)[0] + inner_type = get_args(inner_th)[0] sanitized_value = [ _sanitize_primitive( value=v, expected_type=inner_type, choices=arg_info.choices @@ -532,7 +535,7 @@ def _create_ai_function_info( ] else: sanitized_value = _sanitize_primitive( - value=arg_value, expected_type=arg_info.type, choices=arg_info.choices + value=arg_value, expected_type=inner_th, choices=arg_info.choices ) sanitized_arguments[arg_info.name] = sanitized_value @@ -568,8 +571,10 @@ def type2str(t: type) -> str: if arg_info.description: p["description"] = arg_info.description - if get_origin(arg_info.type) is list: - inner_type = get_args(arg_info.type)[0] + is_optional, inner_th = _is_optional_type(arg_info.type) + + if get_origin(inner_th) is list: + inner_type = get_args(inner_th)[0] p["type"] = "array" p["items"] = {} p["items"]["type"] = type2str(inner_type) @@ -577,7 +582,7 @@ def type2str(t: type) -> str: if arg_info.choices: p["items"]["enum"] = arg_info.choices else: - p["type"] = type2str(arg_info.type) + p["type"] = type2str(inner_th) if arg_info.choices: p["enum"] = arg_info.choices diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py index b82c29de9..8bf05a19f 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py @@ -20,6 +20,7 @@ from typing import Any from livekit.agents.llm import function_context, llm +from livekit.agents.llm.function_context import _is_optional_type __all__ = ["build_oai_function_description", "create_ai_function_info"] @@ -55,28 +56,28 @@ def create_ai_function_info( continue arg_value = parsed_arguments[arg_info.name] - if typing.get_origin(arg_info.type) is not None: + is_optional, inner_th = _is_optional_type(arg_info.type) + + if typing.get_origin(inner_th) is not None: if not isinstance(arg_value, list): raise ValueError( f"AI function {fnc_name} argument {arg_info.name} should be a list" ) - inner_type = typing.get_args(arg_info.type)[0] + inner_type = typing.get_args(inner_th)[0] sanitized_value = [ _sanitize_primitive( value=v, expected_type=inner_type, choices=arg_info.choices, - is_optional=arg_info.is_optional, ) for v in arg_value ] else: sanitized_value = _sanitize_primitive( value=arg_value, - expected_type=arg_info.type, + expected_type=inner_th, choices=arg_info.choices, - is_optional=arg_info.is_optional, ) sanitized_arguments[arg_info.name] = sanitized_value @@ -109,8 +110,10 @@ def type2str(t: type) -> str: if arg_info.description: p["description"] = arg_info.description - if typing.get_origin(arg_info.type) is list: - inner_type = typing.get_args(arg_info.type)[0] + is_optional, inner_th = _is_optional_type(arg_info.type) + + if typing.get_origin(inner_th) is list: + inner_type = typing.get_args(inner_th)[0] p["type"] = "array" p["items"] = {} p["items"]["type"] = type2str(inner_type) @@ -118,11 +121,14 @@ def type2str(t: type) -> str: if arg_info.choices: p["items"]["enum"] = arg_info.choices else: - p["type"] = type2str(arg_info.type) + p["type"] = type2str(inner_th) if arg_info.choices: p["enum"] = arg_info.choices - if arg_info.type is int and arg_info.choices and capabilities is not None: - if not capabilities.supports_choices_on_int: + if ( + inner_th is int + and capabilities + and not capabilities.supports_choices_on_int + ): raise ValueError( f"Parameter '{arg_info.name}' uses 'choices' with 'int', which is not supported by this model." ) @@ -153,11 +159,8 @@ def type2str(t: type) -> str: def _sanitize_primitive( - *, value: Any, expected_type: type, choices: tuple | None, is_optional: bool = False + *, value: Any, expected_type: type, choices: tuple | None ) -> Any: - if is_optional and value is None: - return None - if expected_type is str: if not isinstance(value, str): raise ValueError(f"expected str, got {type(value)}") diff --git a/tests/test_build_func_desc.py b/tests/test_build_func_desc.py new file mode 100644 index 000000000..67659df3b --- /dev/null +++ b/tests/test_build_func_desc.py @@ -0,0 +1,51 @@ +import sys +from inspect import _empty +from typing import List, Optional, Union + +import pytest +from livekit.agents.llm import FunctionArgInfo, FunctionInfo +from livekit.agents.llm.function_context import _is_optional_type +from livekit.plugins.openai import _oai_api + + +def test_typing(): + assert _is_optional_type(Optional[int]) == (True, int) + assert _is_optional_type(Union[str, None]) == (True, str) + if sys.version_info >= (3, 10): + assert _is_optional_type(float | None) == (True, float) + assert _is_optional_type(Union[str, int]) == (False, None) + + +@pytest.mark.parametrize( + ("arg_typ", "oai_type"), + [ + pytest.param(int, "number", id="int"), + pytest.param(Optional[int], "number", id="optional[int]"), + pytest.param(Union[None, int], "number", id="union[none, int]"), + pytest.param(Union[str, None], "string", id="union[str, none]"), + pytest.param(List[int], "array", id="list[int]"), + pytest.param(Optional[List[int]], "array", id="optional[list[int]]"), + ], +) +def test_description_building(arg_typ: type, oai_type: str): + fi = FunctionInfo( + name="foo", + description="foo", + auto_retry=False, + callable=lambda: None, + arguments={ + "arg": FunctionArgInfo( + name="foo", + description="foo", + type=arg_typ, + default=_empty, + choices=(), + ), + }, + ) + assert ( + _oai_api.build_oai_function_description(fi)["function"]["parameters"][ + "properties" + ]["foo"]["type"] + == oai_type + ) diff --git a/tests/test_create_func.py b/tests/test_create_func.py new file mode 100644 index 000000000..97583fb36 --- /dev/null +++ b/tests/test_create_func.py @@ -0,0 +1,209 @@ +import enum +from inspect import _empty +from typing import Annotated, List, Optional + +import pytest +from livekit.agents import llm +from livekit.plugins.openai import _oai_api + + +def test_func_basic(): + class TestFunctionContext(llm.FunctionContext): + @llm.ai_callable(name="test_function", description="A simple test function") + def test_fn( + self, param: Annotated[str, llm.TypeInfo(description="A string parameter")] + ): + pass + + fnc_ctx = TestFunctionContext() + assert ( + "test_function" in fnc_ctx.ai_functions + ), "Function should be registered in ai_functions" + + fnc_info = fnc_ctx.ai_functions["test_function"] + build_info = _oai_api.build_oai_function_description(fnc_info) + assert fnc_info.name == build_info["function"]["name"] + assert fnc_info.description == build_info["function"]["description"] + assert not fnc_info.auto_retry + assert "param" in fnc_info.arguments + assert "param" in build_info["function"]["parameters"]["properties"] + assert "param" in build_info["function"]["parameters"]["required"] + + arg_info = fnc_info.arguments["param"] + build_arg_info = build_info["function"]["parameters"]["properties"]["param"] + + assert arg_info.name == "param" + assert arg_info.description == "A string parameter" + assert arg_info.type is str + assert arg_info.default is _empty + assert arg_info.choices == () + assert build_arg_info["description"] == arg_info.description + assert build_arg_info["type"] == "string" + + +def test_func_duplicate(): + class TestFunctionContext(llm.FunctionContext): + @llm.ai_callable(name="duplicate_function") + def fn1(self): + pass + + @llm.ai_callable(name="duplicate_function") + def fn2(self): + pass + + with pytest.raises( + ValueError, match="duplicate ai_callable name: duplicate_function" + ): + TestFunctionContext() + + +def test_func_with_optional_parameter(): + class TestFunctionContext(llm.FunctionContext): + @llm.ai_callable( + name="optional_function", description="Function with optional parameter" + ) + def optional_fn( + self, + param: Annotated[ + Optional[int], llm.TypeInfo(description="An optional integer parameter") + ] = None, + param2: Optional[List[str]] = None, + param3: str = "A string", + ): + pass + + fnc_ctx = TestFunctionContext() + assert ( + "optional_function" in fnc_ctx.ai_functions + ), "Function should be registered in ai_functions" + + fnc_info = fnc_ctx.ai_functions["optional_function"] + build_info = _oai_api.build_oai_function_description(fnc_info) + print(build_info) + assert fnc_info.name == build_info["function"]["name"] + assert fnc_info.description == build_info["function"]["description"] + assert "param" in fnc_info.arguments + assert "param2" in fnc_info.arguments + assert "param3" in fnc_info.arguments + assert "param" in build_info["function"]["parameters"]["properties"] + assert "param2" in build_info["function"]["parameters"]["properties"] + assert "param3" in build_info["function"]["parameters"]["properties"] + assert "param" not in build_info["function"]["parameters"]["required"] + assert "param2" not in build_info["function"]["parameters"]["required"] + assert "param3" not in build_info["function"]["parameters"]["required"] + + # Check 'param' + arg_info = fnc_info.arguments["param"] + build_arg_info = build_info["function"]["parameters"]["properties"]["param"] + + assert arg_info.name == "param" + assert arg_info.description == "An optional integer parameter" + assert arg_info.type == Optional[int] + assert arg_info.default is None + assert arg_info.choices == () + assert build_arg_info["description"] == arg_info.description + assert build_arg_info["type"] == "number" + + # Check 'param2' + arg_info = fnc_info.arguments["param2"] + build_arg_info = build_info["function"]["parameters"]["properties"]["param2"] + + assert arg_info.name == "param2" + assert arg_info.description == "" + assert arg_info.type == Optional[List[str]] + assert arg_info.default is None + assert arg_info.choices == () + assert build_arg_info["type"] == "array" + assert build_arg_info["items"]["type"] == "string" + + # check 'param3' + arg_info = fnc_info.arguments["param3"] + build_arg_info = build_info["function"]["parameters"]["properties"]["param3"] + + assert arg_info.name == "param3" + assert arg_info.description == "" + assert arg_info.type is str + assert arg_info.default == "A string" + assert arg_info.choices == () + assert build_arg_info["type"] == "string" + + +def test_func_with_list_parameter(): + class TestFunctionContext(llm.FunctionContext): + @llm.ai_callable( + name="list_function", description="Function with list parameter" + ) + def list_fn( + self, + items: Annotated[List[str], llm.TypeInfo(description="A list of strings")], + ): + pass + + fnc_ctx = TestFunctionContext() + assert ( + "list_function" in fnc_ctx.ai_functions + ), "Function should be registered in ai_functions" + + fnc_info = fnc_ctx.ai_functions["list_function"] + build_info = _oai_api.build_oai_function_description(fnc_info) + assert fnc_info.name == build_info["function"]["name"] + assert fnc_info.description == build_info["function"]["description"] + assert not fnc_info.auto_retry + assert "items" in fnc_info.arguments + assert "items" in build_info["function"]["parameters"]["properties"] + assert "items" in build_info["function"]["parameters"]["required"] + + arg_info = fnc_info.arguments["items"] + build_arg_info = build_info["function"]["parameters"]["properties"]["items"] + + assert arg_info.name == "items" + assert arg_info.description == "A list of strings" + assert arg_info.type is List[str] + assert arg_info.default is _empty + assert arg_info.choices == () + assert build_arg_info["description"] == arg_info.description + assert build_arg_info["type"] == "array" + assert build_arg_info["items"]["type"] == "string" + + +def test_func_with_enum_parameter(): + class Status(enum.Enum): + ACTIVE = "active" + INACTIVE = "inactive" + PENDING = "pending" + + class TestFunctionContext(llm.FunctionContext): + @llm.ai_callable( + name="enum_function", description="Function with enum parameter" + ) + def enum_fn( + self, + status: Annotated[Status, llm.TypeInfo(description="Status of the entity")], + ): + pass + + fnc_ctx = TestFunctionContext() + assert ( + "enum_function" in fnc_ctx.ai_functions + ), "Function should be registered in ai_functions" + + fnc_info = fnc_ctx.ai_functions["enum_function"] + build_info = _oai_api.build_oai_function_description(fnc_info) + assert fnc_info.name == build_info["function"]["name"] + assert fnc_info.description == build_info["function"]["description"] + assert not fnc_info.auto_retry + assert "status" in fnc_info.arguments + assert "status" in build_info["function"]["parameters"]["properties"] + assert "status" in build_info["function"]["parameters"]["required"] + + arg_info = fnc_info.arguments["status"] + build_arg_info = build_info["function"]["parameters"]["properties"]["status"] + + assert arg_info.name == "status" + assert arg_info.description == "Status of the entity" + assert arg_info.type is str # Enum values are converted to their underlying type + assert arg_info.default is _empty + assert arg_info.choices == ("active", "inactive", "pending") + assert build_arg_info["description"] == arg_info.description + assert build_arg_info["type"] == "string" + assert build_arg_info["enum"] == arg_info.choices From c6e9fa87e72fd08cfc76b5efabd06492ba609b51 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Tue, 17 Dec 2024 08:35:22 -0800 Subject: [PATCH 05/46] fix azure stt language autodetection (#1246) --- .changeset/twenty-dragons-shave.md | 5 +++++ .../livekit/plugins/azure/stt.py | 2 +- tests/test_decoder.py | 11 ++++++++++- 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 .changeset/twenty-dragons-shave.md diff --git a/.changeset/twenty-dragons-shave.md b/.changeset/twenty-dragons-shave.md new file mode 100644 index 000000000..ceaa8890c --- /dev/null +++ b/.changeset/twenty-dragons-shave.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-azure": patch +--- + +fix azure stt language autodetection diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py index d705a7f2c..309cc9c5c 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py @@ -330,7 +330,7 @@ def _create_speech_recognizer( ) auto_detect_source_language_config = None - if config.languages and len(config.languages) > 1: + if config.languages and len(config.languages) >= 1: auto_detect_source_language_config = ( speechsdk.languageconfig.AutoDetectSourceLanguageConfig( languages=config.languages diff --git a/tests/test_decoder.py b/tests/test_decoder.py index c5ecacce8..10b5b521d 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -90,29 +90,37 @@ def reader(): def test_stream_buffer_large_chunks(): + import hashlib + buffer = StreamBuffer() - large_chunk = b"x" * 1024 * 1024 # 1MB chunk + large_chunk = os.urandom(1024 * 1024) # 1MB of random bytes num_chunks = 5 total_size = 0 write_completed = threading.Event() + input_hasher = hashlib.sha256() def writer(): nonlocal total_size for _ in range(num_chunks): buffer.write(large_chunk) total_size += len(large_chunk) + input_hasher.update(large_chunk) buffer.end_input() write_completed.set() received_size = 0 + output_hasher = hashlib.sha256() def reader(): nonlocal received_size + # allow writer to start first + time.sleep(1) while True: chunk = buffer.read(8192) # Read in 8KB chunks if not chunk: break received_size += len(chunk) + output_hasher.update(chunk) # Run writer and reader in separate threads with ThreadPoolExecutor(max_workers=2) as executor: @@ -125,6 +133,7 @@ def reader(): assert received_size == total_size assert total_size == num_chunks * len(large_chunk) + assert input_hasher.hexdigest() == output_hasher.hexdigest() def test_stream_buffer_early_close(): From 8bc8d14b3650844a14a906184e085bb316ceac49 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Tue, 17 Dec 2024 08:35:38 -0800 Subject: [PATCH 06/46] Include instructions on enabling Google APIs (#1243) --- examples/voice-pipeline-agent/gemini_voice_agent.py | 6 +++++- livekit-plugins/livekit-plugins-google/README.md | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/voice-pipeline-agent/gemini_voice_agent.py b/examples/voice-pipeline-agent/gemini_voice_agent.py index 5b3d62171..bb3641c6b 100644 --- a/examples/voice-pipeline-agent/gemini_voice_agent.py +++ b/examples/voice-pipeline-agent/gemini_voice_agent.py @@ -27,7 +27,11 @@ def prewarm(proc: JobProcess): # 2. save your service account credentials and set the following environments: # * GOOGLE_APPLICATION_CREDENTIALS to the path of the service account key file # * GOOGLE_CLOUD_PROJECT to your Google Cloud project ID -# +# 3. the following services are enabled on your Google Cloud project: +# * Vertex AI +# * Cloud Speech-to-Text API +# * Cloud Text-to-Speech API + # Read more about authentication with Google: https://cloud.google.com/docs/authentication/application-default-credentials diff --git a/livekit-plugins/livekit-plugins-google/README.md b/livekit-plugins/livekit-plugins-google/README.md index b0fffb41e..383fe1a62 100644 --- a/livekit-plugins/livekit-plugins-google/README.md +++ b/livekit-plugins/livekit-plugins-google/README.md @@ -11,3 +11,8 @@ pip install livekit-plugins-google ## Pre-requisites For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials). + +To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project. + +- Cloud Speech-to-Text API +- Cloud Text-to-Speech API From c7efb63ff4f83da3fd5d0fde8207410ad977696b Mon Sep 17 00:00:00 2001 From: Wills Manley <48636156+willsmanley@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:59:21 -0600 Subject: [PATCH 07/46] added cached_token_details to multimodalllmmetrics (#1248) --- livekit-agents/livekit/agents/metrics/base.py | 6 ++++++ .../livekit/plugins/openai/realtime/realtime_model.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/livekit-agents/livekit/agents/metrics/base.py b/livekit-agents/livekit/agents/metrics/base.py index 78d09e4f2..d524b02b8 100644 --- a/livekit-agents/livekit/agents/metrics/base.py +++ b/livekit-agents/livekit/agents/metrics/base.py @@ -108,11 +108,17 @@ class MultimodalLLMError(Error): @dataclass class MultimodalLLMMetrics(LLMMetrics): + @dataclass + class CachedTokenDetails: + text_tokens: int + audio_tokens: int + @dataclass class InputTokenDetails: cached_tokens: int text_tokens: int audio_tokens: int + cached_tokens_details: MultimodalLLMMetrics.CachedTokenDetails @dataclass class OutputTokenDetails: diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index 06e6930a8..c7d2a5d5f 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -1568,6 +1568,10 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon audio_tokens=usage.get("input_token_details", {}).get( "audio_tokens", 0 ), + cached_tokens_details=MultimodalLLMMetrics.CachedTokenDetails( + text_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("text_tokens", 0), + audio_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("audio_tokens", 0), + ), ), output_token_details=MultimodalLLMMetrics.OutputTokenDetails( text_tokens=usage.get("output_token_details", {}).get("text_tokens", 0), From 4bce8dbd6628e7d30f50750287e6ad81a5a84af3 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Tue, 17 Dec 2024 17:01:52 -0800 Subject: [PATCH 08/46] updated default realtime model to gpt-4o-realtime-preview-2024-12-17 (#1250) --- .changeset/thin-carpets-thank.md | 5 +++ .github/workflows/tests.yml | 2 +- .../function_calling_weather.py | 34 +++++++++++-------- .../plugins/openai/realtime/api_proto.py | 16 +++++++++ .../plugins/openai/realtime/realtime_model.py | 20 ++++++----- 5 files changed, 54 insertions(+), 23 deletions(-) create mode 100644 .changeset/thin-carpets-thank.md diff --git a/.changeset/thin-carpets-thank.md b/.changeset/thin-carpets-thank.md new file mode 100644 index 000000000..809ac6fa5 --- /dev/null +++ b/.changeset/thin-carpets-thank.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-openai": patch +--- + +update default realtime model to gpt-4o-realtime-preview-2024-12-17 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2da4754b0..d2a26cbf2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -117,7 +117,7 @@ jobs: case "${{ matrix.test_group }}" in base) - test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_decoder.py test_build_func_desc.py test_create_func.py" + test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_build_func_desc.py test_create_func.py" ;; llm) test_files="test_llm.py" diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py index 4e1784ad2..e8add68d0 100644 --- a/examples/voice-pipeline-agent/function_calling_weather.py +++ b/examples/voice-pipeline-agent/function_calling_weather.py @@ -1,5 +1,6 @@ import logging import random +import urllib from typing import Annotated import aiohttp @@ -34,14 +35,11 @@ async def get_weather( ], ): """Called when the user asks about the weather. This function will return the weather for the given location.""" - - # Example of a filler message while waiting for the function call to complete. - # NOTE: This message illustrates how the agent can engage users by using the `say()` method - # while awaiting the completion of the function call. To create a more dynamic and engaging - # interaction, consider varying the responses based on context or user input. + # When a function call is running, there are a couple of options to inform the user + # that it might take awhile: + # Option 1: you can use .say filler message immediately after the call is triggered + # Option 2: you can prompt the agent to return a text response when it's making a function call call_ctx = AgentCallContext.get_current() - # message = f"Let me check the weather in {location} for you." - message = f"Here is the weather in {location}: " filler_messages = [ "Let me check the weather in {location} for you.", "Let me see what the weather is like in {location} right now.", @@ -54,22 +52,25 @@ async def get_weather( # of the chat context of the function call for answer synthesis speech_handle = await call_ctx.agent.say(message, add_to_chat_ctx=True) # noqa: F841 - # To wait for the speech to finish - # await speech_handle.join() - logger.info(f"getting weather for {location}") - url = f"https://wttr.in/{location}?format=%C+%t" + url = f"https://wttr.in/{urllib.parse.quote(location)}?format=%C+%t" + weather_data = "" async with aiohttp.ClientSession() as session: async with session.get(url) as response: if response.status == 200: - weather_data = await response.text() # response from the function call is returned to the LLM - return f"The weather in {location} is {weather_data}." + weather_data = ( + f"The weather in {location} is {await response.text()}." + ) else: raise Exception( f"Failed to get weather data, status code: {response.status}" ) + # To wait for the speech to finish before giving results of the function call + await speech_handle.join() + return weather_data + def prewarm_process(proc: JobProcess): # preload silero VAD in memory to speed up session start @@ -82,7 +83,11 @@ async def entrypoint(ctx: JobContext): initial_chat_ctx = llm.ChatContext().append( text=( "You are a weather assistant created by LiveKit. Your interface with users will be voice. " - "You will provide weather information for a given location." + "You will provide weather information for a given location. " + # when using option 1, you can suppress from the agent with prompt + "do not say anything while waiting for the function call to complete." + # uncomment this to use option 2 + # "when performing function calls, let user know that you are checking the weather." ), role="system", ) @@ -95,6 +100,7 @@ async def entrypoint(ctx: JobContext): fnc_ctx=fnc_ctx, chat_ctx=initial_chat_ctx, ) + # Start the assistant. This will automatically publish a microphone track and listen to the participant. agent.start(ctx.room, participant) await agent.say( diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py index 506add5ef..2bf9778d3 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/api_proto.py @@ -27,6 +27,16 @@ class FunctionToolChoice(TypedDict): "in_progress", "completed", "incomplete", "cancelled", "failed" ] +# https://platform.openai.com/docs/models/gp#gpt-4o-realtime +OpenAIModel = Literal[ + "gpt-4o-realtime-preview", + "gpt-4o-realtime-preview-2024-10-01", + "gpt-4o-realtime-preview-2024-12-17", + "gpt-4o-mini-realtime-preview", + "gpt-4o-mini-realtime-preview-2024-12-17", +] +DefaultOpenAIModel = "gpt-4o-realtime-preview" + class TextContent(TypedDict): type: Literal["text"] @@ -145,6 +155,12 @@ class InputTokenDetails(TypedDict): cached_tokens: int text_tokens: int audio_tokens: int + cached_tokens_details: CachedTokenDetails + + +class CachedTokenDetails(TypedDict): + text_tokens: int + audio_tokens: int class OutputTokenDetails(TypedDict): diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index c7d2a5d5f..83b2cbfa6 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -152,7 +152,7 @@ class RealtimeError: @dataclass class _ModelOptions: - model: str | None + model: api_proto.OpenAIModel | str modalities: list[api_proto.Modality] instructions: str voice: api_proto.Voice @@ -182,6 +182,7 @@ class _ContentPtr(TypedDict): prefix_padding_ms=300, silence_duration_ms=500, ) + DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputTranscriptionOptions(model="whisper-1") @@ -192,7 +193,7 @@ def __init__( *, instructions: str = "", modalities: list[api_proto.Modality] = ["text", "audio"], - model: str = "gpt-4o-realtime-preview-2024-10-01", + model: api_proto.OpenAIModel | str = api_proto.DefaultOpenAIModel, voice: api_proto.Voice = "alloy", input_audio_format: api_proto.AudioFormat = "pcm16", output_audio_format: api_proto.AudioFormat = "pcm16", @@ -235,7 +236,7 @@ def __init__( *, instructions: str = "", modalities: list[api_proto.Modality] = ["text", "audio"], - model: str | None = "gpt-4o-realtime-preview-2024-10-01", + model: api_proto.OpenAIModel | str = api_proto.DefaultOpenAIModel, voice: api_proto.Voice = "alloy", input_audio_format: api_proto.AudioFormat = "pcm16", output_audio_format: api_proto.AudioFormat = "pcm16", @@ -1548,6 +1549,7 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon duration = time.time() - response._created_timestamp usage = response.usage or {} # type: ignore + input_token_details = usage.get("input_token_details", {}) metrics = MultimodalLLMMetrics( timestamp=response._created_timestamp, request_id=response.id, @@ -1561,16 +1563,18 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon tokens_per_second=usage.get("output_tokens", 0) / duration, error=metrics_error, input_token_details=MultimodalLLMMetrics.InputTokenDetails( - cached_tokens=usage.get("input_token_details", {}).get( - "cached_tokens", 0 - ), + cached_tokens=input_token_details.get("cached_tokens", 0), text_tokens=usage.get("input_token_details", {}).get("text_tokens", 0), audio_tokens=usage.get("input_token_details", {}).get( "audio_tokens", 0 ), cached_tokens_details=MultimodalLLMMetrics.CachedTokenDetails( - text_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("text_tokens", 0), - audio_tokens=usage.get("input_token_details", {}).get("cached_tokens_details", {}).get("audio_tokens", 0), + text_tokens=input_token_details.get( + "cached_tokens_details", {} + ).get("text_tokens", 0), + audio_tokens=input_token_details.get( + "cached_tokens_details", {} + ).get("audio_tokens", 0), ), ), output_token_details=MultimodalLLMMetrics.OutputTokenDetails( From e4c3454acca1e0494a5eee04ba6f7425f685d406 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Wed, 18 Dec 2024 15:08:10 +0800 Subject: [PATCH 09/46] fix: filter out empty message for set chat ctx in realtime model (#1245) --- .changeset/grumpy-dancers-develop.md | 5 +++++ .../plugins/openai/realtime/realtime_model.py | 12 +++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) create mode 100644 .changeset/grumpy-dancers-develop.md diff --git a/.changeset/grumpy-dancers-develop.md b/.changeset/grumpy-dancers-develop.md new file mode 100644 index 000000000..c5563f597 --- /dev/null +++ b/.changeset/grumpy-dancers-develop.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-openai": patch +--- + +filter out empty message for set chat ctx in realtime model diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index 83b2cbfa6..c99294d1a 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -507,10 +507,6 @@ def create( message_content = message.content tool_call_id = message.tool_call_id - if not tool_call_id and message_content is None: - # not a function call while the message content is None - fut.set_result(False) - return fut event: api_proto.ClientEvent.ConversationItemCreate | None = None if tool_call_id: if message.role == "tool": @@ -952,8 +948,14 @@ async def set_chat_ctx(self, new_ctx: llm.ChatContext) -> None: """ original_ctx = self._remote_conversation_items.to_chat_context() + # filter out messages that are not function calls and content is None + filtered_messages = [ + msg + for msg in new_ctx.messages + if msg.tool_call_id or msg.content is not None + ] changes = utils._compute_changes( - original_ctx.messages, new_ctx.messages, key_fnc=lambda x: x.id + original_ctx.messages, filtered_messages, key_fnc=lambda x: x.id ) logger.debug( "sync chat context", From 4d7a04530b3337d95a7883af254984105c86fb2a Mon Sep 17 00:00:00 2001 From: Long Chen Date: Wed, 18 Dec 2024 18:29:36 +0800 Subject: [PATCH 10/46] fix: add session_updated event for realtime model (#1253) --- .changeset/real-squids-warn.md | 5 ++ .../plugins/openai/realtime/__init__.py | 2 + .../plugins/openai/realtime/realtime_model.py | 72 +++++++++++++++---- 3 files changed, 67 insertions(+), 12 deletions(-) create mode 100644 .changeset/real-squids-warn.md diff --git a/.changeset/real-squids-warn.md b/.changeset/real-squids-warn.md new file mode 100644 index 000000000..43c5d096d --- /dev/null +++ b/.changeset/real-squids-warn.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-openai": patch +--- + +add session_updated event for RealtimeSession diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py index ac9b866d6..471deef37 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py @@ -11,6 +11,7 @@ RealtimeOutput, RealtimeResponse, RealtimeSession, + RealtimeSessionOptions, RealtimeToolCall, ServerVadOptions, ) @@ -25,6 +26,7 @@ "RealtimeSession", "RealtimeModel", "RealtimeError", + "RealtimeSessionOptions", "ServerVadOptions", "InputTranscriptionOptions", "ConversationItemCreated", diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index c99294d1a..04bf14ac5 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -21,6 +21,7 @@ EventTypes = Literal[ "start_session", + "session_updated", "error", "input_speech_started", "input_speech_stopped", @@ -151,18 +152,22 @@ class RealtimeError: @dataclass -class _ModelOptions: +class RealtimeSessionOptions: model: api_proto.OpenAIModel | str modalities: list[api_proto.Modality] instructions: str voice: api_proto.Voice input_audio_format: api_proto.AudioFormat output_audio_format: api_proto.AudioFormat - input_audio_transcription: InputTranscriptionOptions - turn_detection: ServerVadOptions + input_audio_transcription: InputTranscriptionOptions | None + turn_detection: ServerVadOptions | None tool_choice: api_proto.ToolChoice temperature: float max_response_output_tokens: int | Literal["inf"] + + +@dataclass +class _ModelOptions(RealtimeSessionOptions): api_key: str | None base_url: str entra_token: str | None @@ -897,12 +902,19 @@ def session_update( function_data["type"] = "function" tools.append(function_data) - server_vad_opts: api_proto.ServerVad = { - "type": "server_vad", - "threshold": self._opts.turn_detection.threshold, - "prefix_padding_ms": self._opts.turn_detection.prefix_padding_ms, - "silence_duration_ms": self._opts.turn_detection.silence_duration_ms, - } + server_vad_opts: api_proto.ServerVad | None = None + if self._opts.turn_detection is not None: + server_vad_opts = { + "type": "server_vad", + "threshold": self._opts.turn_detection.threshold, + "prefix_padding_ms": self._opts.turn_detection.prefix_padding_ms, + "silence_duration_ms": self._opts.turn_detection.silence_duration_ms, + } + input_audio_transcription_opts: api_proto.InputAudioTranscription | None = None + if self._opts.input_audio_transcription is not None: + input_audio_transcription_opts = { + "model": self._opts.input_audio_transcription.model, + } session_data: api_proto.ClientEvent.SessionUpdateData = { "modalities": self._opts.modalities, @@ -910,9 +922,7 @@ def session_update( "voice": self._opts.voice, "input_audio_format": self._opts.input_audio_format, "output_audio_format": self._opts.output_audio_format, - "input_audio_transcription": { - "model": self._opts.input_audio_transcription.model, - }, + "input_audio_transcription": input_audio_transcription_opts, "turn_detection": server_vad_opts, "tools": tools, "tool_choice": self._opts.tool_choice, @@ -1105,6 +1115,8 @@ async def _recv_task(): if event == "session.created": self._handle_session_created(data) + if event == "session.updated": + self._handle_session_updated(data) elif event == "error": self._handle_error(data) elif event == "input_audio_buffer.speech_started": @@ -1173,6 +1185,42 @@ def _handle_session_created( ): self._session_id = session_created["session"]["id"] + def _handle_session_updated( + self, session_updated: api_proto.ServerEvent.SessionUpdated + ): + session = session_updated["session"] + if session["turn_detection"] is None: + turn_detection = None + else: + turn_detection = ServerVadOptions( + threshold=session["turn_detection"]["threshold"], + prefix_padding_ms=session["turn_detection"]["prefix_padding_ms"], + silence_duration_ms=session["turn_detection"]["silence_duration_ms"], + ) + if session["input_audio_transcription"] is None: + input_audio_transcription = None + else: + input_audio_transcription = InputTranscriptionOptions( + model=session["input_audio_transcription"]["model"], + ) + + self.emit( + "session_updated", + RealtimeSessionOptions( + model=session["model"], + modalities=session["modalities"], + instructions=session["instructions"], + voice=session["voice"], + input_audio_format=session["input_audio_format"], + output_audio_format=session["output_audio_format"], + input_audio_transcription=input_audio_transcription, + turn_detection=turn_detection, + tool_choice=session["tool_choice"], + temperature=session["temperature"], + max_response_output_tokens=session["max_response_output_tokens"], + ), + ) + def _handle_error(self, error: api_proto.ServerEvent.Error): logger.error( "OpenAI S2S error %s", From 7e8c08986bc4966d421532c3022d26e23b2b4445 Mon Sep 17 00:00:00 2001 From: Ben Cherry Date: Wed, 18 Dec 2024 09:57:07 -0800 Subject: [PATCH 11/46] Add JPEG quality param to image encoder (#1249) --- .changeset/gorgeous-days-retire.md | 5 +++ .../livekit/agents/llm/chat_context.py | 19 ++++++++-- .../livekit/agents/utils/images/image.py | 37 +++++++++++++++---- 3 files changed, 51 insertions(+), 10 deletions(-) create mode 100644 .changeset/gorgeous-days-retire.md diff --git a/.changeset/gorgeous-days-retire.md b/.changeset/gorgeous-days-retire.md new file mode 100644 index 000000000..fa28e85a8 --- /dev/null +++ b/.changeset/gorgeous-days-retire.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +Add JPEG quality param to image encoder diff --git a/livekit-agents/livekit/agents/llm/chat_context.py b/livekit-agents/livekit/agents/llm/chat_context.py index 07e36d6c0..ccde86bba 100644 --- a/livekit-agents/livekit/agents/llm/chat_context.py +++ b/livekit-agents/livekit/agents/llm/chat_context.py @@ -32,13 +32,26 @@ class ChatImage: You may need to consult your LLM provider's documentation on supported URL types. ```python - # With a VideoFrame, which will be automatically converted to a data URL internally + # Pass a VideoFrame directly, which will be automatically converted to a JPEG data URL internally async for event in rtc.VideoStream(video_track): chat_image = ChatImage(image=event.frame) # this instance is now available for your ChatContext - # With a data URL - chat_image = ChatImage(image=f"data:image/jpeg;base64,{base64_encoded_image}") + # Encode your VideoFrame yourself for more control, and pass the result as a data URL (see EncodeOptions for more details) + from livekit.agents.utils.images import encode, EncodeOptions, ResizeOptions + + image_bytes = encode( + event.frame, + EncodeOptions( + format="PNG", + resize_options=ResizeOptions( + width=512, height=512, strategy="scale_aspect_fit" + ), + ), + ) + chat_image = ChatImage( + image=f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}" + ) # With an external URL chat_image = ChatImage(image="https://example.com/image.jpg") diff --git a/livekit-agents/livekit/agents/utils/images/image.py b/livekit-agents/livekit/agents/utils/images/image.py index bcc0a5b5f..dd9aac739 100644 --- a/livekit-agents/livekit/agents/utils/images/image.py +++ b/livekit-agents/livekit/agents/utils/images/image.py @@ -25,26 +25,42 @@ @dataclass class EncodeOptions: + """Options for encoding rtc.VideoFrame to portable image formats.""" + format: Literal["JPEG", "PNG"] = "JPEG" + """The format to encode the image.""" + resize_options: Optional["ResizeOptions"] = None + """Options for resizing the image.""" + + quality: Optional[int] = 75 + """Image compression quality, 0-100. Only applies to JPEG.""" @dataclass class ResizeOptions: + """Options for resizing rtc.VideoFrame as part of encoding to a portable image format.""" + width: int + """The desired resize width (in)""" + height: int + """The desired height to resize the image to.""" + strategy: Literal[ - # Fit the image into the provided dimensions, with letterboxing "center_aspect_fit", - # Fill the provided dimensions, with cropping "center_aspect_cover", - # Fit the image into the provided dimensions, preserving its original aspect ratio "scale_aspect_fit", - # Fill the provided dimensions, preserving its original aspect ratio (image will be larger than the provided dimensions) "scale_aspect_cover", - # Precisely resize the image to the provided dimensions "skew", ] + """The strategy to use when resizing the image: + - center_aspect_fit: Fit the image into the provided dimensions, with letterboxing + - center_aspect_cover: Fill the provided dimensions, with cropping + - scale_aspect_fit: Fit the image into the provided dimensions, preserving its original aspect ratio + - scale_aspect_cover: Fill the provided dimensions, preserving its original aspect ratio (image will be larger than the provided dimensions) + - skew: Precisely resize the image to the provided dimensions + """ def import_pil(): @@ -57,12 +73,19 @@ def import_pil(): ) -def encode(frame: rtc.VideoFrame, options: EncodeOptions): +def encode(frame: rtc.VideoFrame, options: EncodeOptions) -> bytes: + """Encode a rtc.VideoFrame to a portable image format (JPEG or PNG). + + See EncodeOptions for more details. + """ import_pil() img = _image_from_frame(frame) resized = _resize_image(img, options) buffer = io.BytesIO() - resized.save(buffer, options.format) + kwargs = {} + if options.format == "JPEG" and options.quality is not None: + kwargs["quality"] = options.quality + resized.save(buffer, options.format, **kwargs) buffer.seek(0) return buffer.read() From e32278b9c31f62f56225382ad636b508f5948d52 Mon Sep 17 00:00:00 2001 From: jerad fields Date: Thu, 19 Dec 2024 00:21:28 -0600 Subject: [PATCH 12/46] use plain onnxruntime for turn detector, remove pytorch (#1257) --- .changeset/yellow-kings-hear.md | 5 +++ .../livekit/plugins/turn_detector/__init__.py | 7 ++-- .../livekit/plugins/turn_detector/eou.py | 36 ++++++++++++------- .../livekit-plugins-turn-detector/setup.py | 5 ++- 4 files changed, 33 insertions(+), 20 deletions(-) create mode 100644 .changeset/yellow-kings-hear.md diff --git a/.changeset/yellow-kings-hear.md b/.changeset/yellow-kings-hear.md new file mode 100644 index 000000000..582956a37 --- /dev/null +++ b/.changeset/yellow-kings-hear.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-turn-detector": patch +--- + +use onnxruntime for turn detection and remove pytorch dependency diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py index 32692361a..54d7a90af 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/__init__.py @@ -27,15 +27,12 @@ def __init__(self): super().__init__(__name__, __version__, __package__, logger) def download_files(self) -> None: - from optimum.onnxruntime import ORTModelForCausalLM from transformers import AutoTokenizer - from .eou import HG_MODEL + from .eou import HG_MODEL, ONNX_FILENAME, _download_from_hf_hub - ORTModelForCausalLM.from_pretrained( - HG_MODEL, use_cache=False, use_io_binding=False - ) AutoTokenizer.from_pretrained(HG_MODEL) + _download_from_hf_hub(HG_MODEL, ONNX_FILENAME) Plugin.register_plugin(EOUPlugin()) diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py index afbc09415..acb915ab5 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py @@ -13,10 +13,18 @@ from .log import logger HG_MODEL = "livekit/turn-detector" +ONNX_FILENAME = "model_quantized.onnx" PUNCS = string.punctuation.replace("'", "") MAX_HISTORY = 4 +def _download_from_hf_hub(repo_id, filename, **kwargs): + from huggingface_hub import hf_hub_download + + local_path = hf_hub_download(repo_id=repo_id, filename=filename, **kwargs) + return local_path + + def _softmax(logits: np.ndarray) -> np.ndarray: exp_logits = np.exp(logits - np.max(logits)) return exp_logits / np.sum(exp_logits) @@ -55,17 +63,18 @@ def _format_chat_ctx(self, chat_ctx: dict): return text def initialize(self) -> None: + import onnxruntime as ort from huggingface_hub import errors - from optimum.onnxruntime import ORTModelForCausalLM from transformers import AutoTokenizer try: - self._model = ORTModelForCausalLM.from_pretrained( - HG_MODEL, - local_files_only=True, - use_io_binding=False, - use_cache=False, + local_path_onnx = _download_from_hf_hub( + HG_MODEL, ONNX_FILENAME, local_files_only=True + ) + self._session = ort.InferenceSession( + local_path_onnx, providers=["CPUExecutionProvider"] ) + self._tokenizer = AutoTokenizer.from_pretrained( HG_MODEL, local_files_only=True ) @@ -94,13 +103,17 @@ def run(self, data: bytes) -> bytes | None: inputs = self._tokenizer( text, add_special_tokens=False, - return_tensors="pt", + return_tensors="np", ) - outputs = self._model(**inputs) - logits = outputs.logits[0, -1, :].detach().numpy() - output_probs = _softmax(logits) - eou_probability = output_probs[self._eou_index] + input_dict = {"input_ids": inputs["input_ids"]} + + # Run inference + outputs = self._session.run(["logits"], input_dict) + + logits = outputs[0][0, -1, :] + probs = _softmax(logits) + eou_probability = probs[self._eou_index] end_time = time.perf_counter() @@ -112,7 +125,6 @@ def run(self, data: bytes) -> bytes | None: "duration": round(end_time - start_time, 3), }, ) - return json.dumps({"eou_probability": float(eou_probability)}).encode() diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py index b26b8e536..7b9b4b192 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/setup.py +++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py @@ -51,10 +51,9 @@ python_requires=">=3.9.0", install_requires=[ "livekit-agents>=0.11", - "transformers>=4.46", + "transformers>=4.47.1", "numpy>=1.26", - "torch>=2.5.1", - "optimum[onnxruntime]>=1.23.3", + "onnxruntime>=1.18", ], package_data={"livekit.plugins.turn_detector": ["py.typed"]}, project_urls={ From c57b4ccfb8fe65f146949771ca068cb56a380b28 Mon Sep 17 00:00:00 2001 From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com> Date: Fri, 20 Dec 2024 13:42:45 +0530 Subject: [PATCH 13/46] Move `create_ai_function_info` to function_context.py (#1260) --- .changeset/clever-lies-explode.md | 7 ++ livekit-agents/livekit/agents/llm/__init__.py | 2 + .../livekit/agents/llm/function_context.py | 94 ++++++++++++++++++ .../livekit/plugins/anthropic/llm.py | 95 +----------------- .../livekit/plugins/assemblyai/stt.py | 2 + .../livekit/plugins/openai/_oai_api.py | 96 +------------------ .../livekit/plugins/openai/llm.py | 9 +- .../plugins/openai/realtime/realtime_model.py | 5 +- 8 files changed, 116 insertions(+), 194 deletions(-) create mode 100644 .changeset/clever-lies-explode.md diff --git a/.changeset/clever-lies-explode.md b/.changeset/clever-lies-explode.md new file mode 100644 index 000000000..1bf7ea69d --- /dev/null +++ b/.changeset/clever-lies-explode.md @@ -0,0 +1,7 @@ +--- +"livekit-plugins-anthropic": patch +"livekit-plugins-openai": patch +"livekit-agents": patch +--- + +Moved create_ai_function_info to function_context.py for better reusability and reduce repetation diff --git a/livekit-agents/livekit/agents/llm/__init__.py b/livekit-agents/livekit/agents/llm/__init__.py index acc5b0ce6..d3a06f520 100644 --- a/livekit-agents/livekit/agents/llm/__init__.py +++ b/livekit-agents/livekit/agents/llm/__init__.py @@ -15,6 +15,7 @@ FunctionContext, FunctionInfo, TypeInfo, + _create_ai_function_info, ai_callable, ) from .llm import ( @@ -54,4 +55,5 @@ "FallbackAdapter", "AvailabilityChangedEvent", "ToolChoice", + "_create_ai_function_info", ] diff --git a/livekit-agents/livekit/agents/llm/function_context.py b/livekit-agents/livekit/agents/llm/function_context.py index aa4df9842..4470492fe 100644 --- a/livekit-agents/livekit/agents/llm/function_context.py +++ b/livekit-agents/livekit/agents/llm/function_context.py @@ -18,6 +18,7 @@ import enum import functools import inspect +import json import types import typing from dataclasses import dataclass @@ -303,3 +304,96 @@ def _is_optional_type(typ) -> Tuple[bool, Any]: return True, non_none_args[0] return False, None + + +def _create_ai_function_info( + fnc_ctx: FunctionContext, + tool_call_id: str, + fnc_name: str, + raw_arguments: str, # JSON string +) -> FunctionCallInfo: + if fnc_name not in fnc_ctx.ai_functions: + raise ValueError(f"AI function {fnc_name} not found") + + parsed_arguments: dict[str, Any] = {} + try: + if raw_arguments: # ignore empty string + parsed_arguments = json.loads(raw_arguments) + except json.JSONDecodeError: + raise ValueError( + f"AI function {fnc_name} received invalid JSON arguments - {raw_arguments}" + ) + + fnc_info = fnc_ctx.ai_functions[fnc_name] + + # Ensure all necessary arguments are present and of the correct type. + sanitized_arguments: dict[str, Any] = {} + for arg_info in fnc_info.arguments.values(): + if arg_info.name not in parsed_arguments: + if arg_info.default is inspect.Parameter.empty: + raise ValueError( + f"AI function {fnc_name} missing required argument {arg_info.name}" + ) + continue + + arg_value = parsed_arguments[arg_info.name] + is_optional, inner_th = _is_optional_type(arg_info.type) + + if typing.get_origin(inner_th) is not None: + if not isinstance(arg_value, list): + raise ValueError( + f"AI function {fnc_name} argument {arg_info.name} should be a list" + ) + + inner_type = typing.get_args(inner_th)[0] + sanitized_value = [ + _sanitize_primitive( + value=v, + expected_type=inner_type, + choices=arg_info.choices, + ) + for v in arg_value + ] + else: + sanitized_value = _sanitize_primitive( + value=arg_value, + expected_type=inner_th, + choices=arg_info.choices, + ) + + sanitized_arguments[arg_info.name] = sanitized_value + + return FunctionCallInfo( + tool_call_id=tool_call_id, + raw_arguments=raw_arguments, + function_info=fnc_info, + arguments=sanitized_arguments, + ) + + +def _sanitize_primitive( + *, value: Any, expected_type: type, choices: tuple | None +) -> Any: + if expected_type is str: + if not isinstance(value, str): + raise ValueError(f"expected str, got {type(value)}") + elif expected_type in (int, float): + if not isinstance(value, (int, float)): + raise ValueError(f"expected number, got {type(value)}") + + if expected_type is int: + if value % 1 != 0: + raise ValueError("expected int, got float") + + value = int(value) + elif expected_type is float: + value = float(value) + + elif expected_type is bool: + if not isinstance(value, bool): + raise ValueError(f"expected bool, got {type(value)}") + + if choices and value not in choices: + raise ValueError(f"invalid value {value}, not in {choices}") + + return value diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py index 9678c9381..69b468d23 100644 --- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py +++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py @@ -24,7 +24,6 @@ Awaitable, List, Literal, - Tuple, Union, cast, get_args, @@ -41,7 +40,10 @@ utils, ) from livekit.agents.llm import ToolChoice -from livekit.agents.llm.function_context import _is_optional_type +from livekit.agents.llm.function_context import ( + _create_ai_function_info, + _is_optional_type, +) from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions import anthropic @@ -487,67 +489,6 @@ def _build_anthropic_image_content( ) -def _create_ai_function_info( - fnc_ctx: llm.function_context.FunctionContext, - tool_call_id: str, - fnc_name: str, - raw_arguments: str, # JSON string -) -> llm.function_context.FunctionCallInfo: - if fnc_name not in fnc_ctx.ai_functions: - raise ValueError(f"AI function {fnc_name} not found") - - parsed_arguments: dict[str, Any] = {} - try: - if raw_arguments: # ignore empty string - parsed_arguments = json.loads(raw_arguments) - except json.JSONDecodeError: - raise ValueError( - f"AI function {fnc_name} received invalid JSON arguments - {raw_arguments}" - ) - - fnc_info = fnc_ctx.ai_functions[fnc_name] - - # Ensure all necessary arguments are present and of the correct type. - sanitized_arguments: dict[str, Any] = {} - for arg_info in fnc_info.arguments.values(): - if arg_info.name not in parsed_arguments: - if arg_info.default is inspect.Parameter.empty: - raise ValueError( - f"AI function {fnc_name} missing required argument {arg_info.name}" - ) - continue - - arg_value = parsed_arguments[arg_info.name] - is_optional, inner_th = _is_optional_type(arg_info.type) - - if get_origin(inner_th) is not None: - if not isinstance(arg_value, list): - raise ValueError( - f"AI function {fnc_name} argument {arg_info.name} should be a list" - ) - - inner_type = get_args(inner_th)[0] - sanitized_value = [ - _sanitize_primitive( - value=v, expected_type=inner_type, choices=arg_info.choices - ) - for v in arg_value - ] - else: - sanitized_value = _sanitize_primitive( - value=arg_value, expected_type=inner_th, choices=arg_info.choices - ) - - sanitized_arguments[arg_info.name] = sanitized_value - - return llm.function_context.FunctionCallInfo( - tool_call_id=tool_call_id, - raw_arguments=raw_arguments, - function_info=fnc_info, - arguments=sanitized_arguments, - ) - - def _build_function_description( fnc_info: llm.function_context.FunctionInfo, ) -> anthropic.types.ToolParam: @@ -598,31 +539,3 @@ def type2str(t: type) -> str: "description": fnc_info.description, "input_schema": input_schema, } - - -def _sanitize_primitive( - *, value: Any, expected_type: type, choices: Tuple[Any] | None -) -> Any: - if expected_type is str: - if not isinstance(value, str): - raise ValueError(f"expected str, got {type(value)}") - elif expected_type in (int, float): - if not isinstance(value, (int, float)): - raise ValueError(f"expected number, got {type(value)}") - - if expected_type is int: - if value % 1 != 0: - raise ValueError("expected int, got float") - - value = int(value) - elif expected_type is float: - value = float(value) - - elif expected_type is bool: - if not isinstance(value, bool): - raise ValueError(f"expected bool, got {type(value)}") - - if choices and value not in choices: - raise ValueError(f"invalid value {value}, not in {choices}") - - return value diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py index a87eaf542..acef65b6a 100644 --- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py +++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py @@ -289,6 +289,8 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse): except Exception: logger.exception("failed to process AssemblyAI message") + ws: aiohttp.ClientWebSocketResponse | None = None + while True: try: ws = await self._connect_ws() diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py index 8bf05a19f..8dbc3a33e 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/_oai_api.py @@ -15,79 +15,13 @@ from __future__ import annotations import inspect -import json import typing from typing import Any from livekit.agents.llm import function_context, llm from livekit.agents.llm.function_context import _is_optional_type -__all__ = ["build_oai_function_description", "create_ai_function_info"] - - -def create_ai_function_info( - fnc_ctx: function_context.FunctionContext, - tool_call_id: str, - fnc_name: str, - raw_arguments: str, # JSON string -) -> function_context.FunctionCallInfo: - if fnc_name not in fnc_ctx.ai_functions: - raise ValueError(f"AI function {fnc_name} not found") - - parsed_arguments: dict[str, Any] = {} - try: - if raw_arguments: # ignore empty string - parsed_arguments = json.loads(raw_arguments) - except json.JSONDecodeError: - raise ValueError( - f"AI function {fnc_name} received invalid JSON arguments - {raw_arguments}" - ) - - fnc_info = fnc_ctx.ai_functions[fnc_name] - - # Ensure all necessary arguments are present and of the correct type. - sanitized_arguments: dict[str, Any] = {} - for arg_info in fnc_info.arguments.values(): - if arg_info.name not in parsed_arguments: - if arg_info.default is inspect.Parameter.empty: - raise ValueError( - f"AI function {fnc_name} missing required argument {arg_info.name}" - ) - continue - - arg_value = parsed_arguments[arg_info.name] - is_optional, inner_th = _is_optional_type(arg_info.type) - - if typing.get_origin(inner_th) is not None: - if not isinstance(arg_value, list): - raise ValueError( - f"AI function {fnc_name} argument {arg_info.name} should be a list" - ) - - inner_type = typing.get_args(inner_th)[0] - sanitized_value = [ - _sanitize_primitive( - value=v, - expected_type=inner_type, - choices=arg_info.choices, - ) - for v in arg_value - ] - else: - sanitized_value = _sanitize_primitive( - value=arg_value, - expected_type=inner_th, - choices=arg_info.choices, - ) - - sanitized_arguments[arg_info.name] = sanitized_value - - return function_context.FunctionCallInfo( - tool_call_id=tool_call_id, - raw_arguments=raw_arguments, - function_info=fnc_info, - arguments=sanitized_arguments, - ) +__all__ = ["build_oai_function_description"] def build_oai_function_description( @@ -156,31 +90,3 @@ def type2str(t: type) -> str: }, }, } - - -def _sanitize_primitive( - *, value: Any, expected_type: type, choices: tuple | None -) -> Any: - if expected_type is str: - if not isinstance(value, str): - raise ValueError(f"expected str, got {type(value)}") - elif expected_type in (int, float): - if not isinstance(value, (int, float)): - raise ValueError(f"expected number, got {type(value)}") - - if expected_type is int: - if value % 1 != 0: - raise ValueError("expected int, got float") - - value = int(value) - elif expected_type is float: - value = float(value) - - elif expected_type is bool: - if not isinstance(value, bool): - raise ValueError(f"expected bool, got {type(value)}") - - if choices and value not in choices: - raise ValueError(f"invalid value {value}, not in {choices}") - - return value diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py index 7dfbaff24..bcff2cfa9 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py @@ -29,17 +29,14 @@ APITimeoutError, llm, ) -from livekit.agents.llm import ToolChoice +from livekit.agents.llm import ToolChoice, _create_ai_function_info from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions import openai from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam from openai.types.chat.chat_completion_chunk import Choice -from ._oai_api import ( - build_oai_function_description, - create_ai_function_info, -) +from ._oai_api import build_oai_function_description from .log import logger from .models import ( CerebrasChatModels, @@ -840,7 +837,7 @@ def _try_build_function(self, id: str, choice: Choice) -> llm.ChatChunk | None: ) return None - fnc_info = create_ai_function_info( + fnc_info = _create_ai_function_info( self._fnc_ctx, self._tool_call_id, self._fnc_name, self._fnc_raw_arguments ) diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index 04bf14ac5..26bc2649b 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -12,10 +12,11 @@ import aiohttp from livekit import rtc from livekit.agents import llm, utils +from livekit.agents.llm.function_context import _create_ai_function_info from livekit.agents.metrics import MultimodalLLMError, MultimodalLLMMetrics from typing_extensions import TypedDict -from .._oai_api import build_oai_function_description, create_ai_function_info +from .._oai_api import build_oai_function_description from . import api_proto, remote_items from .log import logger @@ -1521,7 +1522,7 @@ def _handle_response_output_item_done( item = response_output_done["item"] assert item["type"] == "function_call" - fnc_call_info = create_ai_function_info( + fnc_call_info = _create_ai_function_info( self._fnc_ctx, item["call_id"], item["name"], From 7941263f4919b46646becef71ee00bd42831440b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 12:33:35 -0600 Subject: [PATCH 14/46] Version Packages (#1218) Co-authored-by: github-actions[bot] --- .changeset/clever-lies-explode.md | 7 ----- .changeset/empty-sheep-pump.md | 6 ----- .changeset/famous-points-tickle.md | 5 ---- .changeset/fix-imgui-setup.md | 5 ---- .changeset/gorgeous-days-retire.md | 5 ---- .changeset/great-lizards-pump.md | 9 ------- .changeset/grumpy-dancers-develop.md | 5 ---- .changeset/loud-onions-invent.md | 5 ---- .changeset/nervous-years-sell.md | 7 ----- .changeset/real-squids-warn.md | 5 ---- .changeset/strange-snakes-hug.md | 5 ---- .changeset/thin-carpets-thank.md | 5 ---- .changeset/tiny-papayas-film.md | 9 ------- .changeset/twenty-dragons-shave.md | 5 ---- .changeset/warm-pillows-grow.md | 5 ---- .changeset/yellow-kings-hear.md | 5 ---- .../participant-entrypoint/requirements.txt | 2 +- examples/simple-color/requirements.txt | 2 +- examples/speech-to-text/requirements.txt | 4 +-- examples/text-to-speech/requirements.txt | 4 +-- .../voice-pipeline-agent/requirements.txt | 4 +-- livekit-agents/CHANGELOG.md | 20 ++++++++++++++ livekit-agents/livekit/agents/version.py | 2 +- livekit-agents/package.json | 2 +- .../livekit-plugins-anthropic/CHANGELOG.md | 16 ++++++++++++ .../livekit/plugins/anthropic/version.py | 2 +- .../livekit-plugins-anthropic/package.json | 2 +- .../livekit-plugins-azure/CHANGELOG.md | 6 +++++ .../livekit/plugins/azure/version.py | 2 +- .../livekit-plugins-azure/package.json | 2 +- .../livekit-plugins-browser/CHANGELOG.md | 6 +++++ .../livekit/plugins/browser/version.py | 2 +- .../livekit-plugins-browser/package.json | 2 +- .../livekit-plugins-deepgram/CHANGELOG.md | 8 ++++++ .../livekit/plugins/deepgram/version.py | 2 +- .../livekit-plugins-deepgram/package.json | 2 +- .../livekit-plugins-openai/CHANGELOG.md | 26 +++++++++++++++++++ .../livekit/plugins/openai/version.py | 2 +- .../livekit-plugins-openai/package.json | 2 +- .../CHANGELOG.md | 8 ++++++ .../livekit/plugins/turn_detector/version.py | 2 +- .../package.json | 2 +- 42 files changed, 112 insertions(+), 115 deletions(-) delete mode 100644 .changeset/clever-lies-explode.md delete mode 100644 .changeset/empty-sheep-pump.md delete mode 100644 .changeset/famous-points-tickle.md delete mode 100644 .changeset/fix-imgui-setup.md delete mode 100644 .changeset/gorgeous-days-retire.md delete mode 100644 .changeset/great-lizards-pump.md delete mode 100644 .changeset/grumpy-dancers-develop.md delete mode 100644 .changeset/loud-onions-invent.md delete mode 100644 .changeset/nervous-years-sell.md delete mode 100644 .changeset/real-squids-warn.md delete mode 100644 .changeset/strange-snakes-hug.md delete mode 100644 .changeset/thin-carpets-thank.md delete mode 100644 .changeset/tiny-papayas-film.md delete mode 100644 .changeset/twenty-dragons-shave.md delete mode 100644 .changeset/warm-pillows-grow.md delete mode 100644 .changeset/yellow-kings-hear.md diff --git a/.changeset/clever-lies-explode.md b/.changeset/clever-lies-explode.md deleted file mode 100644 index 1bf7ea69d..000000000 --- a/.changeset/clever-lies-explode.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -"livekit-plugins-anthropic": patch -"livekit-plugins-openai": patch -"livekit-agents": patch ---- - -Moved create_ai_function_info to function_context.py for better reusability and reduce repetation diff --git a/.changeset/empty-sheep-pump.md b/.changeset/empty-sheep-pump.md deleted file mode 100644 index 06c854c20..000000000 --- a/.changeset/empty-sheep-pump.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -"livekit-plugins-deepgram": patch -"livekit-agents": patch ---- - -added streaming audio decoder for compressed audio. diff --git a/.changeset/famous-points-tickle.md b/.changeset/famous-points-tickle.md deleted file mode 100644 index 48df9b431..000000000 --- a/.changeset/famous-points-tickle.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-openai": patch ---- - -add on_duplicate option for multimodal agent response create diff --git a/.changeset/fix-imgui-setup.md b/.changeset/fix-imgui-setup.md deleted file mode 100644 index a6e52168e..000000000 --- a/.changeset/fix-imgui-setup.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-browser": patch ---- - -fix: fix `imgui` setup diff --git a/.changeset/gorgeous-days-retire.md b/.changeset/gorgeous-days-retire.md deleted file mode 100644 index fa28e85a8..000000000 --- a/.changeset/gorgeous-days-retire.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -Add JPEG quality param to image encoder diff --git a/.changeset/great-lizards-pump.md b/.changeset/great-lizards-pump.md deleted file mode 100644 index a9542b8be..000000000 --- a/.changeset/great-lizards-pump.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -"livekit-agents": patch -"livekit-plugins-anthropic": patch -"livekit-plugins-openai": patch ---- - -Add support for OpenAI's "detail" parameter to ChatImage - -Add support for data URLs on ChatImage in the Anthropic plugin. diff --git a/.changeset/grumpy-dancers-develop.md b/.changeset/grumpy-dancers-develop.md deleted file mode 100644 index c5563f597..000000000 --- a/.changeset/grumpy-dancers-develop.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-openai": patch ---- - -filter out empty message for set chat ctx in realtime model diff --git a/.changeset/loud-onions-invent.md b/.changeset/loud-onions-invent.md deleted file mode 100644 index dcedf95b4..000000000 --- a/.changeset/loud-onions-invent.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-turn-detector": patch ---- - -use quantized onnx version of turn detector model diff --git a/.changeset/nervous-years-sell.md b/.changeset/nervous-years-sell.md deleted file mode 100644 index a7829fe92..000000000 --- a/.changeset/nervous-years-sell.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -"livekit-plugins-anthropic": patch -"livekit-plugins-openai": patch -"livekit-agents": patch ---- - -fix: correctly parse function argument types diff --git a/.changeset/real-squids-warn.md b/.changeset/real-squids-warn.md deleted file mode 100644 index 43c5d096d..000000000 --- a/.changeset/real-squids-warn.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-openai": patch ---- - -add session_updated event for RealtimeSession diff --git a/.changeset/strange-snakes-hug.md b/.changeset/strange-snakes-hug.md deleted file mode 100644 index 1753e0133..000000000 --- a/.changeset/strange-snakes-hug.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-openai": patch ---- - -added llama 3.3 70b to model definitions diff --git a/.changeset/thin-carpets-thank.md b/.changeset/thin-carpets-thank.md deleted file mode 100644 index 809ac6fa5..000000000 --- a/.changeset/thin-carpets-thank.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-openai": patch ---- - -update default realtime model to gpt-4o-realtime-preview-2024-12-17 diff --git a/.changeset/tiny-papayas-film.md b/.changeset/tiny-papayas-film.md deleted file mode 100644 index 07ccea04c..000000000 --- a/.changeset/tiny-papayas-film.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -"livekit-agents": patch -"livekit-plugins-anthropic": patch -"livekit-plugins-openai": patch ---- - -Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options. - -Make scale_aspect_fit the new default resizing option for video frames. diff --git a/.changeset/twenty-dragons-shave.md b/.changeset/twenty-dragons-shave.md deleted file mode 100644 index ceaa8890c..000000000 --- a/.changeset/twenty-dragons-shave.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-azure": patch ---- - -fix azure stt language autodetection diff --git a/.changeset/warm-pillows-grow.md b/.changeset/warm-pillows-grow.md deleted file mode 100644 index f0f29092a..000000000 --- a/.changeset/warm-pillows-grow.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-deepgram": patch ---- - -Support Deepgram TTS diff --git a/.changeset/yellow-kings-hear.md b/.changeset/yellow-kings-hear.md deleted file mode 100644 index 582956a37..000000000 --- a/.changeset/yellow-kings-hear.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-turn-detector": patch ---- - -use onnxruntime for turn detection and remove pytorch dependency diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt index 5616d9626..53a52b16a 100644 --- a/examples/participant-entrypoint/requirements.txt +++ b/examples/participant-entrypoint/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.2 +livekit-agents>=0.12.3 python-dotenv~=1.0 diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt index 5616d9626..53a52b16a 100644 --- a/examples/simple-color/requirements.txt +++ b/examples/simple-color/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.2 +livekit-agents>=0.12.3 python-dotenv~=1.0 diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt index e74eccacc..53ee39eb8 100644 --- a/examples/speech-to-text/requirements.txt +++ b/examples/speech-to-text/requirements.txt @@ -1,3 +1,3 @@ -livekit-agents>=0.12.2 -livekit-plugins-deepgram>=0.6.14 +livekit-agents>=0.12.3 +livekit-plugins-deepgram>=0.6.15 python-dotenv~=1.0 diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt index 8e983ef04..e5e0d8ddd 100644 --- a/examples/text-to-speech/requirements.txt +++ b/examples/text-to-speech/requirements.txt @@ -1,5 +1,5 @@ -livekit-agents>=0.12.2 -livekit-plugins-openai>=0.10.10 +livekit-agents>=0.12.3 +livekit-plugins-openai>=0.10.11 livekit-plugins-cartesia>=0.4.5 livekit-plugins-elevenlabs>=0.7.9 python-dotenv~=1.0 diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt index a4cdff1ef..c8942df19 100644 --- a/examples/voice-pipeline-agent/requirements.txt +++ b/examples/voice-pipeline-agent/requirements.txt @@ -1,5 +1,5 @@ -livekit-agents>=0.12.2 -livekit-plugins-deepgram>=0.6.14 +livekit-agents>=0.12.3 +livekit-plugins-deepgram>=0.6.15 livekit-plugins-google>=0.8.1 livekit-plugins-openai[vertex]>=0.10.10 livekit-plugins-silero>=0.7.4 diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md index 8a65f0234..83a2959c1 100644 --- a/livekit-agents/CHANGELOG.md +++ b/livekit-agents/CHANGELOG.md @@ -1,5 +1,25 @@ # livekit-agents +## 0.12.3 + +### Patch Changes + +- Moved create_ai_function_info to function_context.py for better reusability and reduce repetation - [#1260](https://github.com/livekit/agents/pull/1260) ([@jayeshp19](https://github.com/jayeshp19)) + +- added streaming audio decoder for compressed audio. - [#1236](https://github.com/livekit/agents/pull/1236) ([@davidzhao](https://github.com/davidzhao)) + +- Add JPEG quality param to image encoder - [#1249](https://github.com/livekit/agents/pull/1249) ([@bcherry](https://github.com/bcherry)) + +- Add support for OpenAI's "detail" parameter to ChatImage - [#1213](https://github.com/livekit/agents/pull/1213) ([@bcherry](https://github.com/bcherry)) + + Add support for data URLs on ChatImage in the Anthropic plugin. + +- fix: correctly parse function argument types - [#1221](https://github.com/livekit/agents/pull/1221) ([@jayeshp19](https://github.com/jayeshp19)) + +- Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options. - [#1222](https://github.com/livekit/agents/pull/1222) ([@bcherry](https://github.com/bcherry)) + + Make scale_aspect_fit the new default resizing option for video frames. + ## 0.12.2 ### Patch Changes diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py index 769b5d67e..55829dea7 100644 --- a/livekit-agents/livekit/agents/version.py +++ b/livekit-agents/livekit/agents/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.12.2" +__version__ = "0.12.3" diff --git a/livekit-agents/package.json b/livekit-agents/package.json index 172ad6196..c23feb751 100644 --- a/livekit-agents/package.json +++ b/livekit-agents/package.json @@ -1,5 +1,5 @@ { "name": "livekit-agents", "private": true, - "version": "0.12.2" + "version": "0.12.3" } diff --git a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md index ab0944df0..f540e9641 100644 --- a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md @@ -1,5 +1,21 @@ # livekit-plugins-anthropic +## 0.2.8 + +### Patch Changes + +- Moved create_ai_function_info to function_context.py for better reusability and reduce repetation - [#1260](https://github.com/livekit/agents/pull/1260) ([@jayeshp19](https://github.com/jayeshp19)) + +- Add support for OpenAI's "detail" parameter to ChatImage - [#1213](https://github.com/livekit/agents/pull/1213) ([@bcherry](https://github.com/bcherry)) + + Add support for data URLs on ChatImage in the Anthropic plugin. + +- fix: correctly parse function argument types - [#1221](https://github.com/livekit/agents/pull/1221) ([@jayeshp19](https://github.com/jayeshp19)) + +- Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options. - [#1222](https://github.com/livekit/agents/pull/1222) ([@bcherry](https://github.com/bcherry)) + + Make scale_aspect_fit the new default resizing option for video frames. + ## 0.2.7 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py index c75e497a4..e558b382c 100644 --- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py +++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.7" +__version__ = "0.2.8" diff --git a/livekit-plugins/livekit-plugins-anthropic/package.json b/livekit-plugins/livekit-plugins-anthropic/package.json index a4f8b5235..ad2ba63a2 100644 --- a/livekit-plugins/livekit-plugins-anthropic/package.json +++ b/livekit-plugins/livekit-plugins-anthropic/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-anthropic", "private": true, - "version": "0.2.7" + "version": "0.2.8" } diff --git a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md index 9a5897906..5d4ab532b 100644 --- a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-azure +## 0.5.1 + +### Patch Changes + +- fix azure stt language autodetection - [#1246](https://github.com/livekit/agents/pull/1246) ([@davidzhao](https://github.com/davidzhao)) + ## 0.5.0 ### Minor Changes diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py index 63a2bd75e..79283902f 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.5.0" +__version__ = "0.5.1" diff --git a/livekit-plugins/livekit-plugins-azure/package.json b/livekit-plugins/livekit-plugins-azure/package.json index dc0b821de..cdd81c035 100644 --- a/livekit-plugins/livekit-plugins-azure/package.json +++ b/livekit-plugins/livekit-plugins-azure/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-azure", "private": true, - "version": "0.5.0" + "version": "0.5.1" } diff --git a/livekit-plugins/livekit-plugins-browser/CHANGELOG.md b/livekit-plugins/livekit-plugins-browser/CHANGELOG.md index e13c5455f..498a259c3 100644 --- a/livekit-plugins/livekit-plugins-browser/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-browser/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-browser +## 0.0.5 + +### Patch Changes + +- fix: fix `imgui` setup - [#1226](https://github.com/livekit/agents/pull/1226) ([@mbukeRepo](https://github.com/mbukeRepo)) + ## 0.0.4 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py b/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py index 1308acf66..0f8366140 100644 --- a/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py +++ b/livekit-plugins/livekit-plugins-browser/livekit/plugins/browser/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.0.4" +__version__ = "0.0.5" diff --git a/livekit-plugins/livekit-plugins-browser/package.json b/livekit-plugins/livekit-plugins-browser/package.json index 5340f768c..f28e403c5 100644 --- a/livekit-plugins/livekit-plugins-browser/package.json +++ b/livekit-plugins/livekit-plugins-browser/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-browser", "private": true, - "version": "0.0.4" + "version": "0.0.5" } diff --git a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md index 6836c4522..9c624c19f 100644 --- a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md @@ -1,5 +1,13 @@ # livekit-plugins-deepgram +## 0.6.15 + +### Patch Changes + +- added streaming audio decoder for compressed audio. - [#1236](https://github.com/livekit/agents/pull/1236) ([@davidzhao](https://github.com/davidzhao)) + +- Support Deepgram TTS - [#1201](https://github.com/livekit/agents/pull/1201) ([@jayeshp19](https://github.com/jayeshp19)) + ## 0.6.14 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py index 63f6f8624..c83922d4e 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.6.14" +__version__ = "0.6.15" diff --git a/livekit-plugins/livekit-plugins-deepgram/package.json b/livekit-plugins/livekit-plugins-deepgram/package.json index 1259f3ad3..65cf7a26a 100644 --- a/livekit-plugins/livekit-plugins-deepgram/package.json +++ b/livekit-plugins/livekit-plugins-deepgram/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-deepgram", "private": true, - "version": "0.6.14" + "version": "0.6.15" } diff --git a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md index 3d2783b7c..d9f42cc0c 100644 --- a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md @@ -1,5 +1,31 @@ # livekit-plugins-openai +## 0.10.11 + +### Patch Changes + +- Moved create_ai_function_info to function_context.py for better reusability and reduce repetation - [#1260](https://github.com/livekit/agents/pull/1260) ([@jayeshp19](https://github.com/jayeshp19)) + +- add on_duplicate option for multimodal agent response create - [#1204](https://github.com/livekit/agents/pull/1204) ([@longcw](https://github.com/longcw)) + +- Add support for OpenAI's "detail" parameter to ChatImage - [#1213](https://github.com/livekit/agents/pull/1213) ([@bcherry](https://github.com/bcherry)) + + Add support for data URLs on ChatImage in the Anthropic plugin. + +- filter out empty message for set chat ctx in realtime model - [#1245](https://github.com/livekit/agents/pull/1245) ([@longcw](https://github.com/longcw)) + +- fix: correctly parse function argument types - [#1221](https://github.com/livekit/agents/pull/1221) ([@jayeshp19](https://github.com/jayeshp19)) + +- add session_updated event for RealtimeSession - [#1253](https://github.com/livekit/agents/pull/1253) ([@longcw](https://github.com/longcw)) + +- added llama 3.3 70b to model definitions - [#1233](https://github.com/livekit/agents/pull/1233) ([@davidzhao](https://github.com/davidzhao)) + +- update default realtime model to gpt-4o-realtime-preview-2024-12-17 - [#1250](https://github.com/livekit/agents/pull/1250) ([@davidzhao](https://github.com/davidzhao)) + +- Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options. - [#1222](https://github.com/livekit/agents/pull/1222) ([@bcherry](https://github.com/bcherry)) + + Make scale_aspect_fit the new default resizing option for video frames. + ## 0.10.10 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py index 9a14e871f..613650a21 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.10.10" +__version__ = "0.10.11" diff --git a/livekit-plugins/livekit-plugins-openai/package.json b/livekit-plugins/livekit-plugins-openai/package.json index b9238338f..a5087740b 100644 --- a/livekit-plugins/livekit-plugins-openai/package.json +++ b/livekit-plugins/livekit-plugins-openai/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-openai", "private": true, - "version": "0.10.10" + "version": "0.10.11" } diff --git a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md index 201e0f662..0bc8544b5 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md @@ -1,5 +1,13 @@ # livekit-plugins-eou +## 0.3.3 + +### Patch Changes + +- use quantized onnx version of turn detector model - [#1231](https://github.com/livekit/agents/pull/1231) ([@jeradf](https://github.com/jeradf)) + +- use onnxruntime for turn detection and remove pytorch dependency - [#1257](https://github.com/livekit/agents/pull/1257) ([@jeradf](https://github.com/jeradf)) + ## 0.3.2 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py index adb9a59d4..6b8f1ef90 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.3.2" +__version__ = "0.3.3" diff --git a/livekit-plugins/livekit-plugins-turn-detector/package.json b/livekit-plugins/livekit-plugins-turn-detector/package.json index 6e6bfea47..acd5d4300 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/package.json +++ b/livekit-plugins/livekit-plugins-turn-detector/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-turn-detector", "private": true, - "version": "0.3.2" + "version": "0.3.3" } From 0ccc4b02a7c812fabcbd149c9ca7291eb0dfe14a Mon Sep 17 00:00:00 2001 From: David Zhao Date: Fri, 20 Dec 2024 23:40:54 -0800 Subject: [PATCH 15/46] Revert to upload-artifacts@v3, update dependency versions (#1267) Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- .changeset/pre.json | 25 +++++++++++++++++++ .github/workflows/build-package.yml | 4 +-- .github/workflows/publish-package.yml | 2 +- .../livekit-plugins-anthropic/setup.py | 2 +- .../livekit-plugins-assemblyai/setup.py | 2 +- .../livekit-plugins-azure/setup.py | 2 +- .../livekit-plugins-browser/setup.py | 2 +- .../livekit-plugins-cartesia/setup.py | 2 +- .../livekit-plugins-clova/setup.py | 2 +- .../livekit-plugins-deepgram/setup.py | 2 +- .../livekit-plugins-elevenlabs/setup.py | 2 +- livekit-plugins/livekit-plugins-fal/setup.py | 2 +- .../livekit-plugins-google/setup.py | 2 +- .../livekit-plugins-llama-index/setup.py | 2 +- .../livekit-plugins-openai/setup.py | 2 +- .../livekit-plugins-playht/setup.py | 2 +- livekit-plugins/livekit-plugins-rag/setup.py | 2 +- .../livekit-plugins-silero/setup.py | 2 +- .../livekit-plugins-turn-detector/setup.py | 2 +- 19 files changed, 44 insertions(+), 19 deletions(-) create mode 100644 .changeset/pre.json diff --git a/.changeset/pre.json b/.changeset/pre.json new file mode 100644 index 000000000..1bfc38cea --- /dev/null +++ b/.changeset/pre.json @@ -0,0 +1,25 @@ +{ + "mode": "pre", + "tag": "dev", + "initialVersions": { + "livekit-agents": "0.12.3", + "livekit-plugins-anthropic": "0.2.8", + "livekit-plugins-assemblyai": "0.2.1", + "livekit-plugins-azure": "0.5.1", + "livekit-plugins-browser": "0.0.5", + "livekit-plugins-cartesia": "0.4.5", + "livekit-plugins-deepgram": "0.6.15", + "livekit-plugins-elevenlabs": "0.7.9", + "livekit-plugins-fal": "0.2.2", + "livekit-plugins-google": "0.8.1", + "livekit-plugins-llama-index": "0.2.2", + "livekit-plugins-minimal": "0.2.1", + "livekit-plugins-nltk": "0.7.3", + "livekit-plugins-openai": "0.10.11", + "livekit-plugins-playht": "1.0.3", + "livekit-plugins-rag": "0.2.3", + "livekit-plugins-silero": "0.7.4", + "livekit-plugins-turn-detector": "0.3.3" + }, + "changesets": [] +} diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.yml index f0f721f72..7593c01be 100644 --- a/.github/workflows/build-package.yml +++ b/.github/workflows/build-package.yml @@ -47,7 +47,7 @@ jobs: run: python -m build - name: Upload distribution package - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: ${{ inputs.artifact_name }} path: "${{ startsWith(inputs.package, 'livekit-plugin') && 'livekit-plugins/' || '' }}${{ inputs.package }}/dist/" @@ -82,7 +82,7 @@ jobs: CIBW_BUILD_VERBOSITY: 3 - name: Upload distribution package - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: ${{ inputs.artifact_name }} path: livekit-plugins/livekit-plugins-browser/dist/ diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 669f37d68..05feeb366 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -96,7 +96,7 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v3 with: name: python-package-distributions path: dist/ diff --git a/livekit-plugins/livekit-plugins-anthropic/setup.py b/livekit-plugins/livekit-plugins-anthropic/setup.py index 5a21aeb5c..4d9c3a1ba 100644 --- a/livekit-plugins/livekit-plugins-anthropic/setup.py +++ b/livekit-plugins/livekit-plugins-anthropic/setup.py @@ -49,7 +49,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11", "anthropic>=0.34"], + install_requires=["livekit-agents>=0.12.3", "anthropic>=0.34"], package_data={"livekit.plugins.anthropic": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", diff --git a/livekit-plugins/livekit-plugins-assemblyai/setup.py b/livekit-plugins/livekit-plugins-assemblyai/setup.py index 8cd008a0c..edd7e5494 100644 --- a/livekit-plugins/livekit-plugins-assemblyai/setup.py +++ b/livekit-plugins/livekit-plugins-assemblyai/setup.py @@ -48,7 +48,7 @@ packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", install_requires=[ - "livekit-agents>=0.11", + "livekit-agents>=0.12.3", ], package_data={}, project_urls={ diff --git a/livekit-plugins/livekit-plugins-azure/setup.py b/livekit-plugins/livekit-plugins-azure/setup.py index 288de7187..e854fc492 100644 --- a/livekit-plugins/livekit-plugins-azure/setup.py +++ b/livekit-plugins/livekit-plugins-azure/setup.py @@ -46,7 +46,7 @@ packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", install_requires=[ - "livekit-agents>=0.11", + "livekit-agents>=0.12.3", "azure-cognitiveservices-speech>=1.41.0", ], package_data={}, diff --git a/livekit-plugins/livekit-plugins-browser/setup.py b/livekit-plugins/livekit-plugins-browser/setup.py index 8eafd27d8..088259ebf 100644 --- a/livekit-plugins/livekit-plugins-browser/setup.py +++ b/livekit-plugins/livekit-plugins-browser/setup.py @@ -113,7 +113,7 @@ def build_extension(self, ext: CMakeExtension) -> None: cmdclass={"build_ext": CMakeBuild}, packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11"], + install_requires=["livekit-agents>=0.12.3"], package_data={ "livekit.plugins.browser": ["py.typed"], "livekit.plugins.browser.resources": ["**", "lkcef_app.app"], diff --git a/livekit-plugins/livekit-plugins-cartesia/setup.py b/livekit-plugins/livekit-plugins-cartesia/setup.py index e4ce007f9..8044f23c6 100644 --- a/livekit-plugins/livekit-plugins-cartesia/setup.py +++ b/livekit-plugins/livekit-plugins-cartesia/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11"], + install_requires=["livekit-agents>=0.12.3"], project_urls={ "Documentation": "https://docs.livekit.io", "Website": "https://livekit.io/", diff --git a/livekit-plugins/livekit-plugins-clova/setup.py b/livekit-plugins/livekit-plugins-clova/setup.py index 254fd1cba..08abcf970 100644 --- a/livekit-plugins/livekit-plugins-clova/setup.py +++ b/livekit-plugins/livekit-plugins-clova/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11", "pydub~=0.25.1"], + install_requires=["livekit-agents>=0.12.3", "pydub~=0.25.1"], project_urls={ "Documentation": "https://docs.livekit.io", "Website": "https://livekit.io/", diff --git a/livekit-plugins/livekit-plugins-deepgram/setup.py b/livekit-plugins/livekit-plugins-deepgram/setup.py index 8a583611d..b9316b839 100644 --- a/livekit-plugins/livekit-plugins-deepgram/setup.py +++ b/livekit-plugins/livekit-plugins-deepgram/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.12.2", "numpy>=1.26"], + install_requires=["livekit-agents>=0.12.3", "numpy>=1.26"], package_data={"livekit.plugins.deepgram": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", diff --git a/livekit-plugins/livekit-plugins-elevenlabs/setup.py b/livekit-plugins/livekit-plugins-elevenlabs/setup.py index ba5400e84..829739fe2 100644 --- a/livekit-plugins/livekit-plugins-elevenlabs/setup.py +++ b/livekit-plugins/livekit-plugins-elevenlabs/setup.py @@ -49,7 +49,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents[codecs]>=0.11"], + install_requires=["livekit-agents[codecs]>=0.12.3"], package_data={"livekit.plugins.elevenlabs": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", diff --git a/livekit-plugins/livekit-plugins-fal/setup.py b/livekit-plugins/livekit-plugins-fal/setup.py index 014251d0c..760607daf 100644 --- a/livekit-plugins/livekit-plugins-fal/setup.py +++ b/livekit-plugins/livekit-plugins-fal/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11", "fal_client"], + install_requires=["livekit-agents>=0.12.3", "fal_client"], package_data={"livekit.plugins.fal": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", diff --git a/livekit-plugins/livekit-plugins-google/setup.py b/livekit-plugins/livekit-plugins-google/setup.py index b6e72949b..87646895f 100644 --- a/livekit-plugins/livekit-plugins-google/setup.py +++ b/livekit-plugins/livekit-plugins-google/setup.py @@ -51,7 +51,7 @@ "google-auth >= 2, < 3", "google-cloud-speech >= 2, < 3", "google-cloud-texttospeech >= 2, < 3", - "livekit-agents>=0.11", + "livekit-agents>=0.12.3", ], package_data={"livekit.plugins.google": ["py.typed"]}, project_urls={ diff --git a/livekit-plugins/livekit-plugins-llama-index/setup.py b/livekit-plugins/livekit-plugins-llama-index/setup.py index 98b0babab..acc39333d 100644 --- a/livekit-plugins/livekit-plugins-llama-index/setup.py +++ b/livekit-plugins/livekit-plugins-llama-index/setup.py @@ -49,7 +49,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11"], + install_requires=["livekit-agents>=0.12.3"], package_data={"livekit.plugins.llama_index": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", diff --git a/livekit-plugins/livekit-plugins-openai/setup.py b/livekit-plugins/livekit-plugins-openai/setup.py index a7b6cdf19..eb9d6d0fe 100644 --- a/livekit-plugins/livekit-plugins-openai/setup.py +++ b/livekit-plugins/livekit-plugins-openai/setup.py @@ -48,7 +48,7 @@ packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", install_requires=[ - "livekit-agents[codecs, images]>=0.11", + "livekit-agents[codecs, images]>=0.12.3", "openai>=1.50", ], extras_require={ diff --git a/livekit-plugins/livekit-plugins-playht/setup.py b/livekit-plugins/livekit-plugins-playht/setup.py index ea5c7bf77..eb41a5b89 100644 --- a/livekit-plugins/livekit-plugins-playht/setup.py +++ b/livekit-plugins/livekit-plugins-playht/setup.py @@ -32,7 +32,7 @@ packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", install_requires=[ - "livekit-agents[codecs]>=0.11", + "livekit-agents[codecs]>=0.12.3", "pyht", "aiohttp", "livekit", diff --git a/livekit-plugins/livekit-plugins-rag/setup.py b/livekit-plugins/livekit-plugins-rag/setup.py index 55c8223a8..00ae59c86 100644 --- a/livekit-plugins/livekit-plugins-rag/setup.py +++ b/livekit-plugins/livekit-plugins-rag/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11", "annoy>=1.17"], + install_requires=["livekit-agents>=0.12.3", "annoy>=1.17"], package_data={"livekit.plugins.rag": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", diff --git a/livekit-plugins/livekit-plugins-silero/setup.py b/livekit-plugins/livekit-plugins-silero/setup.py index c5202db9c..52bc41ba2 100644 --- a/livekit-plugins/livekit-plugins-silero/setup.py +++ b/livekit-plugins/livekit-plugins-silero/setup.py @@ -47,7 +47,7 @@ license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", - install_requires=["livekit-agents>=0.11", "onnxruntime>=1.18", "numpy>=1.26"], + install_requires=["livekit-agents>=0.12.3", "onnxruntime>=1.18", "numpy>=1.26"], package_data={ "livekit.plugins.silero.resources": ["silero_vad.onnx"], "livekit.plugins.silero": ["py.typed"], diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py index 7b9b4b192..f53e82135 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/setup.py +++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py @@ -50,7 +50,7 @@ packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", install_requires=[ - "livekit-agents>=0.11", + "livekit-agents>=0.12.3", "transformers>=4.47.1", "numpy>=1.26", "onnxruntime>=1.18", From af777bef0586f53b26bffe1c7834b405cf77429c Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 21 Dec 2024 09:03:42 +0000 Subject: [PATCH 16/46] Exit pre release mode --- .changeset/pre.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changeset/pre.json b/.changeset/pre.json index 1bfc38cea..c3a216b74 100644 --- a/.changeset/pre.json +++ b/.changeset/pre.json @@ -1,5 +1,5 @@ { - "mode": "pre", + "mode": "exit", "tag": "dev", "initialVersions": { "livekit-agents": "0.12.3", From 50d0a716da00ff4b9b8b82ccc0560653bbd3c974 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Sat, 21 Dec 2024 17:06:59 +0100 Subject: [PATCH 17/46] Update to v4 versions with multiple artifact download (#1268) --- .github/workflows/build-package.yml | 4 ++-- .github/workflows/publish-package.yml | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.yml index 7593c01be..f0f721f72 100644 --- a/.github/workflows/build-package.yml +++ b/.github/workflows/build-package.yml @@ -47,7 +47,7 @@ jobs: run: python -m build - name: Upload distribution package - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ inputs.artifact_name }} path: "${{ startsWith(inputs.package, 'livekit-plugin') && 'livekit-plugins/' || '' }}${{ inputs.package }}/dist/" @@ -82,7 +82,7 @@ jobs: CIBW_BUILD_VERBOSITY: 3 - name: Upload distribution package - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ inputs.artifact_name }} path: livekit-plugins/livekit-plugins-browser/dist/ diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 05feeb366..6724f50aa 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -27,7 +27,7 @@ jobs: submodules: true lfs: true env: - GITHUB_TOKEN: ${{ secrets.CHANGESETS_PUSH_PAT }} + GITHUB_TOKEN: ${{ secrets.CHANGESETS_PUSH_DEPLOY_KEY }} - uses: pnpm/action-setup@v4 - name: Use Node.js 20 @@ -84,7 +84,7 @@ jobs: uses: livekit/agents/.github/workflows/build-package.yml@main with: package: ${{ matrix.package.name }} - artifact_name: python-package-distributions + artifact_name: python-package-dist-${{matrix.package.name}} publish: needs: @@ -96,10 +96,11 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: - name: python-package-distributions - path: dist/ + path: dist + pattern: python-package-dist-* + merge-multiple: true - name: Publish package uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 From dec87af9c8b65ae9b4d85c003bb8a4c73f877954 Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Sat, 21 Dec 2024 18:20:09 +0200 Subject: [PATCH 18/46] ci: use ssh key and remove references to GITHUB_TOKEN (#1269) --- .github/workflows/publish-package.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 6724f50aa..5e5200417 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -26,8 +26,7 @@ jobs: with: submodules: true lfs: true - env: - GITHUB_TOKEN: ${{ secrets.CHANGESETS_PUSH_DEPLOY_KEY }} + ssh-key: ${{ secrets.CHANGESETS_PUSH_DEPLOY_KEY }} - uses: pnpm/action-setup@v4 - name: Use Node.js 20 @@ -50,8 +49,6 @@ jobs: set +e pnpm changeset pre ${{ github.ref == 'refs/heads/main' && 'exit' || 'enter dev' }} echo "exitcode=$?" >> $GITHUB_OUTPUT - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Add changes if: ${{ steps.release_mode.outputs.exitcode == '0' }} @@ -67,8 +64,6 @@ jobs: with: version: pnpm ci:version publish: pnpm ci:publish - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: debug packages run: echo "${{ steps.changesets.outputs.publishedPackages }}" From a4f0bdb51cf5bb2a8ddbf372bc8ed034c5df0676 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Sun, 22 Dec 2024 16:05:46 +0800 Subject: [PATCH 19/46] fix: avoid duplicated chat ctx for function call with messages (#1254) --- .changeset/curvy-knives-promise.md | 5 +++++ .../voice-pipeline-agent/function_calling_weather.py | 4 ++-- .../livekit/agents/pipeline/pipeline_agent.py | 11 ++++++++++- .../livekit/agents/pipeline/speech_handle.py | 8 ++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 .changeset/curvy-knives-promise.md diff --git a/.changeset/curvy-knives-promise.md b/.changeset/curvy-knives-promise.md new file mode 100644 index 000000000..a4d79c4ca --- /dev/null +++ b/.changeset/curvy-knives-promise.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +avoid duplicated chat ctx for function calls with messages diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py index e8add68d0..7f1ba5fa5 100644 --- a/examples/voice-pipeline-agent/function_calling_weather.py +++ b/examples/voice-pipeline-agent/function_calling_weather.py @@ -67,8 +67,8 @@ async def get_weather( f"Failed to get weather data, status code: {response.status}" ) - # To wait for the speech to finish before giving results of the function call - await speech_handle.join() + # (optional) To wait for the speech to finish before giving results of the function call + # await speech_handle.join() return weather_data diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index a08291ea4..5493583b7 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -801,12 +801,20 @@ def _commit_user_question_if_needed() -> None: speech_handle.source.function_calls ) + message_id_committed: str | None = None if ( collected_text and speech_handle.add_to_chat_ctx and (not user_question or speech_handle.user_committed) ): if speech_handle.extra_tools_messages: + msgs = self._chat_ctx.messages + if msgs and msgs[-1].id == speech_handle.fnc_text_message_id: + # remove text message alongside function calls if it's the last in the ctx + msgs.pop() + elif speech_handle.extra_tools_messages[0].tool_calls: + # remove the content of the tool call message + speech_handle.extra_tools_messages[0].content = "" self._chat_ctx.messages.extend(speech_handle.extra_tools_messages) if interrupted: @@ -814,7 +822,7 @@ def _commit_user_question_if_needed() -> None: msg = ChatMessage.create(text=collected_text, role="assistant") self._chat_ctx.messages.append(msg) - + message_id_committed = msg.id speech_handle.mark_speech_committed() if interrupted: @@ -914,6 +922,7 @@ async def _execute_function_calls() -> None: add_to_chat_ctx=speech_handle.add_to_chat_ctx, extra_tools_messages=extra_tools_messages, fnc_nested_depth=speech_handle.fnc_nested_depth + 1, + fnc_text_message_id=message_id_committed, ) # synthesize the tool speech with the chat ctx from llm_stream diff --git a/livekit-agents/livekit/agents/pipeline/speech_handle.py b/livekit-agents/livekit/agents/pipeline/speech_handle.py index d1c64b5c9..d36eb7aee 100644 --- a/livekit-agents/livekit/agents/pipeline/speech_handle.py +++ b/livekit-agents/livekit/agents/pipeline/speech_handle.py @@ -19,6 +19,7 @@ def __init__( user_question: str, fnc_nested_depth: int = 0, extra_tools_messages: list[ChatMessage] | None = None, + fnc_text_message_id: str | None = None, ) -> None: self._id = id self._allow_interruptions = allow_interruptions @@ -41,6 +42,7 @@ def __init__( # nested speech handle and function calls self._fnc_nested_depth = fnc_nested_depth self._fnc_extra_tools_messages: list[ChatMessage] | None = extra_tools_messages + self._fnc_text_message_id: str | None = fnc_text_message_id self._nested_speech_handles: list[SpeechHandle] = [] self._nested_speech_changed = asyncio.Event() @@ -82,6 +84,7 @@ def create_tool_speech( add_to_chat_ctx: bool, fnc_nested_depth: int, extra_tools_messages: list[ChatMessage], + fnc_text_message_id: str | None = None, ) -> SpeechHandle: return SpeechHandle( id=utils.shortuuid(), @@ -91,6 +94,7 @@ def create_tool_speech( user_question="", fnc_nested_depth=fnc_nested_depth, extra_tools_messages=extra_tools_messages, + fnc_text_message_id=fnc_text_message_id, ) async def wait_for_initialization(self) -> None: @@ -200,6 +204,10 @@ def fnc_nested_depth(self) -> int: def extra_tools_messages(self) -> list[ChatMessage] | None: return self._fnc_extra_tools_messages + @property + def fnc_text_message_id(self) -> str | None: + return self._fnc_text_message_id + def add_nested_speech(self, speech_handle: SpeechHandle) -> None: self._nested_speech_handles.append(speech_handle) self._nested_speech_changed.set() From 49f14dd71dc1708e4f99e8ea12124f213b84dbf4 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Sun, 22 Dec 2024 12:18:16 +0100 Subject: [PATCH 20/46] ci: re-add GITHUB_TOKEN to publish workflow (#1272) --- .github/workflows/publish-package.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 5e5200417..61692429e 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -49,6 +49,8 @@ jobs: set +e pnpm changeset pre ${{ github.ref == 'refs/heads/main' && 'exit' || 'enter dev' }} echo "exitcode=$?" >> $GITHUB_OUTPUT + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Add changes if: ${{ steps.release_mode.outputs.exitcode == '0' }} @@ -64,6 +66,8 @@ jobs: with: version: pnpm ci:version publish: pnpm ci:publish + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: debug packages run: echo "${{ steps.changesets.outputs.publishedPackages }}" From b6542abe5f42f5a49bc845301b57e91f049b7cd8 Mon Sep 17 00:00:00 2001 From: Mike McLaughlin <23640224+mike-r-mclaughlin@users.noreply.github.com> Date: Sun, 22 Dec 2024 07:54:44 -0600 Subject: [PATCH 21/46] examples: updated, tested, and added @dsa's Hive moderation agent (#1263) --- examples/hive-moderation-agent/README.md | 41 +++++ examples/hive-moderation-agent/agent.py | 163 ++++++++++++++++++ .../hive_data_classes.py | 95 ++++++++++ .../hive-moderation-agent/requirements.txt | 5 + 4 files changed, 304 insertions(+) create mode 100644 examples/hive-moderation-agent/README.md create mode 100644 examples/hive-moderation-agent/agent.py create mode 100644 examples/hive-moderation-agent/hive_data_classes.py create mode 100644 examples/hive-moderation-agent/requirements.txt diff --git a/examples/hive-moderation-agent/README.md b/examples/hive-moderation-agent/README.md new file mode 100644 index 000000000..8f48218bb --- /dev/null +++ b/examples/hive-moderation-agent/README.md @@ -0,0 +1,41 @@ +# LiveKit realtime moderation agent using Hive + +This is an agent that performs visual moderation of every participant's video in a room. It does this moderation using the Visual Content Moderation model from [Hive](https://thehive.ai) [[docs](https://docs.thehive.ai/docs/visual-content-moderation#visual-content-moderation)]. + +## Prerequisites + +Before running this agent, you'll need: + +1. A LiveKit Cloud project (or a self-hosted LiveKit server). +2. An API key from Hive to access the above mentioned model. + +## Configuration + +Currently, this agent is configured entirely from the `agent.py` source code and the environment. + +### Environment Variables + +| configuration | description | example value | +|---------------|-------------|---------------| +| `LIVEKIT_URL` | Your LiveKit URL | `wss://test-abc123de.livekit.cloud` | +| `LIVEKIT_API_KEY` | Your LiveKit API key | | +| `LIVEKIT_API_SECRET` | Your LiveKit API secret | | +| `HIVE_API_KEY` | The API key from Hive to access the `Visual Content Moderation` model | `abc1deFgHIjK23KLMNOp45QrsTuv6wx8` | + +### Code + +| configuration | description | example value | +|---------------|-------------|---------------| +| `MOD_FRAME_INTERVAL` | Minimum number of seconds to wait between frames | 5.0 | +| `HIVE_HEADERS` | The headers to send with every request to the Hive API | `{}` | +| `CONFIDENCE_THRESHOLD` | The minimum score Hive's moderation class must meet before it is considered a problem | 0.9 | + +## Running + +Run this code like you would any other [LiveKit agent](https://docs.livekit.io/agents/build/anatomy/#starting-the-worker): + +``` +python3 agent.py start +``` + +Once running, the agent will join all new LiveKit rooms by default and begin moderation. diff --git a/examples/hive-moderation-agent/agent.py b/examples/hive-moderation-agent/agent.py new file mode 100644 index 000000000..bf0b23b07 --- /dev/null +++ b/examples/hive-moderation-agent/agent.py @@ -0,0 +1,163 @@ +""" +LiveKit agent that connects to a room and performs visual moderation on the video +of all participants using the Visual Content Moderation model from Hive +(https://docs.thehive.ai/docs/visual-content-moderation#visual-content-moderation). + +The agent periodically sends a frame from the participant's video to Hive's API +for a moderation check. If the results of that check show a confidence score +of 0.9 or higher for any of the positive classes, it logs the result and adds a +message to the room's chat. This can easily be extended to take additional +actions like removing a participant or ending a livestream, etc. +""" + +import asyncio +import logging +import os +import time +from io import BytesIO + +import aiohttp +from dotenv import load_dotenv +from hive_data_classes import HiveResponse, from_dict +from livekit import agents, rtc +from PIL import Image + +load_dotenv() + +MOD_FRAME_INTERVAL = 5.0 # check 1 frame every 5 seconds +""" +How often to check a frame (in seconds) +""" + +HIVE_HEADERS = { + "Authorization": f"Token {os.getenv('HIVE_API_KEY')}", + "accept": "application/json", +} +""" +The default headers included with every request to thehive.ai +""" + +CONFIDENCE_THRESHOLD = 0.9 +""" +THe threshold level for scores returned by thehive.ai. See details in this doc: +https://docs.thehive.ai/docs/visual-content-moderation#choosing-thresholds-for-visual-moderation +""" + + +logger = logging.getLogger("hive-moderation-agent") +logger.setLevel(logging.INFO) + + +async def request_fnc(req: agents.JobRequest): + """ + The request handler for the agent. We use this to set the name of the + agent that is displayed to users + """ + # accept the job request and name the agent participant so users know what this is + await req.accept( + name="Moderator", + identity="hive-moderator", + ) + + +async def entrypoint(ctx: agents.JobContext): + """ + The entrypoint of the agent. This is called every time the moderator + agent joins a room. + """ + + # connect to the room and automatically subscribe to all participants' video + await ctx.connect(auto_subscribe=agents.AutoSubscribe.VIDEO_ONLY) + chat = rtc.ChatManager(ctx.room) + + @ctx.room.on("track_subscribed") + def on_track_subscribed( + track: rtc.Track, + _publication: rtc.TrackPublication, + participant: rtc.RemoteParticipant, + ): + """ + Event handler for video tracks. We automatically subscribe to all video + tracks when a participant joins the room. This event is triggered + once we have completed subscription to that video track. + This creates a backgrond task to process frames from each track + """ + asyncio.create_task(process_track(participant, track)) + + async def process_track(participant: rtc.RemoteParticipant, track: rtc.VideoTrack): + """ + This function is running in a background task once for each video track + (i.e., once for each participant). It handles processing a frame + from the video once every MOD_FRAME INTERVAL seconds. + """ + + video_stream = rtc.VideoStream(track) + last_processed_time = 0 + async for frame in video_stream: + current_time = time.time() + if (current_time - last_processed_time) >= MOD_FRAME_INTERVAL: + last_processed_time = current_time + await check_frame(participant, frame) + + async def check_frame(participant: rtc.RemoteParticipant, frame: rtc.VideoFrame): + """ + Uses thehive.ai API to check the frame for any classifications we care about + """ + + # get the current frame and convert to png format + argb_frame = frame.frame.convert(rtc.VideoBufferType.RGBA) + image = Image.frombytes( + "RGBA", (argb_frame.width, argb_frame.height), argb_frame.data + ) + buffer = BytesIO() + image.save(buffer, format="PNG") + buffer.seek(0) # reset buffer position to beginning after writing + + data = aiohttp.FormData() + data.add_field("image", buffer, filename="image.png", content_type="image/png") + + # submit the image to Hive + logger.info("submitting image to hive") + async with aiohttp.ClientSession() as session: + async with session.post( + "https://api.thehive.ai/api/v2/task/sync", + headers=HIVE_HEADERS, + data=data, + ) as response: + response.raise_for_status() + response_dict = await response.json() + hive_response: HiveResponse = from_dict(HiveResponse, response_dict) + if ( + hive_response.code == 200 + and len(hive_response.status) > 0 + and len(hive_response.status[0].response.output) > 0 + ): + results = hive_response.status[0].response.output[0].classes + # filter to anything with a confidence score > threshold + for mod_class in results: + if mod_class.class_[0:4] == "yes_": + # TODO: should also include "general_nsfw" class + if mod_class.score >= CONFIDENCE_THRESHOLD: + class_name = mod_class.class_[4:] + message = ( + 'FOUND %s for participant "%s" (confidence score: %0.3f)' + % ( + class_name, + participant.identity, + mod_class.score, + ) + ) + logger.info(message) + await chat.send_message(message) + + await ctx.wait_for_participant() + await chat.send_message( + "I'm a moderation agent," + "I will detect and notify you of all inappropriate material in your video stream" + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + agents.cli.run_app(agents.WorkerOptions(entrypoint, request_fnc=request_fnc)) diff --git a/examples/hive-moderation-agent/hive_data_classes.py b/examples/hive-moderation-agent/hive_data_classes.py new file mode 100644 index 000000000..a1773435d --- /dev/null +++ b/examples/hive-moderation-agent/hive_data_classes.py @@ -0,0 +1,95 @@ +from dataclasses import dataclass, is_dataclass +from typing import List, get_type_hints + + +def from_dict(cls, data): + if is_dataclass(cls) and isinstance(data, dict): + # Get type hints for all fields in the dataclass + field_types = get_type_hints(cls) + # Special handling for reserved words like 'class' + reserved_word_mappings = {"class": "class_"} # Map 'class' to 'class_' + processed_data = {} + for key, value in data.items(): + # Check if the key is a reserved word and map it accordingly + field_name = reserved_word_mappings.get(key, key) + # Only include keys that have corresponding fields in the dataclass + if field_name in field_types: + field_type = field_types[field_name] + # Determine if the field_type is itself a dataclass + if is_dataclass(field_type): + processed_value = from_dict(field_type, value) + elif hasattr(field_type, "__origin__") and issubclass( + field_type.__origin__, List + ): + # Handle List fields, assuming all elements are of the same type + item_type = field_type.__args__[0] + processed_value = [from_dict(item_type, item) for item in value] + else: + processed_value = value + processed_data[field_name] = processed_value + return cls(**processed_data) + elif isinstance(data, list): + # This assumes that the function was called with a list type as `cls`, + # which might not work as expected without context on the list's element type. + # A better approach might be needed for handling lists of dataclasses. + return [ + from_dict(cls.__args__[0], item) if hasattr(cls, "__args__") else item + for item in data + ] + else: + return data + + +@dataclass +class Status: + code: str + message: str + + +@dataclass +class ModInput: + id: str + charge: float + config_tag: SyntaxWarning + config_version: float + created_on: str + model: str + model_type: str + model_version: float + project_id: int + user_id: int + + +@dataclass +class ModClass: + class_: str + score: float + + +@dataclass +class ModOutput: + time: int + classes: List[ModClass] + + +@dataclass +class Response: + input: ModInput + output: List[ModOutput] + + +@dataclass +class ModResponse: + status: Status + response: Response + + +@dataclass +class HiveResponse: + id: str + code: int + project_id: int + user_id: int + created_on: str + status: List[ModResponse] + from_cache: bool diff --git a/examples/hive-moderation-agent/requirements.txt b/examples/hive-moderation-agent/requirements.txt new file mode 100644 index 000000000..517a8283f --- /dev/null +++ b/examples/hive-moderation-agent/requirements.txt @@ -0,0 +1,5 @@ +livekit +livekit-agents +python-dotenv +Pillow +aiohttp \ No newline at end of file From a76c21becb45305bfdd94c8347369091d40abe21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Mon, 23 Dec 2024 00:58:43 +0100 Subject: [PATCH 22/46] fix unknown `metadata` & `store` fields on OpenAI-like API (#1276) --- .changeset/lazy-dragons-give.md | 5 +++++ .../livekit-plugins-openai/livekit/plugins/openai/llm.py | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 .changeset/lazy-dragons-give.md diff --git a/.changeset/lazy-dragons-give.md b/.changeset/lazy-dragons-give.md new file mode 100644 index 000000000..6eb6a3db5 --- /dev/null +++ b/.changeset/lazy-dragons-give.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-openai": patch +--- + +fix unknown `metadata` & `store` fields on OpenAI-like API diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py index bcff2cfa9..6f7cbccb1 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py @@ -730,6 +730,13 @@ async def _run(self) -> None: else: opts["tool_choice"] = self._tool_choice + if self._llm._opts.metadata is not None: + # some OpenAI-like API doesn't support having a `metadata` field. (Even None) + opts["metadata"] = self._llm._opts.metadata + + if self._llm._opts.store is not None: + opts["store"] = self._llm._opts.store + user = self._user or openai.NOT_GIVEN messages = _build_oai_context(self._chat_ctx, id(self)) stream = await self._client.chat.completions.create( @@ -740,8 +747,6 @@ async def _run(self) -> None: stream_options={"include_usage": True}, stream=True, user=user, - store=self._llm._opts.store, - metadata=self._llm._opts.metadata, **opts, ) From 0f685455411b093140294b97e1f9dc153a3e9baa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 22 Dec 2024 18:33:51 -0600 Subject: [PATCH 23/46] Version Packages (#1273) Co-authored-by: github-actions[bot] --- .changeset/curvy-knives-promise.md | 5 ---- .changeset/lazy-dragons-give.md | 5 ---- .changeset/pre.json | 25 ------------------- .../participant-entrypoint/requirements.txt | 2 +- examples/simple-color/requirements.txt | 2 +- examples/speech-to-text/requirements.txt | 2 +- examples/text-to-speech/requirements.txt | 4 +-- .../voice-pipeline-agent/requirements.txt | 2 +- livekit-agents/CHANGELOG.md | 6 +++++ livekit-agents/livekit/agents/version.py | 2 +- livekit-agents/package.json | 2 +- .../livekit-plugins-openai/CHANGELOG.md | 6 +++++ .../livekit/plugins/openai/version.py | 2 +- .../livekit-plugins-openai/package.json | 2 +- 14 files changed, 22 insertions(+), 45 deletions(-) delete mode 100644 .changeset/curvy-knives-promise.md delete mode 100644 .changeset/lazy-dragons-give.md delete mode 100644 .changeset/pre.json diff --git a/.changeset/curvy-knives-promise.md b/.changeset/curvy-knives-promise.md deleted file mode 100644 index a4d79c4ca..000000000 --- a/.changeset/curvy-knives-promise.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -avoid duplicated chat ctx for function calls with messages diff --git a/.changeset/lazy-dragons-give.md b/.changeset/lazy-dragons-give.md deleted file mode 100644 index 6eb6a3db5..000000000 --- a/.changeset/lazy-dragons-give.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-openai": patch ---- - -fix unknown `metadata` & `store` fields on OpenAI-like API diff --git a/.changeset/pre.json b/.changeset/pre.json deleted file mode 100644 index c3a216b74..000000000 --- a/.changeset/pre.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "mode": "exit", - "tag": "dev", - "initialVersions": { - "livekit-agents": "0.12.3", - "livekit-plugins-anthropic": "0.2.8", - "livekit-plugins-assemblyai": "0.2.1", - "livekit-plugins-azure": "0.5.1", - "livekit-plugins-browser": "0.0.5", - "livekit-plugins-cartesia": "0.4.5", - "livekit-plugins-deepgram": "0.6.15", - "livekit-plugins-elevenlabs": "0.7.9", - "livekit-plugins-fal": "0.2.2", - "livekit-plugins-google": "0.8.1", - "livekit-plugins-llama-index": "0.2.2", - "livekit-plugins-minimal": "0.2.1", - "livekit-plugins-nltk": "0.7.3", - "livekit-plugins-openai": "0.10.11", - "livekit-plugins-playht": "1.0.3", - "livekit-plugins-rag": "0.2.3", - "livekit-plugins-silero": "0.7.4", - "livekit-plugins-turn-detector": "0.3.3" - }, - "changesets": [] -} diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt index 53a52b16a..5e6395561 100644 --- a/examples/participant-entrypoint/requirements.txt +++ b/examples/participant-entrypoint/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.3 +livekit-agents>=0.12.4 python-dotenv~=1.0 diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt index 53a52b16a..5e6395561 100644 --- a/examples/simple-color/requirements.txt +++ b/examples/simple-color/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.3 +livekit-agents>=0.12.4 python-dotenv~=1.0 diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt index 53ee39eb8..0a18a4bb6 100644 --- a/examples/speech-to-text/requirements.txt +++ b/examples/speech-to-text/requirements.txt @@ -1,3 +1,3 @@ -livekit-agents>=0.12.3 +livekit-agents>=0.12.4 livekit-plugins-deepgram>=0.6.15 python-dotenv~=1.0 diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt index e5e0d8ddd..6a534b331 100644 --- a/examples/text-to-speech/requirements.txt +++ b/examples/text-to-speech/requirements.txt @@ -1,5 +1,5 @@ -livekit-agents>=0.12.3 -livekit-plugins-openai>=0.10.11 +livekit-agents>=0.12.4 +livekit-plugins-openai>=0.10.12 livekit-plugins-cartesia>=0.4.5 livekit-plugins-elevenlabs>=0.7.9 python-dotenv~=1.0 diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt index c8942df19..77975fb53 100644 --- a/examples/voice-pipeline-agent/requirements.txt +++ b/examples/voice-pipeline-agent/requirements.txt @@ -1,4 +1,4 @@ -livekit-agents>=0.12.3 +livekit-agents>=0.12.4 livekit-plugins-deepgram>=0.6.15 livekit-plugins-google>=0.8.1 livekit-plugins-openai[vertex]>=0.10.10 diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md index 83a2959c1..5bd84faf9 100644 --- a/livekit-agents/CHANGELOG.md +++ b/livekit-agents/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-agents +## 0.12.4 + +### Patch Changes + +- avoid duplicated chat ctx for function calls with messages - [#1254](https://github.com/livekit/agents/pull/1254) ([@longcw](https://github.com/longcw)) + ## 0.12.3 ### Patch Changes diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py index 55829dea7..ee001ea03 100644 --- a/livekit-agents/livekit/agents/version.py +++ b/livekit-agents/livekit/agents/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.12.3" +__version__ = "0.12.4" diff --git a/livekit-agents/package.json b/livekit-agents/package.json index c23feb751..212896f2b 100644 --- a/livekit-agents/package.json +++ b/livekit-agents/package.json @@ -1,5 +1,5 @@ { "name": "livekit-agents", "private": true, - "version": "0.12.3" + "version": "0.12.4" } diff --git a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md index d9f42cc0c..02ff2f06f 100644 --- a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-openai +## 0.10.12 + +### Patch Changes + +- fix unknown `metadata` & `store` fields on OpenAI-like API - [#1276](https://github.com/livekit/agents/pull/1276) ([@theomonnom](https://github.com/theomonnom)) + ## 0.10.11 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py index 613650a21..16e535380 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.10.11" +__version__ = "0.10.12" diff --git a/livekit-plugins/livekit-plugins-openai/package.json b/livekit-plugins/livekit-plugins-openai/package.json index a5087740b..bfe2370d0 100644 --- a/livekit-plugins/livekit-plugins-openai/package.json +++ b/livekit-plugins/livekit-plugins-openai/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-openai", "private": true, - "version": "0.10.11" + "version": "0.10.12" } From 4b7230330cbad47f31efee562bd64438225be405 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Mon, 23 Dec 2024 10:53:33 +0800 Subject: [PATCH 24/46] fix: check fnc_text_message_id it not None (#1271) --- .../livekit/agents/pipeline/pipeline_agent.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 5493583b7..65d7e83f8 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -808,13 +808,15 @@ def _commit_user_question_if_needed() -> None: and (not user_question or speech_handle.user_committed) ): if speech_handle.extra_tools_messages: - msgs = self._chat_ctx.messages - if msgs and msgs[-1].id == speech_handle.fnc_text_message_id: - # remove text message alongside function calls if it's the last in the ctx - msgs.pop() - elif speech_handle.extra_tools_messages[0].tool_calls: - # remove the content of the tool call message - speech_handle.extra_tools_messages[0].content = "" + if speech_handle.fnc_text_message_id is not None: + # there is a message alongside the function calls + msgs = self._chat_ctx.messages + if msgs and msgs[-1].id == speech_handle.fnc_text_message_id: + # replace it with the tool call message if it's the last in the ctx + msgs.pop() + elif speech_handle.extra_tools_messages[0].tool_calls: + # remove the content of the tool call message + speech_handle.extra_tools_messages[0].content = "" self._chat_ctx.messages.extend(speech_handle.extra_tools_messages) if interrupted: From f0175c4e15091db931b2a926d6f607dbf3994246 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Mon, 23 Dec 2024 10:54:22 +0800 Subject: [PATCH 25/46] fix: set USE_DOCSTRING as default for ai_callable (#1266) --- .changeset/nasty-rings-wave.md | 5 ++++ .../livekit/agents/llm/function_context.py | 14 +++++------ tests/test_create_func.py | 23 +++++++++++++++++-- 3 files changed, 32 insertions(+), 10 deletions(-) create mode 100644 .changeset/nasty-rings-wave.md diff --git a/.changeset/nasty-rings-wave.md b/.changeset/nasty-rings-wave.md new file mode 100644 index 000000000..cbbcb7979 --- /dev/null +++ b/.changeset/nasty-rings-wave.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +set USE_DOCSTRING as default for ai_callable diff --git a/livekit-agents/livekit/agents/llm/function_context.py b/livekit-agents/livekit/agents/llm/function_context.py index 4470492fe..59604fc8d 100644 --- a/livekit-agents/livekit/agents/llm/function_context.py +++ b/livekit-agents/livekit/agents/llm/function_context.py @@ -105,7 +105,7 @@ class CalledFunction: def ai_callable( *, name: str | None = None, - description: str | _UseDocMarker | None = None, + description: str | _UseDocMarker = USE_DOCSTRING, auto_retry: bool = False, ) -> Callable: def deco(f): @@ -127,7 +127,7 @@ def ai_callable( self, *, name: str | None = None, - description: str | _UseDocMarker | None = None, + description: str | _UseDocMarker = USE_DOCSTRING, auto_retry: bool = True, ) -> Callable: def deco(f): @@ -243,19 +243,17 @@ def _extract_types(annotation: type) -> tuple[type, TypeInfo | None]: def _set_metadata( f: Callable, name: str | None = None, - desc: str | _UseDocMarker | None = None, + desc: str | _UseDocMarker = USE_DOCSTRING, auto_retry: bool = False, ) -> None: - if desc is None: - desc = "" - if isinstance(desc, _UseDocMarker): - desc = inspect.getdoc(f) - if desc is None: + docstring = inspect.getdoc(f) + if docstring is None: raise ValueError( f"missing docstring for function {f.__name__}, " "use explicit description or provide docstring" ) + desc = docstring metadata = _AIFncMetadata( name=name or f.__name__, description=desc, auto_retry=auto_retry diff --git a/tests/test_create_func.py b/tests/test_create_func.py index 97583fb36..a81d31d93 100644 --- a/tests/test_create_func.py +++ b/tests/test_create_func.py @@ -43,11 +43,15 @@ def test_fn( def test_func_duplicate(): class TestFunctionContext(llm.FunctionContext): - @llm.ai_callable(name="duplicate_function") + @llm.ai_callable( + name="duplicate_function", description="A simple test function" + ) def fn1(self): pass - @llm.ai_callable(name="duplicate_function") + @llm.ai_callable( + name="duplicate_function", description="A simple test function" + ) def fn2(self): pass @@ -57,6 +61,21 @@ def fn2(self): TestFunctionContext() +def test_func_with_docstring(): + class TestFunctionContext(llm.FunctionContext): + @llm.ai_callable() + def test_fn(self): + """A simple test function""" + pass + + fnc_ctx = TestFunctionContext() + assert ( + "test_fn" in fnc_ctx.ai_functions + ), "Function should be registered in ai_functions" + + assert fnc_ctx.ai_functions["test_fn"].description == "A simple test function" + + def test_func_with_optional_parameter(): class TestFunctionContext(llm.FunctionContext): @llm.ai_callable( From 12047cda44b9d43e06a49388ca39b1c780ff9768 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Sun, 22 Dec 2024 21:55:44 -0800 Subject: [PATCH 26/46] Add jinja2 dependency to turn detector (#1277) --- .changeset/dirty-mails-reflect.md | 5 +++++ .changeset/four-rockets-accept.md | 5 +++++ .../livekit/agents/pipeline/pipeline_agent.py | 13 ++++++++----- .../livekit-plugins-turn-detector/setup.py | 1 + 4 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 .changeset/dirty-mails-reflect.md create mode 100644 .changeset/four-rockets-accept.md diff --git a/.changeset/dirty-mails-reflect.md b/.changeset/dirty-mails-reflect.md new file mode 100644 index 000000000..34eedc25a --- /dev/null +++ b/.changeset/dirty-mails-reflect.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +make max_endpoint_delay configurable diff --git a/.changeset/four-rockets-accept.md b/.changeset/four-rockets-accept.md new file mode 100644 index 000000000..a200e141d --- /dev/null +++ b/.changeset/four-rockets-accept.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-turn-detector": patch +--- + +add jinja2 dependency to turn detector diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 65d7e83f8..3b9f8e83b 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -130,6 +130,7 @@ class _ImplOptions: int_speech_duration: float int_min_words: int min_endpointing_delay: float + max_endpointing_delay: float max_nested_fnc_calls: int preemptive_synthesis: bool before_llm_cb: BeforeLLMCallback @@ -190,6 +191,7 @@ def __init__( interrupt_speech_duration: float = 0.5, interrupt_min_words: int = 0, min_endpointing_delay: float = 0.5, + max_endpointing_delay: float = 6.0, max_nested_fnc_calls: int = 1, preemptive_synthesis: bool = False, transcription: AgentTranscriptionOptions = AgentTranscriptionOptions(), @@ -247,6 +249,7 @@ def __init__( int_speech_duration=interrupt_speech_duration, int_min_words=interrupt_min_words, min_endpointing_delay=min_endpointing_delay, + max_endpointing_delay=max_endpointing_delay, max_nested_fnc_calls=max_nested_fnc_calls, preemptive_synthesis=preemptive_synthesis, transcription=transcription, @@ -293,7 +296,8 @@ def __init__( self._deferred_validation = _DeferredReplyValidation( self._validate_reply_if_possible, - self._opts.min_endpointing_delay, + min_endpointing_delay=self._opts.min_endpointing_delay, + max_endpointing_delay=self._opts.max_endpointing_delay, turn_detector=self._turn_detector, agent=self, ) @@ -1120,15 +1124,13 @@ class _DeferredReplyValidation: PUNCTUATION = ".!?" PUNCTUATION_REDUCE_FACTOR = 0.75 - # Long delay to use when the model thinks the user is still speaking - UNLIKELY_ENDPOINT_DELAY = 6 - FINAL_TRANSCRIPT_TIMEOUT = 5 def __init__( self, validate_fnc: Callable[[], None], min_endpointing_delay: float, + max_endpointing_delay: float, turn_detector: _TurnDetector | None, agent: VoicePipelineAgent, ) -> None: @@ -1144,6 +1146,7 @@ def __init__( self._agent = agent self._end_of_speech_delay = min_endpointing_delay + self._max_endpointing_delay = max_endpointing_delay @property def validating(self) -> bool: @@ -1237,7 +1240,7 @@ async def _run_task(chat_ctx: ChatContext, delay: float) -> None: unlikely_threshold = self._turn_detector.unlikely_threshold() elasped = time.perf_counter() - start_time if eot_prob < unlikely_threshold: - delay = self.UNLIKELY_ENDPOINT_DELAY + delay = self._max_endpointing_delay delay = max(0, delay - elasped) await asyncio.sleep(delay) diff --git a/livekit-plugins/livekit-plugins-turn-detector/setup.py b/livekit-plugins/livekit-plugins-turn-detector/setup.py index f53e82135..1585ed0cf 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/setup.py +++ b/livekit-plugins/livekit-plugins-turn-detector/setup.py @@ -54,6 +54,7 @@ "transformers>=4.47.1", "numpy>=1.26", "onnxruntime>=1.18", + "jinja2", ], package_data={"livekit.plugins.turn_detector": ["py.typed"]}, project_urls={ From 8afc3955e9e1158199bcd3eafc29c5d350ca65db Mon Sep 17 00:00:00 2001 From: Tina Nguyen <72938484+tinalenguyen@users.noreply.github.com> Date: Mon, 23 Dec 2024 01:35:02 -0500 Subject: [PATCH 27/46] added ConversationPersistor() to document events/transcriptions in external file (#1209) --- examples/conversation_persistor.py | 213 +++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 examples/conversation_persistor.py diff --git a/examples/conversation_persistor.py b/examples/conversation_persistor.py new file mode 100644 index 000000000..0d9909b63 --- /dev/null +++ b/examples/conversation_persistor.py @@ -0,0 +1,213 @@ +import asyncio +import logging +from dataclasses import dataclass +from datetime import datetime +from typing import Union + +import aiofiles +from dotenv import load_dotenv +from livekit.agents import ( + AutoSubscribe, + JobContext, + WorkerOptions, + cli, + multimodal, + utils, +) +from livekit.agents.llm import ChatMessage +from livekit.agents.multimodal.multimodal_agent import EventTypes +from livekit.plugins import openai + + +@dataclass +class EventLog: + eventname: str | None + """name of recorded event""" + time: str = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + """time the event is recorded""" + + +@dataclass +class TranscriptionLog: + role: str | None + """role of the speaker""" + transcription: str | None + """transcription of speech""" + time: str = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + """time the event is recorded""" + + +class ConversationPersistor(utils.EventEmitter[EventTypes]): + def __init__( + self, + *, + model: multimodal.MultimodalAgent | None, + log: str | None, + transcriptions_only: bool = False, + ): + """ + Initializes a ConversationPersistor instance which records the events and transcriptions of a MultimodalAgent. + + Args: + model (multimodal.MultimodalAgent): an instance of a MultiModalAgent + log (str): name of the external file to record events in + transcriptions_only (bool): a boolean variable to determine if only transcriptions will be recorded, False by default + user_transcriptions (arr): list of user transcriptions + agent_transcriptions (arr): list of agent transcriptions + events (arr): list of all events + log_q (asyncio.Queue): a queue of EventLog and TranscriptionLog + + """ + super().__init__() + + self._model = model + self._log = log + self._transcriptions_only = transcriptions_only + + self._user_transcriptions = [] + self._agent_transcriptions = [] + self._events = [] + + self._log_q = asyncio.Queue[Union[EventLog, TranscriptionLog, None]]() + + @property + def log(self) -> str | None: + return self._log + + @property + def model(self) -> multimodal.MultimodalAgent | None: + return self._model + + @property + def user_transcriptions(self) -> dict: + return self._user_transcriptions + + @property + def agent_transcriptions(self) -> dict: + return self._agent_transcriptions + + @property + def events(self) -> dict: + return self._events + + @log.setter + def log(self, newlog: str | None) -> None: + self._log = newlog + + async def _main_atask(self) -> None: + # Writes to file asynchronously + while True: + log = await self._log_q.get() + + if log is None: + break + + async with aiofiles.open(self._log, "a") as file: + if type(log) is EventLog and not self._transcriptions_only: + self._events.append(log) + await file.write("\n" + log.time + " " + log.eventname) + + if type(log) is TranscriptionLog: + if log.role == "user": + self._user_transcriptions.append(log) + else: + self._agent_transcriptions.append(log) + + await file.write( + "\n" + log.time + " " + log.role + " " + log.transcription + ) + + async def aclose(self) -> None: + # Exits + self._log_q.put_nowait(None) + await self._main_task + + def start(self) -> None: + # Listens for emitted MultimodalAgent events + self._main_task = asyncio.create_task(self._main_atask()) + + @self._model.on("user_started_speaking") + def _user_started_speaking(): + event = EventLog(eventname="user_started_speaking") + self._log_q.put_nowait(event) + + @self._model.on("user_stopped_speaking") + def _user_stopped_speaking(): + event = EventLog(eventname="user_stopped_speaking") + self._log_q.put_nowait(event) + + @self._model.on("agent_started_speaking") + def _agent_started_speaking(): + event = EventLog(eventname="agent_started_speaking") + self._log_q.put_nowait(event) + + @self._model.on("agent_stopped_speaking") + def _agent_stopped_speaking(): + transcription = TranscriptionLog( + role="agent", + transcription=(self._model._playing_handle._tr_fwd.played_text)[1:], + ) + self._log_q.put_nowait(transcription) + + event = EventLog(eventname="agent_stopped_speaking") + self._log_q.put_nowait(event) + + @self._model.on("user_speech_committed") + def _user_speech_committed(user_msg: ChatMessage): + transcription = TranscriptionLog( + role="user", transcription=user_msg.content + ) + self._log_q.put_nowait(transcription) + + event = EventLog(eventname="user_speech_committed") + self._log_q.put_nowait(event) + + @self._model.on("agent_speech_committed") + def _agent_speech_committed(): + event = EventLog(eventname="agent_speech_committed") + self._log_q.put_nowait(event) + + @self._model.on("agent_speech_interrupted") + def _agent_speech_interrupted(): + event = EventLog(eventname="agent_speech_interrupted") + self._log_q.put_nowait(event) + + @self._model.on("function_calls_collected") + def _function_calls_collected(): + event = EventLog(eventname="function_calls_collected") + self._log_q.put_nowait(event) + + @self._model.on("function_calls_finished") + def _function_calls_finished(): + event = EventLog(eventname="function_calls_finished") + self._log_q.put_nowait(event) + + +load_dotenv() + +logger = logging.getLogger("my-worker") +logger.setLevel(logging.INFO) + + +async def entrypoint(ctx: JobContext): + agent = multimodal.MultimodalAgent( + model=openai.realtime.RealtimeModel( + voice="alloy", + temperature=0.8, + instructions="You are a helpful assistant.", + turn_detection=openai.realtime.ServerVadOptions( + threshold=0.6, prefix_padding_ms=200, silence_duration_ms=500 + ), + ), + ) + + cp = ConversationPersistor(model=agent, log="log.txt") + cp.start() + + await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) + participant = await ctx.wait_for_participant() + agent.start(ctx.room, participant) + + +if __name__ == "__main__": + cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) From 797253de7cff9ffae223b7d38006698cee17f51d Mon Sep 17 00:00:00 2001 From: Juan Mugica Gonzalez <47819159+jmugicagonz@users.noreply.github.com> Date: Mon, 23 Dec 2024 07:36:47 +0100 Subject: [PATCH 28/46] Substitute google error for warning (#1280) --- .../livekit-plugins-openai/livekit/plugins/openai/llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py index 6f7cbccb1..8e3dda787 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py @@ -220,8 +220,8 @@ def with_vertex( location = location _gac = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") if _gac is None: - raise ValueError( - "`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file." + logger.warning( + "`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file. Otherwise, use any of the other Google Cloud auth methods." ) try: From 42b0e683e08fbc6624b38dfab2cb947357f4638c Mon Sep 17 00:00:00 2001 From: David Zhao Date: Mon, 23 Dec 2024 00:19:41 -0800 Subject: [PATCH 29/46] fix: do not log process warning when process not found (#1281) --- .changeset/six-wasps-pay.md | 5 +++++ livekit-agents/livekit/agents/ipc/supervised_proc.py | 5 +++++ 2 files changed, 10 insertions(+) create mode 100644 .changeset/six-wasps-pay.md diff --git a/.changeset/six-wasps-pay.md b/.changeset/six-wasps-pay.md new file mode 100644 index 000000000..03ccb0dd8 --- /dev/null +++ b/.changeset/six-wasps-pay.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +fix: do not log process warning when process not found diff --git a/livekit-agents/livekit/agents/ipc/supervised_proc.py b/livekit-agents/livekit/agents/ipc/supervised_proc.py index e93f46a9e..e56119876 100644 --- a/livekit-agents/livekit/agents/ipc/supervised_proc.py +++ b/livekit-agents/livekit/agents/ipc/supervised_proc.py @@ -378,11 +378,16 @@ async def _memory_monitor_task(self) -> None: ) except (psutil.NoSuchProcess, psutil.AccessDenied) as e: + if self._closing or self._kill_sent: + return + logger.warning( "Failed to get memory info for process", extra=self.logging_extra(), exc_info=e, ) + # don't bother rechecking if we cannot get process info + return except Exception: if self._closing or self._kill_sent: return From c7881f3776faa2dc4cea0bda4fd832173c00ac17 Mon Sep 17 00:00:00 2001 From: Juan Mugica Gonzalez <47819159+jmugicagonz@users.noreply.github.com> Date: Mon, 23 Dec 2024 09:21:17 +0100 Subject: [PATCH 30/46] fix context when functions have been called (#1279) Co-authored-by: David Zhao --- .changeset/witty-fishes-stare.md | 5 +++++ livekit-agents/livekit/agents/pipeline/pipeline_agent.py | 5 +++++ 2 files changed, 10 insertions(+) create mode 100644 .changeset/witty-fishes-stare.md diff --git a/.changeset/witty-fishes-stare.md b/.changeset/witty-fishes-stare.md new file mode 100644 index 000000000..4f82113d7 --- /dev/null +++ b/.changeset/witty-fishes-stare.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +fix context when functions have been called diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 3b9f8e83b..7b5c28e79 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -702,6 +702,11 @@ async def _synthesize_answer_task( not playing_speech.user_question or playing_speech.user_committed ) and not playing_speech.speech_committed: # the speech is playing but not committed yet, add it to the chat context for this new reply synthesis + # First add the previous function call message if any + if playing_speech.extra_tools_messages: + copied_ctx.messages.extend(playing_speech.extra_tools_messages) + + # Then add the previous assistant message copied_ctx.messages.append( ChatMessage.create( text=playing_speech.synthesis_handle.tts_forwarder.played_text, From 799a53d519cf0ab79e7ad00751d2aa69f7a311a4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:58:11 -0600 Subject: [PATCH 31/46] Version Packages (#1278) Co-authored-by: github-actions[bot] --- .changeset/dirty-mails-reflect.md | 5 ----- .changeset/four-rockets-accept.md | 5 ----- .changeset/nasty-rings-wave.md | 5 ----- .changeset/six-wasps-pay.md | 5 ----- .changeset/witty-fishes-stare.md | 5 ----- examples/participant-entrypoint/requirements.txt | 2 +- examples/simple-color/requirements.txt | 2 +- examples/speech-to-text/requirements.txt | 2 +- examples/text-to-speech/requirements.txt | 2 +- examples/voice-pipeline-agent/requirements.txt | 2 +- livekit-agents/CHANGELOG.md | 12 ++++++++++++ livekit-agents/livekit/agents/version.py | 2 +- livekit-agents/package.json | 2 +- .../livekit-plugins-turn-detector/CHANGELOG.md | 6 ++++++ .../livekit/plugins/turn_detector/version.py | 2 +- .../livekit-plugins-turn-detector/package.json | 2 +- 16 files changed, 27 insertions(+), 34 deletions(-) delete mode 100644 .changeset/dirty-mails-reflect.md delete mode 100644 .changeset/four-rockets-accept.md delete mode 100644 .changeset/nasty-rings-wave.md delete mode 100644 .changeset/six-wasps-pay.md delete mode 100644 .changeset/witty-fishes-stare.md diff --git a/.changeset/dirty-mails-reflect.md b/.changeset/dirty-mails-reflect.md deleted file mode 100644 index 34eedc25a..000000000 --- a/.changeset/dirty-mails-reflect.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -make max_endpoint_delay configurable diff --git a/.changeset/four-rockets-accept.md b/.changeset/four-rockets-accept.md deleted file mode 100644 index a200e141d..000000000 --- a/.changeset/four-rockets-accept.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-turn-detector": patch ---- - -add jinja2 dependency to turn detector diff --git a/.changeset/nasty-rings-wave.md b/.changeset/nasty-rings-wave.md deleted file mode 100644 index cbbcb7979..000000000 --- a/.changeset/nasty-rings-wave.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -set USE_DOCSTRING as default for ai_callable diff --git a/.changeset/six-wasps-pay.md b/.changeset/six-wasps-pay.md deleted file mode 100644 index 03ccb0dd8..000000000 --- a/.changeset/six-wasps-pay.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -fix: do not log process warning when process not found diff --git a/.changeset/witty-fishes-stare.md b/.changeset/witty-fishes-stare.md deleted file mode 100644 index 4f82113d7..000000000 --- a/.changeset/witty-fishes-stare.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -fix context when functions have been called diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt index 5e6395561..a92be36b8 100644 --- a/examples/participant-entrypoint/requirements.txt +++ b/examples/participant-entrypoint/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.4 +livekit-agents>=0.12.5 python-dotenv~=1.0 diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt index 5e6395561..a92be36b8 100644 --- a/examples/simple-color/requirements.txt +++ b/examples/simple-color/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.4 +livekit-agents>=0.12.5 python-dotenv~=1.0 diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt index 0a18a4bb6..e58a682b3 100644 --- a/examples/speech-to-text/requirements.txt +++ b/examples/speech-to-text/requirements.txt @@ -1,3 +1,3 @@ -livekit-agents>=0.12.4 +livekit-agents>=0.12.5 livekit-plugins-deepgram>=0.6.15 python-dotenv~=1.0 diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt index 6a534b331..f025ab277 100644 --- a/examples/text-to-speech/requirements.txt +++ b/examples/text-to-speech/requirements.txt @@ -1,4 +1,4 @@ -livekit-agents>=0.12.4 +livekit-agents>=0.12.5 livekit-plugins-openai>=0.10.12 livekit-plugins-cartesia>=0.4.5 livekit-plugins-elevenlabs>=0.7.9 diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt index 77975fb53..481cb0136 100644 --- a/examples/voice-pipeline-agent/requirements.txt +++ b/examples/voice-pipeline-agent/requirements.txt @@ -1,4 +1,4 @@ -livekit-agents>=0.12.4 +livekit-agents>=0.12.5 livekit-plugins-deepgram>=0.6.15 livekit-plugins-google>=0.8.1 livekit-plugins-openai[vertex]>=0.10.10 diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md index 5bd84faf9..b04f10f1d 100644 --- a/livekit-agents/CHANGELOG.md +++ b/livekit-agents/CHANGELOG.md @@ -1,5 +1,17 @@ # livekit-agents +## 0.12.5 + +### Patch Changes + +- make max_endpoint_delay configurable - [#1277](https://github.com/livekit/agents/pull/1277) ([@davidzhao](https://github.com/davidzhao)) + +- set USE_DOCSTRING as default for ai_callable - [#1266](https://github.com/livekit/agents/pull/1266) ([@longcw](https://github.com/longcw)) + +- fix: do not log process warning when process not found - [#1281](https://github.com/livekit/agents/pull/1281) ([@davidzhao](https://github.com/davidzhao)) + +- fix context when functions have been called - [#1279](https://github.com/livekit/agents/pull/1279) ([@jmugicagonz](https://github.com/jmugicagonz)) + ## 0.12.4 ### Patch Changes diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py index ee001ea03..93e989e31 100644 --- a/livekit-agents/livekit/agents/version.py +++ b/livekit-agents/livekit/agents/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.12.4" +__version__ = "0.12.5" diff --git a/livekit-agents/package.json b/livekit-agents/package.json index 212896f2b..4986b2889 100644 --- a/livekit-agents/package.json +++ b/livekit-agents/package.json @@ -1,5 +1,5 @@ { "name": "livekit-agents", "private": true, - "version": "0.12.4" + "version": "0.12.5" } diff --git a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md index 0bc8544b5..2d38bf347 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-eou +## 0.3.4 + +### Patch Changes + +- add jinja2 dependency to turn detector - [#1277](https://github.com/livekit/agents/pull/1277) ([@davidzhao](https://github.com/davidzhao)) + ## 0.3.3 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py index 6b8f1ef90..bcfe9b179 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.3.3" +__version__ = "0.3.4" diff --git a/livekit-plugins/livekit-plugins-turn-detector/package.json b/livekit-plugins/livekit-plugins-turn-detector/package.json index acd5d4300..82d16bb89 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/package.json +++ b/livekit-plugins/livekit-plugins-turn-detector/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-turn-detector", "private": true, - "version": "0.3.3" + "version": "0.3.4" } From 37bbfccb0166b174c3cb399497f6b7465f97311b Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Mon, 23 Dec 2024 23:16:31 +0200 Subject: [PATCH 32/46] fix(turn-detector): fix int32/64 errors on Windows (#1285) --- .changeset/tricky-spiders-change.md | 5 +++++ .../livekit/plugins/turn_detector/eou.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 .changeset/tricky-spiders-change.md diff --git a/.changeset/tricky-spiders-change.md b/.changeset/tricky-spiders-change.md new file mode 100644 index 000000000..a017624fc --- /dev/null +++ b/.changeset/tricky-spiders-change.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-turn-detector": patch +--- + +fix int32/64 errors on Windows diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py index acb915ab5..8c8090946 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/eou.py @@ -106,7 +106,7 @@ def run(self, data: bytes) -> bytes | None: return_tensors="np", ) - input_dict = {"input_ids": inputs["input_ids"]} + input_dict = {"input_ids": np.array(inputs["input_ids"], dtype=np.int64)} # Run inference outputs = self._session.run(["logits"], input_dict) From c89960882146897049a393db0de8903bea0d54fc Mon Sep 17 00:00:00 2001 From: David Zhao Date: Tue, 24 Dec 2024 10:34:21 -0800 Subject: [PATCH 33/46] improve interruption handling, avoid agent from getting stuck (#1290) --- .changeset/silent-oranges-warn.md | 5 +++++ .../livekit/agents/pipeline/pipeline_agent.py | 21 ++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 .changeset/silent-oranges-warn.md diff --git a/.changeset/silent-oranges-warn.md b/.changeset/silent-oranges-warn.md new file mode 100644 index 000000000..e7bcd0189 --- /dev/null +++ b/.changeset/silent-oranges-warn.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +improve interruption handling, avoid agent from getting stuck diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 7b5c28e79..b2a223bd0 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -714,6 +714,9 @@ async def _synthesize_answer_task( ) ) + # we want to add this question even if it's empty. during false positive interruptions, + # adding an empty user message gives the LLM context so it could continue from where + # it had been interrupted. copied_ctx.messages.append( ChatMessage.create(text=handle.user_question, role="user") ) @@ -1035,7 +1038,7 @@ async def _llm_stream_to_str_generator( def _validate_reply_if_possible(self) -> None: """Check if the new agent speech should be played""" - if self._playing_speech is not None: + if self._playing_speech and not self._playing_speech.interrupted: should_ignore_input = False if not self._playing_speech.allow_interruptions: should_ignore_input = True @@ -1049,19 +1052,24 @@ def _validate_reply_if_possible(self) -> None: "interrupt threshold is not met", extra={"speech_id": self._playing_speech.id}, ) + if should_ignore_input: self._transcribed_text = "" return if self._pending_agent_reply is None: - if self._opts.preemptive_synthesis or not self._transcribed_text: + if self._opts.preemptive_synthesis: return + # as long as we don't have a pending reply, we need to synthesize it + # in order to keep the conversation flowing. + # transcript could be empty at this moment, if the user interrupted the agent + # but did not generate any transcribed text. self._synthesize_agent_reply() assert self._pending_agent_reply is not None - # in some bad timing, we could end up with two pushed agent replies inside the speech queue. + # due to timing, we could end up with two pushed agent replies inside the speech queue. # so make sure we directly interrupt every reply when validating a new one for speech in self._speech_q: if not speech.is_reply: @@ -1072,7 +1080,10 @@ def _validate_reply_if_possible(self) -> None: logger.debug( "validated agent reply", - extra={"speech_id": self._pending_agent_reply.id}, + extra={ + "speech_id": self._pending_agent_reply.id, + "text": self._transcribed_text, + }, ) if self._last_speech_time is not None: @@ -1101,7 +1112,7 @@ def _interrupt_if_possible(self) -> None: def _should_interrupt(self) -> bool: if self._playing_speech is None: - return True + return False if ( not self._playing_speech.allow_interruptions From ee0850937e773f41f337e287d5022b51004bdbc3 Mon Sep 17 00:00:00 2001 From: Juan Mugica Gonzalez <47819159+jmugicagonz@users.noreply.github.com> Date: Tue, 24 Dec 2024 20:30:47 +0100 Subject: [PATCH 34/46] encode boost words (#1284) Co-authored-by: David Zhao --- .changeset/hot-trainers-press.md | 5 +++++ .../livekit/plugins/assemblyai/stt.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 .changeset/hot-trainers-press.md diff --git a/.changeset/hot-trainers-press.md b/.changeset/hot-trainers-press.md new file mode 100644 index 000000000..326150914 --- /dev/null +++ b/.changeset/hot-trainers-press.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-assemblyai": patch +--- + +assemblyai: encode boost words diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py index acef65b6a..8fc51c774 100644 --- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py +++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py @@ -318,7 +318,9 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse): async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse: live_config = { "sample_rate": self._opts.sample_rate, - "word_boost": self._opts.word_boost, + "word_boost": json.dumps(self._opts.word_boost) + if self._opts.word_boost is not None + else None, "encoding": self._opts.encoding, "disable_partial_transcripts": self._opts.disable_partial_transcripts, "enable_extra_session_information": self._opts.enable_extra_session_information, From ffeee077f31f0c42f9bfd705bd8d45618d321e0c Mon Sep 17 00:00:00 2001 From: David Zhao Date: Tue, 24 Dec 2024 11:36:00 -0800 Subject: [PATCH 35/46] Ensure STT exceptions are being propagated (#1291) Co-authored-by: jayesh --- .changeset/giant-ways-invite.md | 8 ++++++++ livekit-agents/livekit/agents/_exceptions.py | 6 +++--- .../livekit/plugins/assemblyai/stt.py | 14 ++++++++++++-- .../livekit/plugins/azure/stt.py | 5 ++++- .../livekit/plugins/deepgram/stt.py | 10 +++++++++- .../livekit/plugins/google/stt.py | 5 ++++- 6 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 .changeset/giant-ways-invite.md diff --git a/.changeset/giant-ways-invite.md b/.changeset/giant-ways-invite.md new file mode 100644 index 000000000..5644cb581 --- /dev/null +++ b/.changeset/giant-ways-invite.md @@ -0,0 +1,8 @@ +--- +"livekit-plugins-assemblyai": patch +"livekit-plugins-deepgram": patch +"livekit-plugins-google": patch +"livekit-plugins-azure": patch +--- + +fix: Ensure STT exceptions are being propagated diff --git a/livekit-agents/livekit/agents/_exceptions.py b/livekit-agents/livekit/agents/_exceptions.py index 128efacee..a6d987e7d 100644 --- a/livekit-agents/livekit/agents/_exceptions.py +++ b/livekit-agents/livekit/agents/_exceptions.py @@ -48,9 +48,9 @@ def __init__( self, message: str, *, - status_code: int, - request_id: str | None, - body: object | None, + status_code: int = -1, + request_id: str | None = None, + body: object | None = None, ) -> None: super().__init__(message, body=body) diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py index 8fc51c774..40c359fd8 100644 --- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py +++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/stt.py @@ -25,7 +25,13 @@ from urllib.parse import urlencode import aiohttp -from livekit.agents import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions, stt, utils +from livekit.agents import ( + DEFAULT_API_CONNECT_OPTIONS, + APIConnectOptions, + APIStatusError, + stt, + utils, +) from livekit.agents.stt import SpeechEvent from livekit.agents.utils import AudioBuffer @@ -274,7 +280,7 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse): if closing_ws: # close is expected, see SpeechStream.aclose return - raise Exception( + raise APIStatusError( "AssemblyAI connection closed unexpectedly", ) # this will trigger a reconnection, see the _run loop @@ -305,6 +311,10 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse): [asyncio.gather(*tasks), wait_reconnect_task], return_when=asyncio.FIRST_COMPLETED, ) # type: ignore + for task in done: + if task != wait_reconnect_task: + task.result() + if wait_reconnect_task not in done: break diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py index 309cc9c5c..2bda776fd 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/stt.py @@ -199,10 +199,13 @@ async def process_input(): wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait()) try: - await asyncio.wait( + done, _ = await asyncio.wait( [process_input_task, wait_reconnect_task], return_when=asyncio.FIRST_COMPLETED, ) + for task in done: + if task != wait_reconnect_task: + task.result() finally: await utils.aio.gracefully_cancel( process_input_task, wait_reconnect_task diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py index 2ae6d74fe..d45966e4e 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt.py @@ -471,7 +471,9 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse): return # this will trigger a reconnection, see the _run loop - raise Exception("deepgram connection closed unexpectedly") + raise APIStatusError( + message="deepgram connection closed unexpectedly" + ) if msg.type != aiohttp.WSMsgType.TEXT: logger.warning("unexpected deepgram message type %s", msg.type) @@ -498,6 +500,12 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse): [asyncio.gather(*tasks), wait_reconnect_task], return_when=asyncio.FIRST_COMPLETED, ) # type: ignore + + # propagate exceptions from completed tasks + for task in done: + if task != wait_reconnect_task: + task.result() + if wait_reconnect_task not in done: break diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py index de4ac6251..7fe2a527d 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py @@ -438,10 +438,13 @@ async def process_stream(stream): process_stream_task = asyncio.create_task(process_stream(stream)) wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait()) try: - await asyncio.wait( + done, _ = await asyncio.wait( [process_stream_task, wait_reconnect_task], return_when=asyncio.FIRST_COMPLETED, ) + for task in done: + if task != wait_reconnect_task: + task.result() finally: await utils.aio.gracefully_cancel( process_stream_task, wait_reconnect_task From baae79b2ea9ec8300c9f47ea42cf830a66c2e41d Mon Sep 17 00:00:00 2001 From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com> Date: Wed, 25 Dec 2024 01:20:54 +0530 Subject: [PATCH 36/46] Support PlayHT/PlayAI TTS (#1174) Co-authored-by: David Zhao --- .changeset/khaki-stingrays-train.md | 5 + .github/workflows/ci.yml | 1 + .github/workflows/tests.yml | 3 + .../livekit/agents/utils/codecs/mp3.py | 14 + livekit-plugins/install_local.sh | 1 + .../CHANGELOG.md | 0 .../livekit-plugins-playai/README.md | 13 + .../livekit/plugins/playai}/__init__.py | 13 +- .../livekit/plugins/playai/log.py | 5 + .../livekit/plugins/playai/models.py | 9 + .../livekit/plugins/playai/py.typed | 0 .../livekit/plugins/playai/tts.py | 296 ++++++++++++++++++ .../livekit/plugins/playai}/version.py | 0 .../livekit-plugins-playai/package.json | 5 + .../pyproject.toml | 0 .../setup.py | 12 +- .../livekit-plugins-playht/README.md | 13 - .../livekit/plugins/playht/log.py | 3 - .../livekit/plugins/playht/models.py | 20 -- .../livekit/plugins/playht/tts.py | 238 -------------- .../livekit-plugins-playht/package.json | 5 - tests/test_tts.py | 12 +- 22 files changed, 372 insertions(+), 296 deletions(-) create mode 100644 .changeset/khaki-stingrays-train.md rename livekit-plugins/{livekit-plugins-playht => livekit-plugins-playai}/CHANGELOG.md (100%) create mode 100644 livekit-plugins/livekit-plugins-playai/README.md rename livekit-plugins/{livekit-plugins-playht/livekit/plugins/playht => livekit-plugins-playai/livekit/plugins/playai}/__init__.py (58%) create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/py.typed create mode 100644 livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py rename livekit-plugins/{livekit-plugins-playht/livekit/plugins/playht => livekit-plugins-playai/livekit/plugins/playai}/version.py (100%) create mode 100644 livekit-plugins/livekit-plugins-playai/package.json rename livekit-plugins/{livekit-plugins-playht => livekit-plugins-playai}/pyproject.toml (100%) rename livekit-plugins/{livekit-plugins-playht => livekit-plugins-playai}/setup.py (86%) delete mode 100644 livekit-plugins/livekit-plugins-playht/README.md delete mode 100644 livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py delete mode 100644 livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py delete mode 100644 livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py delete mode 100644 livekit-plugins/livekit-plugins-playht/package.json diff --git a/.changeset/khaki-stingrays-train.md b/.changeset/khaki-stingrays-train.md new file mode 100644 index 000000000..ca99f9fa7 --- /dev/null +++ b/.changeset/khaki-stingrays-train.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-playai": patch +--- + +Support PlayAI TTS engine. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f048347d..9eb72c55c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -80,4 +80,5 @@ jobs: -p livekit.plugins.azure \ -p livekit.plugins.anthropic \ -p livekit.plugins.fal \ + -p livekit.plugins.playai \ -p livekit.plugins.assemblyai diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d2a26cbf2..25f72cc33 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -77,6 +77,7 @@ jobs: sudo dpkg -i libssl1.1_1.1.1-1ubuntu2.1_arm64.deb sudo dpkg -i libssl-dev_1.1.1-1ubuntu2.1_arm64.deb + - name: Install ffmpeg (macOS) if: ${{ startsWith(matrix.os, 'macos') }} run: brew install ffmpeg @@ -109,6 +110,8 @@ jobs: GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} ASSEMBLYAI_API_KEY: ${{ secrets.ASSEMBLYAI_API_KEY }} FAL_KEY: ${{ secrets.FAL_KEY }} + PLAYHT_API_KEY: ${{ secrets.PLAYHT_API_KEY }} + PLAYHT_USER_ID: ${{ secrets.PLAYHT_USER_ID }} GOOGLE_APPLICATION_CREDENTIALS: google.json PYTEST_ADDOPTS: "--color=yes" working-directory: tests diff --git a/livekit-agents/livekit/agents/utils/codecs/mp3.py b/livekit-agents/livekit/agents/utils/codecs/mp3.py index 6f3b1aa45..2f2321028 100644 --- a/livekit-agents/livekit/agents/utils/codecs/mp3.py +++ b/livekit-agents/livekit/agents/utils/codecs/mp3.py @@ -39,6 +39,20 @@ def __init__(self): self._codec = av.CodecContext.create("mp3", "r") # noqa def decode_chunk(self, chunk: bytes) -> List[rtc.AudioFrame]: + # Skip ID3v2 header if present + if chunk.startswith(b"ID3"): + # ID3v2 header is 10 bytes long + # The size is encoded in the next 4 bytes (bytes 6-9) + # Each byte only uses 7 bits (most significant bit is always 0) + if len(chunk) >= 10: + size = ( + ((chunk[6] & 0x7F) << 21) + | ((chunk[7] & 0x7F) << 14) + | ((chunk[8] & 0x7F) << 7) + | (chunk[9] & 0x7F) + ) + chunk = chunk[10 + size :] + packets = self._codec.parse(chunk) result: List[rtc.AudioFrame] = [] for packet in packets: diff --git a/livekit-plugins/install_local.sh b/livekit-plugins/install_local.sh index 79ec29f0d..3e6a1cee4 100755 --- a/livekit-plugins/install_local.sh +++ b/livekit-plugins/install_local.sh @@ -17,5 +17,6 @@ pip install \ "${SCRIPT_DIR}/livekit-plugins-nltk" \ "${SCRIPT_DIR}/livekit-plugins-openai" \ "${SCRIPT_DIR}/livekit-plugins-rag" \ + "${SCRIPT_DIR}/livekit-plugins-playai" \ "${SCRIPT_DIR}/livekit-plugins-silero" \ "${SCRIPT_DIR}/livekit-plugins-turn-detector" diff --git a/livekit-plugins/livekit-plugins-playht/CHANGELOG.md b/livekit-plugins/livekit-plugins-playai/CHANGELOG.md similarity index 100% rename from livekit-plugins/livekit-plugins-playht/CHANGELOG.md rename to livekit-plugins/livekit-plugins-playai/CHANGELOG.md diff --git a/livekit-plugins/livekit-plugins-playai/README.md b/livekit-plugins/livekit-plugins-playai/README.md new file mode 100644 index 000000000..5561dbe66 --- /dev/null +++ b/livekit-plugins/livekit-plugins-playai/README.md @@ -0,0 +1,13 @@ +# LiveKit Plugins PlayAI/PlayHT + +Agent Framework plugin for voice synthesis with [PlayAI](https://play.ai/) API. + +## Installation + +```bash +pip install livekit-plugins-playai +``` + +## Pre-requisites + +You'll need USER ID and API Secret KEY from PlayHT. It can be set as an environment variable: `PLAYHT_USER_ID`, `PLAYHT_API_KEY` get it from [here](https://play.ht/studio/api-access) diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/__init__.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/__init__.py similarity index 58% rename from livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/__init__.py rename to livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/__init__.py index 82229c316..033d9363e 100644 --- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/__init__.py +++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/__init__.py @@ -1,27 +1,20 @@ -from .models import TTSEngines -from .tts import DEFAULT_VOICE, TTS, Voice +from .tts import TTS from .version import __version__ __all__ = [ "TTS", - "Voice", - "DEFAULT_VOICE", - "TTSEngines", "__version__", ] from livekit.agents import Plugin -class PlayHTPlugin(Plugin): +class PlayAIPlugin(Plugin): def __init__(self) -> None: super().__init__(__name__, __version__, __package__) - def download_files(self) -> None: - self.download_files(self) - -Plugin.register_plugin(PlayHTPlugin()) +Plugin.register_plugin(PlayAIPlugin()) # Cleanup docs of unexported modules _module = dir() diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py new file mode 100644 index 000000000..decd14a99 --- /dev/null +++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/log.py @@ -0,0 +1,5 @@ +import logging + +logger = logging.getLogger("livekit.plugins.playai") +# suppress verbose websocket logs +logging.getLogger("websockets.client").setLevel(logging.INFO) diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py new file mode 100644 index 000000000..1dc6dfce8 --- /dev/null +++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/models.py @@ -0,0 +1,9 @@ +from typing import Literal + +from pyht.client import Format # type: ignore + +TTSModel = Literal["Play3.0-mini-ws", "PlayDialog-ws", "Play3.0-mini", "PlayDialog"] +FORMAT = Literal["mp3"] +format_mapping = { + "mp3": Format.FORMAT_MP3, +} diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/py.typed b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py new file mode 100644 index 000000000..464f3f418 --- /dev/null +++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/tts.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import asyncio +import os +import weakref +from dataclasses import dataclass, fields + +from livekit import rtc +from livekit.agents import ( + DEFAULT_API_CONNECT_OPTIONS, + APIConnectionError, + APIConnectOptions, + tokenize, + tts, + utils, +) +from pyht import AsyncClient as PlayHTAsyncClient # type: ignore +from pyht.client import Format, Language, TTSOptions # type: ignore + +from .log import logger +from .models import TTSModel + +NUM_CHANNELS = 1 + + +@dataclass +class _Options: + model: TTSModel | str + tts_options: TTSOptions + word_tokenizer: tokenize.WordTokenizer + + +class TTS(tts.TTS): + def __init__( + self, + *, + api_key: str | None = None, + user_id: str | None = None, + voice: str = "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json", + language: str = "english", + sample_rate: int = 24000, + model: TTSModel | str = "Play3.0-mini-ws", + word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer( + ignore_punctuation=False + ), + **kwargs, + ) -> None: + """ + Initialize the PlayAI TTS engine. + + Args: + api_key (str): PlayAI API key. + user_id (str): PlayAI user ID. + voice (str): Voice manifest URL. + model (TTSModel): TTS model, defaults to "Play3.0-mini-ws". + language (str): language, defaults to "english". + sample_rate (int): sample rate (Hz), A number greater than or equal to 8000, and must be less than or equal to 48000 + word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer. + **kwargs: Additional options. + """ + + super().__init__( + capabilities=tts.TTSCapabilities( + streaming=False, + ), + sample_rate=sample_rate, + num_channels=1, + ) + + api_key = api_key or os.environ.get("PLAYHT_API_KEY") + user_id = user_id or os.environ.get("PLAYHT_USER_ID") + + if not api_key or not user_id: + raise ValueError( + "PlayHT API key and user ID are required. Set environment variables PLAYHT_API_KEY and PLAYHT_USER_ID or pass them explicitly." + ) + _validate_kwargs(kwargs) + self._config = TTSOptions( + voice=voice, + format=Format.FORMAT_MP3, # Default format for now + sample_rate=sample_rate, + language=Language(language), + **kwargs, + ) + + self._opts = _Options( + model=model, + tts_options=self._config, + word_tokenizer=word_tokenizer, + ) + + # Initialize client + self._client = PlayHTAsyncClient( + user_id=user_id, + api_key=api_key, + ) + self._streams = weakref.WeakSet[SynthesizeStream]() + + def update_options( + self, + *, + voice: str | None = None, + model: TTSModel | str | None = None, + language: str | None = None, + **kwargs, + ) -> None: + """ + Update the TTS options. + """ + updates = {} + if voice is not None: + updates["voice"] = voice + if language is not None: + updates["language"] = Language(language) + tts_kwargs = {k: v for k, v in kwargs.items()} + + self._config = _update_options(self._config, **updates, **tts_kwargs) + + if model is not None: + self._opts.model = model + + for stream in self._streams: + stream._config = _update_options(stream._config, **updates, **tts_kwargs) + if model is not None: + stream._opts.model = model + + def synthesize( + self, + text: str, + *, + conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, + ) -> "ChunkedStream": + return ChunkedStream( + tts=self, + input_text=text, + conn_options=conn_options, + opts=self._opts, + ) + + def stream( + self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS + ) -> "SynthesizeStream": + stream = SynthesizeStream( + tts=self, + conn_options=conn_options, + opts=self._opts, + ) + self._streams.add(stream) + return stream + + +class ChunkedStream(tts.ChunkedStream): + def __init__( + self, + *, + tts: TTS, + input_text: str, + conn_options: APIConnectOptions, + opts: _Options, + ) -> None: + super().__init__(tts=tts, input_text=input_text, conn_options=conn_options) + self._client = tts._client + self._opts = opts + self._config = self._opts.tts_options + self._mp3_decoder = utils.codecs.Mp3StreamDecoder() + + async def _run(self) -> None: + request_id = utils.shortuuid() + bstream = utils.audio.AudioByteStream( + sample_rate=self._config.sample_rate, num_channels=NUM_CHANNELS + ) + + try: + async for chunk in self._client.tts( + text=self._input_text, + options=self._config, + voice_engine=self._opts.model, + streaming=True, + ): + for frame in self._mp3_decoder.decode_chunk(chunk): + for frame in bstream.write(frame.data.tobytes()): + self._event_ch.send_nowait( + tts.SynthesizedAudio( + request_id=request_id, + frame=frame, + ) + ) + for frame in bstream.flush(): + self._event_ch.send_nowait( + tts.SynthesizedAudio(request_id=request_id, frame=frame) + ) + except Exception as e: + raise APIConnectionError() from e + + +class SynthesizeStream(tts.SynthesizeStream): + def __init__( + self, + *, + tts: TTS, + conn_options: APIConnectOptions, + opts: _Options, + ): + super().__init__(tts=tts, conn_options=conn_options) + self._client = tts._client + self._opts = opts + self._config = self._opts.tts_options + self._segments_ch = utils.aio.Chan[tokenize.WordStream]() + self._mp3_decoder = utils.codecs.Mp3StreamDecoder() + + async def _run(self) -> None: + request_id = utils.shortuuid() + segment_id = utils.shortuuid() + bstream = utils.audio.AudioByteStream( + sample_rate=self._config.sample_rate, + num_channels=NUM_CHANNELS, + ) + last_frame: rtc.AudioFrame | None = None + + def _send_last_frame(*, segment_id: str, is_final: bool) -> None: + nonlocal last_frame + if last_frame is not None: + self._event_ch.send_nowait( + tts.SynthesizedAudio( + request_id=request_id, + segment_id=segment_id, + frame=last_frame, + is_final=is_final, + ) + ) + last_frame = None + + input_task = asyncio.create_task(self._tokenize_input()) + try: + text_stream = await self._create_text_stream() + async for chunk in self._client.stream_tts_input( + text_stream=text_stream, + options=self._config, + voice_engine=self._opts.model, + ): + for frame in self._mp3_decoder.decode_chunk(chunk): + for frame in bstream.write(frame.data.tobytes()): + _send_last_frame(segment_id=segment_id, is_final=False) + last_frame = frame + + for frame in bstream.flush(): + _send_last_frame(segment_id=segment_id, is_final=False) + last_frame = frame + _send_last_frame(segment_id=segment_id, is_final=True) + except Exception as e: + raise APIConnectionError() from e + finally: + await utils.aio.gracefully_cancel(input_task) + self._client.close() + + @utils.log_exceptions(logger=logger) + async def _tokenize_input(self): + # Converts incoming text into WordStreams and sends them into _segments_ch + word_stream = None + async for input in self._input_ch: + if isinstance(input, str): + if word_stream is None: + word_stream = self._opts.word_tokenizer.stream() + self._segments_ch.send_nowait(word_stream) + word_stream.push_text(input) + elif isinstance(input, self._FlushSentinel): + if word_stream: + word_stream.end_input() + word_stream = None + self._segments_ch.close() + + @utils.log_exceptions(logger=logger) + async def _create_text_stream(self): + async def text_stream(): + async for word_stream in self._segments_ch: + async for word in word_stream: + yield word.token + + return text_stream() + + +def _update_options(config: TTSOptions, **kwargs) -> TTSOptions: + _validate_kwargs(kwargs) + for k, v in kwargs.items(): + if v is not None: + setattr(config, k, v) + return config + + +def _validate_kwargs(kwargs: dict) -> None: + valid_keys = {field.name for field in fields(TTSOptions)} + invalid_keys = set(kwargs.keys()) - valid_keys + if invalid_keys: + raise ValueError( + f"Invalid parameters: {invalid_keys}. Allowed parameters: {valid_keys}" + ) diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/version.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py similarity index 100% rename from livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/version.py rename to livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py diff --git a/livekit-plugins/livekit-plugins-playai/package.json b/livekit-plugins/livekit-plugins-playai/package.json new file mode 100644 index 000000000..043890665 --- /dev/null +++ b/livekit-plugins/livekit-plugins-playai/package.json @@ -0,0 +1,5 @@ +{ + "name": "livekit-plugins-playai", + "private": true, + "version": "1.0.3" +} diff --git a/livekit-plugins/livekit-plugins-playht/pyproject.toml b/livekit-plugins/livekit-plugins-playai/pyproject.toml similarity index 100% rename from livekit-plugins/livekit-plugins-playht/pyproject.toml rename to livekit-plugins/livekit-plugins-playai/pyproject.toml diff --git a/livekit-plugins/livekit-plugins-playht/setup.py b/livekit-plugins/livekit-plugins-playai/setup.py similarity index 86% rename from livekit-plugins/livekit-plugins-playht/setup.py rename to livekit-plugins/livekit-plugins-playai/setup.py index eb41a5b89..76c2d2ba5 100644 --- a/livekit-plugins/livekit-plugins-playht/setup.py +++ b/livekit-plugins/livekit-plugins-playai/setup.py @@ -6,14 +6,14 @@ here = pathlib.Path(__file__).parent.resolve() about = {} -with open(os.path.join(here, "livekit", "plugins", "playht", "version.py"), "r") as f: +with open(os.path.join(here, "livekit", "plugins", "playai", "version.py"), "r") as f: exec(f.read(), about) setuptools.setup( - name="livekit-plugins-playht", + name="livekit-plugins-playai", version=about["__version__"], - description="Agent Framework plugin for voice synthesis with PlayHT's API.", + description="Agent Framework plugin for voice synthesis with PlayAI's API.", long_description=(here / "README.md").read_text(encoding="utf-8"), long_description_content_type="text/markdown", url="https://github.com/livekit/agents", @@ -27,17 +27,17 @@ "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", ], - keywords=["webrtc", "realtime", "audio", "livekit", "playHT"], + keywords=["webrtc", "realtime", "audio", "livekit", "playHT", "playAI"], license="Apache-2.0", packages=setuptools.find_namespace_packages(include=["livekit.*"]), python_requires=">=3.9.0", install_requires=[ "livekit-agents[codecs]>=0.12.3", - "pyht", + "pyht>=0.1.10", "aiohttp", "livekit", ], - package_data={"livekit.plugins.playht": ["py.typed"]}, + package_data={"livekit.plugins.playai": ["py.typed"]}, project_urls={ "Documentation": "https://docs.livekit.io", "Website": "https://livekit.io/", diff --git a/livekit-plugins/livekit-plugins-playht/README.md b/livekit-plugins/livekit-plugins-playht/README.md deleted file mode 100644 index 53badc144..000000000 --- a/livekit-plugins/livekit-plugins-playht/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# LiveKit Plugins PlayHT - -Agent Framework plugin for voice synthesis with [PlayHT](https://play.ht/) API. - -## Installation - -```bash -pip install livekit-plugins-playht -``` - -## Pre-requisites - -You'll need USER ID and API Secret KEY from PlayHT. It can be set as an environment variable: `PLAYHT_USER_ID`, `PLAYHT_API_KEY` \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py b/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py deleted file mode 100644 index 18a81836e..000000000 --- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/log.py +++ /dev/null @@ -1,3 +0,0 @@ -import logging - -logger = logging.getLogger("livekit.custom_tts_plugins.playht") diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py b/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py deleted file mode 100644 index 6ffe63a5b..000000000 --- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/models.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Literal - -TTSEngines = Literal[ - "PlayHT2.0", - "PlayHT1.0", - "PlayHT2.0-turbo", - "Play3.0-mini", -] - -TTSEncoding = Literal[ - "mp3_22050_32", - "mp3_44100_32", - "mp3_44100_64", - "mp3_44100_96", - "mp3_44100_128", - "mp3_44100_192", - "pcm_16000", - "pcm_22050", - "pcm_44100", -] diff --git a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py b/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py deleted file mode 100644 index 982565da7..000000000 --- a/livekit-plugins/livekit-plugins-playht/livekit/plugins/playht/tts.py +++ /dev/null @@ -1,238 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from dataclasses import dataclass -from typing import Any, List, Literal - -import aiohttp -from livekit.agents import ( - DEFAULT_API_CONNECT_OPTIONS, - APIConnectionError, - APIConnectOptions, - APIStatusError, - APITimeoutError, - tts, - utils, -) - -from .log import logger -from .models import TTSEncoding, TTSEngines - -_Encoding = Literal["mp3", "pcm"] - - -def _sample_rate_from_format(output_format: TTSEncoding) -> int: - split = output_format.split("_") - return int(split[1]) - - -def _encoding_from_format(output_format: TTSEncoding) -> _Encoding: - if output_format.startswith("mp3"): - return "mp3" - elif output_format.startswith("pcm"): - return "pcm" - elif output_format.startswith("wav"): - return "pcm" - - raise ValueError(f"Unknown format: {output_format}") - - -@dataclass -class Voice: - id: str - name: str - voice_engine: TTSEngines - - -DEFAULT_VOICE = Voice( - id="s3://peregrine-voices/mel22/manifest.json", - name="Will", - voice_engine="Play3.0-mini", -) - -ACCEPT_HEADER = { - "mp3": "audio/mpeg", - "wav": "audio/wav", - "ogg": "audio/ogg", - "flac": "audio/flac", - "mulaw": "audio/basic", # commonly used for mulaw -} - - -API_BASE_URL_V2 = "https://api.play.ht/api/v2" -AUTHORIZATION_HEADER = "AUTHORIZATION" -USERID_HEADER = "X-USER-ID" -PLAYHT_TTS_CHANNELS = 1 - -_TTSEncoding = Literal["mp3", "wav", "ogg", "flac", "mulaw"] - - -@dataclass -class _TTSOptions: - api_key: str - user_id: str - voice: Voice - base_url: str - sample_rate: int - encoding: _TTSEncoding - - -class TTS(tts.TTS): - def __init__( - self, - *, - voice: Voice = DEFAULT_VOICE, - api_key: str | None = None, - user_id: str | None = None, - base_url: str | None = None, - sample_rate: int = 24000, - encoding: _TTSEncoding = "wav", - http_session: aiohttp.ClientSession | None = None, - ) -> None: - super().__init__( - capabilities=tts.TTSCapabilities( - streaming=False, - ), - sample_rate=sample_rate, - num_channels=PLAYHT_TTS_CHANNELS, - ) - api_key = api_key or os.environ.get("PLAYHT_API_KEY") - if not api_key: - raise ValueError("PLAYHT_API_KEY must be set") - - user_id = user_id or os.environ.get("PLAYHT_USER_ID") - if not user_id: - raise ValueError("PLAYHT_USER_ID mus be set") - - self._opts = _TTSOptions( - voice=voice, - user_id=user_id, - api_key=api_key, - base_url=base_url or API_BASE_URL_V2, - sample_rate=sample_rate, - encoding=encoding, - ) - self._session = http_session - - def _ensure_session(self) -> aiohttp.ClientSession: - if not self._session: - self._session = utils.http_context.http_session() - - return self._session - - async def list_voices(self) -> List[Voice]: - async with self._ensure_session().get( - f"{self._opts.base_url}/voices", - headers={ - "accept": "application/json", - AUTHORIZATION_HEADER: self._opts.api_key, - USERID_HEADER: self._opts.user_id, - }, - ) as resp: - return _dict_to_voices_list(await resp.json()) - - def synthesize( - self, - text: str, - *, - conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, - ) -> "ChunkedStream": - return ChunkedStream( - tts=self, - input_text=text, - conn_options=conn_options, - opts=self._opts, - session=self._ensure_session(), - ) - - -class ChunkedStream(tts.ChunkedStream): - """Synthesize using the chunked api endpoint""" - - def __init__( - self, - tts: TTS, - input_text: str, - opts: _TTSOptions, - conn_options: APIConnectOptions, - session: aiohttp.ClientSession, - ) -> None: - super().__init__(tts=tts, input_text=input_text, conn_options=conn_options) - self._opts, self._session = opts, session - - async def _run(self) -> None: - stream = utils.audio.AudioByteStream( - sample_rate=self._opts.sample_rate, num_channels=1 - ) - self._mp3_decoder = utils.codecs.Mp3StreamDecoder() - request_id = utils.shortuuid() - url = f"{API_BASE_URL_V2}/tts/stream" - headers = { - "accept": ACCEPT_HEADER[self._opts.encoding], - "content-type": "application/json", - AUTHORIZATION_HEADER: self._opts.api_key, - USERID_HEADER: self._opts.user_id, - } - json_data = { - "text": self._input_text, - "output_format": self._opts.encoding, - "sample_rate": self._opts.sample_rate, - "voice": self._opts.voice.id, - } - try: - async with self._session.post( - url=url, headers=headers, json=json_data - ) as resp: - if not resp.content_type.startswith("audio/"): - content = await resp.text() - logger.error("playHT returned non-audio data: %s", content) - return - - encoding = _encoding_from_format(self._opts.encoding) - if encoding == "mp3": - async for bytes_data, _ in resp.content.iter_chunks(): - for frame in self._mp3_decoder.decode_chunk(bytes_data): - self._event_ch.send_nowait( - tts.SynthesizedAudio( - request_id=request_id, - frame=frame, - ) - ) - else: - async for bytes_data, _ in resp.content.iter_chunks(): - for frame in stream.write(bytes_data): - self._event_ch.send_nowait( - tts.SynthesizedAudio( - request_id=request_id, - frame=frame, - ) - ) - - for frame in stream.flush(): - self._event_ch.send_nowait( - tts.SynthesizedAudio(request_id=request_id, frame=frame) - ) - - except asyncio.TimeoutError as e: - raise APITimeoutError() from e - except aiohttp.ClientResponseError as e: - raise APIStatusError( - message=e.message, - status_code=e.status, - request_id=None, - body=None, - ) from e - except Exception as e: - raise APIConnectionError() from e - - -def _dict_to_voices_list(data: dict[str, Any]): - voices: List[Voice] = [] - for voice in data["text"]: - voices.append( - Voice( - id=voice["id"], name=voice["name"], voice_engine=voice["voice_engine"] - ) - ) - return voices diff --git a/livekit-plugins/livekit-plugins-playht/package.json b/livekit-plugins/livekit-plugins-playht/package.json deleted file mode 100644 index fee72ee05..000000000 --- a/livekit-plugins/livekit-plugins-playht/package.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "name": "livekit-plugins-playht", - "private": true, - "version": "1.0.3" -} \ No newline at end of file diff --git a/tests/test_tts.py b/tests/test_tts.py index b0fbd8034..91f8035b5 100644 --- a/tests/test_tts.py +++ b/tests/test_tts.py @@ -10,7 +10,15 @@ from livekit import agents from livekit.agents import APIConnectionError, tokenize, tts from livekit.agents.utils import AudioBuffer, merge_frames -from livekit.plugins import azure, cartesia, deepgram, elevenlabs, google, openai +from livekit.plugins import ( + azure, + cartesia, + deepgram, + elevenlabs, + google, + openai, + playai, +) from .conftest import TEST_CONNECT_OPTIONS from .fake_tts import FakeTTS @@ -44,6 +52,7 @@ async def _assert_valid_synthesized_audio( pytest.param(lambda: azure.TTS(), id="azure"), pytest.param(lambda: cartesia.TTS(), id="cartesia"), pytest.param(lambda: deepgram.TTS(), id="deepgram"), + pytest.param(lambda: playai.TTS(), id="playai"), ] @@ -89,6 +98,7 @@ async def test_synthesize(tts_factory): id="azure.stream", ), pytest.param(lambda: deepgram.TTS(), id="deepgram"), + pytest.param(lambda: playai.TTS(), id="playai"), ] From 32383939025453616d23c82ccc8c725ee87a9870 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Wed, 25 Dec 2024 13:55:16 -0800 Subject: [PATCH 37/46] improved handling of LLM errors, do not retry if already began (#1298) --- .changeset/gorgeous-sheep-grow.md | 7 +++++ livekit-agents/livekit/agents/_exceptions.py | 31 ++++++++++++++----- livekit-agents/livekit/agents/llm/llm.py | 2 +- .../livekit/plugins/anthropic/llm.py | 6 ++-- .../livekit/plugins/cartesia/tts.py | 5 ++- .../livekit/plugins/deepgram/tts.py | 10 ++++-- .../livekit/plugins/elevenlabs/tts.py | 5 +-- .../livekit/plugins/openai/llm.py | 6 ++-- 8 files changed, 54 insertions(+), 18 deletions(-) create mode 100644 .changeset/gorgeous-sheep-grow.md diff --git a/.changeset/gorgeous-sheep-grow.md b/.changeset/gorgeous-sheep-grow.md new file mode 100644 index 000000000..5bdc7cc5f --- /dev/null +++ b/.changeset/gorgeous-sheep-grow.md @@ -0,0 +1,7 @@ +--- +"livekit-plugins-anthropic": patch +"livekit-plugins-openai": patch +"livekit-agents": patch +--- + +improved handling of LLM errors, do not retry if already began diff --git a/livekit-agents/livekit/agents/_exceptions.py b/livekit-agents/livekit/agents/_exceptions.py index a6d987e7d..74a1ab3c1 100644 --- a/livekit-agents/livekit/agents/_exceptions.py +++ b/livekit-agents/livekit/agents/_exceptions.py @@ -23,16 +23,22 @@ class APIError(Exception): body: object | None """The API response body, if available. - + If the API returned a valid json, the body will contains the decodede result. """ - def __init__(self, message: str, *, body: object | None) -> None: + retryable: bool = False + """Whether the error can be retried.""" + + def __init__( + self, message: str, *, body: object | None, retryable: bool = True + ) -> None: super().__init__(message) self.message = message self.body = body + self.retryable = retryable class APIStatusError(APIError): @@ -51,8 +57,15 @@ def __init__( status_code: int = -1, request_id: str | None = None, body: object | None = None, + retryable: bool | None = None, ) -> None: - super().__init__(message, body=body) + if retryable is None: + retryable = True + # 4xx errors are not retryable + if status_code >= 400 and status_code < 500: + retryable = False + + super().__init__(message, body=body, retryable=retryable) self.status_code = status_code self.request_id = request_id @@ -61,12 +74,16 @@ def __init__( class APIConnectionError(APIError): """Raised when an API request failed due to a connection error.""" - def __init__(self, message: str = "Connection error.") -> None: - super().__init__(message, body=None) + def __init__( + self, message: str = "Connection error.", *, retryable: bool = True + ) -> None: + super().__init__(message, body=None, retryable=retryable) class APITimeoutError(APIConnectionError): """Raised when an API request timed out.""" - def __init__(self, message: str = "Request timed out.") -> None: - super().__init__(message) + def __init__( + self, message: str = "Request timed out.", *, retryable: bool = True + ) -> None: + super().__init__(message, retryable=retryable) diff --git a/livekit-agents/livekit/agents/llm/llm.py b/livekit-agents/livekit/agents/llm/llm.py index 351fcc9b1..099e3139c 100644 --- a/livekit-agents/livekit/agents/llm/llm.py +++ b/livekit-agents/livekit/agents/llm/llm.py @@ -148,7 +148,7 @@ async def _main_task(self) -> None: try: return await self._run() except APIError as e: - if self._conn_options.max_retry == 0: + if self._conn_options.max_retry == 0 or not e.retryable: raise elif i == self._conn_options.max_retry: raise APIConnectionError( diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py index 69b468d23..3af490211 100644 --- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py +++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py @@ -206,6 +206,7 @@ def __init__( self._output_tokens = 0 async def _run(self) -> None: + retryable = True try: if not self._anthropic_stream: self._anthropic_stream = await self._awaitable_anthropic_stream @@ -215,6 +216,7 @@ async def _run(self) -> None: chat_chunk = self._parse_event(event) if chat_chunk is not None: self._event_ch.send_nowait(chat_chunk) + retryable = False self._event_ch.send_nowait( llm.ChatChunk( @@ -227,7 +229,7 @@ async def _run(self) -> None: ) ) except anthropic.APITimeoutError: - raise APITimeoutError() + raise APITimeoutError(retryable=retryable) except anthropic.APIStatusError as e: raise APIStatusError( e.message, @@ -236,7 +238,7 @@ async def _run(self) -> None: body=e.body, ) except Exception as e: - raise APIConnectionError() from e + raise APIConnectionError(retryable=retryable) from e def _parse_event( self, event: anthropic.types.RawMessageStreamEvent diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py index dd76473c7..eae3a0679 100644 --- a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py +++ b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py @@ -312,7 +312,10 @@ def _send_last_frame(*, segment_id: str, is_final: bool) -> None: aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING, ): - raise Exception("Cartesia connection closed unexpectedly") + raise APIStatusError( + "Cartesia connection closed unexpectedly", + request_id=request_id, + ) if msg.type != aiohttp.WSMsgType.TEXT: logger.warning("unexpected Cartesia message type %s", msg.type) diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py index 56d7405a7..401c26be7 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/tts.py @@ -314,8 +314,9 @@ def _send_last_frame(*, segment_id: str, is_final: bool) -> None: aiohttp.WSMsgType.CLOSING, ): if not closing_ws: - raise Exception( - "Deepgram websocket connection closed unexpectedly" + raise APIStatusError( + "Deepgram websocket connection closed unexpectedly", + request_id=request_id, ) return @@ -393,7 +394,10 @@ async def _connection_timeout(): raise APITimeoutError() from e except aiohttp.ClientResponseError as e: raise APIStatusError( - message=e.message, status_code=e.status, request_id=None, body=None + message=e.message, + status_code=e.status, + request_id=request_id, + body=None, ) from e except Exception as e: raise APIConnectionError() from e diff --git a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py index 0c5490707..948d42758 100644 --- a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py +++ b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py @@ -469,8 +469,9 @@ def _send_last_frame(*, segment_id: str, is_final: bool) -> None: aiohttp.WSMsgType.CLOSING, ): if not eos_sent: - raise Exception( - "11labs connection closed unexpectedly, not all tokens have been consumed" + raise APIStatusError( + "11labs connection closed unexpectedly, not all tokens have been consumed", + request_id=request_id, ) return diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py index 8e3dda787..37526dd4b 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py @@ -706,6 +706,7 @@ async def _run(self) -> None: self._fnc_name: str | None = None self._fnc_raw_arguments: str | None = None self._tool_index: int | None = None + retryable = True try: opts: dict[str, Any] = dict() @@ -755,6 +756,7 @@ async def _run(self) -> None: for choice in chunk.choices: chat_chunk = self._parse_choice(chunk.id, choice) if chat_chunk is not None: + retryable = False self._event_ch.send_nowait(chat_chunk) if chunk.usage is not None: @@ -771,7 +773,7 @@ async def _run(self) -> None: ) except openai.APITimeoutError: - raise APITimeoutError() + raise APITimeoutError(retryable=retryable) except openai.APIStatusError as e: raise APIStatusError( e.message, @@ -780,7 +782,7 @@ async def _run(self) -> None: body=e.body, ) except Exception as e: - raise APIConnectionError() from e + raise APIConnectionError(retryable=retryable) from e def _parse_choice(self, id: str, choice: Choice) -> llm.ChatChunk | None: delta = choice.delta From 66152a49649e234f1914f3cd7cb96b1a872b6a7f Mon Sep 17 00:00:00 2001 From: Sahil Suman <34382211+sahilsuman933@users.noreply.github.com> Date: Sat, 28 Dec 2024 11:52:45 +0530 Subject: [PATCH 38/46] Broken Link (#1300) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c9ffa9b78..11664a68f 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ Documentation on the framework and how to use it can be found [here](https://doc | Voice agent using the new OpenAI Realtime API | [demo](https://playground.livekit.io) | [code](https://github.com/livekit-examples/realtime-playground) | | Super fast voice agent using Cerebras hosted Llama 3.1 | [demo](https://cerebras.vercel.app) | [code](https://github.com/dsa/fast-voice-assistant/) | | Voice agent using Cartesia's Sonic model | [demo](https://cartesia-assistant.vercel.app/) | [code](https://github.com/livekit-examples/cartesia-voice-agent) | -| Agent that looks up the current weather via function call | N/A | [code](https://github.com/livekit-examples/cartesia-voice-agent) | +| Agent that looks up the current weather via function call | N/A | [code](https://github.com/livekit/agents/blob/main/examples/voice-pipeline-agent/function_calling_weather.py) | | Voice Agent using Gemini 2.0 Flash | N/A | [code](https://github.com/livekit-examples/voice-pipeline-agent/gemini_voice_agent.py) | | Voice agent with custom turn-detection model | N/A | [code](https://github.com/livekit/agents/blob/main/examples/voice-pipeline-agent/turn_detector.py) | | Voice agent that performs a RAG-based lookup | N/A | [code](https://github.com/livekit/agents/tree/main/examples/voice-pipeline-agent/simple-rag) | From 83dc84ec25e1464d9a0cb72ef7ad7157bbb4c53b Mon Sep 17 00:00:00 2001 From: Hamdan <96612374+s-hamdananwar@users.noreply.github.com> Date: Sat, 28 Dec 2024 04:50:18 -0800 Subject: [PATCH 39/46] expose worker_id in jobcontext (#1307) --- .changeset/eight-lemons-hear.md | 5 +++++ livekit-agents/livekit/agents/cli/proto.py | 2 ++ livekit-agents/livekit/agents/ipc/proto.py | 2 ++ livekit-agents/livekit/agents/job.py | 6 ++++++ livekit-agents/livekit/agents/worker.py | 2 ++ tests/test_ipc.py | 1 + 6 files changed, 18 insertions(+) create mode 100644 .changeset/eight-lemons-hear.md diff --git a/.changeset/eight-lemons-hear.md b/.changeset/eight-lemons-hear.md new file mode 100644 index 000000000..38a3f1b1b --- /dev/null +++ b/.changeset/eight-lemons-hear.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +expose worker_id in jobcontext diff --git a/livekit-agents/livekit/agents/cli/proto.py b/livekit-agents/livekit/agents/cli/proto.py index f7753c579..761690783 100644 --- a/livekit-agents/livekit/agents/cli/proto.py +++ b/livekit-agents/livekit/agents/cli/proto.py @@ -52,6 +52,7 @@ def write(self, b: io.BytesIO) -> None: channel.write_string(b, accept_args.metadata) channel.write_string(b, running_job.url) channel.write_string(b, running_job.token) + channel.write_string(b, running_job.worker_id) channel.write_int(b, self.reload_count) @@ -69,6 +70,7 @@ def read(self, b: io.BytesIO) -> None: job=job, url=channel.read_string(b), token=channel.read_string(b), + worker_id=channel.read_string(b), ) ) diff --git a/livekit-agents/livekit/agents/ipc/proto.py b/livekit-agents/livekit/agents/ipc/proto.py index c878b4f23..509964b55 100644 --- a/livekit-agents/livekit/agents/ipc/proto.py +++ b/livekit-agents/livekit/agents/ipc/proto.py @@ -90,6 +90,7 @@ def write(self, b: io.BytesIO) -> None: channel.write_string(b, accept_args.metadata) channel.write_string(b, self.running_job.url) channel.write_string(b, self.running_job.token) + channel.write_string(b, self.running_job.worker_id) def read(self, b: io.BytesIO) -> None: job = agent.Job() @@ -103,6 +104,7 @@ def read(self, b: io.BytesIO) -> None: job=job, url=channel.read_string(b), token=channel.read_string(b), + worker_id=channel.read_string(b), ) diff --git a/livekit-agents/livekit/agents/job.py b/livekit-agents/livekit/agents/job.py index 64be850f0..b54f8358c 100644 --- a/livekit-agents/livekit/agents/job.py +++ b/livekit-agents/livekit/agents/job.py @@ -68,6 +68,7 @@ class RunningJobInfo: job: agent.Job url: str token: str + worker_id: str DEFAULT_PARTICIPANT_KINDS: list[rtc.ParticipantKind.ValueType] = [ @@ -123,6 +124,11 @@ def job(self) -> agent.Job: """Returns the current job that the worker is executing.""" return self._info.job + @property + def worker_id(self) -> str: + """Returns the id of the worker.""" + return self._info.worker_id + @property def room(self) -> rtc.Room: """The Room object is the main interface that the worker should interact with. diff --git a/livekit-agents/livekit/agents/worker.py b/livekit-agents/livekit/agents/worker.py index 4708a34d3..54ad75470 100644 --- a/livekit-agents/livekit/agents/worker.py +++ b/livekit-agents/livekit/agents/worker.py @@ -621,6 +621,7 @@ async def _reload_jobs(self, jobs: list[RunningJobInfo]) -> None: job=aj.job, url=url, token=jwt.encode(decoded, self._opts.api_secret, algorithm="HS256"), + worker_id=aj.worker_id, ) await self._proc_pool.launch_job(running_info) @@ -692,6 +693,7 @@ async def _on_accept(args: JobAcceptArguments) -> None: job=msg.job, url=job_assign.url or self._opts.ws_url, token=job_assign.token, + worker_id=self._id, ) await self._proc_pool.launch_job(running_info) diff --git a/tests/test_ipc.py b/tests/test_ipc.py index d964c9f55..4e1fd4fe7 100644 --- a/tests/test_ipc.py +++ b/tests/test_ipc.py @@ -114,6 +114,7 @@ def _generate_fake_job() -> job.RunningJobInfo: url="fake_url", token="fake_token", accept_arguments=job.JobAcceptArguments(name="", identity="", metadata=""), + worker_id="fake_id", ) From 97d9bce766602d815ad305a077b651b6062b5a51 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Sat, 28 Dec 2024 20:54:04 +0800 Subject: [PATCH 40/46] fix: add manual interrupt for pipeline agent (#1294) --- .changeset/slow-walls-bake.md | 5 ++ .../livekit/agents/pipeline/pipeline_agent.py | 49 ++++++++++++++++--- .../livekit/agents/pipeline/speech_handle.py | 19 ++++--- 3 files changed, 60 insertions(+), 13 deletions(-) create mode 100644 .changeset/slow-walls-bake.md diff --git a/.changeset/slow-walls-bake.md b/.changeset/slow-walls-bake.md new file mode 100644 index 000000000..11df23f75 --- /dev/null +++ b/.changeset/slow-walls-bake.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +add manual interrupt method for pipeline agent diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index b2a223bd0..2c7dc1363 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -473,7 +473,7 @@ async def say( synthesis_handle = self._synthesize_agent_speech(new_handle.id, source) new_handle.initialize(source=source, synthesis_handle=synthesis_handle) - if self._playing_speech and not self._playing_speech.nested_speech_finished: + if self._playing_speech and not self._playing_speech.nested_speech_done: self._playing_speech.add_nested_speech(new_handle) else: self._add_speech_for_playout(new_handle) @@ -497,6 +497,23 @@ async def say( return new_handle + def interrupt(self, interrupt_all: bool = True) -> None: + """Interrupt the current speech + + Args: + interrupt_all: Whether to interrupt all pending speech + """ + if interrupt_all: + # interrupt all pending speech + if self._pending_agent_reply is not None: + self._pending_agent_reply.cancel(cancel_nested=True) + for speech in self._speech_q: + speech.cancel(cancel_nested=True) + + # interrupt the playing speech + if self._playing_speech is not None: + self._playing_speech.cancel() + def _update_state(self, state: AgentState, delay: float = 0.0): """Set the current state of the agent""" @@ -956,19 +973,31 @@ async def _execute_function_calls() -> None: self.emit("function_calls_finished", called_fncs) _CallContextVar.reset(tk) + if not is_using_tools: + speech_handle._set_done() + return + fnc_task = asyncio.create_task(_execute_function_calls()) - while not speech_handle.nested_speech_finished: - event_wait_task = asyncio.create_task( + while not speech_handle.nested_speech_done: + nesting_changed = asyncio.create_task( speech_handle.nested_speech_changed.wait() ) + nesting_done_fut: asyncio.Future = speech_handle._nested_speech_done_fut await asyncio.wait( - [event_wait_task, fnc_task], return_when=asyncio.FIRST_COMPLETED + [nesting_changed, fnc_task, nesting_done_fut], + return_when=asyncio.FIRST_COMPLETED, ) - if not event_wait_task.done(): - event_wait_task.cancel() + if not nesting_changed.done(): + nesting_changed.cancel() while speech_handle.nested_speech_handles: speech = speech_handle.nested_speech_handles[0] + if speech_handle.nested_speech_done: + # in case tool speech is added after nested speech done + speech.cancel(cancel_nested=True) + speech_handle.nested_speech_handles.pop(0) + continue + self._playing_speech = speech await self._play_speech(speech) speech_handle.nested_speech_handles.pop(0) @@ -977,7 +1006,13 @@ async def _execute_function_calls() -> None: speech_handle.nested_speech_changed.clear() # break if the function calls task is done if fnc_task.done(): - speech_handle.mark_nested_speech_finished() + speech_handle.mark_nested_speech_done() + + if not fnc_task.done(): + logger.debug( + "cancelling function calls task", extra={"speech_id": speech_handle.id} + ) + fnc_task.cancel() # mark the speech as done speech_handle._set_done() diff --git a/livekit-agents/livekit/agents/pipeline/speech_handle.py b/livekit-agents/livekit/agents/pipeline/speech_handle.py index d36eb7aee..cd1f39dec 100644 --- a/livekit-agents/livekit/agents/pipeline/speech_handle.py +++ b/livekit-agents/livekit/agents/pipeline/speech_handle.py @@ -46,7 +46,7 @@ def __init__( self._nested_speech_handles: list[SpeechHandle] = [] self._nested_speech_changed = asyncio.Event() - self._nested_speech_finished = False + self._nested_speech_done_fut = asyncio.Future[None]() @staticmethod def create_assistant_reply( @@ -190,12 +190,17 @@ def interrupt(self) -> None: raise RuntimeError("interruptions are not allowed") self.cancel() - def cancel(self) -> None: + def cancel(self, cancel_nested: bool = False) -> None: self._init_fut.cancel() if self._synthesis_handle is not None: self._synthesis_handle.interrupt() + if cancel_nested: + for speech in self._nested_speech_handles: + speech.cancel(cancel_nested=True) + self.mark_nested_speech_done() + @property def fnc_nested_depth(self) -> int: return self._fnc_nested_depth @@ -221,8 +226,10 @@ def nested_speech_changed(self) -> asyncio.Event: return self._nested_speech_changed @property - def nested_speech_finished(self) -> bool: - return self._nested_speech_finished + def nested_speech_done(self) -> bool: + return self._nested_speech_done_fut.done() - def mark_nested_speech_finished(self) -> None: - self._nested_speech_finished = True + def mark_nested_speech_done(self) -> None: + if self._nested_speech_done_fut.done(): + return + self._nested_speech_done_fut.set_result(None) From bd36bc989d819f5da003a0a34fc9f570e100583c Mon Sep 17 00:00:00 2001 From: martin-purplefish Date: Sat, 28 Dec 2024 22:27:13 -0500 Subject: [PATCH 41/46] Do not pass function context if at max depth (#1306) --- .changeset/khaki-candles-rest.md | 5 +++++ .../livekit/agents/pipeline/pipeline_agent.py | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 .changeset/khaki-candles-rest.md diff --git a/.changeset/khaki-candles-rest.md b/.changeset/khaki-candles-rest.md new file mode 100644 index 000000000..91afec21c --- /dev/null +++ b/.changeset/khaki-candles-rest.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +Do not pass function context if at max depth diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 2c7dc1363..872bd5d4b 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -960,7 +960,24 @@ async def _execute_function_calls() -> None: chat_ctx = call_ctx.chat_ctx.copy() chat_ctx.messages.extend(extra_tools_messages) chat_ctx.messages.extend(call_ctx.extra_chat_messages) - answer_llm_stream = self._llm.chat(chat_ctx=chat_ctx, fnc_ctx=self.fnc_ctx) + fnc_ctx = self.fnc_ctx + if ( + fnc_ctx + and new_speech_handle.fnc_nested_depth + >= self._opts.max_nested_fnc_calls + ): + logger.warning( + "max function calls nested depth reached, not propagating fnc ctx", + extra={ + "speech_id": speech_handle.id, + "fnc_nested_depth": speech_handle.fnc_nested_depth, + }, + ) + fnc_ctx = None + answer_llm_stream = self._llm.chat( + chat_ctx=chat_ctx, + fnc_ctx=fnc_ctx, + ) synthesis_handle = self._synthesize_agent_speech( new_speech_handle.id, answer_llm_stream From b7f289560260598a4aeb1915d3bcba28a8013245 Mon Sep 17 00:00:00 2001 From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com> Date: Mon, 30 Dec 2024 21:09:48 +0530 Subject: [PATCH 42/46] Support Gemini Live API (#1240) --- .changeset/thirty-coats-tie.md | 7 + examples/multimodal_agent/gemini_agent.py | 68 +++ .../openai_agent.py} | 0 livekit-agents/livekit/agents/cli/log.py | 1 + .../livekit/agents/multimodal/__init__.py | 14 +- .../agents/multimodal/multimodal_agent.py | 129 ++++-- .../livekit/plugins/google/__init__.py | 4 +- .../livekit/plugins/google/beta/__init__.py | 3 + .../plugins/google/beta/realtime/__init__.py | 15 + .../plugins/google/beta/realtime/api_proto.py | 79 ++++ .../google/beta/realtime/realtime_api.py | 424 ++++++++++++++++++ .../livekit-plugins-google/setup.py | 1 + .../plugins/openai/realtime/__init__.py | 4 - .../plugins/openai/realtime/realtime_model.py | 35 +- 14 files changed, 741 insertions(+), 43 deletions(-) create mode 100644 .changeset/thirty-coats-tie.md create mode 100644 examples/multimodal_agent/gemini_agent.py rename examples/{multimodal_agent.py => multimodal_agent/openai_agent.py} (100%) create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py create mode 100644 livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py diff --git a/.changeset/thirty-coats-tie.md b/.changeset/thirty-coats-tie.md new file mode 100644 index 000000000..f0c6a9e67 --- /dev/null +++ b/.changeset/thirty-coats-tie.md @@ -0,0 +1,7 @@ +--- +"livekit-plugins-google": minor +"livekit-plugins-openai": patch +"livekit-agents": patch +--- + +make multimodal class generic and support gemini live api diff --git a/examples/multimodal_agent/gemini_agent.py b/examples/multimodal_agent/gemini_agent.py new file mode 100644 index 000000000..81a474609 --- /dev/null +++ b/examples/multimodal_agent/gemini_agent.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import logging +from typing import Annotated + +import aiohttp +from dotenv import load_dotenv +from livekit.agents import ( + AutoSubscribe, + JobContext, + WorkerOptions, + WorkerType, + cli, + llm, + multimodal, +) +from livekit.plugins import google + +load_dotenv() + +logger = logging.getLogger("my-worker") +logger.setLevel(logging.INFO) + + +async def entrypoint(ctx: JobContext): + logger.info("starting entrypoint") + + fnc_ctx = llm.FunctionContext() + + @fnc_ctx.ai_callable() + async def get_weather( + location: Annotated[ + str, llm.TypeInfo(description="The location to get the weather for") + ], + ): + """Called when the user asks about the weather. This function will return the weather for the given location.""" + logger.info(f"getting weather for {location}") + url = f"https://wttr.in/{location}?format=%C+%t" + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + if response.status == 200: + weather_data = await response.text() + # # response from the function call is returned to the LLM + return f"The weather in {location} is {weather_data}." + else: + raise Exception( + f"Failed to get weather data, status code: {response.status}" + ) + + await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) + participant = await ctx.wait_for_participant() + + chat_ctx = llm.ChatContext() + + agent = multimodal.MultimodalAgent( + model=google.beta.realtime.RealtimeModel( + voice="Charon", + temperature=0.8, + instructions="You are a helpful assistant", + ), + fnc_ctx=fnc_ctx, + chat_ctx=chat_ctx, + ) + agent.start(ctx.room, participant) + + +if __name__ == "__main__": + cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, worker_type=WorkerType.ROOM)) diff --git a/examples/multimodal_agent.py b/examples/multimodal_agent/openai_agent.py similarity index 100% rename from examples/multimodal_agent.py rename to examples/multimodal_agent/openai_agent.py diff --git a/livekit-agents/livekit/agents/cli/log.py b/livekit-agents/livekit/agents/cli/log.py index dc16bfdfa..c4b5e5e52 100644 --- a/livekit-agents/livekit/agents/cli/log.py +++ b/livekit-agents/livekit/agents/cli/log.py @@ -18,6 +18,7 @@ "openai", "watchfiles", "anthropic", + "websockets.client", ] diff --git a/livekit-agents/livekit/agents/multimodal/__init__.py b/livekit-agents/livekit/agents/multimodal/__init__.py index d165c082a..f741e168a 100644 --- a/livekit-agents/livekit/agents/multimodal/__init__.py +++ b/livekit-agents/livekit/agents/multimodal/__init__.py @@ -1,3 +1,13 @@ -from .multimodal_agent import AgentTranscriptionOptions, MultimodalAgent +from .multimodal_agent import ( + AgentTranscriptionOptions, + MultimodalAgent, + _RealtimeAPI, + _RealtimeAPISession, +) -__all__ = ["MultimodalAgent", "AgentTranscriptionOptions"] +__all__ = [ + "MultimodalAgent", + "AgentTranscriptionOptions", + "_RealtimeAPI", + "_RealtimeAPISession", +] diff --git a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py index ee3a2d992..f02bb2e64 100644 --- a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py +++ b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py @@ -2,7 +2,17 @@ import asyncio from dataclasses import dataclass -from typing import Callable, Literal, Protocol +from typing import ( + Any, + AsyncIterable, + Callable, + Literal, + Optional, + Protocol, + TypeVar, + Union, + overload, +) import aiohttp from livekit import rtc @@ -28,6 +38,76 @@ ] +class _InputTranscriptionProto(Protocol): + item_id: str + """id of the item""" + transcript: str + """transcript of the input audio""" + + +class _ContentProto(Protocol): + response_id: str + item_id: str + output_index: int + content_index: int + text: str + audio: list[rtc.AudioFrame] + text_stream: AsyncIterable[str] + audio_stream: AsyncIterable[rtc.AudioFrame] + content_type: Literal["text", "audio"] + + +class _CapabilitiesProto(Protocol): + supports_truncate: bool + + +class _RealtimeAPI(Protocol): + """Realtime API protocol""" + + @property + def capabilities(self) -> _CapabilitiesProto: ... + def session( + self, + *, + chat_ctx: llm.ChatContext | None = None, + fnc_ctx: llm.FunctionContext | None = None, + ) -> _RealtimeAPISession: + """ + Create a new realtime session with the given chat and function contexts. + """ + pass + + +T = TypeVar("T", bound=Callable[..., Any]) + + +class _RealtimeAPISession(Protocol): + async def set_chat_ctx(self, ctx: llm.ChatContext) -> None: ... + @overload + def on(self, event: str, callback: None = None) -> Callable[[T], T]: ... + @overload + def on(self, event: str, callback: T) -> T: ... + def on( + self, event: str, callback: Optional[T] = None + ) -> Union[T, Callable[[T], T]]: ... + + def _push_audio(self, frame: rtc.AudioFrame) -> None: ... + @property + def fnc_ctx(self) -> llm.FunctionContext | None: ... + @fnc_ctx.setter + def fnc_ctx(self, value: llm.FunctionContext | None) -> None: ... + def chat_ctx_copy(self) -> llm.ChatContext: ... + def _recover_from_text_response(self, item_id: str) -> None: ... + def _update_conversation_item_content( + self, + item_id: str, + content: llm.ChatContent | list[llm.ChatContent] | None = None, + ) -> None: ... + def _truncate_conversation_item( + self, item_id: str, content_index: int, audio_end_ms: int + ) -> None: ... + + @dataclass(frozen=True) class AgentTranscriptionOptions: user_transcription: bool = True @@ -50,9 +130,6 @@ class AgentTranscriptionOptions: representing the hyphenated parts of the word.""" -class S2SModel(Protocol): ... - - @dataclass(frozen=True) class _ImplOptions: transcription: AgentTranscriptionOptions @@ -62,7 +139,7 @@ class MultimodalAgent(utils.EventEmitter[EventTypes]): def __init__( self, *, - model: S2SModel, + model: _RealtimeAPI, vad: vad.VAD | None = None, chat_ctx: llm.ChatContext | None = None, fnc_ctx: llm.FunctionContext | None = None, @@ -73,7 +150,7 @@ def __init__( """Create a new MultimodalAgent. Args: - model: S2SModel instance. + model: RealtimeAPI instance. vad: Voice Activity Detection (VAD) instance. chat_ctx: Chat context for the assistant. fnc_ctx: Function context for the assistant. @@ -89,10 +166,6 @@ def __init__( super().__init__() self._loop = loop or asyncio.get_event_loop() - from livekit.plugins.openai import realtime - - assert isinstance(model, realtime.RealtimeModel) - self._model = model self._vad = vad self._chat_ctx = chat_ctx @@ -177,13 +250,8 @@ async def _init_and_start(): # Schedule the initialization and start task asyncio.create_task(_init_and_start()) - from livekit.plugins.openai import realtime - @self._session.on("response_content_added") - def _on_content_added(message: realtime.RealtimeContent): - if message.content_type == "text": - return - + def _on_content_added(message: _ContentProto): tr_fwd = transcription.TTSSegmentsForwarder( room=self._room, participant=self._room.local_participant, @@ -202,7 +270,7 @@ def _on_content_added(message: realtime.RealtimeContent): ) @self._session.on("response_content_done") - def _response_content_done(message: realtime.RealtimeContent): + def _response_content_done(message: _ContentProto): if message.content_type == "text": if self._text_response_retries >= self._max_text_response_retries: raise RuntimeError( @@ -236,9 +304,7 @@ def _input_speech_committed(): ) @self._session.on("input_speech_transcription_completed") - def _input_speech_transcription_completed( - ev: realtime.InputTranscriptionCompleted, - ): + def _input_speech_transcription_completed(ev: _InputTranscriptionProto): self._stt_forwarder.update( stt.SpeechEvent( type=stt.SpeechEventType.FINAL_TRANSCRIPT, @@ -248,6 +314,7 @@ def _input_speech_transcription_completed( user_msg = ChatMessage.create( text=ev.transcript, role="user", id=ev.item_id ) + self._session._update_conversation_item_content( ev.item_id, user_msg.content ) @@ -265,11 +332,14 @@ def _input_speech_started(): if self._playing_handle is not None and not self._playing_handle.done(): self._playing_handle.interrupt() - self._session.conversation.item.truncate( - item_id=self._playing_handle.item_id, - content_index=self._playing_handle.content_index, - audio_end_ms=int(self._playing_handle.audio_samples / 24000 * 1000), - ) + if self._model.capabilities.supports_truncate: + self._session._truncate_conversation_item( + item_id=self._playing_handle.item_id, + content_index=self._playing_handle.content_index, + audio_end_ms=int( + self._playing_handle.audio_samples / 24000 * 1000 + ), + ) @self._session.on("input_speech_stopped") def _input_speech_stopped(): @@ -330,9 +400,10 @@ def _on_playout_stopped(interrupted: bool) -> None: role="assistant", id=self._playing_handle.item_id, ) - self._session._update_conversation_item_content( - self._playing_handle.item_id, msg.content - ) + if self._model.capabilities.supports_truncate: + self._session._update_conversation_item_content( + self._playing_handle.item_id, msg.content + ) if interrupted: self.emit("agent_speech_interrupted", msg) @@ -366,7 +437,7 @@ def _on_playout_stopped(interrupted: bool) -> None: ) async for frame in self._input_audio_ch: for f in bstream.write(frame.data.tobytes()): - self._session.input_audio_buffer.append(f) + self._session._push_audio(f) def _on_participant_connected(self, participant: rtc.RemoteParticipant): if self._linked_participant is None: diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py index ca754bd30..88e163634 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/__init__.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +from . import beta from .stt import STT, SpeechStream from .tts import TTS from .version import __version__ -__all__ = ["STT", "TTS", "SpeechStream", "__version__"] - +__all__ = ["STT", "TTS", "SpeechStream", "__version__", "beta"] from livekit.agents import Plugin from .log import logger diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py new file mode 100644 index 000000000..89cb122c8 --- /dev/null +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/__init__.py @@ -0,0 +1,3 @@ +from . import realtime + +__all__ = ["realtime"] diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py new file mode 100644 index 000000000..e95a86917 --- /dev/null +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/__init__.py @@ -0,0 +1,15 @@ +from .api_proto import ( + ClientEvents, + LiveAPIModels, + ResponseModality, + Voice, +) +from .realtime_api import RealtimeModel + +__all__ = [ + "RealtimeModel", + "ClientEvents", + "LiveAPIModels", + "ResponseModality", + "Voice", +] diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py new file mode 100644 index 000000000..c02fb3859 --- /dev/null +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/api_proto.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import inspect +from typing import Any, Dict, List, Literal, Sequence, Union + +from google.genai import types # type: ignore + +LiveAPIModels = Literal["gemini-2.0-flash-exp"] + +Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"] +ResponseModality = Literal["AUDIO", "TEXT"] + + +ClientEvents = Union[ + types.ContentListUnion, + types.ContentListUnionDict, + types.LiveClientContentOrDict, + types.LiveClientRealtimeInput, + types.LiveClientRealtimeInputOrDict, + types.LiveClientToolResponseOrDict, + types.FunctionResponseOrDict, + Sequence[types.FunctionResponseOrDict], +] + + +JSON_SCHEMA_TYPE_MAP = { + str: "string", + int: "integer", + float: "number", + bool: "boolean", + dict: "object", + list: "array", +} + + +def _build_parameters(arguments: Dict[str, Any]) -> types.SchemaDict: + properties: Dict[str, types.SchemaDict] = {} + required: List[str] = [] + + for arg_name, arg_info in arguments.items(): + py_type = arg_info.type + if py_type not in JSON_SCHEMA_TYPE_MAP: + raise ValueError(f"Unsupported type: {py_type}") + + prop: types.SchemaDict = { + "type": JSON_SCHEMA_TYPE_MAP[py_type], + "description": arg_info.description, + } + + if arg_info.choices: + prop["enum"] = arg_info.choices + + properties[arg_name] = prop + + if arg_info.default is inspect.Parameter.empty: + required.append(arg_name) + + parameters: types.SchemaDict = {"type": "object", "properties": properties} + + if required: + parameters["required"] = required + + return parameters + + +def _build_tools(fnc_ctx: Any) -> List[types.FunctionDeclarationDict]: + function_declarations: List[types.FunctionDeclarationDict] = [] + for fnc_info in fnc_ctx.ai_functions.values(): + parameters = _build_parameters(fnc_info.arguments) + + func_decl: types.FunctionDeclarationDict = { + "name": fnc_info.name, + "description": fnc_info.description, + "parameters": parameters, + } + + function_declarations.append(func_decl) + + return function_declarations diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py new file mode 100644 index 000000000..40bb0d7a1 --- /dev/null +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py @@ -0,0 +1,424 @@ +from __future__ import annotations + +import asyncio +import base64 +import json +import os +from dataclasses import dataclass +from typing import AsyncIterable, Literal + +from livekit import rtc +from livekit.agents import llm, utils +from livekit.agents.llm.function_context import _create_ai_function_info + +from google import genai # type: ignore +from google.genai.types import ( # type: ignore + FunctionResponse, + GenerationConfigDict, + LiveClientToolResponse, + LiveConnectConfigDict, + PrebuiltVoiceConfig, + SpeechConfig, + VoiceConfig, +) + +from ...log import logger +from .api_proto import ( + ClientEvents, + LiveAPIModels, + ResponseModality, + Voice, + _build_tools, +) + +EventTypes = Literal[ + "start_session", + "input_speech_started", + "response_content_added", + "response_content_done", + "function_calls_collected", + "function_calls_finished", + "function_calls_cancelled", +] + + +@dataclass +class GeminiContent: + response_id: str + item_id: str + output_index: int + content_index: int + text: str + audio: list[rtc.AudioFrame] + text_stream: AsyncIterable[str] + audio_stream: AsyncIterable[rtc.AudioFrame] + content_type: Literal["text", "audio"] + + +@dataclass +class Capabilities: + supports_truncate: bool + + +@dataclass +class ModelOptions: + model: LiveAPIModels | str + api_key: str | None + voice: Voice | str + response_modalities: ResponseModality + vertexai: bool + project: str | None + location: str | None + candidate_count: int + temperature: float | None + max_output_tokens: int | None + top_p: float | None + top_k: int | None + presence_penalty: float | None + frequency_penalty: float | None + instructions: str + + +class RealtimeModel: + def __init__( + self, + *, + instructions: str = "", + model: LiveAPIModels | str = "gemini-2.0-flash-exp", + api_key: str | None = None, + voice: Voice | str = "Puck", + modalities: ResponseModality = "AUDIO", + vertexai: bool = False, + project: str | None = None, + location: str | None = None, + candidate_count: int = 1, + temperature: float | None = None, + max_output_tokens: int | None = None, + top_p: float | None = None, + top_k: int | None = None, + presence_penalty: float | None = None, + frequency_penalty: float | None = None, + loop: asyncio.AbstractEventLoop | None = None, + ): + """ + Initializes a RealtimeModel instance for interacting with Google's Realtime API. + + Args: + instructions (str, optional): Initial system instructions for the model. Defaults to "". + api_key (str or None, optional): OpenAI API key. If None, will attempt to read from the environment variable OPENAI_API_KEY + modalities (ResponseModality): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"]. + model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp". + voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck". + temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8. + vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False. + project (str or None, optional): The project to use for the API. Defaults to None. (for vertexai) + location (str or None, optional): The location to use for the API. Defaults to None. (for vertexai) + candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1. + top_p (float, optional): The top-p value for response generation + top_k (int, optional): The top-k value for response generation + presence_penalty (float, optional): The presence penalty for response generation + frequency_penalty (float, optional): The frequency penalty for response generation + loop (asyncio.AbstractEventLoop or None, optional): Event loop to use for async operations. If None, the current event loop is used. + + Raises: + ValueError: If the API key is not provided and cannot be found in environment variables. + """ + super().__init__() + self._capabilities = Capabilities( + supports_truncate=False, + ) + self._model = model + self._loop = loop or asyncio.get_event_loop() + self._api_key = api_key or os.environ.get("GOOGLE_API_KEY") + self._vertexai = vertexai + self._project_id = project or os.environ.get("GOOGLE_PROJECT") + self._location = location or os.environ.get("GOOGLE_LOCATION") + if self._api_key is None and not self._vertexai: + raise ValueError("GOOGLE_API_KEY is not set") + + self._rt_sessions: list[GeminiRealtimeSession] = [] + self._opts = ModelOptions( + model=model, + api_key=api_key, + voice=voice, + response_modalities=modalities, + vertexai=vertexai, + project=project, + location=location, + candidate_count=candidate_count, + temperature=temperature, + max_output_tokens=max_output_tokens, + top_p=top_p, + top_k=top_k, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + instructions=instructions, + ) + + @property + def sessions(self) -> list[GeminiRealtimeSession]: + return self._rt_sessions + + @property + def capabilities(self) -> Capabilities: + return self._capabilities + + def session( + self, + *, + chat_ctx: llm.ChatContext | None = None, + fnc_ctx: llm.FunctionContext | None = None, + ) -> GeminiRealtimeSession: + session = GeminiRealtimeSession( + opts=self._opts, + chat_ctx=chat_ctx or llm.ChatContext(), + fnc_ctx=fnc_ctx, + loop=self._loop, + ) + self._rt_sessions.append(session) + + return session + + async def aclose(self) -> None: + for session in self._rt_sessions: + await session.aclose() + + +class GeminiRealtimeSession(utils.EventEmitter[EventTypes]): + def __init__( + self, + *, + opts: ModelOptions, + chat_ctx: llm.ChatContext, + fnc_ctx: llm.FunctionContext | None, + loop: asyncio.AbstractEventLoop, + ): + """ + Initializes a GeminiRealtimeSession instance for interacting with Google's Realtime API. + + Args: + opts (ModelOptions): The model options for the session. + chat_ctx (llm.ChatContext): The chat context for the session. + fnc_ctx (llm.FunctionContext or None): The function context for the session. + loop (asyncio.AbstractEventLoop): The event loop for the session. + """ + super().__init__() + self._loop = loop + self._opts = opts + self._chat_ctx = chat_ctx + self._fnc_ctx = fnc_ctx + self._fnc_tasks = utils.aio.TaskSet() + + tools = [] + if self._fnc_ctx is not None: + functions = _build_tools(self._fnc_ctx) + tools.append({"function_declarations": functions}) + + self._config = LiveConnectConfigDict( + model=self._opts.model, + response_modalities=self._opts.response_modalities, + generation_config=GenerationConfigDict( + candidate_count=self._opts.candidate_count, + temperature=self._opts.temperature, + max_output_tokens=self._opts.max_output_tokens, + top_p=self._opts.top_p, + top_k=self._opts.top_k, + presence_penalty=self._opts.presence_penalty, + frequency_penalty=self._opts.frequency_penalty, + ), + system_instruction=self._opts.instructions, + speech_config=SpeechConfig( + voice_config=VoiceConfig( + prebuilt_voice_config=PrebuiltVoiceConfig( + voice_name=self._opts.voice + ) + ) + ), + tools=tools, + ) + self._client = genai.Client( + http_options={"api_version": "v1alpha"}, + api_key=self._opts.api_key, + vertexai=self._opts.vertexai, + project=self._opts.project, + location=self._opts.location, + ) + self._main_atask = asyncio.create_task( + self._main_task(), name="gemini-realtime-session" + ) + # dummy task to wait for the session to be initialized # TODO: sync chat ctx + self._init_sync_task = asyncio.create_task( + asyncio.sleep(0), name="gemini-realtime-session-init" + ) + self._send_ch = utils.aio.Chan[ClientEvents]() + self._active_response_id = None + + async def aclose(self) -> None: + if self._send_ch.closed: + return + + self._send_ch.close() + await self._main_atask + + @property + def fnc_ctx(self) -> llm.FunctionContext | None: + return self._fnc_ctx + + @fnc_ctx.setter + def fnc_ctx(self, value: llm.FunctionContext | None) -> None: + self._fnc_ctx = value + + def _push_audio(self, frame: rtc.AudioFrame) -> None: + data = base64.b64encode(frame.data).decode("utf-8") + self._queue_msg({"mime_type": "audio/pcm", "data": data}) + + def _queue_msg(self, msg: dict) -> None: + self._send_ch.send_nowait(msg) + + def chat_ctx_copy(self) -> llm.ChatContext: + return self._chat_ctx.copy() + + async def set_chat_ctx(self, ctx: llm.ChatContext) -> None: + self._chat_ctx = ctx.copy() + + @utils.log_exceptions(logger=logger) + async def _main_task(self): + @utils.log_exceptions(logger=logger) + async def _send_task(): + async for msg in self._send_ch: + await self._session.send(msg) + + await self._session.send(".", end_of_turn=True) + + @utils.log_exceptions(logger=logger) + async def _recv_task(): + while True: + async for response in self._session.receive(): + if self._active_response_id is None: + self._active_response_id = utils.shortuuid() + text_stream = utils.aio.Chan[str]() + audio_stream = utils.aio.Chan[rtc.AudioFrame]() + content = GeminiContent( + response_id=self._active_response_id, + item_id=self._active_response_id, + output_index=0, + content_index=0, + text="", + audio=[], + text_stream=text_stream, + audio_stream=audio_stream, + content_type=self._opts.response_modalities, + ) + self.emit("response_content_added", content) + + server_content = response.server_content + if server_content: + model_turn = server_content.model_turn + if model_turn: + for part in model_turn.parts: + if part.text: + content.text_stream.send_nowait(part.text) + if part.inline_data: + frame = rtc.AudioFrame( + data=part.inline_data.data, + sample_rate=24000, + num_channels=1, + samples_per_channel=len(part.inline_data.data) + // 2, + ) + content.audio_stream.send_nowait(frame) + + if server_content.interrupted or server_content.turn_complete: + for stream in (content.text_stream, content.audio_stream): + if isinstance(stream, utils.aio.Chan): + stream.close() + + if server_content.interrupted: + self.emit("input_speech_started") + elif server_content.turn_complete: + self.emit("response_content_done", content) + + self._active_response_id = None + + if response.tool_call: + if self._fnc_ctx is None: + raise ValueError("Function context is not set") + fnc_calls = [] + for fnc_call in response.tool_call.function_calls: + fnc_call_info = _create_ai_function_info( + self._fnc_ctx, + fnc_call.id, + fnc_call.name, + json.dumps(fnc_call.args), + ) + fnc_calls.append(fnc_call_info) + + self.emit("function_calls_collected", fnc_calls) + + for fnc_call_info in fnc_calls: + self._fnc_tasks.create_task( + self._run_fnc_task(fnc_call_info, content.item_id) + ) + + # Handle function call cancellations + if response.tool_call_cancellation: + logger.warning( + "function call cancelled", + extra={ + "function_call_ids": response.tool_call_cancellation.function_call_ids, + }, + ) + self.emit( + "function_calls_cancelled", + response.tool_call_cancellation.function_call_ids, + ) + + async with self._client.aio.live.connect( + model=self._opts.model, config=self._config + ) as session: + self._session = session + tasks = [ + asyncio.create_task(_send_task(), name="gemini-realtime-send"), + asyncio.create_task(_recv_task(), name="gemini-realtime-recv"), + ] + + try: + await asyncio.gather(*tasks) + finally: + await utils.aio.gracefully_cancel(*tasks) + await self._session.close() + + @utils.log_exceptions(logger=logger) + async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str): + logger.debug( + "executing ai function", + extra={ + "function": fnc_call_info.function_info.name, + }, + ) + + called_fnc = fnc_call_info.execute() + try: + await called_fnc.task + except Exception as e: + logger.exception( + "error executing ai function", + extra={ + "function": fnc_call_info.function_info.name, + }, + exc_info=e, + ) + tool_call = llm.ChatMessage.create_tool_from_called_function(called_fnc) + if tool_call.content is not None: + tool_response = LiveClientToolResponse( + function_responses=[ + FunctionResponse( + name=tool_call.name, + id=tool_call.tool_call_id, + response={"result": tool_call.content}, + ) + ] + ) + await self._session.send(tool_response) + + self.emit("function_calls_finished", [called_fnc]) diff --git a/livekit-plugins/livekit-plugins-google/setup.py b/livekit-plugins/livekit-plugins-google/setup.py index 87646895f..0db8addce 100644 --- a/livekit-plugins/livekit-plugins-google/setup.py +++ b/livekit-plugins/livekit-plugins-google/setup.py @@ -51,6 +51,7 @@ "google-auth >= 2, < 3", "google-cloud-speech >= 2, < 3", "google-cloud-texttospeech >= 2, < 3", + "google-genai >= 0.3.0", "livekit-agents>=0.12.3", ], package_data={"livekit.plugins.google": ["py.typed"]}, diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py index 471deef37..fbb453609 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/__init__.py @@ -2,8 +2,6 @@ from .realtime_model import ( DEFAULT_INPUT_AUDIO_TRANSCRIPTION, DEFAULT_SERVER_VAD_OPTIONS, - InputTranscriptionCompleted, - InputTranscriptionFailed, InputTranscriptionOptions, RealtimeContent, RealtimeError, @@ -17,8 +15,6 @@ ) __all__ = [ - "InputTranscriptionCompleted", - "InputTranscriptionFailed", "RealtimeContent", "RealtimeOutput", "RealtimeResponse", diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index 26bc2649b..10d7abc1f 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -4,6 +4,7 @@ import base64 import os import time +import weakref from copy import deepcopy from dataclasses import dataclass from typing import AsyncIterable, Literal, Optional, Union, cast, overload @@ -105,8 +106,11 @@ class RealtimeToolCall: """id of the tool call""" -# TODO(theomonnom): add the content type directly inside RealtimeContent? -# text/audio/transcript? +@dataclass +class Capabilities: + supports_truncate: bool + + @dataclass class RealtimeContent: response_id: str @@ -284,6 +288,9 @@ def __init__( ValueError: If the API key is not provided and cannot be found in environment variables. """ super().__init__() + self._capabilities = Capabilities( + supports_truncate=True, + ) self._base_url = base_url is_azure = ( @@ -322,7 +329,7 @@ def __init__( ) self._loop = loop or asyncio.get_event_loop() - self._rt_sessions: list[RealtimeSession] = [] + self._rt_sessions = weakref.WeakSet[RealtimeSession]() self._http_session = http_session @classmethod @@ -427,9 +434,13 @@ def _ensure_session(self) -> aiohttp.ClientSession: return self._http_session @property - def sessions(self) -> list[RealtimeSession]: + def sessions(self) -> weakref.WeakSet[RealtimeSession]: return self._rt_sessions + @property + def capabilities(self) -> Capabilities: + return self._capabilities + def session( self, *, @@ -475,7 +486,7 @@ def session( http_session=self._ensure_session(), loop=self._loop, ) - self._rt_sessions.append(new_session) + self._rt_sessions.add(new_session) return new_session async def aclose(self) -> None: @@ -854,6 +865,9 @@ def conversation(self) -> Conversation: def input_audio_buffer(self) -> InputAudioBuffer: return RealtimeSession.InputAudioBuffer(self) + def _push_audio(self, frame: rtc.AudioFrame) -> None: + self.input_audio_buffer.append(frame) + @property def response(self) -> Response: return RealtimeSession.Response(self) @@ -1023,6 +1037,15 @@ def _recover_from_text_response(self, item_id: str | None = None) -> None: self.conversation.item.create(self._create_empty_user_audio_message(1.0)) self.response.create(on_duplicate="keep_both") + def _truncate_conversation_item( + self, item_id: str, content_index: int, audio_end_ms: int + ) -> None: + self.conversation.item.truncate( + item_id=item_id, + content_index=content_index, + audio_end_ms=audio_end_ms, + ) + def _update_conversation_item_content( self, item_id: str, content: llm.ChatContent | list[llm.ChatContent] | None ) -> None: @@ -1662,7 +1685,7 @@ async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str) "function": fnc_call_info.function_info.name, }, ) - if called_fnc.result is not None: + if tool_call.content is not None: create_fut = self.conversation.item.create( tool_call, previous_item_id=item_id, From bcbe7dd0f8ffdb55b2741d1b17637f1016a71030 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Tue, 31 Dec 2024 11:56:51 +0800 Subject: [PATCH 43/46] avoid duplicate say in function call example (#1317) --- .../function_calling_weather.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py index 7f1ba5fa5..88358e419 100644 --- a/examples/voice-pipeline-agent/function_calling_weather.py +++ b/examples/voice-pipeline-agent/function_calling_weather.py @@ -39,18 +39,25 @@ async def get_weather( # that it might take awhile: # Option 1: you can use .say filler message immediately after the call is triggered # Option 2: you can prompt the agent to return a text response when it's making a function call - call_ctx = AgentCallContext.get_current() - filler_messages = [ - "Let me check the weather in {location} for you.", - "Let me see what the weather is like in {location} right now.", - # LLM will complete this sentence if it is added to the end of the chat context - "The current weather in {location} is ", - ] - message = random.choice(filler_messages).format(location=location) - - # NOTE: set add_to_chat_ctx=True will add the message to the end - # of the chat context of the function call for answer synthesis - speech_handle = await call_ctx.agent.say(message, add_to_chat_ctx=True) # noqa: F841 + agent = AgentCallContext.get_current().agent + + if ( + not agent.chat_ctx.messages + or agent.chat_ctx.messages[-1].role != "assistant" + ): + # skip if assistant already said something + filler_messages = [ + "Let me check the weather in {location} for you.", + "Let me see what the weather is like in {location} right now.", + # LLM will complete this sentence if it is added to the end of the chat context + "The current weather in {location} is ", + ] + message = random.choice(filler_messages).format(location=location) + logger.info(f"saying filler message: {message}") + + # NOTE: set add_to_chat_ctx=True will add the message to the end + # of the chat context of the function call for answer synthesis + speech_handle = await agent.say(message, add_to_chat_ctx=True) # noqa: F841 logger.info(f"getting weather for {location}") url = f"https://wttr.in/{urllib.parse.quote(location)}?format=%C+%t" From aedbb82a5130142c760a482e264a44f135ae5f65 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Mon, 30 Dec 2024 21:12:46 -0800 Subject: [PATCH 44/46] avoid warnings when function depth matches limit (#1316) --- .changeset/quiet-dots-fly.md | 5 +++++ .../function_calling_weather.py | 7 ++++++- .../livekit/agents/pipeline/pipeline_agent.py | 15 ++++++++------- 3 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 .changeset/quiet-dots-fly.md diff --git a/.changeset/quiet-dots-fly.md b/.changeset/quiet-dots-fly.md new file mode 100644 index 000000000..3f7208c3e --- /dev/null +++ b/.changeset/quiet-dots-fly.md @@ -0,0 +1,5 @@ +--- +"livekit-agents": patch +--- + +avoid warnings when function depth matches limit diff --git a/examples/voice-pipeline-agent/function_calling_weather.py b/examples/voice-pipeline-agent/function_calling_weather.py index 88358e419..f39705f17 100644 --- a/examples/voice-pipeline-agent/function_calling_weather.py +++ b/examples/voice-pipeline-agent/function_calling_weather.py @@ -1,5 +1,6 @@ import logging import random +import re import urllib from typing import Annotated @@ -35,6 +36,9 @@ async def get_weather( ], ): """Called when the user asks about the weather. This function will return the weather for the given location.""" + # Clean the location string of special characters + location = re.sub(r"[^a-zA-Z0-9]+", " ", location).strip() + # When a function call is running, there are a couple of options to inform the user # that it might take awhile: # Option 1: you can use .say filler message immediately after the call is triggered @@ -69,6 +73,7 @@ async def get_weather( weather_data = ( f"The weather in {location} is {await response.text()}." ) + logger.info(f"weather data: {weather_data}") else: raise Exception( f"Failed to get weather data, status code: {response.status}" @@ -92,7 +97,7 @@ async def entrypoint(ctx: JobContext): "You are a weather assistant created by LiveKit. Your interface with users will be voice. " "You will provide weather information for a given location. " # when using option 1, you can suppress from the agent with prompt - "do not say anything while waiting for the function call to complete." + "do not return any text while calling the function." # uncomment this to use option 2 # "when performing function calls, let user know that you are checking the weather." ), diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py index 872bd5d4b..e6f65e772 100644 --- a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py +++ b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py @@ -966,13 +966,14 @@ async def _execute_function_calls() -> None: and new_speech_handle.fnc_nested_depth >= self._opts.max_nested_fnc_calls ): - logger.warning( - "max function calls nested depth reached, not propagating fnc ctx", - extra={ - "speech_id": speech_handle.id, - "fnc_nested_depth": speech_handle.fnc_nested_depth, - }, - ) + if len(fnc_ctx.ai_functions) > 1: + logger.info( + "max function calls nested depth reached, dropping function context. increase max_nested_fnc_calls to enable additional nesting.", + extra={ + "speech_id": speech_handle.id, + "fnc_nested_depth": speech_handle.fnc_nested_depth, + }, + ) fnc_ctx = None answer_llm_stream = self._llm.chat( chat_ctx=chat_ctx, From 924b79e0a0305be5fa3c77cfbd8fcab63bfd7de7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 30 Dec 2024 23:15:36 -0600 Subject: [PATCH 45/46] Version Packages (#1286) Co-authored-by: github-actions[bot] --- .changeset/eight-lemons-hear.md | 5 ----- .changeset/giant-ways-invite.md | 8 -------- .changeset/gorgeous-sheep-grow.md | 7 ------- .changeset/hot-trainers-press.md | 5 ----- .changeset/khaki-candles-rest.md | 5 ----- .changeset/khaki-stingrays-train.md | 5 ----- .changeset/quiet-dots-fly.md | 5 ----- .changeset/silent-oranges-warn.md | 5 ----- .changeset/slow-walls-bake.md | 5 ----- .changeset/thirty-coats-tie.md | 7 ------- .changeset/tricky-spiders-change.md | 5 ----- .../participant-entrypoint/requirements.txt | 2 +- examples/simple-color/requirements.txt | 2 +- examples/speech-to-text/requirements.txt | 4 ++-- examples/text-to-speech/requirements.txt | 4 ++-- examples/voice-pipeline-agent/requirements.txt | 6 +++--- livekit-agents/CHANGELOG.md | 18 ++++++++++++++++++ livekit-agents/livekit/agents/version.py | 2 +- livekit-agents/package.json | 2 +- .../livekit-plugins-anthropic/CHANGELOG.md | 6 ++++++ .../livekit/plugins/anthropic/version.py | 2 +- .../livekit-plugins-anthropic/package.json | 2 +- .../livekit-plugins-assemblyai/CHANGELOG.md | 8 ++++++++ .../livekit/plugins/assemblyai/version.py | 2 +- .../livekit-plugins-assemblyai/package.json | 2 +- .../livekit-plugins-azure/CHANGELOG.md | 6 ++++++ .../livekit/plugins/azure/version.py | 2 +- .../livekit-plugins-azure/package.json | 2 +- .../livekit-plugins-deepgram/CHANGELOG.md | 6 ++++++ .../livekit/plugins/deepgram/version.py | 2 +- .../livekit-plugins-deepgram/package.json | 2 +- .../livekit-plugins-google/CHANGELOG.md | 10 ++++++++++ .../livekit/plugins/google/version.py | 2 +- .../livekit-plugins-google/package.json | 2 +- .../livekit-plugins-openai/CHANGELOG.md | 8 ++++++++ .../livekit/plugins/openai/version.py | 2 +- .../livekit-plugins-openai/package.json | 2 +- .../livekit-plugins-playai/CHANGELOG.md | 6 ++++++ .../livekit/plugins/playai/version.py | 2 +- .../livekit-plugins-playai/package.json | 2 +- .../livekit-plugins-turn-detector/CHANGELOG.md | 6 ++++++ .../livekit/plugins/turn_detector/version.py | 2 +- .../livekit-plugins-turn-detector/package.json | 2 +- 43 files changed, 101 insertions(+), 89 deletions(-) delete mode 100644 .changeset/eight-lemons-hear.md delete mode 100644 .changeset/giant-ways-invite.md delete mode 100644 .changeset/gorgeous-sheep-grow.md delete mode 100644 .changeset/hot-trainers-press.md delete mode 100644 .changeset/khaki-candles-rest.md delete mode 100644 .changeset/khaki-stingrays-train.md delete mode 100644 .changeset/quiet-dots-fly.md delete mode 100644 .changeset/silent-oranges-warn.md delete mode 100644 .changeset/slow-walls-bake.md delete mode 100644 .changeset/thirty-coats-tie.md delete mode 100644 .changeset/tricky-spiders-change.md diff --git a/.changeset/eight-lemons-hear.md b/.changeset/eight-lemons-hear.md deleted file mode 100644 index 38a3f1b1b..000000000 --- a/.changeset/eight-lemons-hear.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -expose worker_id in jobcontext diff --git a/.changeset/giant-ways-invite.md b/.changeset/giant-ways-invite.md deleted file mode 100644 index 5644cb581..000000000 --- a/.changeset/giant-ways-invite.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -"livekit-plugins-assemblyai": patch -"livekit-plugins-deepgram": patch -"livekit-plugins-google": patch -"livekit-plugins-azure": patch ---- - -fix: Ensure STT exceptions are being propagated diff --git a/.changeset/gorgeous-sheep-grow.md b/.changeset/gorgeous-sheep-grow.md deleted file mode 100644 index 5bdc7cc5f..000000000 --- a/.changeset/gorgeous-sheep-grow.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -"livekit-plugins-anthropic": patch -"livekit-plugins-openai": patch -"livekit-agents": patch ---- - -improved handling of LLM errors, do not retry if already began diff --git a/.changeset/hot-trainers-press.md b/.changeset/hot-trainers-press.md deleted file mode 100644 index 326150914..000000000 --- a/.changeset/hot-trainers-press.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-assemblyai": patch ---- - -assemblyai: encode boost words diff --git a/.changeset/khaki-candles-rest.md b/.changeset/khaki-candles-rest.md deleted file mode 100644 index 91afec21c..000000000 --- a/.changeset/khaki-candles-rest.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -Do not pass function context if at max depth diff --git a/.changeset/khaki-stingrays-train.md b/.changeset/khaki-stingrays-train.md deleted file mode 100644 index ca99f9fa7..000000000 --- a/.changeset/khaki-stingrays-train.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-playai": patch ---- - -Support PlayAI TTS engine. diff --git a/.changeset/quiet-dots-fly.md b/.changeset/quiet-dots-fly.md deleted file mode 100644 index 3f7208c3e..000000000 --- a/.changeset/quiet-dots-fly.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -avoid warnings when function depth matches limit diff --git a/.changeset/silent-oranges-warn.md b/.changeset/silent-oranges-warn.md deleted file mode 100644 index e7bcd0189..000000000 --- a/.changeset/silent-oranges-warn.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -improve interruption handling, avoid agent from getting stuck diff --git a/.changeset/slow-walls-bake.md b/.changeset/slow-walls-bake.md deleted file mode 100644 index 11df23f75..000000000 --- a/.changeset/slow-walls-bake.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-agents": patch ---- - -add manual interrupt method for pipeline agent diff --git a/.changeset/thirty-coats-tie.md b/.changeset/thirty-coats-tie.md deleted file mode 100644 index f0c6a9e67..000000000 --- a/.changeset/thirty-coats-tie.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -"livekit-plugins-google": minor -"livekit-plugins-openai": patch -"livekit-agents": patch ---- - -make multimodal class generic and support gemini live api diff --git a/.changeset/tricky-spiders-change.md b/.changeset/tricky-spiders-change.md deleted file mode 100644 index a017624fc..000000000 --- a/.changeset/tricky-spiders-change.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"livekit-plugins-turn-detector": patch ---- - -fix int32/64 errors on Windows diff --git a/examples/participant-entrypoint/requirements.txt b/examples/participant-entrypoint/requirements.txt index a92be36b8..77c8959d1 100644 --- a/examples/participant-entrypoint/requirements.txt +++ b/examples/participant-entrypoint/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.5 +livekit-agents>=0.12.6 python-dotenv~=1.0 diff --git a/examples/simple-color/requirements.txt b/examples/simple-color/requirements.txt index a92be36b8..77c8959d1 100644 --- a/examples/simple-color/requirements.txt +++ b/examples/simple-color/requirements.txt @@ -1,2 +1,2 @@ -livekit-agents>=0.12.5 +livekit-agents>=0.12.6 python-dotenv~=1.0 diff --git a/examples/speech-to-text/requirements.txt b/examples/speech-to-text/requirements.txt index e58a682b3..b9f8e9fb0 100644 --- a/examples/speech-to-text/requirements.txt +++ b/examples/speech-to-text/requirements.txt @@ -1,3 +1,3 @@ -livekit-agents>=0.12.5 -livekit-plugins-deepgram>=0.6.15 +livekit-agents>=0.12.6 +livekit-plugins-deepgram>=0.6.16 python-dotenv~=1.0 diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt index f025ab277..f03f7fa49 100644 --- a/examples/text-to-speech/requirements.txt +++ b/examples/text-to-speech/requirements.txt @@ -1,5 +1,5 @@ -livekit-agents>=0.12.5 -livekit-plugins-openai>=0.10.12 +livekit-agents>=0.12.6 +livekit-plugins-openai>=0.10.13 livekit-plugins-cartesia>=0.4.5 livekit-plugins-elevenlabs>=0.7.9 python-dotenv~=1.0 diff --git a/examples/voice-pipeline-agent/requirements.txt b/examples/voice-pipeline-agent/requirements.txt index 481cb0136..cf97c8314 100644 --- a/examples/voice-pipeline-agent/requirements.txt +++ b/examples/voice-pipeline-agent/requirements.txt @@ -1,6 +1,6 @@ -livekit-agents>=0.12.5 -livekit-plugins-deepgram>=0.6.15 -livekit-plugins-google>=0.8.1 +livekit-agents>=0.12.6 +livekit-plugins-deepgram>=0.6.16 +livekit-plugins-google>=0.9.0 livekit-plugins-openai[vertex]>=0.10.10 livekit-plugins-silero>=0.7.4 livekit-plugins-rag>=0.2.3 diff --git a/livekit-agents/CHANGELOG.md b/livekit-agents/CHANGELOG.md index b04f10f1d..d9c3770d4 100644 --- a/livekit-agents/CHANGELOG.md +++ b/livekit-agents/CHANGELOG.md @@ -1,5 +1,23 @@ # livekit-agents +## 0.12.6 + +### Patch Changes + +- expose worker_id in jobcontext - [#1307](https://github.com/livekit/agents/pull/1307) ([@s-hamdananwar](https://github.com/s-hamdananwar)) + +- improved handling of LLM errors, do not retry if already began - [#1298](https://github.com/livekit/agents/pull/1298) ([@davidzhao](https://github.com/davidzhao)) + +- Do not pass function context if at max depth - [#1306](https://github.com/livekit/agents/pull/1306) ([@martin-purplefish](https://github.com/martin-purplefish)) + +- avoid warnings when function depth matches limit - [#1316](https://github.com/livekit/agents/pull/1316) ([@davidzhao](https://github.com/davidzhao)) + +- improve interruption handling, avoid agent from getting stuck - [#1290](https://github.com/livekit/agents/pull/1290) ([@davidzhao](https://github.com/davidzhao)) + +- add manual interrupt method for pipeline agent - [#1294](https://github.com/livekit/agents/pull/1294) ([@longcw](https://github.com/longcw)) + +- make multimodal class generic and support gemini live api - [#1240](https://github.com/livekit/agents/pull/1240) ([@jayeshp19](https://github.com/jayeshp19)) + ## 0.12.5 ### Patch Changes diff --git a/livekit-agents/livekit/agents/version.py b/livekit-agents/livekit/agents/version.py index 93e989e31..0696f486e 100644 --- a/livekit-agents/livekit/agents/version.py +++ b/livekit-agents/livekit/agents/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.12.5" +__version__ = "0.12.6" diff --git a/livekit-agents/package.json b/livekit-agents/package.json index 4986b2889..c321ac852 100644 --- a/livekit-agents/package.json +++ b/livekit-agents/package.json @@ -1,5 +1,5 @@ { "name": "livekit-agents", "private": true, - "version": "0.12.5" + "version": "0.12.6" } diff --git a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md index f540e9641..3b75922f3 100644 --- a/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-anthropic/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-anthropic +## 0.2.9 + +### Patch Changes + +- improved handling of LLM errors, do not retry if already began - [#1298](https://github.com/livekit/agents/pull/1298) ([@davidzhao](https://github.com/davidzhao)) + ## 0.2.8 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py index e558b382c..bd4a8d004 100644 --- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py +++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.8" +__version__ = "0.2.9" diff --git a/livekit-plugins/livekit-plugins-anthropic/package.json b/livekit-plugins/livekit-plugins-anthropic/package.json index ad2ba63a2..eb8866886 100644 --- a/livekit-plugins/livekit-plugins-anthropic/package.json +++ b/livekit-plugins/livekit-plugins-anthropic/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-anthropic", "private": true, - "version": "0.2.8" + "version": "0.2.9" } diff --git a/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md b/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md index 5a5f68908..71d63e941 100644 --- a/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-assemblyai/CHANGELOG.md @@ -1,5 +1,13 @@ # livekit-plugins-assemblyai +## 0.2.2 + +### Patch Changes + +- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao)) + +- assemblyai: encode boost words - [#1284](https://github.com/livekit/agents/pull/1284) ([@jmugicagonz](https://github.com/jmugicagonz)) + ## 0.2.1 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py index 875ee5214..2985d9da1 100644 --- a/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py +++ b/livekit-plugins/livekit-plugins-assemblyai/livekit/plugins/assemblyai/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.1" +__version__ = "0.2.2" diff --git a/livekit-plugins/livekit-plugins-assemblyai/package.json b/livekit-plugins/livekit-plugins-assemblyai/package.json index 992070917..8b0962663 100644 --- a/livekit-plugins/livekit-plugins-assemblyai/package.json +++ b/livekit-plugins/livekit-plugins-assemblyai/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-assemblyai", "private": true, - "version": "0.2.1" + "version": "0.2.2" } diff --git a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md index 5d4ab532b..414181cbd 100644 --- a/livekit-plugins/livekit-plugins-azure/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-azure/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-azure +## 0.5.2 + +### Patch Changes + +- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao)) + ## 0.5.1 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py index 79283902f..ec65e487a 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.5.1" +__version__ = "0.5.2" diff --git a/livekit-plugins/livekit-plugins-azure/package.json b/livekit-plugins/livekit-plugins-azure/package.json index cdd81c035..45561032c 100644 --- a/livekit-plugins/livekit-plugins-azure/package.json +++ b/livekit-plugins/livekit-plugins-azure/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-azure", "private": true, - "version": "0.5.1" + "version": "0.5.2" } diff --git a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md index 9c624c19f..617d61f38 100644 --- a/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-deepgram/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-deepgram +## 0.6.16 + +### Patch Changes + +- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao)) + ## 0.6.15 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py index c83922d4e..e1df9b637 100644 --- a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py +++ b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.6.15" +__version__ = "0.6.16" diff --git a/livekit-plugins/livekit-plugins-deepgram/package.json b/livekit-plugins/livekit-plugins-deepgram/package.json index 65cf7a26a..3a0a81159 100644 --- a/livekit-plugins/livekit-plugins-deepgram/package.json +++ b/livekit-plugins/livekit-plugins-deepgram/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-deepgram", "private": true, - "version": "0.6.15" + "version": "0.6.16" } diff --git a/livekit-plugins/livekit-plugins-google/CHANGELOG.md b/livekit-plugins/livekit-plugins-google/CHANGELOG.md index 82ccd17ff..8867829ea 100644 --- a/livekit-plugins/livekit-plugins-google/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-google/CHANGELOG.md @@ -1,5 +1,15 @@ # livekit-plugins-google +## 0.9.0 + +### Minor Changes + +- make multimodal class generic and support gemini live api - [#1240](https://github.com/livekit/agents/pull/1240) ([@jayeshp19](https://github.com/jayeshp19)) + +### Patch Changes + +- fix: Ensure STT exceptions are being propagated - [#1291](https://github.com/livekit/agents/pull/1291) ([@davidzhao](https://github.com/davidzhao)) + ## 0.8.1 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py index eb38535e3..654ad56ec 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.8.1" +__version__ = "0.9.0" diff --git a/livekit-plugins/livekit-plugins-google/package.json b/livekit-plugins/livekit-plugins-google/package.json index c79ee66c4..17bc59ac6 100644 --- a/livekit-plugins/livekit-plugins-google/package.json +++ b/livekit-plugins/livekit-plugins-google/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-google", "private": true, - "version": "0.8.1" + "version": "0.9.0" } diff --git a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md index 02ff2f06f..1e363b412 100644 --- a/livekit-plugins/livekit-plugins-openai/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-openai/CHANGELOG.md @@ -1,5 +1,13 @@ # livekit-plugins-openai +## 0.10.13 + +### Patch Changes + +- improved handling of LLM errors, do not retry if already began - [#1298](https://github.com/livekit/agents/pull/1298) ([@davidzhao](https://github.com/davidzhao)) + +- make multimodal class generic and support gemini live api - [#1240](https://github.com/livekit/agents/pull/1240) ([@jayeshp19](https://github.com/jayeshp19)) + ## 0.10.12 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py index 16e535380..c1fcb43b8 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.10.12" +__version__ = "0.10.13" diff --git a/livekit-plugins/livekit-plugins-openai/package.json b/livekit-plugins/livekit-plugins-openai/package.json index bfe2370d0..e23704cba 100644 --- a/livekit-plugins/livekit-plugins-openai/package.json +++ b/livekit-plugins/livekit-plugins-openai/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-openai", "private": true, - "version": "0.10.12" + "version": "0.10.13" } diff --git a/livekit-plugins/livekit-plugins-playai/CHANGELOG.md b/livekit-plugins/livekit-plugins-playai/CHANGELOG.md index 84c891ab4..8fd61d2cf 100644 --- a/livekit-plugins/livekit-plugins-playai/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-playai/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-playht +## 1.0.4 + +### Patch Changes + +- Support PlayAI TTS engine. - [#1174](https://github.com/livekit/agents/pull/1174) ([@jayeshp19](https://github.com/jayeshp19)) + ## 1.0.3 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py index 976498ab9..92192eed4 100644 --- a/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py +++ b/livekit-plugins/livekit-plugins-playai/livekit/plugins/playai/version.py @@ -1 +1 @@ -__version__ = "1.0.3" +__version__ = "1.0.4" diff --git a/livekit-plugins/livekit-plugins-playai/package.json b/livekit-plugins/livekit-plugins-playai/package.json index 043890665..a4879d16b 100644 --- a/livekit-plugins/livekit-plugins-playai/package.json +++ b/livekit-plugins/livekit-plugins-playai/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-playai", "private": true, - "version": "1.0.3" + "version": "1.0.4" } diff --git a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md index 2d38bf347..46a9a7fe5 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md +++ b/livekit-plugins/livekit-plugins-turn-detector/CHANGELOG.md @@ -1,5 +1,11 @@ # livekit-plugins-eou +## 0.3.5 + +### Patch Changes + +- fix int32/64 errors on Windows - [#1285](https://github.com/livekit/agents/pull/1285) ([@nbsp](https://github.com/nbsp)) + ## 0.3.4 ### Patch Changes diff --git a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py index bcfe9b179..4be9d79b7 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py +++ b/livekit-plugins/livekit-plugins-turn-detector/livekit/plugins/turn_detector/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.3.4" +__version__ = "0.3.5" diff --git a/livekit-plugins/livekit-plugins-turn-detector/package.json b/livekit-plugins/livekit-plugins-turn-detector/package.json index 82d16bb89..264da83bf 100644 --- a/livekit-plugins/livekit-plugins-turn-detector/package.json +++ b/livekit-plugins/livekit-plugins-turn-detector/package.json @@ -1,5 +1,5 @@ { "name": "livekit-plugins-turn-detector", "private": true, - "version": "0.3.4" + "version": "0.3.5" } From 1ab8d88749c9e42ddb952741e4fb2a65bb6645e8 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Tue, 31 Dec 2024 00:08:28 -0800 Subject: [PATCH 46/46] rename `multimodal_agent` directory for consistency (#1318) --- examples/{multimodal_agent => multimodal-agent}/gemini_agent.py | 0 examples/{multimodal_agent => multimodal-agent}/openai_agent.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename examples/{multimodal_agent => multimodal-agent}/gemini_agent.py (100%) rename examples/{multimodal_agent => multimodal-agent}/openai_agent.py (100%) diff --git a/examples/multimodal_agent/gemini_agent.py b/examples/multimodal-agent/gemini_agent.py similarity index 100% rename from examples/multimodal_agent/gemini_agent.py rename to examples/multimodal-agent/gemini_agent.py diff --git a/examples/multimodal_agent/openai_agent.py b/examples/multimodal-agent/openai_agent.py similarity index 100% rename from examples/multimodal_agent/openai_agent.py rename to examples/multimodal-agent/openai_agent.py