Merge branch 'main' into theo/more-metrics-errors

livekit · Oct 28, 2024 · 44c519a · 44c519a
2 parents 259818a + ca3e47a
commit 44c519a
Show file tree

Hide file tree

Showing 12 changed files with 101 additions and 25 deletions.
diff --git a/.changeset/eleven-items-love.md b/.changeset/eleven-items-love.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-openai": patch
+---
+
+Groq integration with Whisper-compatible STT endpoints
diff --git a/.changeset/hot-toys-wash.md b/.changeset/hot-toys-wash.md
@@ -0,0 +1,15 @@
+---
+"livekit-plugins-elevenlabs": patch
+"livekit-plugins-anthropic": patch
+"livekit-plugins-cartesia": patch
+"livekit-plugins-deepgram": patch
+"livekit-plugins-browser": patch
+"livekit-plugins-google": patch
+"livekit-plugins-openai": patch
+"livekit-plugins-playht": patch
+"livekit-plugins-silero": patch
+"livekit-plugins-azure": patch
+"livekit-agents": patch
+---
+
+pipelineagent: expose timing metrics & api errors wip
diff --git a/.changeset/serious-worms-tap.md b/.changeset/serious-worms-tap.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+Fix stack dump on room shutdown
diff --git a/livekit-agents/livekit/agents/ipc/job_main.py b/livekit-agents/livekit/agents/ipc/job_main.py
@@ -7,6 +7,7 @@
 import pickle
 import queue
 import socket
+import sys
 import threading
 from dataclasses import dataclass
 from typing import Any, Callable, Optional
@@ -47,6 +48,10 @@ def _forward_logs(self):
 
     def emit(self, record: logging.LogRecord) -> None:
         try:
+            # Check if Python is shutting down
+            if sys.is_finalizing():
+                return
+
             # from https://github.com/python/cpython/blob/91b7f2e7f6593acefda4fa860250dd87d6f849bf/Lib/logging/handlers.py#L1453
             msg = self.format(record)
             record = copy.copy(record)

diff --git a/livekit-agents/livekit/agents/pipeline/metrics.py b/livekit-agents/livekit/agents/pipeline/metrics.py
@@ -25,21 +25,31 @@ class PipelineSTTMetrics(STTMetrics, TypedDict):
 class PipelineEOUMetrics(TypedDict):
     type: Literal["eou_metrics"]
     sequence_id: str
+    """Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics."""
+
     timestamp: float
-    duration: float
+    """Timestamp of when the event was recorded."""
+
+    end_of_utterance_delay: float
+    """Amount of time between the end of speech from VAD and the decision to end the user's turn."""
+
     transcription_delay: float
-    """time it took to get the transcript after the end of the user's speech
-    could be 0 if the transcript was already available"""
+    """Time taken to obtain the transcript after the end of the user's speech.
+    
+    May be 0 if the transcript was already available.
+    """
 
 
 class PipelineLLMMetrics(LLMMetrics, TypedDict):
     type: Literal["llm_metrics"]
     sequence_id: str
+    """Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics."""
 
 
 class PipelineTTSMetrics(TTSMetrics, TypedDict):
     type: Literal["tts_metrics"]
     sequence_id: str
+    """Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics."""
 
 
 class PipelineVADMetrics(VADMetrics, TypedDict):

diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -935,7 +935,7 @@ def _validate_reply_if_possible(self) -> None:
                 "type": "eou_metrics",
                 "timestamp": time.time(),
                 "sequence_id": self._pending_agent_reply.id,
-                "duration": time_since_last_speech,
+                "end_of_utterance_delay": time_since_last_speech,
                 "transcription_delay": transcription_delay,
             }
             self.emit("metrics_collected", eou_metrics)

diff --git a/livekit-agents/livekit/agents/stt/stt.py b/livekit-agents/livekit/agents/stt/stt.py
@@ -13,7 +13,7 @@
 
 
 class STTMetrics(TypedDict):
-    request_id: str # can be empty when just emitting usage (in streaming mode)
+    request_id: str  # can be empty when just emitting usage (in streaming mode)
     timestamp: float
     duration: float
     label: str

diff --git a/livekit-agents/livekit/agents/vad.py b/livekit-agents/livekit/agents/vad.py
@@ -14,7 +14,7 @@
 
 class VADMetrics(TypedDict):
     timestamp: float
-    inference_duration_avg: float
+    inference_duration_total: float
     inference_count: int
     label: str
 
@@ -114,24 +114,24 @@ async def _main_task(self) -> None: ...
     async def _metrics_monitor_task(self, event_aiter: AsyncIterable[VADEvent]) -> None:
         """Task used to collect metrics"""
 
-        inference_duration_sum = 0.0
+        inference_duration_total = 0.0
         inference_count = 0
 
         async for ev in event_aiter:
             if ev.type == VADEventType.INFERENCE_DONE:
-                inference_duration_sum += ev.inference_duration
+                inference_duration_total += ev.inference_duration
                 inference_count += 1
 
                 if inference_count >= 1 / self._vad.capabilities.update_interval:
                     vad_metrics: VADMetrics = {
                         "timestamp": time.time(),
-                        "inference_duration_avg": inference_duration_sum
-                        / inference_count,
+                        "inference_duration_total": inference_duration_total,
                         "inference_count": inference_count,
                         "label": self._vad._label,
                     }
                     self._vad.emit("metrics_collected", vad_metrics)
-                    inference_duration_sum = 0.0
+
+                    inference_duration_total = 0.0
                     inference_count = 0
 
     def push_frame(self, frame: rtc.AudioFrame) -> None:

diff --git a/livekit-plugins/livekit-plugins-browser/src/dev_renderer.cpp b/livekit-plugins/livekit-plugins-browser/src/dev_renderer.cpp
@@ -473,7 +473,7 @@ void DevRenderer::Run() {
         }
 
         // Render the browser tex
-        ImGui::Image((void*)(intptr_t)data.texture_id,
+        ImGui::Image((ImTextureID)(intptr_t)data.texture_id,
                      ImVec2((float)data.view_width, (float)data.view_height));
       }
       ImGui::End();

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/models.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/models.py
@@ -70,6 +70,10 @@
     "gemma2-9b-it",
 ]
 
+GroqAudioModels = Literal[
+    "whisper-large-v3", "distil-whisper-large-v3-en", "whisper-large-v3-turbo"
+]
+
 DeepSeekChatModels = Literal[
     "deepseek-coder",
     "deepseek-chat",

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/stt.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/stt.py
@@ -16,6 +16,7 @@
 
 import dataclasses
 import io
+import os
 import wave
 from dataclasses import dataclass
 
@@ -26,14 +27,14 @@
 
 import openai
 
-from .models import WhisperModels
+from .models import GroqAudioModels, WhisperModels
 
 
 @dataclass
 class _STTOptions:
     language: str
     detect_language: bool
-    model: WhisperModels
+    model: WhisperModels | str
 
 
 class STT(stt.STT):
@@ -42,7 +43,7 @@ def __init__(
         *,
         language: str = "en",
         detect_language: bool = False,
-        model: WhisperModels = "whisper-1",
+        model: WhisperModels | str = "whisper-1",
         base_url: str | None = None,
         api_key: str | None = None,
         client: openai.AsyncClient | None = None,
@@ -80,6 +81,38 @@ def __init__(
             ),
         )
 
+    @staticmethod
+    def with_groq(
+        *,
+        model: GroqAudioModels | str = "whisper-large-v3-turbo",
+        api_key: str | None = None,
+        base_url: str | None = "https://api.groq.com/openai/v1",
+        client: openai.AsyncClient | None = None,
+        language: str = "en",
+        detect_language: bool = False,
+    ) -> STT:
+        """
+        Create a new instance of Groq STT.
+
+        ``api_key`` must be set to your Groq API key, either using the argument or by setting
+        the ``GROQ_API_KEY`` environmental variable.
+        """
+
+        # Use environment variable if API key is not provided
+        api_key = api_key or os.environ.get("GROQ_API_KEY")
+        if api_key is None:
+            raise ValueError("Groq API key is required")
+
+        # Instantiate and return a configured STT instance
+        return STT(
+            model=model,
+            api_key=api_key,
+            base_url=base_url,
+            client=client,
+            language=language,
+            detect_language=detect_language,
+        )
+
     def _sanitize_options(self, *, language: str | None = None) -> _STTOptions:
         config = dataclasses.replace(self._opts)
         config.language = language or config.language

diff --git a/tests/test_llm.py b/tests/test_llm.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import asyncio
-import uuid
 from enum import Enum
 from typing import Annotated, Callable, Optional
 
@@ -90,15 +89,15 @@ def test_hashable_typeinfo():
 
 LLMS: list[Callable[[], llm.LLM]] = [
     lambda: openai.LLM(),
-    lambda: openai.beta.AssistantLLM(
-        assistant_opts=openai.beta.AssistantOptions(
-            create_options=openai.beta.AssistantCreateOptions(
-                name=f"test-{uuid.uuid4()}",
-                instructions="You are a basic assistant",
-                model="gpt-4o",
-            )
-        )
-    ),
+    # lambda: openai.beta.AssistantLLM(
+    #     assistant_opts=openai.beta.AssistantOptions(
+    #         create_options=openai.beta.AssistantCreateOptions(
+    #             name=f"test-{uuid.uuid4()}",
+    #             instructions="You are a basic assistant",
+    #             model="gpt-4o",
+    #         )
+    #     )
+    # ),
     # anthropic.LLM(),
 ]