Merge branch 'livekit:main' into main

livekit · Dec 31, 2024 · b718710 · b718710
2 parents 10c17ab + 1ab8d88
commit b718710
Show file tree

Hide file tree

Showing 135 changed files with 2,880 additions and 811 deletions.
diff --git a/.changeset/famous-points-tickle.md b/.changeset/famous-points-tickle.md
diff --git a/.changeset/great-lizards-pump.md b/.changeset/great-lizards-pump.md
diff --git a/.changeset/strange-snakes-hug.md b/.changeset/strange-snakes-hug.md
diff --git a/.changeset/tiny-papayas-film.md b/.changeset/tiny-papayas-film.md
diff --git a/.changeset/warm-pillows-grow.md b/.changeset/warm-pillows-grow.md
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -80,4 +80,5 @@ jobs:
                -p livekit.plugins.azure \
                -p livekit.plugins.anthropic \
                -p livekit.plugins.fal \
+               -p livekit.plugins.playai \
                -p livekit.plugins.assemblyai
diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
@@ -26,8 +26,7 @@ jobs:
         with:
           submodules: true
           lfs: true
-        env:
-          GITHUB_TOKEN: ${{ secrets.CHANGESETS_PUSH_PAT }}
+          ssh-key: ${{ secrets.CHANGESETS_PUSH_DEPLOY_KEY }}
 
       - uses: pnpm/action-setup@v4
       - name: Use Node.js 20
@@ -84,7 +83,7 @@ jobs:
     uses: livekit/agents/.github/workflows/build-package.yml@main
     with:
       package: ${{ matrix.package.name }}
-      artifact_name: python-package-distributions
+      artifact_name: python-package-dist-${{matrix.package.name}}
 
   publish:
     needs:
@@ -98,8 +97,9 @@ jobs:
       - name: Download all the dists
         uses: actions/download-artifact@v4
         with:
-          name: python-package-distributions
-          path: dist/
+          path: dist
+          pattern: python-package-dist-*
+          merge-multiple: true
 
       - name: Publish package
         uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -18,15 +18,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os:
-          [
-            macos-14-large,
+        os: [
+            # disabled Intel Macs due to pytorch 2.3+ not supporting it
+            # macos-14-large,
             macos-14,
             windows-2019,
             ubuntu-20.04,
             namespace-profile-default-arm64,
           ]
-        python_version: ["3.12"]
+        python_version: ["3.9", "3.12"]
         test_group: ["base"]
         include:
           # Include llm, stt, and tts tests only on Ubuntu 20.04 with Python 3.9
@@ -60,11 +60,8 @@ jobs:
             ${{ runner.os }}-cache
 
       - uses: actions/setup-python@v5
-        # brew will install python as part of ffmpeg install on MacOS
-        # installing system Python could cause a conflict with `Could not symlink bin/idle3`
-        if: ${{ matrix.os != 'macos-14-large' }}
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python_version }}
           cache: "pip"
 
       - name: Install ffmpeg (Linux)
@@ -80,6 +77,7 @@ jobs:
           sudo dpkg -i libssl1.1_1.1.1-1ubuntu2.1_arm64.deb
           sudo dpkg -i libssl-dev_1.1.1-1ubuntu2.1_arm64.deb
 
+
       - name: Install ffmpeg (macOS)
         if: ${{ startsWith(matrix.os, 'macos') }}
         run: brew install ffmpeg
@@ -91,20 +89,9 @@ jobs:
       - name: Install packages
         shell: bash
         run: |
-          pip3 install pytest pytest-asyncio pytest-timeout './livekit-agents[codecs]' psutil
-          pip3 install -r ./tests/test-requirements.txt
-          pip3 install ./livekit-agents \
-                      ./livekit-plugins/livekit-plugins-openai \
-                      ./livekit-plugins/livekit-plugins-deepgram \
-                      ./livekit-plugins/livekit-plugins-google \
-                      ./livekit-plugins/livekit-plugins-nltk \
-                      ./livekit-plugins/livekit-plugins-silero \
-                      ./livekit-plugins/livekit-plugins-elevenlabs \
-                      ./livekit-plugins/livekit-plugins-cartesia \
-                      ./livekit-plugins/livekit-plugins-azure \
-                      ./livekit-plugins/livekit-plugins-anthropic \
-                      ./livekit-plugins/livekit-plugins-assemblyai \
-                      ./livekit-plugins/livekit-plugins-fal
+          pip install pytest pytest-asyncio pytest-timeout './livekit-agents[codecs]' psutil
+          pip install -r ./tests/test-requirements.txt
+          ./livekit-plugins/install_local.sh
 
       - name: Run tests
         shell: bash
@@ -123,6 +110,8 @@ jobs:
           GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
           ASSEMBLYAI_API_KEY: ${{ secrets.ASSEMBLYAI_API_KEY }}
           FAL_KEY: ${{ secrets.FAL_KEY }}
+          PLAYHT_API_KEY: ${{ secrets.PLAYHT_API_KEY }}
+          PLAYHT_USER_ID: ${{ secrets.PLAYHT_USER_ID }}
           GOOGLE_APPLICATION_CREDENTIALS: google.json
           PYTEST_ADDOPTS: "--color=yes"
         working-directory: tests
@@ -131,7 +120,7 @@ jobs:
 
           case "${{ matrix.test_group }}" in
             base)
-              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py"
+              test_files="test_aio.py test_tokenizer.py test_vad.py test_ipc.py test_tts_fallback.py test_stt_fallback.py test_message_change.py test_build_func_desc.py test_create_func.py"
               ;;
             llm)
               test_files="test_llm.py"

diff --git a/README.md b/README.md
@@ -122,7 +122,7 @@ Documentation on the framework and how to use it can be found [here](https://doc
 | Voice agent using the new OpenAI Realtime API                         | [demo](https://playground.livekit.io)          | [code](https://github.com/livekit-examples/realtime-playground)                                        |
 | Super fast voice agent using Cerebras hosted Llama 3.1                | [demo](https://cerebras.vercel.app)            | [code](https://github.com/dsa/fast-voice-assistant/)                                                   |
 | Voice agent using Cartesia's Sonic model                              | [demo](https://cartesia-assistant.vercel.app/) | [code](https://github.com/livekit-examples/cartesia-voice-agent)                                       |
-| Agent that looks up the current weather via function call             | N/A                                            | [code](https://github.com/livekit-examples/cartesia-voice-agent)                                       |
+| Agent that looks up the current weather via function call             | N/A                                            | [code](https://github.com/livekit/agents/blob/main/examples/voice-pipeline-agent/function_calling_weather.py)                                       |
 | Voice Agent using Gemini 2.0 Flash                                    | N/A                                            | [code](https://github.com/livekit-examples/voice-pipeline-agent/gemini_voice_agent.py)                 |
 | Voice agent with custom turn-detection model                          | N/A                                            | [code](https://github.com/livekit/agents/blob/main/examples/voice-pipeline-agent/turn_detector.py)     |
 | Voice agent that performs a RAG-based lookup                          | N/A                                            | [code](https://github.com/livekit/agents/tree/main/examples/voice-pipeline-agent/simple-rag)           |

diff --git a/examples/hive-moderation-agent/README.md b/examples/hive-moderation-agent/README.md
@@ -0,0 +1,41 @@
+# LiveKit realtime moderation agent using Hive
+
+This is an agent that performs visual moderation of every participant's video in a room.  It does this moderation using the Visual Content Moderation model from [Hive](https://thehive.ai) [[docs](https://docs.thehive.ai/docs/visual-content-moderation#visual-content-moderation)].
+
+## Prerequisites
+
+Before running this agent, you'll need:
+
+1. A LiveKit Cloud project (or a self-hosted LiveKit server).
+2. An API key from Hive to access the above mentioned model.
+
+## Configuration
+
+Currently, this agent is configured entirely from the `agent.py` source code and the environment.
+
+### Environment Variables
+
+| configuration | description | example value |
+|---------------|-------------|---------------|
+| `LIVEKIT_URL` | Your LiveKit URL | `wss://test-abc123de.livekit.cloud` |
+| `LIVEKIT_API_KEY` | Your LiveKit API key | |
+| `LIVEKIT_API_SECRET` | Your LiveKit API secret | |
+| `HIVE_API_KEY` | The API key from Hive to access the `Visual Content Moderation` model | `abc1deFgHIjK23KLMNOp45QrsTuv6wx8` |
+
+### Code
+
+| configuration | description | example value |
+|---------------|-------------|---------------|
+| `MOD_FRAME_INTERVAL` | Minimum number of seconds to wait between frames | 5.0 |
+| `HIVE_HEADERS` | The headers to send with every request to the Hive API | `{}` |
+| `CONFIDENCE_THRESHOLD` | The minimum score Hive's moderation class must meet before it is considered a problem | 0.9 |
+
+## Running
+
+Run this code like you would any other [LiveKit agent](https://docs.livekit.io/agents/build/anatomy/#starting-the-worker):
+
+```
+python3 agent.py start
+```
+
+Once running, the agent will join all new LiveKit rooms by default and begin moderation.
diff --git a/examples/hive-moderation-agent/agent.py b/examples/hive-moderation-agent/agent.py
@@ -0,0 +1,163 @@
+"""
+LiveKit agent that connects to a room and performs visual moderation on the video
+of all participants using the Visual Content Moderation model from Hive
+(https://docs.thehive.ai/docs/visual-content-moderation#visual-content-moderation).
+
+The agent periodically sends a frame from the participant's video to Hive's API
+for a moderation check. If the results of that check show a confidence score
+of 0.9 or higher for any of the positive classes, it logs the result and adds a
+message to the room's chat. This can easily be extended to take additional
+actions like removing a participant or ending a livestream, etc.
+"""
+
+import asyncio
+import logging
+import os
+import time
+from io import BytesIO
+
+import aiohttp
+from dotenv import load_dotenv
+from hive_data_classes import HiveResponse, from_dict
+from livekit import agents, rtc
+from PIL import Image
+
+load_dotenv()
+
+MOD_FRAME_INTERVAL = 5.0  # check 1 frame every 5 seconds
+"""
+How often to check a frame (in seconds)
+"""
+
+HIVE_HEADERS = {
+    "Authorization": f"Token {os.getenv('HIVE_API_KEY')}",
+    "accept": "application/json",
+}
+"""
+The default headers included with every request to thehive.ai
+"""
+
+CONFIDENCE_THRESHOLD = 0.9
+"""
+THe threshold level for scores returned by thehive.ai.  See details in this doc:
+https://docs.thehive.ai/docs/visual-content-moderation#choosing-thresholds-for-visual-moderation
+"""
+
+
+logger = logging.getLogger("hive-moderation-agent")
+logger.setLevel(logging.INFO)
+
+
+async def request_fnc(req: agents.JobRequest):
+    """
+    The request handler for the agent.  We use this to set the name of the
+    agent that is displayed to users
+    """
+    # accept the job request and name the agent participant so users know what this is
+    await req.accept(
+        name="Moderator",
+        identity="hive-moderator",
+    )
+
+
+async def entrypoint(ctx: agents.JobContext):
+    """
+    The entrypoint of the agent.  This is called every time the moderator
+    agent joins a room.
+    """
+
+    # connect to the room and automatically subscribe to all participants' video
+    await ctx.connect(auto_subscribe=agents.AutoSubscribe.VIDEO_ONLY)
+    chat = rtc.ChatManager(ctx.room)
+
+    @ctx.room.on("track_subscribed")
+    def on_track_subscribed(
+        track: rtc.Track,
+        _publication: rtc.TrackPublication,
+        participant: rtc.RemoteParticipant,
+    ):
+        """
+        Event handler for video tracks.  We automatically subscribe to all video
+        tracks when a participant joins the room.  This event is triggered
+        once we have completed subscription to that video track.
+        This creates a backgrond task to process frames from each track
+        """
+        asyncio.create_task(process_track(participant, track))
+
+    async def process_track(participant: rtc.RemoteParticipant, track: rtc.VideoTrack):
+        """
+        This function is running in a background task once for each video track
+        (i.e., once for each participant).  It handles processing a frame
+        from the video once every MOD_FRAME INTERVAL seconds.
+        """
+
+        video_stream = rtc.VideoStream(track)
+        last_processed_time = 0
+        async for frame in video_stream:
+            current_time = time.time()
+            if (current_time - last_processed_time) >= MOD_FRAME_INTERVAL:
+                last_processed_time = current_time
+                await check_frame(participant, frame)
+
+    async def check_frame(participant: rtc.RemoteParticipant, frame: rtc.VideoFrame):
+        """
+        Uses thehive.ai API to check the frame for any classifications we care about
+        """
+
+        # get the current frame and convert to png format
+        argb_frame = frame.frame.convert(rtc.VideoBufferType.RGBA)
+        image = Image.frombytes(
+            "RGBA", (argb_frame.width, argb_frame.height), argb_frame.data
+        )
+        buffer = BytesIO()
+        image.save(buffer, format="PNG")
+        buffer.seek(0)  # reset buffer position to beginning after writing
+
+        data = aiohttp.FormData()
+        data.add_field("image", buffer, filename="image.png", content_type="image/png")
+
+        # submit the image to Hive
+        logger.info("submitting image to hive")
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                "https://api.thehive.ai/api/v2/task/sync",
+                headers=HIVE_HEADERS,
+                data=data,
+            ) as response:
+                response.raise_for_status()
+                response_dict = await response.json()
+                hive_response: HiveResponse = from_dict(HiveResponse, response_dict)
+                if (
+                    hive_response.code == 200
+                    and len(hive_response.status) > 0
+                    and len(hive_response.status[0].response.output) > 0
+                ):
+                    results = hive_response.status[0].response.output[0].classes
+                    # filter to anything with a confidence score > threshold
+                    for mod_class in results:
+                        if mod_class.class_[0:4] == "yes_":
+                            # TODO: should also include "general_nsfw" class
+                            if mod_class.score >= CONFIDENCE_THRESHOLD:
+                                class_name = mod_class.class_[4:]
+                                message = (
+                                    'FOUND %s for participant "%s" (confidence score: %0.3f)'
+                                    % (
+                                        class_name,
+                                        participant.identity,
+                                        mod_class.score,
+                                    )
+                                )
+                                logger.info(message)
+                                await chat.send_message(message)
+
+    await ctx.wait_for_participant()
+    await chat.send_message(
+        "I'm a moderation agent,"
+        "I will detect and notify you of all inappropriate material in your video stream"
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    agents.cli.run_app(agents.WorkerOptions(entrypoint, request_fnc=request_fnc))