From 3c1921e400c954fe79ce7d332e06313ea4f396c3 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 9 Jan 2025 15:59:54 -0500 Subject: [PATCH] add hf cache caching for GHA (#2247) * add hf cache caching for GHA * use modal volume to cache hf data * make sure to update the cache as we add new fixtures in conftest --- .github/workflows/tests.yml | 36 ++++++++++++++++++++++++++++++++++++ cicd/Dockerfile.jinja | 1 + cicd/multigpu.py | 8 ++++++++ cicd/tests.py | 8 ++++++++ 4 files changed, 53 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 39622e3905..6af794b168 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -61,6 +61,15 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 + - name: Restore HF cache + id: hf-cache-restore + uses: actions/cache/restore@v4 + with: + path: | + /home/runner/.cache/huggingface/hub/datasets--* + /home/runner/.cache/huggingface/hub/models--* + key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }} + - name: Setup Python uses: actions/setup-python@v5 with: @@ -101,6 +110,15 @@ jobs: run: | find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; + - name: Save HF cache + id: hf-cache + uses: actions/cache/save@v4 + with: + path: | + /home/runner/.cache/huggingface/hub/datasets--* + /home/runner/.cache/huggingface/hub/models--* + key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }} + pytest-sdist: name: PyTest from Source Dist runs-on: ubuntu-latest @@ -116,6 +134,15 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 + - name: Restore HF cache + id: hf-cache-restore + uses: actions/cache/restore@v4 + with: + path: | + /home/runner/.cache/huggingface/hub/datasets--* + /home/runner/.cache/huggingface/hub/models--* + key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }} + - name: Setup Python uses: actions/setup-python@v5 with: @@ -157,6 +184,15 @@ jobs: run: | find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; + - name: Save HF cache + id: hf-cache + uses: actions/cache/save@v4 + with: + path: | + /home/runner/.cache/huggingface/hub/datasets--* + /home/runner/.cache/huggingface/hub/models--* + key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }} + docker-e2e-tests-1st: if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }} # this job needs to be run on self-hosted GPU runners... diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja index ed64664166..641bd90b6a 100644 --- a/cicd/Dockerfile.jinja +++ b/cicd/Dockerfile.jinja @@ -8,6 +8,7 @@ ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}" ENV GITHUB_REF="{{ GITHUB_REF }}" ENV GITHUB_SHA="{{ GITHUB_SHA }}" ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}" +ENV HF_HOME="{{ HF_HOME }}" RUN apt-get update && \ apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev diff --git a/cicd/multigpu.py b/cicd/multigpu.py index 0ea4c8cc11..f9bad386a3 100644 --- a/cicd/multigpu.py +++ b/cicd/multigpu.py @@ -28,6 +28,7 @@ "CUDA": os.environ.get("CUDA", "121"), "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), + "HF_HOME": "/workspace/data/huggingface-cache/hub", } dockerfile_contents = df_template.render(**df_args) @@ -48,6 +49,12 @@ app = App("Axolotl CI/CD", secrets=[]) +hf_cache_volume = modal.Volume.from_name( + "axolotl-ci-hf-hub-cache", create_if_missing=True +) +VOLUME_CONFIG = { + "/workspace/data/huggingface-cache/hub": hf_cache_volume, +} N_GPUS = int(os.environ.get("N_GPUS", 2)) GPU_CONFIG = modal.gpu.H100(count=N_GPUS) @@ -67,6 +74,7 @@ def run_cmd(cmd: str, run_folder: str): timeout=60 * 60, cpu=8.0, memory=131072 * N_GPUS, + volumes=VOLUME_CONFIG, ) def cicd_pytest(): run_cmd("./cicd/multigpu.sh", "/workspace/axolotl") diff --git a/cicd/tests.py b/cicd/tests.py index f3dbaef105..d7ae5b5e8c 100644 --- a/cicd/tests.py +++ b/cicd/tests.py @@ -29,6 +29,7 @@ "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""), + "HF_HOME": "/workspace/data/huggingface-cache/hub", } dockerfile_contents = df_template.render(**df_args) @@ -50,6 +51,12 @@ app = App("Axolotl CI/CD", secrets=[]) +hf_cache_volume = modal.Volume.from_name( + "axolotl-ci-hf-hub-cache", create_if_missing=True +) +VOLUME_CONFIG = { + "/workspace/data/huggingface-cache/hub": hf_cache_volume, +} N_GPUS = int(os.environ.get("N_GPUS", 1)) GPU_CONFIG = modal.gpu.A10G(count=N_GPUS) @@ -69,6 +76,7 @@ def run_cmd(cmd: str, run_folder: str): timeout=60 * 60, cpu=8.0, memory=131072, + volumes=VOLUME_CONFIG, ) def cicd_pytest(): run_cmd("./cicd/cicd.sh", "/workspace/axolotl")