From 3c1921e400c954fe79ce7d332e06313ea4f396c3 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 9 Jan 2025 15:59:54 -0500
Subject: [PATCH] add hf cache caching for GHA (#2247)

* add hf cache caching for GHA

* use modal volume to cache hf data

* make sure to update the cache as we add new fixtures in conftest
---
 .github/workflows/tests.yml | 36 ++++++++++++++++++++++++++++++++++++
 cicd/Dockerfile.jinja       |  1 +
 cicd/multigpu.py            |  8 ++++++++
 cicd/tests.py               |  8 ++++++++
 4 files changed, 53 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 39622e3905..6af794b168 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -61,6 +61,15 @@ jobs:
       - name: Check out repository code
         uses: actions/checkout@v4
 
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
@@ -101,6 +110,15 @@ jobs:
         run: |
           find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
 
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
   pytest-sdist:
     name: PyTest from Source Dist
     runs-on: ubuntu-latest
@@ -116,6 +134,15 @@ jobs:
       - name: Check out repository code
         uses: actions/checkout@v4
 
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
@@ -157,6 +184,15 @@ jobs:
         run: |
           find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
 
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
   docker-e2e-tests-1st:
     if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
     # this job needs to be run on self-hosted GPU runners...
diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
index ed64664166..641bd90b6a 100644
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -8,6 +8,7 @@ ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
 ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
 
 RUN apt-get update && \
     apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
diff --git a/cicd/multigpu.py b/cicd/multigpu.py
index 0ea4c8cc11..f9bad386a3 100644
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -28,6 +28,7 @@
     "CUDA": os.environ.get("CUDA", "121"),
     "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
     "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
 
 dockerfile_contents = df_template.render(**df_args)
@@ -48,6 +49,12 @@
 
 app = App("Axolotl CI/CD", secrets=[])
 
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
 
 N_GPUS = int(os.environ.get("N_GPUS", 2))
 GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
@@ -67,6 +74,7 @@ def run_cmd(cmd: str, run_folder: str):
     timeout=60 * 60,
     cpu=8.0,
     memory=131072 * N_GPUS,
+    volumes=VOLUME_CONFIG,
 )
 def cicd_pytest():
     run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
diff --git a/cicd/tests.py b/cicd/tests.py
index f3dbaef105..d7ae5b5e8c 100644
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -29,6 +29,7 @@
     "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
     "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
     "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
 
 dockerfile_contents = df_template.render(**df_args)
@@ -50,6 +51,12 @@
 
 app = App("Axolotl CI/CD", secrets=[])
 
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
 
 N_GPUS = int(os.environ.get("N_GPUS", 1))
 GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
@@ -69,6 +76,7 @@ def run_cmd(cmd: str, run_folder: str):
     timeout=60 * 60,
     cpu=8.0,
     memory=131072,
+    volumes=VOLUME_CONFIG,
 )
 def cicd_pytest():
     run_cmd("./cicd/cicd.sh", "/workspace/axolotl")