[Feature] Add daily test case (#864)

* add daily test case * Update pr-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update pr-run-test.yml --------- Co-authored-by: zhulin1 <[email protected]>
open-compass · Feb 2, 2024 · 0919b08 · 0919b08
1 parent 4c87e77
commit 0919b08
Show file tree

Hide file tree

Showing 4 changed files with 193 additions and 13 deletions.
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
@@ -0,0 +1,93 @@
+import csv
+import os
+
+import pytest
+import yaml
+
+output_path = 'regression_result_daily'
+
+model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf']
+dataset_list = ['ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval']
+
+
+@pytest.fixture()
+def baseline_scores(request):
+    config_path = os.path.join(request.config.rootdir,
+                               '.github/scripts/oc_score_baseline.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config
+
+
+@pytest.fixture()
+def result_scores():
+    file = find_csv_files(output_path)
+    if file is None:
+        return None
+    return read_csv_file(file)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores')
+class TestChat:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list
+                                                for p2 in dataset_list])
+    def test_demo_default(self, baseline_scores, result_scores, model,
+                          dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, base_score)
+
+
+def assert_score(score, baseline):
+    if score is None or score == '-':
+        assert False, 'value is none'
+    if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
+        print(score + ' between ' + str(baseline * 0.97) + ' and ' +
+              str(baseline * 1.03))
+        assert True
+    else:
+        assert False, score + ' not between ' + str(
+            baseline * 0.97) + ' and ' + str(baseline * 1.03)
+
+
+def find_csv_files(directory):
+    csv_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.csv'):
+                csv_files.append(os.path.join(root, file))
+    if len(csv_files) > 1:
+        raise 'have more than 1 result file, please check the result manually'
+    if len(csv_files) == 0:
+        return None
+    return csv_files[0]
+
+
+def read_csv_file(file_path):
+    with open(file_path, 'r') as csvfile:
+        reader = csv.DictReader(csvfile)
+        filtered_data = []
+
+        for row in reader:
+            filtered_row = {
+                k: v
+                for k, v in row.items()
+                if k not in ['version', 'metric', 'mode']
+            }
+            filtered_data.append(filtered_row)
+
+    result = {}
+    for data in filtered_data:
+        dataset = data.get('dataset')
+        for key in data.keys():
+            if key == 'dataset':
+                continue
+            else:
+                if key in result.keys():
+                    result.get(key)[dataset] = data.get(key)
+                else:
+                    result[key] = {dataset: data.get(key)}
+    return result
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
@@ -0,0 +1,11 @@
+internlm-7b-hf:
+    ARC-c: 36.27
+    chid-dev: 81.68
+    chid-test: 83.67
+    openai_humaneval: 10.37
+
+internlm-chat-7b-hf:
+    ARC-c: 36.95
+    chid-dev: 71.78
+    chid-test: 76.87
+    openai_humaneval: 21.34
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
@@ -0,0 +1,74 @@
+name: daily_run_test
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron:  '56 16 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CONDA_ENV: opencompass_regression
+  PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
+  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
+  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+
+jobs:
+  daily_run_test:
+    runs-on: self-hosted
+    environment: 'prod'
+    timeout-minutes: 240 #4hours
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+      - name: Prepare - create conda env and install torch
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda create -y --name ${{env.CONDA_ENV}} python=3.10
+          conda activate ${{env.CONDA_ENV}}
+          pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+          conda info --envs
+      - name: Prepare - Pip install code
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda activate ${{env.CONDA_ENV}}
+          pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda info --envs
+      - name: Prepare - prepare data and hf model
+        run: |
+          cp -r ${{env.USERSPACE_PREFIX}}/data .
+          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
+          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
+          export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
+      - name:  Run test
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          rm -rf regression_result_daily
+          export from_tf=TRUE
+          python3 run.py --models hf_internlm_chat_7b hf_internlm_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl --work-dir regression_result_daily --debug
+      - name:  Get result
+        run: |
+          pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}}
+          pytest -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Remove Conda Env
+        if: always()
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda env remove --name ${{env.CONDA_ENV}}
+          conda info --envs
+
+  notify_to_feishu:
+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    needs: [daily_run_test]
+    environment: 'prod'
+    timeout-minutes: 5
+    runs-on: self-hosted
+    steps:
+      - name: notify
+        run: |
+          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
@@ -18,28 +18,30 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  CONDA_ENV: opencompass_regression_daily
+  CONDA_ENV: opencompass_base
+  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
+  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
 
 jobs:
   pr_run_test:
     runs-on: self-hosted
     environment: 'prod'
-    timeout-minutes: 20
+    timeout-minutes: 30
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
-      - name: Prepare - create conda env and install code
+      - name: Prepare - Install opencompass
         run: |
           eval "$(conda shell.bash hook)"
-          conda create --name ${{env.CONDA_ENV}} --clone opencompass_base --offline
           conda activate ${{env.CONDA_ENV}}
-          python3 -m pip install -e .
+          python3 -m pip uninstall opencompass -y
+          python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
           conda info --envs
       - name: Prepare - prepare data and hf model
         run: |
-          cp -r /cpfs01/user/qa-llm-cicd/data .
+          cp -r ${{env.USERSPACE_PREFIX}}/data .
           rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
-          ln -s /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub ~/.cache/huggingface/hub
+          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
           export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
       - name:  Run test
         run: |
@@ -49,21 +51,21 @@ jobs:
           rm -rf regression_result
           python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug
       - name:  Get result
-        if: always()
         run: |
           score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
-          if (( ${score%.*} >= 70 && ${score%.*} <= 80 )); then
-             echo "score is $score between 70 and 80"
+          if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then
+             echo "score is $score between 70 and 75"
           else
-             echo "score is $score not between 70 and 80"
+             echo "score is $score not between 70 and 75"
              exit 1
           fi
           rm -rf regression_result
-      - name:  Remove Conda Env
+      - name:  Uninstall opencompass
         if: always()
         run: |
           eval "$(conda shell.bash hook)"
-          conda env remove --name ${{env.CONDA_ENV}}
+          conda activate ${{env.CONDA_ENV}}
+          python3 -m pip uninstall opencompass -y
           conda info --envs
 
   notify_to_feishu: