diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py new file mode 100644 index 000000000..528970947 --- /dev/null +++ b/.github/scripts/oc_score_assert.py @@ -0,0 +1,93 @@ +import csv +import os + +import pytest +import yaml + +output_path = 'regression_result_daily' + +model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf'] +dataset_list = ['ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval'] + + +@pytest.fixture() +def baseline_scores(request): + config_path = os.path.join(request.config.rootdir, + '.github/scripts/oc_score_baseline.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + return config + + +@pytest.fixture() +def result_scores(): + file = find_csv_files(output_path) + if file is None: + return None + return read_csv_file(file) + + +@pytest.mark.usefixtures('result_scores') +@pytest.mark.usefixtures('baseline_scores') +class TestChat: + """Test cases for chat model.""" + + @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list + for p2 in dataset_list]) + def test_demo_default(self, baseline_scores, result_scores, model, + dataset): + base_score = baseline_scores.get(model).get(dataset) + result_score = result_scores.get(model).get(dataset) + assert_score(result_score, base_score) + + +def assert_score(score, baseline): + if score is None or score == '-': + assert False, 'value is none' + if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97): + print(score + ' between ' + str(baseline * 0.97) + ' and ' + + str(baseline * 1.03)) + assert True + else: + assert False, score + ' not between ' + str( + baseline * 0.97) + ' and ' + str(baseline * 1.03) + + +def find_csv_files(directory): + csv_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.csv'): + csv_files.append(os.path.join(root, file)) + if len(csv_files) > 1: + raise 'have more than 1 result file, please check the result manually' + if len(csv_files) == 0: + return None + return csv_files[0] + + +def read_csv_file(file_path): + with open(file_path, 'r') as csvfile: + reader = csv.DictReader(csvfile) + filtered_data = [] + + for row in reader: + filtered_row = { + k: v + for k, v in row.items() + if k not in ['version', 'metric', 'mode'] + } + filtered_data.append(filtered_row) + + result = {} + for data in filtered_data: + dataset = data.get('dataset') + for key in data.keys(): + if key == 'dataset': + continue + else: + if key in result.keys(): + result.get(key)[dataset] = data.get(key) + else: + result[key] = {dataset: data.get(key)} + return result diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml new file mode 100644 index 000000000..e80d2df9a --- /dev/null +++ b/.github/scripts/oc_score_baseline.yaml @@ -0,0 +1,11 @@ +internlm-7b-hf: + ARC-c: 36.27 + chid-dev: 81.68 + chid-test: 83.67 + openai_humaneval: 10.37 + +internlm-chat-7b-hf: + ARC-c: 36.95 + chid-dev: 71.78 + chid-test: 76.87 + openai_humaneval: 21.34 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml new file mode 100644 index 000000000..232852afd --- /dev/null +++ b/.github/workflows/daily-run-test.yml @@ -0,0 +1,74 @@ +name: daily_run_test + +on: + workflow_dispatch: + schedule: + - cron: '56 16 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + CONDA_ENV: opencompass_regression + PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip + USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd + HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + +jobs: + daily_run_test: + runs-on: self-hosted + environment: 'prod' + timeout-minutes: 240 #4hours + steps: + - name: Clone repository + uses: actions/checkout@v2 + - name: Prepare - create conda env and install torch + run: | + eval "$(conda shell.bash hook)" + conda create -y --name ${{env.CONDA_ENV}} python=3.10 + conda activate ${{env.CONDA_ENV}} + pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 + conda info --envs + - name: Prepare - Pip install code + run: | + eval "$(conda shell.bash hook)" + conda activate ${{env.CONDA_ENV}} + pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} + pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}} + conda info --envs + - name: Prepare - prepare data and hf model + run: | + cp -r ${{env.USERSPACE_PREFIX}}/data . + rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p + ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub + export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; + - name: Run test + run: | + eval "$(conda shell.bash hook)" + conda activate ${{env.CONDA_ENV}} + conda info --envs + rm -rf regression_result_daily + export from_tf=TRUE + python3 run.py --models hf_internlm_chat_7b hf_internlm_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl --work-dir regression_result_daily --debug + - name: Get result + run: | + pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}} + pytest -s -v --color=yes .github/scripts/oc_score_assert.py + - name: Remove Conda Env + if: always() + run: | + eval "$(conda shell.bash hook)" + conda env remove --name ${{env.CONDA_ENV}} + conda info --envs + + notify_to_feishu: + if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} + needs: [daily_run_test] + environment: 'prod' + timeout-minutes: 5 + runs-on: self-hosted + steps: + - name: notify + run: | + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index 9df2ab802..7ada69450 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -18,28 +18,30 @@ concurrency: cancel-in-progress: true env: - CONDA_ENV: opencompass_regression_daily + CONDA_ENV: opencompass_base + USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd + HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub jobs: pr_run_test: runs-on: self-hosted environment: 'prod' - timeout-minutes: 20 + timeout-minutes: 30 steps: - name: Clone repository uses: actions/checkout@v2 - - name: Prepare - create conda env and install code + - name: Prepare - Install opencompass run: | eval "$(conda shell.bash hook)" - conda create --name ${{env.CONDA_ENV}} --clone opencompass_base --offline conda activate ${{env.CONDA_ENV}} - python3 -m pip install -e . + python3 -m pip uninstall opencompass -y + python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip conda info --envs - name: Prepare - prepare data and hf model run: | - cp -r /cpfs01/user/qa-llm-cicd/data . + cp -r ${{env.USERSPACE_PREFIX}}/data . rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p - ln -s /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub ~/.cache/huggingface/hub + ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; - name: Run test run: | @@ -49,21 +51,21 @@ jobs: rm -rf regression_result python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug - name: Get result - if: always() run: | score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}') - if (( ${score%.*} >= 70 && ${score%.*} <= 80 )); then - echo "score is $score between 70 and 80" + if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then + echo "score is $score between 70 and 75" else - echo "score is $score not between 70 and 80" + echo "score is $score not between 70 and 75" exit 1 fi rm -rf regression_result - - name: Remove Conda Env + - name: Uninstall opencompass if: always() run: | eval "$(conda shell.bash hook)" - conda env remove --name ${{env.CONDA_ENV}} + conda activate ${{env.CONDA_ENV}} + python3 -m pip uninstall opencompass -y conda info --envs notify_to_feishu: