forked from open-compass/opencompass
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Add daily test case (open-compass#864)
* add daily test case * Update pr-run-test.yml * Update daily-run-test.yml * Update daily-run-test.yml * Update pr-run-test.yml --------- Co-authored-by: zhulin1 <[email protected]>
- Loading branch information
1 parent
d0ac8d0
commit 166265c
Showing
4 changed files
with
193 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import csv | ||
import os | ||
|
||
import pytest | ||
import yaml | ||
|
||
output_path = 'regression_result_daily' | ||
|
||
model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf'] | ||
dataset_list = ['ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval'] | ||
|
||
|
||
@pytest.fixture() | ||
def baseline_scores(request): | ||
config_path = os.path.join(request.config.rootdir, | ||
'.github/scripts/oc_score_baseline.yaml') | ||
with open(config_path) as f: | ||
config = yaml.load(f.read(), Loader=yaml.SafeLoader) | ||
return config | ||
|
||
|
||
@pytest.fixture() | ||
def result_scores(): | ||
file = find_csv_files(output_path) | ||
if file is None: | ||
return None | ||
return read_csv_file(file) | ||
|
||
|
||
@pytest.mark.usefixtures('result_scores') | ||
@pytest.mark.usefixtures('baseline_scores') | ||
class TestChat: | ||
"""Test cases for chat model.""" | ||
|
||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list | ||
for p2 in dataset_list]) | ||
def test_demo_default(self, baseline_scores, result_scores, model, | ||
dataset): | ||
base_score = baseline_scores.get(model).get(dataset) | ||
result_score = result_scores.get(model).get(dataset) | ||
assert_score(result_score, base_score) | ||
|
||
|
||
def assert_score(score, baseline): | ||
if score is None or score == '-': | ||
assert False, 'value is none' | ||
if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97): | ||
print(score + ' between ' + str(baseline * 0.97) + ' and ' + | ||
str(baseline * 1.03)) | ||
assert True | ||
else: | ||
assert False, score + ' not between ' + str( | ||
baseline * 0.97) + ' and ' + str(baseline * 1.03) | ||
|
||
|
||
def find_csv_files(directory): | ||
csv_files = [] | ||
for root, dirs, files in os.walk(directory): | ||
for file in files: | ||
if file.endswith('.csv'): | ||
csv_files.append(os.path.join(root, file)) | ||
if len(csv_files) > 1: | ||
raise 'have more than 1 result file, please check the result manually' | ||
if len(csv_files) == 0: | ||
return None | ||
return csv_files[0] | ||
|
||
|
||
def read_csv_file(file_path): | ||
with open(file_path, 'r') as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
filtered_data = [] | ||
|
||
for row in reader: | ||
filtered_row = { | ||
k: v | ||
for k, v in row.items() | ||
if k not in ['version', 'metric', 'mode'] | ||
} | ||
filtered_data.append(filtered_row) | ||
|
||
result = {} | ||
for data in filtered_data: | ||
dataset = data.get('dataset') | ||
for key in data.keys(): | ||
if key == 'dataset': | ||
continue | ||
else: | ||
if key in result.keys(): | ||
result.get(key)[dataset] = data.get(key) | ||
else: | ||
result[key] = {dataset: data.get(key)} | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
internlm-7b-hf: | ||
ARC-c: 36.27 | ||
chid-dev: 81.68 | ||
chid-test: 83.67 | ||
openai_humaneval: 10.37 | ||
|
||
internlm-chat-7b-hf: | ||
ARC-c: 36.95 | ||
chid-dev: 71.78 | ||
chid-test: 76.87 | ||
openai_humaneval: 21.34 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
name: daily_run_test | ||
|
||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: '56 16 * * *' | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
env: | ||
CONDA_ENV: opencompass_regression | ||
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip | ||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd | ||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub | ||
|
||
jobs: | ||
daily_run_test: | ||
runs-on: self-hosted | ||
environment: 'prod' | ||
timeout-minutes: 240 #4hours | ||
steps: | ||
- name: Clone repository | ||
uses: actions/checkout@v2 | ||
- name: Prepare - create conda env and install torch | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda create -y --name ${{env.CONDA_ENV}} python=3.10 | ||
conda activate ${{env.CONDA_ENV}} | ||
pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 | ||
conda info --envs | ||
- name: Prepare - Pip install code | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda activate ${{env.CONDA_ENV}} | ||
pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} | ||
pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}} | ||
conda info --envs | ||
- name: Prepare - prepare data and hf model | ||
run: | | ||
cp -r ${{env.USERSPACE_PREFIX}}/data . | ||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p | ||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub | ||
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; | ||
- name: Run test | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda activate ${{env.CONDA_ENV}} | ||
conda info --envs | ||
rm -rf regression_result_daily | ||
export from_tf=TRUE | ||
python3 run.py --models hf_internlm_chat_7b hf_internlm_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl --work-dir regression_result_daily --debug | ||
- name: Get result | ||
run: | | ||
pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}} | ||
pytest -s -v --color=yes .github/scripts/oc_score_assert.py | ||
- name: Remove Conda Env | ||
if: always() | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda env remove --name ${{env.CONDA_ENV}} | ||
conda info --envs | ||
notify_to_feishu: | ||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} | ||
needs: [daily_run_test] | ||
environment: 'prod' | ||
timeout-minutes: 5 | ||
runs-on: self-hosted | ||
steps: | ||
- name: notify | ||
run: | | ||
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters