Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add daily test case #864

Merged
merged 5 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions .github/scripts/oc_score_assert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import csv
import os

import pytest
import yaml

output_path = 'regression_result_daily'

model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf']
dataset_list = ['ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval']


@pytest.fixture()
def baseline_scores(request):
config_path = os.path.join(request.config.rootdir,
'.github/scripts/oc_score_baseline.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
return config


@pytest.fixture()
def result_scores():
file = find_csv_files(output_path)
if file is None:
return None
return read_csv_file(file)


@pytest.mark.usefixtures('result_scores')
@pytest.mark.usefixtures('baseline_scores')
class TestChat:
"""Test cases for chat model."""

@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list
for p2 in dataset_list])
def test_demo_default(self, baseline_scores, result_scores, model,
dataset):
base_score = baseline_scores.get(model).get(dataset)
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, base_score)


def assert_score(score, baseline):
if score is None or score == '-':
assert False, 'value is none'
if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
print(score + ' between ' + str(baseline * 0.97) + ' and ' +
str(baseline * 1.03))
assert True
else:
assert False, score + ' not between ' + str(
baseline * 0.97) + ' and ' + str(baseline * 1.03)


def find_csv_files(directory):
csv_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.csv'):
csv_files.append(os.path.join(root, file))
if len(csv_files) > 1:
raise 'have more than 1 result file, please check the result manually'
if len(csv_files) == 0:
return None
return csv_files[0]


def read_csv_file(file_path):
with open(file_path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
filtered_data = []

for row in reader:
filtered_row = {
k: v
for k, v in row.items()
if k not in ['version', 'metric', 'mode']
}
filtered_data.append(filtered_row)

result = {}
for data in filtered_data:
dataset = data.get('dataset')
for key in data.keys():
if key == 'dataset':
continue
else:
if key in result.keys():
result.get(key)[dataset] = data.get(key)
else:
result[key] = {dataset: data.get(key)}
return result
11 changes: 11 additions & 0 deletions .github/scripts/oc_score_baseline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
internlm-7b-hf:
ARC-c: 36.27
chid-dev: 81.68
chid-test: 83.67
openai_humaneval: 10.37

internlm-chat-7b-hf:
ARC-c: 36.95
chid-dev: 71.78
chid-test: 76.87
openai_humaneval: 21.34
74 changes: 74 additions & 0 deletions .github/workflows/daily-run-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
name: daily_run_test

on:
workflow_dispatch:
schedule:
- cron: '56 16 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

env:
CONDA_ENV: opencompass_regression
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub

jobs:
daily_run_test:
runs-on: self-hosted
environment: 'prod'
timeout-minutes: 240 #4hours
steps:
- name: Clone repository
uses: actions/checkout@v2
- name: Prepare - create conda env and install torch
run: |
eval "$(conda shell.bash hook)"
conda create -y --name ${{env.CONDA_ENV}} python=3.10
conda activate ${{env.CONDA_ENV}}
pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
conda info --envs
- name: Prepare - Pip install code
run: |
eval "$(conda shell.bash hook)"
conda activate ${{env.CONDA_ENV}}
pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}}
conda info --envs
- name: Prepare - prepare data and hf model
run: |
cp -r ${{env.USERSPACE_PREFIX}}/data .
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
- name: Run test
run: |
eval "$(conda shell.bash hook)"
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm -rf regression_result_daily
export from_tf=TRUE
python3 run.py --models hf_internlm_chat_7b hf_internlm_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl --work-dir regression_result_daily --debug
- name: Get result
run: |
pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}}
pytest -s -v --color=yes .github/scripts/oc_score_assert.py
- name: Remove Conda Env
if: always()
run: |
eval "$(conda shell.bash hook)"
conda env remove --name ${{env.CONDA_ENV}}
conda info --envs

notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test]
environment: 'prod'
timeout-minutes: 5
runs-on: self-hosted
steps:
- name: notify
run: |
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
28 changes: 15 additions & 13 deletions .github/workflows/pr-run-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,30 @@ concurrency:
cancel-in-progress: true

env:
CONDA_ENV: opencompass_regression_daily
CONDA_ENV: opencompass_base
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub

jobs:
pr_run_test:
runs-on: self-hosted
environment: 'prod'
timeout-minutes: 20
timeout-minutes: 30
steps:
- name: Clone repository
uses: actions/checkout@v2
- name: Prepare - create conda env and install code
- name: Prepare - Install opencompass
run: |
eval "$(conda shell.bash hook)"
conda create --name ${{env.CONDA_ENV}} --clone opencompass_base --offline
conda activate ${{env.CONDA_ENV}}
python3 -m pip install -e .
python3 -m pip uninstall opencompass -y
python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
conda info --envs
- name: Prepare - prepare data and hf model
run: |
cp -r /cpfs01/user/qa-llm-cicd/data .
cp -r ${{env.USERSPACE_PREFIX}}/data .
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
ln -s /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub ~/.cache/huggingface/hub
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
- name: Run test
run: |
Expand All @@ -49,21 +51,21 @@ jobs:
rm -rf regression_result
python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug
- name: Get result
if: always()
run: |
score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 70 && ${score%.*} <= 80 )); then
echo "score is $score between 70 and 80"
if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then
echo "score is $score between 70 and 75"
else
echo "score is $score not between 70 and 80"
echo "score is $score not between 70 and 75"
exit 1
fi
rm -rf regression_result
- name: Remove Conda Env
- name: Uninstall opencompass
if: always()
run: |
eval "$(conda shell.bash hook)"
conda env remove --name ${{env.CONDA_ENV}}
conda activate ${{env.CONDA_ENV}}
python3 -m pip uninstall opencompass -y
conda info --envs

notify_to_feishu:
Expand Down
Loading