Skip to content

Commit

Permalink
support alpacaeval (#809)
Browse files Browse the repository at this point in the history
* support alpacaeval_v1

* Update opencompass/summarizers/subjective/__init__.py

Co-authored-by: Songyang Zhang <[email protected]>

* Update opencompass/summarizers/subjective/alpacaeval_v1.py

Co-authored-by: Songyang Zhang <[email protected]>

* fix conflict

* support alpacaeval v2

* support alpacav2

---------

Co-authored-by: Songyang Zhang <[email protected]>
  • Loading branch information
2 people authored and BunnyRunnerX committed Feb 4, 2024
1 parent a2d701a commit 80caa6f
Show file tree
Hide file tree
Showing 5 changed files with 456 additions and 0 deletions.
98 changes: 98 additions & 0 deletions configs/datasets/subjective/alpaca_eval/alpacav1_judgeby_gpt4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import SubjectiveCmpDataset
from mmengine.config import read_base

subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)

subjective_all_sets = [
"alpaca_eval",
]


subjective_datasets = []

gpt4_prompt = """
I want you to create a leaderboard of different of large-language models. To do so, I will give you the instructions (prompts) given to the models, and the responses of two models. Please rank the models based on which responses would be preferred by humans. All inputs and outputs should be python dictionaries.
Here is the prompt:
{
"instruction": "{question}"
}
Here are the outputs of the models:
[
{
"model": "model_1",
"answer": "{prediction}"
},
{
"model": "model_2",
"answer": "{prediction2}"
}
]
Now please rank the models by the quality of their answers, so that the model with rank 1 has the best output. Then return a list of the model names and ranks, i.e., produce the following output:
[
{"model": <model-name>, "rank": <model-rank>},
{"model": <model-name>, "rank": <model-rank>}
]
Your response must be a valid Python dictionary and should contain nothing else because we will directly execute it in Python. Please provide the ranking that the majority of humans would give.
"""


for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)

subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
infer_order='random',
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant, that ranks models by the quality of their answers.")
],
round=[
dict(
role='HUMAN',
prompt = gpt4_prompt
),
]),
),
),
pred_role="BOT",
)

subjective_datasets.append(
dict(
abbr=f"{_name}",
type=SubjectiveCmpDataset,
path="./data/subjective/",
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
))
100 changes: 100 additions & 0 deletions configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import SubjectiveCmpDataset
from mmengine.config import read_base

subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)

subjective_all_sets = [
"alpaca_eval",
]


subjective_datasets = []

gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
"instruction": "{question}",
}
## Model Outputs
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
{
{
"model_identifier": "m",
"output": "{prediction}"
},
{
"model_identifier": "M",
"output": "{prediction2}"
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
## Best Model Identifier
"""


for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)

subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
infer_order='random',
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.")
],
round=[
dict(
role='HUMAN',
prompt = gpt4_prompt
),
]),
),
),
pred_role="BOT",
)

subjective_datasets.append(
dict(
abbr=f"{_name}",
type=SubjectiveCmpDataset,
path="./data/subjective/",
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
))
82 changes: 82 additions & 0 deletions configs/eval_subjective_alpacaeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from mmengine.config import read_base
with read_base():
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2

datasets = [*alpacav2]

from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import AlpacaSummarizer

models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True)
],
reserved_roles=[
dict(role='SYSTEM', api_role='SYSTEM'),
],
)

infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)

judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAI, path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature = 0
)

eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
max_task_size=1000,
mode='m2n',
base_models = [*hf_chatglm3_6b],
compare_models = [*hf_qwen_7b_chat]
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=judge_model
)),
)
work_dir = 'outputs/alpaca/'

summarizer = dict(
type=AlpacaSummarizer, judge_type='v2'
)
1 change: 1 addition & 0 deletions opencompass/summarizers/subjective/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# flake8: noqa: F401, E501
from .alignmentbench import AlignmentBenchSummarizer
from .alpacaeval import AlpacaSummarizer
from .compass_arena import CompassArenaSummarizer
from .corev2 import Corev2Summarizer
from .creationbench import CreationBenchSummarizer
Expand Down
Loading

0 comments on commit 80caa6f

Please sign in to comment.