diff --git a/configs/datasets/needlebench/needlebench.py b/configs/datasets/needlebench/needlebench.py index 291a3dee4..09b978dd6 100644 --- a/configs/datasets/needlebench/needlebench.py +++ b/configs/datasets/needlebench/needlebench.py @@ -6,5 +6,6 @@ from .needlebench_32k.needlebench import needlebench_datasets as needlebench_datasets_32k from .needlebench_128k.needlebench import needlebench_datasets as needlebench_datasets_128k from .needlebench_200k.needlebench import needlebench_datasets as needlebench_datasets_200k + from .needlebench_1000k.needlebench import needlebench_datasets as needlebench_datasets_1000k needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench.py b/configs/datasets/needlebench/needlebench_1000k/needlebench.py new file mode 100644 index 000000000..b73abb1ff --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning import needlebench_datasets_2needle_en as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_en as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_en as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_en as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning import needlebench_datasets_2needle_zh as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_3needle_zh as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_4needle_zh as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning import needlebench_datasets_5needle_zh as needlebench_multi_5needle_zh_datasets + + from .needlebench_single import needlebench_datasets_en as needlebench_origin_en_datasets + from .needlebench_single import needlebench_datasets_zh as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval import needlebench_datasets_en as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval import needlebench_datasets_zh as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning.py new file mode 100644 index 000000000..80402bdd4 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_reasoning.py @@ -0,0 +1,286 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] +depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_en = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_en.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_en.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_en.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_en = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_en.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_datasets_2needle_zh = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_2needle_zh.append(dataset_dict) + +num_needles = 3 +needlebench_datasets_3needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_3needle_zh.append(dataset_dict) + +num_needles = 4 +needlebench_datasets_4needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_4needle_zh.append(dataset_dict) + +num_needles = 5 +needlebench_datasets_5needle_zh = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_1000k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_5needle_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval.py new file mode 100644 index 000000000..993e4f7c6 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_multi_retrieval.py @@ -0,0 +1,108 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' +depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_1000k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 3000, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_1000k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_1000k/needlebench_single.py b/configs/datasets/needlebench/needlebench_1000k/needlebench_single.py new file mode 100644 index 000000000..5a41275e6 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_1000k/needlebench_single.py @@ -0,0 +1,109 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +context_lengths = [20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000] +depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_datasets_en = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_1000k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_en.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_datasets_zh = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_1000k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_datasets_zh.append(dataset_dict) diff --git a/configs/datasets/needlebench/readme.md b/configs/datasets/needlebench/readme.md index ea0821325..763c1ef58 100644 --- a/configs/datasets/needlebench/readme.md +++ b/configs/datasets/needlebench/readme.md @@ -16,6 +16,7 @@ configs/datasets/needlebench/ ├── needlebench_32k ├── needlebench_128k ├── needlebench_200k +├── needlebench_1000k ├── needlebench.py ├── readme.md └── readme_zh-CN.md diff --git a/configs/datasets/needlebench/readme_zh-CN.md b/configs/datasets/needlebench/readme_zh-CN.md index d8a95cc9d..b27160251 100644 --- a/configs/datasets/needlebench/readme_zh-CN.md +++ b/configs/datasets/needlebench/readme_zh-CN.md @@ -16,6 +16,7 @@ configs/datasets/needlebench/ ├── needlebench_32k ├── needlebench_128k ├── needlebench_200k +├── needlebench_1000k ├── needlebench.py ├── readme.md └── readme_zh-CN.md diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py new file mode 100644 index 000000000..77e9c12f6 --- /dev/null +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py @@ -0,0 +1,33 @@ +from opencompass.models.turbomind import TurboMindModel + + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', + generate=True), + ], + eos_token_id=92542 +) + +models = [ + dict( + type=TurboMindModel, + abbr='internlm2-chat-20b-turbomind', + path="internlm/internlm2-chat-20b", + meta_template=_meta_template, + engine_config=dict(session_len=210000, + max_batch_size=8, + rope_scaling_factor=3.0, + model_name="internlm2-chat-20b", + tp=2), + gen_config=dict(top_k=1, top_p=0.8, + temperature=1.0, + max_new_tokens=2000,), + max_out_len=2000, + max_seq_len=210000, + batch_size=1, + concurrency=8, + run_cfg=dict(num_gpus=2, num_procs=1), + ) +] diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py new file mode 100644 index 000000000..6f5e7f919 --- /dev/null +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py @@ -0,0 +1,32 @@ +from opencompass.models.turbomind import TurboMindModel + + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', + generate=True), + ], + eos_token_id=92542 +) + +models = [ + dict( + type=TurboMindModel, + abbr='internlm2-chat-7b-turbomind', + path="internlm/internlm2-chat-7b", + meta_template=_meta_template, + engine_config=dict(session_len=210000, + max_batch_size=8, + rope_scaling_factor=2.0, + model_name="internlm2-chat-7b"), + gen_config=dict(top_k=1, top_p=0.8, + temperature=1.0, + max_new_tokens=2000), + max_out_len=2000, + max_seq_len=210000, + batch_size=8, + concurrency=8, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/qwen/vllm_qwen_14b_chat.py b/configs/models/qwen/vllm_qwen_14b_chat.py new file mode 100644 index 000000000..589aa5e05 --- /dev/null +++ b/configs/models/qwen/vllm_qwen_14b_chat.py @@ -0,0 +1,25 @@ +from opencompass.models import VLLM + + +_meta_template = dict( + round=[ + dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True), + ], +) + +models = [ + dict( + type=VLLM, + abbr='qwen-14b-chat-vllm', + path="Qwen/Qwen-14B-Chat", + model_kwargs=dict(tensor_parallel_size=4), + meta_template=_meta_template, + max_out_len=100, + max_seq_len=2048, + batch_size=32, + generation_kwargs=dict(temperature=0), + end_str='<|im_end|>', + run_cfg=dict(num_gpus=4, num_procs=1), + ) +] diff --git a/configs/summarizers/needlebench.py b/configs/summarizers/needlebench.py index f31d54c27..27a71702f 100644 --- a/configs/summarizers/needlebench.py +++ b/configs/summarizers/needlebench.py @@ -539,8 +539,114 @@ ], summary_groups=needlebench_summary_groups, ) -context_lengths_8k = list(range(5000, 9000, 1000)) +# ----------NeedleBench-1000k-summarizer---------- + +context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) +# Initialize the lists +_needlebench_1000k_2needle_en = [] +_needlebench_1000k_3needle_en = [] +_needlebench_1000k_4needle_en = [] +_needlebench_1000k_5needle_en = [] +_needlebench_1000k_2needle_zh = [] +_needlebench_1000k_3needle_zh = [] +_needlebench_1000k_4needle_zh = [] +_needlebench_1000k_5needle_zh = [] +_needlebench_1000k_origin_en = [] +_needlebench_1000k_origin_zh = [] + +# Fill the lists using nested loops +for original_context_length in context_lengths_1000k: + for depth_percent in depths_list_sparse: + _needlebench_1000k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_1000k') + _needlebench_1000k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_1000k') + _needlebench_1000k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_1000k') + _needlebench_1000k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_1000k') + _needlebench_1000k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_1000k') + _needlebench_1000k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_1000k') + _needlebench_1000k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_1000k') + _needlebench_1000k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_1000k') + + _needlebench_1000k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_1000k') + _needlebench_1000k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_1000k') + +# Concatenate the multi-needle and origin lists +_needlebench_1000k_multi_needle_en = _needlebench_1000k_2needle_en + _needlebench_1000k_3needle_en + _needlebench_1000k_4needle_en + _needlebench_1000k_5needle_en +_needlebench_1000k_multi_needle_zh = _needlebench_1000k_2needle_zh + _needlebench_1000k_3needle_zh + _needlebench_1000k_4needle_zh + _needlebench_1000k_5needle_zh +_needlebench_1000k_origin = _needlebench_1000k_origin_en + _needlebench_1000k_origin_zh +_needlebench_1000k_multi_needle = _needlebench_1000k_multi_needle_en + _needlebench_1000k_multi_needle_zh + +# Repeating the same process for parallel (assuming it's similar to origin_en) +_needlebench_1000k_parallel_en = [] +_needlebench_1000k_parallel_zh = [] +for original_context_length in context_lengths_1000k: + _needlebench_1000k_parallel_en.append(f'Length{original_context_length}_parallel_en_1000k') +for original_context_length in context_lengths_1000k: + _needlebench_1000k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_1000k') +_needlebench_1000k_parallel = _needlebench_1000k_parallel_en + _needlebench_1000k_parallel_zh + +needlebench_summary_groups = [ + {'name': 'original_version', 'subsets': _needlebench_1000k_origin}, + {'name': 'original_version_zh', 'subsets': _needlebench_1000k_origin_zh}, + {'name': 'original_version_en', 'subsets': _needlebench_1000k_origin_en}, + + {'name': 'multi_needle_en', 'subsets': _needlebench_1000k_multi_needle_en}, + {'name': 'multi_needle2_en', 'subsets': _needlebench_1000k_2needle_en}, + {'name': 'multi_needle3_en', 'subsets': _needlebench_1000k_3needle_en}, + {'name': 'multi_needle4_en', 'subsets': _needlebench_1000k_4needle_en}, + {'name': 'multi_needle5_en', 'subsets': _needlebench_1000k_5needle_en}, + + {'name': 'multi_needle_zh', 'subsets': _needlebench_1000k_multi_needle_zh}, + {'name': 'multi_needle2_zh', 'subsets': _needlebench_1000k_2needle_zh}, + {'name': 'multi_needle3_zh', 'subsets': _needlebench_1000k_3needle_zh}, + {'name': 'multi_needle4_zh', 'subsets': _needlebench_1000k_4needle_zh}, + {'name': 'multi_needle5_zh', 'subsets': _needlebench_1000k_5needle_zh}, + + {'name': 'multi_needle', 'subsets': _needlebench_1000k_multi_needle}, + + {'name': 'parallel_version', 'subsets': _needlebench_1000k_parallel}, + {'name': 'parallel_version_zh', 'subsets': _needlebench_1000k_parallel_zh}, + {'name': 'parallel_version_en', 'subsets': _needlebench_1000k_parallel_en}, + + {'name': 'overall', + 'subsets': [['original_version', 'naive_average'], + ['multi_needle', 'naive_average'], + ['parallel_version', 'average_score']], + 'weights': {'original_version': 0.4, + 'multi_needle': 0.3, + 'parallel_version': 0.3}}, +] +needlebench_1000k_summarizer = dict( + type=NeedleBenchSummarizer, + dataset_abbrs=[ + 'overall', + '--------- NeedleBench-1000k Single-Needle ---------', # category + 'original_version', + 'original_version_zh', + 'original_version_en', + '--------- NeedleBench-1000k Parallel-Needles ---------', # category + 'parallel_version', + 'parallel_version_zh', + 'parallel_version_en', + '--------- NeedleBench-1000k Multi-Needles ---------', # category + 'multi_needle', + 'multi_needle_en', + 'multi_needle_zh', + 'multi_needle2_en', + 'multi_needle3_en', + 'multi_needle4_en', + 'multi_needle5_en', + 'multi_needle2_zh', + 'multi_needle3_zh', + 'multi_needle4_zh', + 'multi_needle5_zh', + + # *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel, + ], + summary_groups=needlebench_summary_groups, +) + +context_lengths_8k = list(range(5000, 9000, 1000)) # Repeating the same process for parallel (assuming it's similar to origin_en) _needlebench_8k_parallel_en_batch1 = [] _needlebench_8k_parallel_en_batch5 = [] diff --git a/opencompass/datasets/needlebench/multi.py b/opencompass/datasets/needlebench/multi.py index 3a2d11575..6f1c48996 100644 --- a/opencompass/datasets/needlebench/multi.py +++ b/opencompass/datasets/needlebench/multi.py @@ -48,6 +48,7 @@ def load( needle_file_name: str, num_needles: int, diff: int, + position: str = 'End', ): data = {'prompt': [], 'answer': []} tokenizer = tiktoken.encoding_for_model(tokenizer_model) @@ -109,19 +110,42 @@ def _generate_prompt(context, retrieval_question): retrieval_question) if language == 'Chinese': - prompt = ('你是一个善于回答用户问题的智能AI助手\n' - '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' - ',或重复你的回答\n' - f'用户现在给你的文档是{context}\n\n' - f'现在请问:{retrieval_question}') + if position == 'End': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n' + f'用户现在给你的文档是{context}\n\n' + f'现在请问:{retrieval_question}') + elif position == 'Start': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n' + f'现在请问:{retrieval_question}', + f'用户现在给你的文档是{context}\n\n') + else: + raise ValueError('Unsupported position. ' + 'Position must be "End" or "Start".') elif language == 'English': - prompt = ('You are an intelligent AI assistant skilled in ' - 'answering user questions.\n' - 'Please keep your answers concise and clear. Do not' - ' talk about irrelevant topics or repeat your ' - 'answers.\n' - f'The document given to you by the user is {context}' - f'\n\nNow, the question is: {retrieval_question}') + if position == 'End': + prompt = ('You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do ' + 'not talk about irrelevant topics or repeat ' + 'your answers.\nThe document ' + f'given to you by the user is {context}\n\n' + f'Now, the question is: {retrieval_question}') + elif position == 'Start': + prompt = ('You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do ' + 'not talk about irrelevant topics or repeat ' + 'your answers.\n' + f'Now, the question is: {retrieval_question}' + 'The document given to you by the user' + f' is {context}\n\n') + else: + raise ValueError('Unsupported position. ' + 'Position must be "End" or "Start".') else: raise ValueError(f"Language '{language}' is not supported.") diff --git a/opencompass/datasets/needlebench/origin.py b/opencompass/datasets/needlebench/origin.py index 561de1ba8..2848dc3a6 100644 --- a/opencompass/datasets/needlebench/origin.py +++ b/opencompass/datasets/needlebench/origin.py @@ -45,6 +45,7 @@ def load( guide: bool, language: str, needle_file_name: str, + position: str = 'End', ): data = {'prompt': [], 'answer': []} tokenizer = tiktoken.encoding_for_model(tokenizer_model) @@ -85,19 +86,42 @@ def _generate_prompt(context, retrieval_question): retrieval_question) if language == 'Chinese': - prompt = ('你是一个善于回答用户问题的智能AI助手\n' - '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' - ',或重复你的回答\n' - f'用户现在给你的文档是{context}\n\n' - f'现在请问:{retrieval_question}') + if position == 'End': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n' + f'用户现在给你的文档是{context}\n\n' + f'现在请问:{retrieval_question}') + elif position == 'Start': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n' + f'现在请问:{retrieval_question}', + f'用户现在给你的文档是{context}\n\n') + else: + raise ValueError('Unsupported position. ' + 'Position must be "End" or "Start".') elif language == 'English': - prompt = ('You are an intelligent AI assistant skilled in ' - 'answering user questions.\n' - 'Please keep your answers concise and clear. Do not' - ' talk about irrelevant topics or repeat your ' - 'answers.\n' - f'The document given to you by the user is {context}' - f'\n\nNow, the question is: {retrieval_question}') + if position == 'End': + prompt = ('You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do ' + 'not talk about irrelevant topics or repeat ' + 'your answers.\nThe document ' + f'given to you by the user is {context}\n\n' + f'Now, the question is: {retrieval_question}') + elif position == 'Start': + prompt = ('You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do ' + 'not talk about irrelevant topics or repeat ' + 'your answers.\n' + f'Now, the question is: {retrieval_question}' + 'The document given to you by the user' + f' is {context}\n\n') + else: + raise ValueError('Unsupported position. ' + 'Position must be "End" or "Start".') else: raise ValueError(f"Language '{language}' is not supported.") diff --git a/opencompass/datasets/needlebench/parallel.py b/opencompass/datasets/needlebench/parallel.py index 6133a3ca4..a17073d1d 100644 --- a/opencompass/datasets/needlebench/parallel.py +++ b/opencompass/datasets/needlebench/parallel.py @@ -67,6 +67,7 @@ def load( length_buffer: int, guide: bool, language: str, + position: str = 'End', ): data = {'prompt': [], 'answer': []} tokenizer = tiktoken.encoding_for_model(tokenizer_model) @@ -134,20 +135,44 @@ def _generate_prompt(context, retrieval_question): retrieval_question) if language == 'Chinese': - prompt = ('你是一个善于回答用户问题的智能AI助手\n' - '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' - ',或重复你的回答\n请先仔细阅读下面的文档再依次回答' - f'最后提出的问题\n用户现在给你的文档是{context}\n\n' - f'现在请问:{retrieval_question}\n') + if position == 'End': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n请先仔细阅读下面的文档再依次回答' + f'最后提出的问题\n用户现在给你的文档是{context}\n\n' + f'现在请问:{retrieval_question}\n') + elif position == 'Start': + prompt = ('你是一个善于回答用户问题的智能AI助手\n' + '请保持你的回答简洁清楚。不要说和下面文档中的无关的话' + ',或重复你的回答\n请先仔细阅读下面的文档再依次回答' + f'最后提出的问题\n现在请问:{retrieval_question}\n\n' + f'用户现在给你的文档是{context}\n') + else: + raise ValueError('Unsupported position. ' + 'Position must be "End" or "Start".') + elif language == 'English': - prompt = ( - 'You are an intelligent AI assistant skilled in ' - 'answering user questions.\n' - 'Please keep your answers concise and clear. Do not' - ' talk about irrelevant topics or repeat your ' - 'answers.\n' - f'The document given to you by the user is {context}' - f'\n\nNow, the questions are: {retrieval_question}\n') + if position == 'End': + prompt = ( + 'You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do not' + ' talk about irrelevant topics or repeat your ' + 'answers.\n' + f'The document given to you by the user is {context}' + f'\n\nNow, the questions are: {retrieval_question}\n') + elif position == 'Start': + prompt = ( + 'You are an intelligent AI assistant skilled in ' + 'answering user questions.\n' + 'Please keep your answers concise and clear. Do not' + ' talk about irrelevant topics or repeat your ' + 'answers.\n' + f'\nNow, the questions are: {retrieval_question}\n\n' + f'The document given to you by the user is {context}') + else: + raise ValueError('Unsupported position. ' + 'Position must be "End" or "Start".') else: raise ValueError(f"Language '{language}' is not supported.")