From cd6d3e09241d41183d510bf5ccdf01fca2be8780 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 16 Aug 2024 22:39:51 +0000 Subject: [PATCH 1/6] Add support to vllm models --- mix_eval/models/__init__.py | 2 ++ mix_eval/models/vllm.py | 59 +++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 mix_eval/models/vllm.py diff --git a/mix_eval/models/__init__.py b/mix_eval/models/__init__.py index 9fa5737..e4f0b9e 100644 --- a/mix_eval/models/__init__.py +++ b/mix_eval/models/__init__.py @@ -10,6 +10,8 @@ "llama_3_70b": "Llama_3_70B", "llama_3_70b_instruct": "Llama_3_70B_Instruct", + "llama_3_8b_instruct_vllm": "Llama_3_8B_Instruct_vLLM", + "qwen_15_4b": "Qwen_15_4B", "qwen_15_7b": "Qwen_15_7B", "qwen_15_32b": "Qwen_15_32B", diff --git a/mix_eval/models/vllm.py b/mix_eval/models/vllm.py new file mode 100644 index 0000000..e10e05b --- /dev/null +++ b/mix_eval/models/vllm.py @@ -0,0 +1,59 @@ +from .base import ChatModel +from vllm import LLM, SamplingParams +import torch +import json + +class ChatModelVLLM(ChatModel): + + def build_model(self): + num_gpus = torch.cuda.device_count() + return LLM(model=self.model_name, tensor_parallel_size=num_gpus, enable_chunked_prefill=True, distributed_executor_backend="ray") + + def get_closeended_responses(self, batch, response_file): + sampling_params = SamplingParams(max_tokens=self.closeended_max_new_tokens, **self.gen_kwargs) + formated_prompts = [d['raw_inputs']['formated_input'] for d in batch] + inputs = [self.apply_chat_template(self.get_messages(prompt)) for prompt in formated_prompts] + + outputs = self.model.generate(inputs, sampling_params) + responses = [output.outputs[0].text for output in outputs] + + with open(response_file, "a") as f: + for raw_dict, response in zip(batch, responses): + raw_dict = raw_dict['raw_inputs'] + raw_dict['response'] = response + f.write(json.dumps(raw_dict) + "\n") + + def get_openended_responses(self, batch, response_file): + sampling_params = SamplingParams(max_tokens=self.closeended_max_new_tokens, **self.gen_kwargs) + + messages_batch = [ + [ + self.SYSTEM_MESSAGE.copy(), + ] if self.SYSTEM_MESSAGE is not None else [] + for _ in batch + ] + turns_batch = [d['raw_inputs']['turns'] for d in batch] + turn_num = len(turns_batch[0]) + for turns in turns_batch: + assert len(turns) == turn_num, "All dialogues should have the same number of turns." + + responses_all = [] + for i in range(turn_num): + for turns, messages in zip(turns_batch, messages_batch): + messages.append(self.USER_MESSAGE_TEMPLATE(turns[i])) + inputs = [self.apply_chat_template(messages) for messages in messages_batch] + + outputs = self.model.generate(inputs, sampling_params) + responses = [output.outputs[0].text for output in outputs] + + responses_all.append(responses) + for response, messages in zip(responses, messages_batch): + messages.append(self.ASSISTANT_MESSAGE_TEMPLATE(response)) + + responses_all = list(zip(*responses_all)) + + with open(response_file, "a") as f: + for raw_dict, response in zip(batch, responses_all): + raw_dict = raw_dict['raw_inputs'] + raw_dict['response'] = response + f.write(json.dumps(raw_dict) + "\n") \ No newline at end of file From 7c1e875ddf22c34966260ed2b6ae908a0e54119d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Sat, 17 Aug 2024 18:33:30 +0000 Subject: [PATCH 2/6] Avoid instatiating model more than once --- mix_eval/evaluate.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mix_eval/evaluate.py b/mix_eval/evaluate.py index bed8cd0..364ecbb 100644 --- a/mix_eval/evaluate.py +++ b/mix_eval/evaluate.py @@ -152,7 +152,7 @@ def parse_args(): return parser.parse_args() -def _eval(args): +def _eval(args, model=None): print(f"\n\nStart to evaluate {args.model_name}'s {args.split} split. \n\n") time_elapsed = 0 start_time = time.time() @@ -192,7 +192,8 @@ def _eval(args): "lines as recorded in cached metadadta. Please check the response file. " "You might consider delete the response and metadata file to start from scratch.") - model = mix_eval.api.registry.get_model(args.model_name)(args) + if model is None: + model = mix_eval.api.registry.get_model(args.model_name)(args) eval_dataset = get_eval_dataset(args) dataloader = DataLoader( eval_dataset, @@ -235,18 +236,19 @@ def _eval(args): print(f"Finished evaluating {args.model_name}'s {args.split} split. " f"Used {round(time_elapsed / 60, 2)} minutes.") + return model def eval(args): if args.benchmark == "mixeval": args.split = "close_freeform" - _eval(args) + model = _eval(args) args.split = "close_multichoice" - _eval(args) + _eval(args, model) elif args.benchmark == "mixeval_hard": args.split = "close_freeform_hard" - _eval(args) + model = _eval(args) args.split = "close_multichoice_hard" - _eval(args) + _eval(args, model) else: raise ValueError(f"Benchmark {args.benchmark} not supported.") From fce6367faadbff4ca2fac10b89b9a8a570310ab8 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 19 Aug 2024 18:47:44 +0000 Subject: [PATCH 3/6] Add support to cpu offloading --- mix_eval/evaluate.py | 7 +++++++ mix_eval/models/vllm.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/mix_eval/evaluate.py b/mix_eval/evaluate.py index 364ecbb..6fca606 100644 --- a/mix_eval/evaluate.py +++ b/mix_eval/evaluate.py @@ -76,6 +76,13 @@ def parse_args(): "Set this properly will allocate more memory for activations, " "so you can use longer context lengths or larger batch sizes." ) + parser.add_argument( + "--cpu_offload_gb", + type=int, + default=None, + help="Amount of memory (in GB) to offload to CPU for loading the weights. " + "Only valid with vLLM models." + ) parser.add_argument( "--api_parallel_num", type=int, diff --git a/mix_eval/models/vllm.py b/mix_eval/models/vllm.py index e10e05b..8d65d8d 100644 --- a/mix_eval/models/vllm.py +++ b/mix_eval/models/vllm.py @@ -2,12 +2,17 @@ from vllm import LLM, SamplingParams import torch import json +import re class ChatModelVLLM(ChatModel): def build_model(self): num_gpus = torch.cuda.device_count() - return LLM(model=self.model_name, tensor_parallel_size=num_gpus, enable_chunked_prefill=True, distributed_executor_backend="ray") + + if self.args.cpu_offload_gb: + return LLM(model=self.model_name, tensor_parallel_size=num_gpus, enable_chunked_prefill=True, distributed_executor_backend="ray", cpu_offload_gb=self.args.cpu_offload_gb) + else: + return LLM(model=self.model_name, tensor_parallel_size=num_gpus, enable_chunked_prefill=True, distributed_executor_backend="ray") def get_closeended_responses(self, batch, response_file): sampling_params = SamplingParams(max_tokens=self.closeended_max_new_tokens, **self.gen_kwargs) From fea58d275988af38043421c0e0392659abbadde7 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 19 Aug 2024 19:01:23 +0000 Subject: [PATCH 4/6] Add llama_3_1_8b_instruct_vllm --- mix_eval/models/llama_3_8b_instruct_vllm.py | 58 +++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 mix_eval/models/llama_3_8b_instruct_vllm.py diff --git a/mix_eval/models/llama_3_8b_instruct_vllm.py b/mix_eval/models/llama_3_8b_instruct_vllm.py new file mode 100644 index 0000000..9d7b6cd --- /dev/null +++ b/mix_eval/models/llama_3_8b_instruct_vllm.py @@ -0,0 +1,58 @@ +from dotenv import load_dotenv +import os + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mix_eval.models.vllm import ChatModelVLLM +from mix_eval.api.registry import register_model +from mix_eval.utils.common_utils import get_gpu_memory + +@register_model("llama_3_8b_instruct_vllm") +class Llama_3_8B_Instruct_vLLM(ChatModelVLLM): + def __init__(self, args): + super().__init__(args) + self.model_name = "meta-llama/Meta-Llama-3-8B-Instruct" + + self.SYSTEM_MESSAGE = {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"} # set to None if no system message + self.USER_MESSAGE_TEMPLATE = lambda x: {"role": "user", "content": x} + self.ASSISTANT_MESSAGE_TEMPLATE = lambda x: {"role": "assistant", "content": x} + + self.model_dtype = torch.bfloat16 + + load_dotenv() + self.hf_token = os.getenv('_FADKLFHAKH_') + self.model = self.build_model() + self.model_max_len = 8192 + self.tokenizer = self.build_tokenizer() + self.tokenizer.pad_token = self.tokenizer.eos_token + self.max_input_length_closeend = min( + self.model_max_len, + self.max_input_length + ) - self.closeended_max_new_tokens + self.max_input_length_openend = min( + self.model_max_len, + self.max_input_length + ) - self.openended_max_new_tokens + + + terminators = [ + self.tokenizer.eos_token_id, + self.tokenizer.convert_tokens_to_ids("<|eot_id|>") + ] + + self.gen_kwargs = { + 'temperature': 0.6, + 'top_p': 0.9, + 'stop_token_ids': terminators, + } + + def build_tokenizer(self): + tokenizer = AutoTokenizer.from_pretrained( + self.model_name, + model_max_length=self.model_max_len, + padding_side=self.padding_side, + use_fast=self.use_fast_tokenizer, + trust_remote_code=self.trust_remote_code, + token=self.hf_token,) + return tokenizer From 888e407303eb9665588b1c01906157e4d35af0b0 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 19 Aug 2024 19:09:30 +0000 Subject: [PATCH 5/6] Make sure model is only instantiated once --- mix_eval/evaluate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mix_eval/evaluate.py b/mix_eval/evaluate.py index 6fca606..0b93be0 100644 --- a/mix_eval/evaluate.py +++ b/mix_eval/evaluate.py @@ -246,14 +246,15 @@ def _eval(args, model=None): return model def eval(args): + model = None if args.benchmark == "mixeval": args.split = "close_freeform" - model = _eval(args) + model = _eval(args, model) args.split = "close_multichoice" _eval(args, model) elif args.benchmark == "mixeval_hard": args.split = "close_freeform_hard" - model = _eval(args) + model = _eval(args, model) args.split = "close_multichoice_hard" _eval(args, model) else: From d270aa990561645b618c50e13691423dc37342ca Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 19 Aug 2024 20:40:04 +0000 Subject: [PATCH 6/6] Add support to base model w/ vllm backend --- mix_eval/models/vllm.py | 50 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/mix_eval/models/vllm.py b/mix_eval/models/vllm.py index 8d65d8d..ffbd93b 100644 --- a/mix_eval/models/vllm.py +++ b/mix_eval/models/vllm.py @@ -1,11 +1,9 @@ -from .base import ChatModel +from .base import ChatModel, BaseModel from vllm import LLM, SamplingParams import torch import json -import re class ChatModelVLLM(ChatModel): - def build_model(self): num_gpus = torch.cuda.device_count() @@ -61,4 +59,48 @@ def get_openended_responses(self, batch, response_file): for raw_dict, response in zip(batch, responses_all): raw_dict = raw_dict['raw_inputs'] raw_dict['response'] = response - f.write(json.dumps(raw_dict) + "\n") \ No newline at end of file + f.write(json.dumps(raw_dict) + "\n") + +class BaseModelVLLM(BaseModel): + def build_model(self): + num_gpus = torch.cuda.device_count() + + if self.args.cpu_offload_gb: + return LLM(model=self.model_name, tensor_parallel_size=num_gpus, enable_chunked_prefill=True, distributed_executor_backend="ray", cpu_offload_gb=self.args.cpu_offload_gb) + else: + return LLM(model=self.model_name, tensor_parallel_size=num_gpus, enable_chunked_prefill=True, distributed_executor_backend="ray") + + def get_closeended_responses(self, batch, response_file): + formated_prompts = [d['raw_inputs']['formated_input'] for d in batch] + + # add few-shot prompts + if self.args.split == 'close_multichoice' or self.args.split == 'close_multichoice_hard': + formated_prompts = [ + FIVE_SHOT_PREFIX_MULTIPLECHOICE + prompt + '\n' + for prompt in formated_prompts + ] + elif self.args.split == 'close_freeform' or self.args.split == 'close_freeform_hard': + formated_prompts = [ + FIVE_SHOT_PREFIX_FREEFORM + prompt + '\n' + for prompt in formated_prompts] + else: + raise ValueError(f"Split {self.args.split} not supported in " + f"{self.__class__.__name__}: get_closeended_responses()") + + for _fp, _b in zip(formated_prompts, batch): + _b['raw_inputs']['formated_input'] = _fp + + outputs = self.model.generate(formated_prompts, sampling_params) + responses = [output.outputs[0].text for output in outputs] + + with open(response_file, "a") as f: + for raw_dict, response in zip(batch, responses): + raw_dict = raw_dict['raw_inputs'] + raw_dict['response'] = response + f.write(json.dumps(raw_dict) + "\n") + + with open(response_file, "a") as f: + for raw_dict, response in zip(batch, responses): + raw_dict = raw_dict['raw_inputs'] + raw_dict['response'] = response + f.write(json.dumps(raw_dict) + "\n")