From 7c65521e78c4106f80f4632a885d581efee3c8d5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 6 Nov 2023 14:58:20 +0000 Subject: [PATCH] update scripts --- tests/peft/fine_tune.sh | 4 +-- tests/peft/hf_finetune.py | 2 +- tests/peft/hf_serve.py | 55 +++++++++++++++++++++++++-------------- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh index dbcdb849fa..eddb6139d0 100755 --- a/tests/peft/fine_tune.sh +++ b/tests/peft/fine_tune.sh @@ -7,8 +7,8 @@ cd "${BASH_SOURCE[0]%/*}" python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half -python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full -python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half +python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full +python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index d702d23038..cf157a8913 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -29,7 +29,7 @@ def print_trainable_parameters(model): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf") + parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") parser.add_argument("--lora-rank", type=int, default=16) parser.add_argument("--lora-alpha", type=int, default=32) parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target") diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 6f3753906f..efade301da 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -2,51 +2,68 @@ import torch import os, sys from peft import PeftModel, PeftConfig -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + LlamaTokenizer, + GenerationConfig, +) + def main(): parser = argparse.ArgumentParser() parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama") - parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) parser.add_argument("--max-new-tokens", type=int, default=50) + parser.add_argument("--do-sample", action="store_true", help="Use sampling") args = parser.parse_args() peft_model_id = args.peft_model_id - #peft_model_id = "goliaro/llama-7b-lora-half" - use_full_precision=args.use_full_precision + use_full_precision = args.use_full_precision max_new_tokens = args.max_new_tokens # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) - + config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained( - config.base_model_name_or_path, - return_dict=True, - #load_in_8bit=True, - torch_dtype = torch.float32 if use_full_precision else torch.float16, - device_map='auto', + config.base_model_name_or_path, + return_dict=True, + # load_in_8bit=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + device_map="auto", + ) + hf_config = AutoConfig.from_pretrained( + config.base_model_name_or_path, trust_remote_code=True ) - hf_config = AutoConfig.from_pretrained(config.base_model_name_or_path, trust_remote_code=True) hf_arch = getattr(hf_config, "architectures")[0] if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": tokenizer = LlamaTokenizer.from_pretrained( - config.base_model_name_or_path, use_fast=True, - torch_dtype = torch.float32 if use_full_precision else torch.float16, + config.base_model_name_or_path, + use_fast=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, ) else: tokenizer = AutoTokenizer.from_pretrained( - config.base_model_name_or_path, - torch_dtype = torch.float32 if use_full_precision else torch.float16, + config.base_model_name_or_path, + torch_dtype=torch.float32 if use_full_precision else torch.float16, ) - + # Generation config + generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path) + generation_config.do_sample = args.do_sample # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) - batch = tokenizer("Two things are infinite: ", return_tensors='pt') + batch = tokenizer("Two things are infinite: ", return_tensors="pt") with torch.cuda.amp.autocast(): - output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens) - print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) + output_tokens = model.generate( + **batch, max_new_tokens=max_new_tokens, generation_config=generation_config + ) + print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) + if __name__ == "__main__": main()