From 7c65521e78c4106f80f4632a885d581efee3c8d5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 6 Nov 2023 14:58:20 +0000
Subject: [PATCH] update scripts

---
 tests/peft/fine_tune.sh   |  4 +--
 tests/peft/hf_finetune.py |  2 +-
 tests/peft/hf_serve.py    | 55 +++++++++++++++++++++++++--------------
 3 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh
index dbcdb849fa..eddb6139d0 100755
--- a/tests/peft/fine_tune.sh
+++ b/tests/peft/fine_tune.sh
@@ -7,8 +7,8 @@ cd "${BASH_SOURCE[0]%/*}"
 
 python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full
 python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half
-python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
-python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
+python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full
+python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half
 
 python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full
 python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index d702d23038..cf157a8913 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -29,7 +29,7 @@ def print_trainable_parameters(model):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf")
+    parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
     parser.add_argument("--lora-rank", type=int, default=16)
     parser.add_argument("--lora-alpha", type=int, default=32)
     parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target")
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
index 6f3753906f..efade301da 100644
--- a/tests/peft/hf_serve.py
+++ b/tests/peft/hf_serve.py
@@ -2,51 +2,68 @@
 import torch
 import os, sys
 from peft import PeftModel, PeftConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    LlamaTokenizer,
+    GenerationConfig,
+)
+
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama")
-    parser.add_argument("--use-full-precision", action="store_true", help="Use full precision")
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
     parser.add_argument("--max-new-tokens", type=int, default=50)
+    parser.add_argument("--do-sample", action="store_true", help="Use sampling")
     args = parser.parse_args()
     peft_model_id = args.peft_model_id
-    #peft_model_id = "goliaro/llama-7b-lora-half"
-    use_full_precision=args.use_full_precision
+    use_full_precision = args.use_full_precision
     max_new_tokens = args.max_new_tokens
 
     # Change working dir to folder storing this script
     abspath = os.path.abspath(__file__)
     dname = os.path.dirname(abspath)
     os.chdir(dname)
-    
+
     config = PeftConfig.from_pretrained(peft_model_id)
     model = AutoModelForCausalLM.from_pretrained(
-        config.base_model_name_or_path, 
-        return_dict=True, 
-        #load_in_8bit=True, 
-        torch_dtype = torch.float32 if use_full_precision else torch.float16,
-        device_map='auto',
+        config.base_model_name_or_path,
+        return_dict=True,
+        # load_in_8bit=True,
+        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        device_map="auto",
+    )
+    hf_config = AutoConfig.from_pretrained(
+        config.base_model_name_or_path, trust_remote_code=True
     )
-    hf_config = AutoConfig.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
     hf_arch = getattr(hf_config, "architectures")[0]
     if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
         tokenizer = LlamaTokenizer.from_pretrained(
-            config.base_model_name_or_path, use_fast=True, 
-            torch_dtype = torch.float32 if use_full_precision else torch.float16,
+            config.base_model_name_or_path,
+            use_fast=True,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
         )
     else:
         tokenizer = AutoTokenizer.from_pretrained(
-            config.base_model_name_or_path, 
-            torch_dtype = torch.float32 if use_full_precision else torch.float16,
+            config.base_model_name_or_path,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
         )
-
+    # Generation config
+    generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path)
+    generation_config.do_sample = args.do_sample
     # Load the Lora model
     model = PeftModel.from_pretrained(model, peft_model_id)
-    batch = tokenizer("Two things are infinite: ", return_tensors='pt')
+    batch = tokenizer("Two things are infinite: ", return_tensors="pt")
     with torch.cuda.amp.autocast():
-        output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens)
-    print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))
+        output_tokens = model.generate(
+            **batch, max_new_tokens=max_new_tokens, generation_config=generation_config
+        )
+    print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
+
 
 if __name__ == "__main__":
     main()