LogicKor-v2 update initial commit.

- add PROMPT_STRATEGY - change judge_template - change evaluation template
instructkr · Jun 28, 2024 · 5f08eb7 · 5f08eb7
1 parent c34d5a8
commit 5f08eb7
Show file tree

Hide file tree

Showing 41 changed files with 282 additions and 1,530 deletions.
diff --git a/evaluator.py b/evaluator.py
@@ -0,0 +1,142 @@
+from typing import Dict, Union
+import argparse
+import re
+import json
+import time
+from datetime import datetime
+from threading import Lock
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+import pandas as pd
+from openai import OpenAI
+
+# Constants
+TIME_START = datetime.now().strftime("%Y%m%d_%H%M%S")
+LOCK = Lock()
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--model-output-dir', help='Model Output Directory', required=True)
+    parser.add_argument('-k', '--openai-api-key', help='OpenAI API Key', required=True)
+    parser.add_argument('-j', '--judge-model', help='Judge Model', default='gpt-4-1106-preview')
+    parser.add_argument('-t', '--threads', help='Thread count', default=42, type=int)
+    return parser.parse_args()
+
+def create_azure_client(api_key: str):
+    return OpenAI(
+        api_key=api_key
+    )
+
+def load_judge_template() -> pd.DataFrame:
+    return pd.read_json('judge_template.jsonl', lines=True)
+
+def create_answers(client, model_output, judge_model, df_judge_template, is_multi_turn: bool = False, i=0) -> Dict[str, Union[str, float]]:
+    model_questions = model_output['questions']
+    model_outputs = model_output['outputs']
+    model_references = model_output['references']
+
+    prompt = (
+        f"아래의 내용을 주어진 평가 기준들을 충실히 반영하여 평가해라. 특히 모델 답변이 언어 요구사항을 준수하는지 반드시 확인해야 한다.\n\n"
+        f"**Question**\n{model_questions[0]}"
+    )
+
+    if model_references and model_references[0]:
+        prompt += f"\n\n**Additional Reference**\n{model_references[0]}"
+
+    prompt += f"\n\n**Model's Response**\n{model_outputs[0]}"
+
+    if is_multi_turn:
+        prompt += f"\n\n**Follow-up Question.**\n{model_questions[1]}"
+        if model_references and model_references[1]:
+            prompt += f"\n\n**Additional Reference**\n{model_references[1]}"
+        prompt += f"\n\n**Model's Response**\n{model_outputs[1]}"
+
+    prompt += "\n\n[[대화 종료. 평가 시작.]]"
+
+    try:
+        response = client.chat.completions.create(
+            model=judge_model,
+            temperature=0.0,
+            n=1,
+            messages=[
+                {"role": "system", "content": df_judge_template.iloc[1 if is_multi_turn else 0]['system_prompt']},
+                {"role": "user", "content": prompt}
+            ]
+        )
+
+        content = response.choices[0].message.content
+        judge_message_match = re.search(r"평가:(.*?)점수:", content.replace("*", ''), re.DOTALL)
+        judge_message = judge_message_match.group(1).strip() if judge_message_match else "No judge message found"
+        judge_score_match = re.search(r"점수:\s*(\d+(\.\d+)?)", content.replace("*", ''))
+        if judge_score_match:
+            judge_score = float(judge_score_match.group(1))
+        else:
+            raise ValueError("No score found in response")
+
+        return {
+            'judge_message': judge_message,
+            'judge_score': judge_score
+        }
+
+    except Exception as e:
+        print("Error. Retrying after 20 sec", e)
+        time.sleep(20)
+
+        # 현재는 에러에 따라서 다르게 핸들링 하지 않고 있음. 업데이트 필요함.
+        if i > 3:
+            print("Impossible prompt, aborting..!")
+            return {
+                'judge_message': "Impossible to judge due to repetition.",
+                'judge_score': 0.0
+            }
+        i += 1
+        return create_answers(client, model_output, judge_model, df_judge_template, is_multi_turn, i)
+
+def process_item(client, row, judge_model, df_judge_template, output_file):
+    query_single = create_answers(client, row, judge_model, df_judge_template)
+    query_multi = create_answers(client, row, judge_model, df_judge_template, is_multi_turn=True)
+
+    row['query_single'] = query_single
+    row['query_multi'] = query_multi
+    row = row.to_dict()
+
+    with LOCK:
+        with output_file.open('a', encoding='utf-8-sig') as f:
+            f.write(json.dumps(row, ensure_ascii=False))
+            f.write('\n')
+
+def process_file(client, file_path: Path, output_dir: Path, judge_model, df_judge_template, threads: int):
+    print(f"- 현재 Processing : {file_path}")
+    df_model_outputs = pd.read_json(file_path, lines=True)
+
+    output_file = output_dir / file_path.relative_to(args.model_output_dir)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    with ThreadPoolExecutor(max_workers=threads) as executor:
+        for row in df_model_outputs.iterrows():
+            executor.submit(process_item, client, row[1], judge_model, df_judge_template, output_file)
+
+def is_hidden(filepath: Path) -> bool:
+    return any(part.startswith('.') for part in filepath.parts)
+
+def main():
+    args = get_args()
+    client = create_azure_client(args.openai_api_key)
+    df_judge_template = load_judge_template()
+
+    input_dir = Path(args.model_output_dir)
+    output_dir = Path('./evaluated')
+
+    # Filter out hidden files
+    json_files = [file for file in input_dir.rglob('*.jsonl') if not is_hidden(file)]
+
+    for file_path in json_files:
+        output_file_path = output_dir / file_path.relative_to(input_dir)
+        if output_file_path.exists():
+            print(f"이미 평가 완료.. : {file_path}")
+            continue
+        process_file(client, file_path, output_dir, args.judge_model, df_judge_template, args.threads)
+        time.sleep(20) # ratelimit!
+
+if __name__ == "__main__":
+    main()
diff --git a/generator.py b/generator.py
@@ -1,69 +1,153 @@
 import argparse
-import os
-
 import pandas as pd
-from vllm import LLM, SamplingParams
+import os
 
+# Use aphrodite-engine or vLLM
+try:
+    from aphrodite import LLM, SamplingParams
+    print("- Using aphrodite-engine")
+
+except ImportError:
+    from vllm import LLM, SamplingParams
+    print("- Using vLLM")
+
+PROMPT_STRATEGY = {
+    "cot-1-shot": [
+        {"role": "system", "content": "문제를 해결하는 과정을 단계별로 명확하게 설명한 다음, 최종 답변을 제시해 주세요."},
+        {"role": "user", "content": "한 도시에서 매년 나무를 심고 있습니다. 첫해에 100그루를 심었고, 매년 전년도에 심은 나무의 수보다 20%씩 더 많은 나무를 심고 있습니다. 2년 후에는 총 몇 그루의 나무를 심게 될까요?"},
+        {"role": "assistant", "content": """이 문제를 해결하기 위해 단계별로 접근해 보겠습니다.
+
+1단계: 주어진 정보 정리
+- 첫해에 100그루를 심음
+- 매년 전년도보다 20% 더 많은 나무를 심음
+- 2년 후의 총 나무 수를 구해야 함
+
+2단계: 두 번째 해에 심은 나무 수 계산
+- 첫해보다 20% 증가
+- 100 + (100의 20%) = 100 + 20 = 120그루
+
+3단계: 세 번째 해에 심은 나무 수 계산
+- 두 번째 해보다 20% 증가
+- 120 + (120의 20%) = 120 + 24 = 144그루
+
+4단계: 3년 동안 심은 총 나무 수 계산
+- 첫해: 100그루
+- 둘째 해: 120그루
+- 셋째 해: 144그루
+- 총합: 100 + 120 + 144 = 364그루
+
+따라서, 2년 후(즉, 3년째)에는 총 364그루의 나무를 심게 됩니다."""}
+    ],
+
+    "1-shot": [
+        {"role": "user", "content": "Git의 rebase와 merge의 차이점을 설명하고, 각각 어떤 상황에서 사용하는 것이 좋은지 예를 들어 설명해주세요."},
+        {"role": "assistant", "content": """Git에서 `rebase`와 `merge`는 두 개의 브랜치를 통합하는 데 사용되는 두 가지 주요 명령어입니다. 이들의 차이점과 사용 상황을 설명하겠습니다.
+
+### Rebase
+
+#### 개념
+- `rebase`는 하나의 브랜치의 커밋을 다른 브랜치의 끝에 다시 적용하는 것입니다. 즉, 커밋 히스토리를 새로 만든다고 할 수 있습니다.
+- 결과적으로 커밋 히스토리가 깔끔하게 정리됩니다.
+
+#### 사용 상황
+- **깔끔한 히스토리가 필요할 때**: 브랜치 히스토리가 병합 없이 일직선으로 이어지도록 하고 싶을 때 유용합니다.
+- **작업 중인 브랜치를 최신 상태로 유지하고 싶을 때**: 예를 들어, `feature` 브랜치에서 작업 중인데 `main` 브랜치에서 업데이트가 있을 경우, `feature` 브랜치를 `rebase`하여 최신 상태로 유지할 수 있습니다.
+
+#### 예제
+1. `feature` 브랜치에서 작업 중입니다.
+2. `main` 브랜치에 새로운 커밋이 추가되었습니다.
+3. `feature` 브랜치에서 `git rebase main`을 실행합니다.
+4. `feature` 브랜치의 커밋들이 `main` 브랜치의 최신 커밋들 뒤에 재배치됩니다.
+
+```bash
+git checkout feature
+git rebase main
+```
+
+### Merge
+
+#### 개념
+- `merge`는 두 개의 브랜치를 합치는 방법으로, 두 브랜치의 히스토리를 유지하며 새로운 병합 커밋을 생성합니다.
+- `merge`는 기존의 히스토리를 그대로 보존합니다.
+
+#### 사용 상황
+- **히스토리를 유지하고 싶을 때**: 각 브랜치의 작업 기록을 그대로 보존하면서 병합할 때 사용합니다.
+- **협업 중 충돌을 명확히 해결하고 기록을 남기고 싶을 때**: 여러 사람이 같은 저장소에서 작업할 때, `merge`는 충돌 해결 과정과 그 기록을 명확히 남길 수 있습니다.
+
+#### 예제
+1. `feature` 브랜치에서 작업을 완료했습니다.
+2. `main` 브랜치에 병합하고 싶습니다.
+3. `main` 브랜치로 체크아웃한 후 `feature` 브랜치를 병합합니다.
+
+```bash
+git checkout main
+git merge feature
+```
+
+### 요약
+
+- **Rebase**는 히스토리를 깔끔하게 정리하는 데 유리하며, 주로 개인 작업이나 `feature` 브랜치를 최신 상태로 유지할 때 사용됩니다.
+- **Merge**는 두 브랜치의 작업 히스토리를 보존하면서 병합하며, 협업 과정에서 충돌 해결과 기록을 명확히 남기는 데 유리합니다.
+
+각 방법의 장단점과 사용 상황을 고려하여 적절히 선택하는 것이 중요합니다."""}
+],
+    "default": [],
+}
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--gpu_devices", help=" : CUDA_VISIBLE_DEVICES", default="0")
-parser.add_argument("--model", help=" : Model to evaluate", default="yanolja/EEVE-Korean-Instruct-2.8B-v1.0")
-parser.add_argument("--template", help=" : Template File Location", default="./templates/template-EEVE.json")
-parser.add_argument("--model_len", help=" : Maximum Model Length", default=4096, type=int)
+parser.add_argument('-g' ,'--gpu_devices', help=' : CUDA_VISIBLE_DEVICES', default='0')
+parser.add_argument('-m', '--model', help=' : Model to evaluate', default='yanolja/EEVE-Korean-Instruct-2.8B-v1.0')
+parser.add_argument('-ml', '--model_len', help=' : Maximum Model Length', default=4096, type=int)
 args = parser.parse_args()
 
-os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_devices
-gpu_counts = len(args.gpu_devices.split(","))
+print(f"Args - {args}")
 
-df_config = pd.read_json(args.template, typ="series")
-SINGLE_TURN_TEMPLATE = df_config.iloc[0]
-DOUBLE_TURN_TEMPLATE = df_config.iloc[1]
+os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_devices
+gpu_counts = len(args.gpu_devices.split(','))
 
 llm = LLM(
-    model=args.model, tensor_parallel_size=gpu_counts, max_model_len=int(args.model_len), gpu_memory_utilization=0.95
-)
+    model=args.model,
+    tensor_parallel_size=gpu_counts,
+    max_model_len=args.model_len,
+    gpu_memory_utilization=0.8,
+    trust_remote_code=True # !
+    )
+
 sampling_params = SamplingParams(
     temperature=0,
-    top_p=1,
-    top_k=-1,
-    early_stopping=True,
-    best_of=4,
-    use_beam_search=True,
-    skip_special_tokens=False,
+    skip_special_tokens=True,
     max_tokens=args.model_len,
-    stop=["<|endoftext|>", "</s>", "<|im_end|>", "[INST]", "[/INST]", "<end_of_turn>", "<start_of_turn>"],
-)
-
-df_questions = pd.read_json("questions.jsonl", orient="records", encoding="utf-8-sig", lines=True)
-
-
-def format_single_turn_question(question):
-    return SINGLE_TURN_TEMPLATE.format(question[0])
-
-
-single_turn_questions = df_questions["questions"].map(format_single_turn_question)
-single_turn_outputs = [
-    output.outputs[0].text.strip() for output in llm.generate(single_turn_questions, sampling_params)
-]
-
-
-def format_double_turn_question(question, single_turn_output):
-    return DOUBLE_TURN_TEMPLATE.format(question[0], single_turn_output, question[1])
-
-
-multi_turn_questions = df_questions[["questions", "id"]].apply(
-    lambda x: format_double_turn_question(x["questions"], single_turn_outputs[x["id"] - 1]), axis=1
-)  # bad code ig?
-
-multi_turn_outputs = [output.outputs[0].text.strip() for output in llm.generate(multi_turn_questions, sampling_params)]
-
-df_output = pd.DataFrame(
-    {
-        "id": df_questions["id"],
-        "category": df_questions["category"],
-        "questions": df_questions["questions"],
-        "outputs": list(zip(single_turn_outputs, multi_turn_outputs)),
-        "references": df_questions["references"],
-    }
-)
-df_output.to_json(f'{str(args.model).replace("/", "_")}.jsonl', orient="records", lines=True, force_ascii=False)
+    stop=[
+        '<|endoftext|>',
+        '[INST]', '[/INST]',
+        '<|im_end|>',
+        '<|end|>'
+        ]
+    )
+
+df_questions = pd.read_json(
+    'questions.jsonl',
+    orient='records',
+    encoding="utf-8-sig",
+    lines=True
+    )
+
+if not os.path.exists("./generated/" + args.model):
+    os.makedirs("./generated/" + args.model)
+
+for strategy_name, prompts in PROMPT_STRATEGY.items():
+    def format_single_turn_question(question):
+        return llm.llm_engine.tokenizer.tokenizer.apply_chat_template(prompts + [{"role": "user", "content": question[0]}], tokenize=False, add_generation_prompt=True)
+
+    single_turn_questions = df_questions['questions'].map(format_single_turn_question)
+    print(single_turn_questions.iloc[0])
+    single_turn_outputs = [output.outputs[0].text.strip() for output in llm.generate(single_turn_questions, sampling_params)]
+
+    def format_double_turn_question(question, single_turn_output):
+        return llm.llm_engine.tokenizer.tokenizer.apply_chat_template(prompts + [{"role": "user", "content": question[0]}, {"role": "assistant", "content": single_turn_output}, {"role": "user", "content": question[1]}], tokenize=False, add_generation_prompt=True)
+
+    multi_turn_questions = df_questions[['questions', 'id']].apply(lambda x: format_double_turn_question(x['questions'], single_turn_outputs[x['id']-1]), axis=1)
+    multi_turn_outputs = [output.outputs[0].text.strip() for output in llm.generate(multi_turn_questions, sampling_params)]
+
+    df_output = pd.DataFrame({'id': df_questions['id'], 'category': df_questions['category'], 'questions': df_questions['questions'], 'outputs': list(zip(single_turn_outputs, multi_turn_outputs)), "references": df_questions['references']})
+    df_output.to_json('./generated/' + os.path.join(args.model, f'{strategy_name}.jsonl'), orient='records', lines=True, force_ascii=False)