From df0f3ea143ad0c7b9b9afb4d70018bb633444f86 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Thu, 2 Jan 2025 14:01:51 +0800 Subject: [PATCH] reformat --- deepeval/benchmarks/arc/arc.py | 39 ++++++++----- deepeval/benchmarks/bbq/bbq.py | 33 +++++++---- .../big_bench_hard/big_bench_hard.py | 55 ++++++++++++------ deepeval/benchmarks/bool_q/bool_q.py | 37 +++++++----- deepeval/benchmarks/drop/drop.py | 46 ++++++++++----- deepeval/benchmarks/gsm8k/gsm8k.py | 39 ++++++++----- deepeval/benchmarks/hellaswag/hellaswag.py | 47 ++++++++++----- deepeval/benchmarks/human_eval/human_eval.py | 22 ++++--- deepeval/benchmarks/lambada/lambada.py | 32 ++++++----- deepeval/benchmarks/logi_qa/logi_qa.py | 50 ++++++++++------ deepeval/benchmarks/math_qa/math_qa.py | 51 ++++++++++++----- deepeval/benchmarks/mmlu/mmlu.py | 49 +++++++++++----- deepeval/benchmarks/schema.py | 3 +- deepeval/benchmarks/squad/squad.py | 31 +++++----- .../benchmarks/truthful_qa/truthful_qa.py | 57 +++++++++++++------ deepeval/benchmarks/winogrande/winogrande.py | 31 ++++++---- tests/test_benchmarks.py | 4 +- 17 files changed, 415 insertions(+), 211 deletions(-) diff --git a/deepeval/benchmarks/arc/arc.py b/deepeval/benchmarks/arc/arc.py index 4287b8bc..59d055d3 100644 --- a/deepeval/benchmarks/arc/arc.py +++ b/deepeval/benchmarks/arc/arc.py @@ -42,11 +42,12 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + self.confinement_instructions = ( + "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + ) else: self.confinement_instructions = confinement_instructions - def evaluate(self, model: DeepEvalBaseLLM) -> Dict: with capture_benchmark_run("ARC", self.n_problems): overall_correct_predictions = 0 @@ -57,15 +58,23 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: goldens: List[Golden] = self.load_benchmark_dataset(self.mode)[ : self.n_problems ] - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {self.n_problems} problems" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {self.n_problems} problems") + ): prediction, score = self.predict(model, golden).values() if score: overall_correct_predictions += 1 - predictions_row.append((golden.input, prediction, golden.expected_output, score)) + predictions_row.append( + (golden.input, prediction, golden.expected_output, score) + ) if self.verbose_mode: - self.print_verbose_logs(idx, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + golden.input, + golden.expected_output, + prediction, + score, + ) # Calculate overall accuracy overall_accuracy = ( @@ -74,7 +83,8 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: print(f"Overall ARC Accuracy: {overall_accuracy}") self.predictions = pd.DataFrame( - predictions_row, columns=["Input", "Prediction", "Expected Output", "Correct"] + predictions_row, + columns=["Input", "Prediction", "Expected Output", "Correct"], ) self.overall_score = overall_accuracy @@ -130,18 +140,18 @@ def load_benchmark_dataset(self, mode: ARCMode) -> List[Golden]: golden = Golden(input=input, expected_output=expected_output) goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - input: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -159,6 +169,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs + return verbose_logs diff --git a/deepeval/benchmarks/bbq/bbq.py b/deepeval/benchmarks/bbq/bbq.py index 8b616330..dc46bd59 100644 --- a/deepeval/benchmarks/bbq/bbq.py +++ b/deepeval/benchmarks/bbq/bbq.py @@ -33,7 +33,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode: bool = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output only 'A', 'B', or 'C. Full answer not needed." + self.confinement_instructions = ( + "Output only 'A', 'B', or 'C. Full answer not needed." + ) else: self.confinement_instructions = confinement_instructions @@ -56,9 +58,9 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: overall_total_predictions += len(goldens) # Calculate task accuracy - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict(model, golden).values() if score: task_correct_predictions += 1 @@ -73,7 +75,14 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -166,19 +175,19 @@ def load_benchmark_dataset(self, task: BBQTask) -> List[Golden]: golden = Golden(input=input, expected_output=expected_output) goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -196,5 +205,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - + return verbose_logs diff --git a/deepeval/benchmarks/big_bench_hard/big_bench_hard.py b/deepeval/benchmarks/big_bench_hard/big_bench_hard.py index 87fb7eb1..7aed3082 100644 --- a/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +++ b/deepeval/benchmarks/big_bench_hard/big_bench_hard.py @@ -43,6 +43,7 @@ BigBenchHardTask.WORD_SORTING: "\n\nOutput only the sequence of words separated by white space. Full answer not needed.", } + class BigBenchHard(DeepEvalBaseBenchmark): def __init__( self, @@ -51,7 +52,9 @@ def __init__( enable_cot: bool = True, n_problems_per_task: Optional[int] = None, verbose_mode: bool = False, - confinement_instructions_dict: Optional[Dict[BigBenchHardTask, str]] = None, + confinement_instructions_dict: Optional[ + Dict[BigBenchHardTask, str] + ] = None, **kwargs, ): assert n_shots <= 3, "BBH only supports n_shots <= 3" @@ -115,9 +118,9 @@ def evaluate( ) else: # Calculate task accuracy - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict( model, task, golden ).values() @@ -125,10 +128,23 @@ def evaluate( task_correct_predictions += 1 overall_correct_predictions += 1 predictions_row.append( - (task.value, golden.input, prediction, golden.expected_output, score) + ( + task.value, + golden.input, + prediction, + golden.expected_output, + score, + ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -148,7 +164,13 @@ def evaluate( # Columns: 'Task', 'Input', 'Prediction', 'Score' self.predictions = pd.DataFrame( predictions_row, - columns=["Task", "Input", "Prediction", "Expected Output", "Correct"], + columns=[ + "Task", + "Input", + "Prediction", + "Expected Output", + "Correct", + ], ) self.task_scores = pd.DataFrame( scores_row, columns=["Task", "Score"] @@ -237,7 +259,9 @@ def batch_predict( return res def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]: - dataset_mapping = {task: f"{task.value}_dataset" for task in BigBenchHardTask} + dataset_mapping = { + task: f"{task.value}_dataset" for task in BigBenchHardTask + } dataset_attr = dataset_mapping.get(task) if dataset_attr: if not hasattr(self, dataset_attr): @@ -254,19 +278,19 @@ def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]: goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -284,6 +308,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs + return verbose_logs diff --git a/deepeval/benchmarks/bool_q/bool_q.py b/deepeval/benchmarks/bool_q/bool_q.py index d0382174..5c9eb28e 100644 --- a/deepeval/benchmarks/bool_q/bool_q.py +++ b/deepeval/benchmarks/bool_q/bool_q.py @@ -31,7 +31,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Make sure to output only 'Yes' or 'No'." + self.confinement_instructions = ( + "Make sure to output only 'Yes' or 'No'." + ) else: self.confinement_instructions = confinement_instructions @@ -43,15 +45,23 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: # Solving each problem goldens = self.load_benchmark_dataset()[: self.n_problems] - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {self.n_problems} problems" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {self.n_problems} problems") + ): prediction, score = self.predict(model, golden).values() if score: overall_correct_predictions += 1 - predictions_row.append((golden.input, prediction, golden.expected_output, score)) + predictions_row.append( + (golden.input, prediction, golden.expected_output, score) + ) if self.verbose_mode: - self.print_verbose_logs(idx, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + golden.input, + golden.expected_output, + prediction, + score, + ) # Calculate overall accuracy overall_accuracy = ( @@ -60,7 +70,8 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: print(f"Overall BoolQ Accuracy: {overall_accuracy}") self.predictions = pd.DataFrame( - predictions_row, columns=["Input", "Prediction", "Expected Output", "Correct"] + predictions_row, + columns=["Input", "Prediction", "Expected Output", "Correct"], ) self.overall_score = overall_accuracy @@ -110,18 +121,18 @@ def load_benchmark_dataset(self) -> List[Golden]: goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - input: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -139,5 +150,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - + return verbose_logs diff --git a/deepeval/benchmarks/drop/drop.py b/deepeval/benchmarks/drop/drop.py index 565cc0af..6db75f65 100644 --- a/deepeval/benchmarks/drop/drop.py +++ b/deepeval/benchmarks/drop/drop.py @@ -86,18 +86,31 @@ def evaluate( (task.value, golden.input, prediction, score) ) else: - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict(model, golden).values() if score: task_correct_predictions += 1 overall_correct_predictions += 1 predictions_row.append( - (task.value, golden.input, prediction, golden.expected_output, score) + ( + task.value, + golden.input, + prediction, + golden.expected_output, + score, + ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -117,7 +130,13 @@ def evaluate( # Columns: 'Task', 'Input', 'Prediction', 'Score' self.predictions = pd.DataFrame( predictions_row, - columns=["Task", "Input", "Prediction", "Expected Output", "Correct"], + columns=[ + "Task", + "Input", + "Prediction", + "Expected Output", + "Correct", + ], ) self.task_scores = pd.DataFrame( scores_row, columns=["Task", "Score"] @@ -263,19 +282,19 @@ def load_benchmark_dataset(self, task: DROPTask) -> List[Golden]: goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nAccepted Expected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nAccepted Expected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -293,6 +312,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs + return verbose_logs diff --git a/deepeval/benchmarks/gsm8k/gsm8k.py b/deepeval/benchmarks/gsm8k/gsm8k.py index c2afbb28..e1a344ed 100644 --- a/deepeval/benchmarks/gsm8k/gsm8k.py +++ b/deepeval/benchmarks/gsm8k/gsm8k.py @@ -33,7 +33,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Make sure to output only the numerical answer." + self.confinement_instructions = ( + "Make sure to output only the numerical answer." + ) else: self.confinement_instructions = confinement_instructions @@ -45,15 +47,23 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: # Solving each problem goldens = self.load_benchmark_dataset()[: self.n_problems] - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {self.n_problems} problems" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {self.n_problems} problems") + ): prediction, score = self.predict(model, golden).values() if score: overall_correct_predictions += 1 - predictions_row.append((golden.input, prediction, golden.expected_output, score)) + predictions_row.append( + (golden.input, prediction, golden.expected_output, score) + ) if self.verbose_mode: - self.print_verbose_logs(idx, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + golden.input, + golden.expected_output, + prediction, + score, + ) # Calculate overall accuracy overall_accuracy = ( @@ -62,7 +72,8 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: print(f"Overall GSM8K Accuracy: {overall_accuracy}") self.predictions = pd.DataFrame( - predictions_row, columns=["Input", "Prediction", "Expected Output", "Correct"] + predictions_row, + columns=["Input", "Prediction", "Expected Output", "Correct"], ) self.overall_score = overall_accuracy @@ -89,7 +100,7 @@ def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict: except TypeError: prompt += f"\n\n{self.confinement_instructions}" prediction = model.generate(prompt) - + # For native models, shouldn't happen but just in case if isinstance(prediction, tuple): prediction = prediction[0] @@ -126,18 +137,18 @@ def load_benchmark_dataset(self) -> List[Golden]: goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - input: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -155,5 +166,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - + return verbose_logs diff --git a/deepeval/benchmarks/hellaswag/hellaswag.py b/deepeval/benchmarks/hellaswag/hellaswag.py index 1bf22573..fbd20332 100644 --- a/deepeval/benchmarks/hellaswag/hellaswag.py +++ b/deepeval/benchmarks/hellaswag/hellaswag.py @@ -38,7 +38,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode: bool = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + self.confinement_instructions = ( + "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + ) else: self.confinement_instructions = confinement_instructions @@ -84,9 +86,9 @@ def evaluate( (task.value, golden.input, prediction, score) ) else: - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict( model, task, golden ).values() @@ -94,10 +96,23 @@ def evaluate( task_correct_predictions += 1 overall_correct_predictions += 1 predictions_row.append( - (task.value, golden.input, prediction, golden.expected_output, score) + ( + task.value, + golden.input, + prediction, + golden.expected_output, + score, + ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -117,7 +132,13 @@ def evaluate( # Columns: 'Task', 'Input', 'Prediction', 'Score' self.predictions = pd.DataFrame( predictions_row, - columns=["Task", "Input", "Prediction", "Expected Output", "Correct"], + columns=[ + "Task", + "Input", + "Prediction", + "Expected Output", + "Correct", + ], ) self.task_scores = pd.DataFrame( scores_row, columns=["Task", "Score"] @@ -251,15 +272,15 @@ def load_benchmark_dataset(self, task: HellaSwagTask) -> List[Golden]: def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -277,5 +298,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - + return verbose_logs diff --git a/deepeval/benchmarks/human_eval/human_eval.py b/deepeval/benchmarks/human_eval/human_eval.py index aa038584..08e64094 100644 --- a/deepeval/benchmarks/human_eval/human_eval.py +++ b/deepeval/benchmarks/human_eval/human_eval.py @@ -31,7 +31,7 @@ def __init__( self.predictions: Optional[pd.DataFrame] = None self.task_scores: Optional[pd.DataFrame] = None self.overall_score: Optional[float] = None - self.verbose_mode: bool = False, + self.verbose_mode: bool = (False,) def evaluate(self, model: DeepEvalBaseLLM, k: int) -> Dict: with capture_benchmark_run("HumanEval", len(self.tasks)): @@ -57,7 +57,9 @@ def evaluate(self, model: DeepEvalBaseLLM, k: int) -> Dict: (task.value, golden.input, prediction, score) ) if self.verbose_mode: - self.print_verbose_logs(task.value, golden.input, prediction, score) + self.print_verbose_logs( + task.value, golden.input, prediction, score + ) print( f"HumanEval Task Accuracy (task={task.value}): {task_correct}" ) @@ -124,7 +126,7 @@ def load_benchmark_dataset(self, task: HumanEvalTask) -> List[Golden]: else: dataset = load_dataset("openai_humaneval", trust_remote_code=True) self.dataset = dataset - + # Filter tasks test_set = dataset["test"].filter( lambda data: data["entry_point"] == task.value @@ -134,17 +136,13 @@ def load_benchmark_dataset(self, task: HumanEvalTask) -> List[Golden]: input=test_set["prompt"], expected_output=test_set["test"] ) return golden - + def print_verbose_logs( - self, - task_value: str, - input: str, - prediction: str, - score: int + self, task_value: str, input: str, prediction: str, score: int ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}" + f"Score: {score}\nPrediction: {prediction}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -162,5 +160,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs \ No newline at end of file + + return verbose_logs diff --git a/deepeval/benchmarks/lambada/lambada.py b/deepeval/benchmarks/lambada/lambada.py index 9ac8bf19..429b8d36 100644 --- a/deepeval/benchmarks/lambada/lambada.py +++ b/deepeval/benchmarks/lambada/lambada.py @@ -31,11 +31,12 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output the target word! Do not include punctuations." + self.confinement_instructions = ( + "Output the target word! Do not include punctuations." + ) else: self.confinement_instructions = confinement_instructions - def evaluate(self, model: DeepEvalBaseLLM) -> Dict: with capture_benchmark_run("LAMBADA", self.n_problems): overall_correct_predictions = 0 @@ -44,9 +45,9 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: # Solving each problem goldens = self.load_benchmark_dataset()[: self.n_problems] - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {self.n_problems} problems" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {self.n_problems} problems") + ): prediction, score = self.predict(model, golden).values() if score: overall_correct_predictions += 1 @@ -54,7 +55,13 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: (golden.input, prediction, golden.expected_output, score) ) if self.verbose_mode: - self.print_verbose_logs(idx, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + golden.input, + golden.expected_output, + prediction, + score, + ) # Calculate overall accuracy overall_accuracy = ( @@ -116,18 +123,18 @@ def load_benchmark_dataset(self) -> List[Golden]: goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - input: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -145,6 +152,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs + return verbose_logs diff --git a/deepeval/benchmarks/logi_qa/logi_qa.py b/deepeval/benchmarks/logi_qa/logi_qa.py index faf9d8dc..a5288300 100644 --- a/deepeval/benchmarks/logi_qa/logi_qa.py +++ b/deepeval/benchmarks/logi_qa/logi_qa.py @@ -40,7 +40,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode: bool = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + self.confinement_instructions = ( + "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + ) else: self.confinement_instructions = confinement_instructions @@ -87,18 +89,31 @@ def evaluate( (task.value, golden.input, prediction, score) ) else: - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict(model, golden).values() if score: task_correct_predictions += 1 overall_correct_predictions += 1 predictions_row.append( - (task.value, golden.input, prediction, golden.expected_output, score) + ( + task.value, + golden.input, + prediction, + golden.expected_output, + score, + ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -118,7 +133,13 @@ def evaluate( # Columns: 'Task', 'Input', 'Prediction', 'Score' self.predictions = pd.DataFrame( predictions_row, - columns=["Task", "Input", "Prediction", "Expected Output", "Correct"], + columns=[ + "Task", + "Input", + "Prediction", + "Expected Output", + "Correct", + ], ) self.task_scores = pd.DataFrame( scores_row, columns=["Task", "Score"] @@ -234,15 +255,15 @@ def download_and_load_hf_dataset(self, url): def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -260,8 +281,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs - - + return verbose_logs diff --git a/deepeval/benchmarks/math_qa/math_qa.py b/deepeval/benchmarks/math_qa/math_qa.py index bd310e8e..74552889 100644 --- a/deepeval/benchmarks/math_qa/math_qa.py +++ b/deepeval/benchmarks/math_qa/math_qa.py @@ -37,7 +37,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output 'a', 'b', 'c', or 'd'. Full answer not needed." + self.confinement_instructions = ( + "Output 'a', 'b', 'c', or 'd'. Full answer not needed." + ) else: self.confinement_instructions = confinement_instructions @@ -84,18 +86,31 @@ def evaluate( (task.value, golden.input, prediction, score) ) else: - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict(model, golden).values() if score: task_correct_predictions += 1 overall_correct_predictions += 1 predictions_row.append( - (task.value, golden.input, prediction, golden.expected_output, score) + ( + task.value, + golden.input, + prediction, + golden.expected_output, + score, + ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -115,7 +130,13 @@ def evaluate( # Columns: 'Task', 'Input', 'Prediction', 'Score' self.predictions = pd.DataFrame( predictions_row, - columns=["Task", "Input", "Prediction", "Expected Output", "Correct"], + columns=[ + "Task", + "Input", + "Prediction", + "Expected Output", + "Correct", + ], ) self.task_scores = pd.DataFrame( scores_row, columns=["Task", "Score"] @@ -202,7 +223,7 @@ def load_benchmark_dataset(self, task: MathQATask) -> List[Golden]: else: dataset = load_dataset("allenai/math_qa", trust_remote_code=True) self.dataset = dataset - + # Construct test set test_set = dataset["test"].filter( lambda data: data["category"] == task.value @@ -218,15 +239,15 @@ def load_benchmark_dataset(self, task: MathQATask) -> List[Golden]: def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -244,5 +265,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs \ No newline at end of file + + return verbose_logs diff --git a/deepeval/benchmarks/mmlu/mmlu.py b/deepeval/benchmarks/mmlu/mmlu.py index e30d4576..7f75534e 100644 --- a/deepeval/benchmarks/mmlu/mmlu.py +++ b/deepeval/benchmarks/mmlu/mmlu.py @@ -36,7 +36,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode: bool = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + self.confinement_instructions = ( + "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + ) else: self.confinement_instructions = confinement_instructions @@ -83,9 +85,9 @@ def evaluate( (task.value, golden.input, prediction, score) ) else: - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict( model, task, golden ).values() @@ -93,10 +95,23 @@ def evaluate( task_correct_predictions += 1 overall_correct_predictions += 1 predictions_row.append( - (task.value, golden.input, prediction, golden.expected_output, score) + ( + task.value, + golden.input, + prediction, + golden.expected_output, + score, + ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -116,7 +131,13 @@ def evaluate( # Columns: 'Task', 'Input', 'Prediction', 'Score' self.predictions = pd.DataFrame( predictions_row, - columns=["Task", "Input", "Prediction", "Expected Output", "Correct"], + columns=[ + "Task", + "Input", + "Prediction", + "Expected Output", + "Correct", + ], ) self.task_scores = pd.DataFrame( scores_row, columns=["Task", "Score"] @@ -235,19 +256,19 @@ def load_benchmark_dataset(self, task: MMLUTask) -> List[Golden]: golden = Golden(input=input, expected_output=data["target"]) goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -265,5 +286,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - + return verbose_logs diff --git a/deepeval/benchmarks/schema.py b/deepeval/benchmarks/schema.py index decb8430..080a4d0a 100644 --- a/deepeval/benchmarks/schema.py +++ b/deepeval/benchmarks/schema.py @@ -1,6 +1,7 @@ from pydantic import BaseModel from typing import List, Literal + class MultipleChoiceSchema(BaseModel): answer: Literal["A", "B", "C", "D"] @@ -167,4 +168,4 @@ class BBHMultipleChoice18Schema(BaseModel): "tracking_shuffled_objects_seven_objects": BBHMultipleChoice7Schema, "web_of_lies": AffirmationSchema, "word_sorting": StringSchema, -} \ No newline at end of file +} diff --git a/deepeval/benchmarks/squad/squad.py b/deepeval/benchmarks/squad/squad.py index 4cebc548..cd79f3f2 100644 --- a/deepeval/benchmarks/squad/squad.py +++ b/deepeval/benchmarks/squad/squad.py @@ -45,7 +45,6 @@ def __init__( else: self.confinement_instructions = confinement_instructions - def evaluate(self, model: DeepEvalBaseLLM) -> Dict: with capture_benchmark_run("SQuAD", len(self.tasks)): overall_correct_predictions = 0 @@ -64,9 +63,9 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: task_total_predictions = len(goldens) overall_total_predictions += len(goldens) - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict(model, golden).values() if score: task_correct_predictions += 1 @@ -81,7 +80,14 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -163,19 +169,19 @@ def load_benchmark_dataset(self, task: SQuADTask) -> List[Golden]: golden = Golden(input=input, expected_output=expected_output) goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -193,6 +199,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs + return verbose_logs diff --git a/deepeval/benchmarks/truthful_qa/truthful_qa.py b/deepeval/benchmarks/truthful_qa/truthful_qa.py index 30f50f39..1a90e58d 100644 --- a/deepeval/benchmarks/truthful_qa/truthful_qa.py +++ b/deepeval/benchmarks/truthful_qa/truthful_qa.py @@ -17,9 +17,10 @@ truthful_qa_confinement_statements_dict = { TruthfulQAMode.MC1: "\n\nOutput '1', '2', '3', '4', '5' etc. (number in front of answer choice). Full answer not needed.", - TruthfulQAMode.MC2: "\n\nOutput the indices of all correct answers as a python list (e.g. '[1, 3, 4]'). Full answers are not needed." + TruthfulQAMode.MC2: "\n\nOutput the indices of all correct answers as a python list (e.g. '[1, 3, 4]'). Full answers are not needed.", } + class TruthfulQA(DeepEvalBaseBenchmark): def __init__( self, @@ -27,7 +28,9 @@ def __init__( mode: TruthfulQAMode = TruthfulQAMode.MC1, n_problems_per_task: Optional[int] = None, verbose_mode: bool = False, - confinement_instructions_dict: Optional[Dict[TruthfulQAMode, str]] = None, + confinement_instructions_dict: Optional[ + Dict[TruthfulQAMode, str] + ] = None, **kwargs, ): super().__init__(**kwargs) @@ -43,7 +46,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode: bool = verbose_mode if not confinement_instructions_dict: - self.confinement_instructions_dict = truthful_qa_confinement_statements_dict + self.confinement_instructions_dict = ( + truthful_qa_confinement_statements_dict + ) else: self.confinement_instructions_dict = confinement_instructions_dict @@ -90,9 +95,9 @@ def evaluate( (task.value, golden.input, prediction, score) ) else: - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {task.value}" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {task.value}") + ): prediction, score = self.predict( model, golden, self.mode ).values() @@ -100,10 +105,23 @@ def evaluate( task_correct_predictions += score overall_correct_predictions += score predictions_row.append( - (task.value, golden.input, prediction, golden.expected_output, score) + ( + task.value, + golden.input, + prediction, + golden.expected_output, + score, + ) ) if self.verbose_mode: - self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + task.value, + golden.input, + golden.expected_output, + prediction, + score, + ) task_accuracy = ( task_correct_predictions / task_total_predictions @@ -123,7 +141,13 @@ def evaluate( # Columns: 'Task', 'Input', 'Prediction', 'Score' self.predictions = pd.DataFrame( predictions_row, - columns=["Task", "Input", "Prediction", "Expected Output", "Correct"], + columns=[ + "Task", + "Input", + "Prediction", + "Expected Output", + "Correct", + ], ) self.task_scores = pd.DataFrame( scores_row, columns=["Task", "Score"] @@ -284,19 +308,19 @@ def load_benchmark_dataset( goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - task_value: str, - input: str, + task_value: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -314,6 +338,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs + return verbose_logs diff --git a/deepeval/benchmarks/winogrande/winogrande.py b/deepeval/benchmarks/winogrande/winogrande.py index 05342c8d..2c51ac0b 100644 --- a/deepeval/benchmarks/winogrande/winogrande.py +++ b/deepeval/benchmarks/winogrande/winogrande.py @@ -31,7 +31,9 @@ def __init__( self.overall_score: Optional[float] = None self.verbose_mode = verbose_mode if not confinement_instructions: - self.confinement_instructions = "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + self.confinement_instructions = ( + "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + ) else: self.confinement_instructions = confinement_instructions @@ -43,9 +45,9 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: # Solving each problem goldens = self.load_benchmark_dataset()[: self.n_problems] - for idx, golden in enumerate(tqdm( - goldens, desc=f"Processing {self.n_problems} problems" - )): + for idx, golden in enumerate( + tqdm(goldens, desc=f"Processing {self.n_problems} problems") + ): prediction, score = self.predict(model, golden).values() if score: overall_correct_predictions += 1 @@ -53,7 +55,13 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict: (golden.input, prediction, golden.expected_output, score) ) if self.verbose_mode: - self.print_verbose_logs(idx, golden.input, golden.expected_output, prediction, score) + self.print_verbose_logs( + idx, + golden.input, + golden.expected_output, + prediction, + score, + ) # Calculate overall accuracy overall_accuracy = ( @@ -117,18 +125,18 @@ def load_benchmark_dataset(self) -> List[Golden]: goldens.append(golden) return goldens - + def print_verbose_logs( self, idx: int, - input: str, + input: str, expected_output: str, - prediction: str, - score: int + prediction: str, + score: int, ) -> str: steps = [ f"Input:\n{input}", - f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}" + f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}", ] verbose_logs = "" for i in range(len(steps) - 1): @@ -146,6 +154,5 @@ def print_verbose_logs( print(verbose_logs + f"\n \n{steps[-1]}") print("") print("=" * 70) - - return verbose_logs + return verbose_logs diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 30fed10e..05f7a84d 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -127,7 +127,9 @@ verbose_mode=verbose_mode, ) benchmark_lambada = LAMBADA(n_problems=n_problems, verbose_mode=verbose_mode) -benchmark_winogrande = Winogrande(n_problems=n_problems, verbose_mode=verbose_mode) +benchmark_winogrande = Winogrande( + n_problems=n_problems, verbose_mode=verbose_mode +) ######################################## ## Evaluate ############################