Skip to content

Commit

Permalink
reformat
Browse files Browse the repository at this point in the history
  • Loading branch information
penguine-ip committed Jan 2, 2025
1 parent 702aae1 commit df0f3ea
Show file tree
Hide file tree
Showing 17 changed files with 415 additions and 211 deletions.
39 changes: 24 additions & 15 deletions deepeval/benchmarks/arc/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@ def __init__(
self.overall_score: Optional[float] = None
self.verbose_mode = verbose_mode
if not confinement_instructions:
self.confinement_instructions = "Output 'A', 'B', 'C', or 'D'. Full answer not needed."
self.confinement_instructions = (
"Output 'A', 'B', 'C', or 'D'. Full answer not needed."
)
else:
self.confinement_instructions = confinement_instructions


def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
with capture_benchmark_run("ARC", self.n_problems):
overall_correct_predictions = 0
Expand All @@ -57,15 +58,23 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
goldens: List[Golden] = self.load_benchmark_dataset(self.mode)[
: self.n_problems
]
for idx, golden in enumerate(tqdm(
goldens, desc=f"Processing {self.n_problems} problems"
)):
for idx, golden in enumerate(
tqdm(goldens, desc=f"Processing {self.n_problems} problems")
):
prediction, score = self.predict(model, golden).values()
if score:
overall_correct_predictions += 1
predictions_row.append((golden.input, prediction, golden.expected_output, score))
predictions_row.append(
(golden.input, prediction, golden.expected_output, score)
)
if self.verbose_mode:
self.print_verbose_logs(idx, golden.input, golden.expected_output, prediction, score)
self.print_verbose_logs(
idx,
golden.input,
golden.expected_output,
prediction,
score,
)

# Calculate overall accuracy
overall_accuracy = (
Expand All @@ -74,7 +83,8 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
print(f"Overall ARC Accuracy: {overall_accuracy}")

self.predictions = pd.DataFrame(
predictions_row, columns=["Input", "Prediction", "Expected Output", "Correct"]
predictions_row,
columns=["Input", "Prediction", "Expected Output", "Correct"],
)
self.overall_score = overall_accuracy

Expand Down Expand Up @@ -130,18 +140,18 @@ def load_benchmark_dataset(self, mode: ARCMode) -> List[Golden]:
golden = Golden(input=input, expected_output=expected_output)
goldens.append(golden)
return goldens

def print_verbose_logs(
self,
idx: int,
input: str,
input: str,
expected_output: str,
prediction: str,
score: int
prediction: str,
score: int,
) -> str:
steps = [
f"Input:\n{input}",
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}"
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}",
]
verbose_logs = ""
for i in range(len(steps) - 1):
Expand All @@ -159,6 +169,5 @@ def print_verbose_logs(
print(verbose_logs + f"\n \n{steps[-1]}")
print("")
print("=" * 70)

return verbose_logs

return verbose_logs
33 changes: 21 additions & 12 deletions deepeval/benchmarks/bbq/bbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def __init__(
self.overall_score: Optional[float] = None
self.verbose_mode: bool = verbose_mode
if not confinement_instructions:
self.confinement_instructions = "Output only 'A', 'B', or 'C. Full answer not needed."
self.confinement_instructions = (
"Output only 'A', 'B', or 'C. Full answer not needed."
)
else:
self.confinement_instructions = confinement_instructions

Expand All @@ -56,9 +58,9 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
overall_total_predictions += len(goldens)

# Calculate task accuracy
for idx, golden in enumerate(tqdm(
goldens, desc=f"Processing {task.value}"
)):
for idx, golden in enumerate(
tqdm(goldens, desc=f"Processing {task.value}")
):
prediction, score = self.predict(model, golden).values()
if score:
task_correct_predictions += 1
Expand All @@ -73,7 +75,14 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
)
)
if self.verbose_mode:
self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score)
self.print_verbose_logs(
idx,
task.value,
golden.input,
golden.expected_output,
prediction,
score,
)

task_accuracy = (
task_correct_predictions / task_total_predictions
Expand Down Expand Up @@ -166,19 +175,19 @@ def load_benchmark_dataset(self, task: BBQTask) -> List[Golden]:
golden = Golden(input=input, expected_output=expected_output)
goldens.append(golden)
return goldens

def print_verbose_logs(
self,
idx: int,
task_value: str,
input: str,
task_value: str,
input: str,
expected_output: str,
prediction: str,
score: int
prediction: str,
score: int,
) -> str:
steps = [
f"Input:\n{input}",
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}"
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}",
]
verbose_logs = ""
for i in range(len(steps) - 1):
Expand All @@ -196,5 +205,5 @@ def print_verbose_logs(
print(verbose_logs + f"\n \n{steps[-1]}")
print("")
print("=" * 70)

return verbose_logs
55 changes: 39 additions & 16 deletions deepeval/benchmarks/big_bench_hard/big_bench_hard.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
BigBenchHardTask.WORD_SORTING: "\n\nOutput only the sequence of words separated by white space. Full answer not needed.",
}


class BigBenchHard(DeepEvalBaseBenchmark):
def __init__(
self,
Expand All @@ -51,7 +52,9 @@ def __init__(
enable_cot: bool = True,
n_problems_per_task: Optional[int] = None,
verbose_mode: bool = False,
confinement_instructions_dict: Optional[Dict[BigBenchHardTask, str]] = None,
confinement_instructions_dict: Optional[
Dict[BigBenchHardTask, str]
] = None,
**kwargs,
):
assert n_shots <= 3, "BBH only supports n_shots <= 3"
Expand Down Expand Up @@ -115,20 +118,33 @@ def evaluate(
)
else:
# Calculate task accuracy
for idx, golden in enumerate(tqdm(
goldens, desc=f"Processing {task.value}"
)):
for idx, golden in enumerate(
tqdm(goldens, desc=f"Processing {task.value}")
):
prediction, score = self.predict(
model, task, golden
).values()
if score:
task_correct_predictions += 1
overall_correct_predictions += 1
predictions_row.append(
(task.value, golden.input, prediction, golden.expected_output, score)
(
task.value,
golden.input,
prediction,
golden.expected_output,
score,
)
)
if self.verbose_mode:
self.print_verbose_logs(idx, task.value, golden.input, golden.expected_output, prediction, score)
self.print_verbose_logs(
idx,
task.value,
golden.input,
golden.expected_output,
prediction,
score,
)

task_accuracy = (
task_correct_predictions / task_total_predictions
Expand All @@ -148,7 +164,13 @@ def evaluate(
# Columns: 'Task', 'Input', 'Prediction', 'Score'
self.predictions = pd.DataFrame(
predictions_row,
columns=["Task", "Input", "Prediction", "Expected Output", "Correct"],
columns=[
"Task",
"Input",
"Prediction",
"Expected Output",
"Correct",
],
)
self.task_scores = pd.DataFrame(
scores_row, columns=["Task", "Score"]
Expand Down Expand Up @@ -237,7 +259,9 @@ def batch_predict(
return res

def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]:
dataset_mapping = {task: f"{task.value}_dataset" for task in BigBenchHardTask}
dataset_mapping = {
task: f"{task.value}_dataset" for task in BigBenchHardTask
}
dataset_attr = dataset_mapping.get(task)
if dataset_attr:
if not hasattr(self, dataset_attr):
Expand All @@ -254,19 +278,19 @@ def load_benchmark_dataset(self, task: BigBenchHardTask) -> List[Golden]:
goldens.append(golden)

return goldens

def print_verbose_logs(
self,
idx: int,
task_value: str,
input: str,
task_value: str,
input: str,
expected_output: str,
prediction: str,
score: int
prediction: str,
score: int,
) -> str:
steps = [
f"Input:\n{input}",
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}"
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}",
]
verbose_logs = ""
for i in range(len(steps) - 1):
Expand All @@ -284,6 +308,5 @@ def print_verbose_logs(
print(verbose_logs + f"\n \n{steps[-1]}")
print("")
print("=" * 70)

return verbose_logs

return verbose_logs
37 changes: 24 additions & 13 deletions deepeval/benchmarks/bool_q/bool_q.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def __init__(
self.overall_score: Optional[float] = None
self.verbose_mode = verbose_mode
if not confinement_instructions:
self.confinement_instructions = "Make sure to output only 'Yes' or 'No'."
self.confinement_instructions = (
"Make sure to output only 'Yes' or 'No'."
)
else:
self.confinement_instructions = confinement_instructions

Expand All @@ -43,15 +45,23 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:

# Solving each problem
goldens = self.load_benchmark_dataset()[: self.n_problems]
for idx, golden in enumerate(tqdm(
goldens, desc=f"Processing {self.n_problems} problems"
)):
for idx, golden in enumerate(
tqdm(goldens, desc=f"Processing {self.n_problems} problems")
):
prediction, score = self.predict(model, golden).values()
if score:
overall_correct_predictions += 1
predictions_row.append((golden.input, prediction, golden.expected_output, score))
predictions_row.append(
(golden.input, prediction, golden.expected_output, score)
)
if self.verbose_mode:
self.print_verbose_logs(idx, golden.input, golden.expected_output, prediction, score)
self.print_verbose_logs(
idx,
golden.input,
golden.expected_output,
prediction,
score,
)

# Calculate overall accuracy
overall_accuracy = (
Expand All @@ -60,7 +70,8 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
print(f"Overall BoolQ Accuracy: {overall_accuracy}")

self.predictions = pd.DataFrame(
predictions_row, columns=["Input", "Prediction", "Expected Output", "Correct"]
predictions_row,
columns=["Input", "Prediction", "Expected Output", "Correct"],
)
self.overall_score = overall_accuracy

Expand Down Expand Up @@ -110,18 +121,18 @@ def load_benchmark_dataset(self) -> List[Golden]:
goldens.append(golden)

return goldens

def print_verbose_logs(
self,
idx: int,
input: str,
input: str,
expected_output: str,
prediction: str,
score: int
prediction: str,
score: int,
) -> str:
steps = [
f"Input:\n{input}",
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}"
f"Score: {score}\nPrediction: {prediction}\nExpected Output: {expected_output}",
]
verbose_logs = ""
for i in range(len(steps) - 1):
Expand All @@ -139,5 +150,5 @@ def print_verbose_logs(
print(verbose_logs + f"\n \n{steps[-1]}")
print("")
print("=" * 70)

return verbose_logs
Loading

0 comments on commit df0f3ea

Please sign in to comment.