Skip to content

Commit

Permalink
dataset loading and saving
Browse files Browse the repository at this point in the history
  • Loading branch information
kritinv committed Nov 22, 2024
1 parent 1793e96 commit fce2748
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 16 deletions.
20 changes: 16 additions & 4 deletions deepeval/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
)

df = pd.read_csv(file_path)

# Convert np.nan (default for missing values in pandas) to None for compatibility with Python and Pydantic
df = df.astype(object).where(pd.notna(df), None)

inputs = get_column_data(df, input_col_name)
actual_outputs = get_column_data(df, actual_output_col_name)
expected_outputs = get_column_data(
Expand Down Expand Up @@ -324,6 +326,7 @@ def add_test_cases_from_json_file(
retrieval_context_key_name: Optional[str] = None,
tools_called_key_name: Optional[str] = None,
expected_tools_key_name: Optional[str] = None,
encoding_type: str = "utf-8"
):
"""
Load test cases from a JSON file.
Expand All @@ -349,7 +352,7 @@ def add_test_cases_from_json_file(
The JSON file should be structured as a list of objects, with each object containing the required keys. The method assumes the file format and keys are correctly defined and present.
"""
try:
with open(file_path, "r") as file:
with open(file_path, "r", encoding=encoding_type) as file:
json_list = json.load(file)
except FileNotFoundError:
raise FileNotFoundError(f"The file {file_path} was not found.")
Expand Down Expand Up @@ -400,6 +403,7 @@ def add_goldens_from_csv_file(
tools_called_col_delimiter: str = ";",
expected_tools_col_name: Optional[str] = None,
expected_tools_col_delimiter: str = ";",
source_file_col_name: Optional[str] = None,
additional_metadata_col_name: Optional[str] = None,
):
try:
Expand All @@ -417,6 +421,8 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
)

df = pd.read_csv(file_path)
# Convert np.nan (default for missing values in pandas) to None for compatibility with Python and Pydantic
df = df.astype(object).where(pd.notna(df), None)

inputs = get_column_data(df, input_col_name)
actual_outputs = get_column_data(df, actual_output_col_name)
Expand Down Expand Up @@ -457,6 +463,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
df, expected_tools_col_name, default=""
)
]
source_files = get_column_data(df, source_file_col_name)
additional_metadatas = [
ast.literal_eval(metadata) if metadata else None
for metadata in get_column_data(
Expand All @@ -472,6 +479,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
retrieval_context,
tools_called,
expected_tools,
source_file,
additional_metadata,
) in zip(
inputs,
Expand All @@ -481,6 +489,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
retrieval_contexts,
tools_called,
expected_tools,
source_files,
additional_metadatas,
):
self.goldens.append(
Expand All @@ -493,7 +502,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
tools_called=tools_called,
expected_tools=expected_tools,
additional_metadata=additional_metadata,
source_file=file_path,
source_file=source_file,
)
)

Expand All @@ -507,9 +516,11 @@ def add_goldens_from_json_file(
retrieval_context_key_name: Optional[str] = None,
tools_called_key_name: Optional[str] = None,
expected_tools_key_name: Optional[str] = None,
source_file_key_name: Optional[str] = None,
encoding_type: str = "utf-8"
):
try:
with open(file_path, "r") as file:
with open(file_path, "r", encoding=encoding_type) as file:
json_list = json.load(file)
except FileNotFoundError:
raise FileNotFoundError(f"The file {file_path} was not found.")
Expand All @@ -530,6 +541,7 @@ def add_goldens_from_json_file(
retrieval_context = json_obj.get(retrieval_context_key_name)
tools_called = json_obj.get(tools_called_key_name)
expected_tools = json_obj.get(expected_tools_key_name)
source_file = json_obj.get(source_file_key_name)

self.goldens.append(
Golden(
Expand Down
76 changes: 64 additions & 12 deletions tests/test_synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import os
import pytest
import time
from typing import Callable
import asyncio
from deepeval.synthesizer import Synthesizer
from deepeval.dataset import EvaluationDataset
from deepeval.models import OpenAIEmbeddingModel
from deepeval.models.gpt_model_schematic import SchematicGPTModel
import pytest
import time
import os

from deepeval.synthesizer.chunking.context_generator import ContextGenerator
from deepeval.models.gpt_model_schematic import SchematicGPTModel
from deepeval.models import OpenAIEmbeddingModel
from deepeval.dataset import EvaluationDataset
from deepeval.synthesizer import Synthesizer
from deepeval.synthesizer.config import *
from deepeval.synthesizer import (
Evolution,
PromptEvolution,
)
from deepeval.synthesizer.config import *

#########################################################
### Context #############################################
Expand Down Expand Up @@ -282,7 +283,7 @@ def test_generate_goldens_from_docs(synthesizer: Synthesizer):
synthesizer_async = Synthesizer(async_mode=True, max_concurrent=9)

# test_generate_goldens_from_docs(synthesizer_sync)
test_generate_goldens_from_docs(synthesizer_async)
# test_generate_goldens_from_docs(synthesizer_async)

#########################################################
### Generate Goldens From Scratch #######################
Expand Down Expand Up @@ -343,6 +344,57 @@ def test_generate_generate_goldens_from_scratch(synthesizer: Synthesizer):
# )
# print(dataset.goldens)

test_generate_goldens_from_contexts(synthesizer)
test_generate_goldens_from_docs(synthesizer)
test_generate_generate_goldens_from_scratch(synthesizer)
#########################################################
### Save to JSON/CSV ####################################
#########################################################

def test_save_goldens(synthesizer: Synthesizer, file_type: str):
goldens = synthesizer.generate_goldens_from_docs(
max_goldens_per_context=3,
document_paths=document_paths,
context_construction_config=ContextConstructionConfig(chunk_size=100),
_send_data=False,
)
if file_type == "csv":
synthesizer.save_as("csv", "./goldens")
elif file_type == "json":
synthesizer.save_as("json", "./goldens")

def test_load_goldens(file_name: str):
_, extension = os.path.splitext(file_name)
dataset = EvaluationDataset()
print(extension)
if extension == ".csv":
dataset.add_goldens_from_csv_file(
file_name,
input_col_name="input",
actual_output_col_name="actual_output",
expected_output_col_name="expected_output",
context_col_name="context",
context_col_delimiter="|",
source_file_col_name="source_file"
)
print(dataset.goldens)
elif extension == ".json":
dataset.add_goldens_from_json_file(
file_name,
input_key_name="input",
actual_output_key_name="actual_output",
expected_output_key_name="expected_output",
context_key_name="context",
source_file_key_name="source_file"
)
print(dataset.goldens)

# synthesizer = Synthesizer(async_mode=True)
# test_save_goldens(synthesizer, "json")
# test_load_goldens("./goldens/20241122_153727.csv")
test_load_goldens("./goldens/20241122_154545.json")

#########################################################
### Test Everything #####################################
#########################################################

# test_generate_goldens_from_contexts(synthesizer)
# test_generate_goldens_from_docs(synthesizer)
# test_generate_generate_goldens_from_scratch(synthesizer)

0 comments on commit fce2748

Please sign in to comment.