diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py index a43ac8f8..69d894e1 100644 --- a/deepeval/dataset/dataset.py +++ b/deepeval/dataset/dataset.py @@ -235,7 +235,9 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None): ) df = pd.read_csv(file_path) - + # Convert np.nan (default for missing values in pandas) to None for compatibility with Python and Pydantic + df = df.astype(object).where(pd.notna(df), None) + inputs = get_column_data(df, input_col_name) actual_outputs = get_column_data(df, actual_output_col_name) expected_outputs = get_column_data( @@ -324,6 +326,7 @@ def add_test_cases_from_json_file( retrieval_context_key_name: Optional[str] = None, tools_called_key_name: Optional[str] = None, expected_tools_key_name: Optional[str] = None, + encoding_type: str = "utf-8" ): """ Load test cases from a JSON file. @@ -349,7 +352,7 @@ def add_test_cases_from_json_file( The JSON file should be structured as a list of objects, with each object containing the required keys. The method assumes the file format and keys are correctly defined and present. """ try: - with open(file_path, "r") as file: + with open(file_path, "r", encoding=encoding_type) as file: json_list = json.load(file) except FileNotFoundError: raise FileNotFoundError(f"The file {file_path} was not found.") @@ -400,6 +403,7 @@ def add_goldens_from_csv_file( tools_called_col_delimiter: str = ";", expected_tools_col_name: Optional[str] = None, expected_tools_col_delimiter: str = ";", + source_file_col_name: Optional[str] = None, additional_metadata_col_name: Optional[str] = None, ): try: @@ -417,6 +421,8 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None): ) df = pd.read_csv(file_path) + # Convert np.nan (default for missing values in pandas) to None for compatibility with Python and Pydantic + df = df.astype(object).where(pd.notna(df), None) inputs = get_column_data(df, input_col_name) actual_outputs = get_column_data(df, actual_output_col_name) @@ -457,6 +463,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None): df, expected_tools_col_name, default="" ) ] + source_files = get_column_data(df, source_file_col_name) additional_metadatas = [ ast.literal_eval(metadata) if metadata else None for metadata in get_column_data( @@ -472,6 +479,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None): retrieval_context, tools_called, expected_tools, + source_file, additional_metadata, ) in zip( inputs, @@ -481,6 +489,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None): retrieval_contexts, tools_called, expected_tools, + source_files, additional_metadatas, ): self.goldens.append( @@ -493,7 +502,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None): tools_called=tools_called, expected_tools=expected_tools, additional_metadata=additional_metadata, - source_file=file_path, + source_file=source_file, ) ) @@ -507,9 +516,11 @@ def add_goldens_from_json_file( retrieval_context_key_name: Optional[str] = None, tools_called_key_name: Optional[str] = None, expected_tools_key_name: Optional[str] = None, + source_file_key_name: Optional[str] = None, + encoding_type: str = "utf-8" ): try: - with open(file_path, "r") as file: + with open(file_path, "r", encoding=encoding_type) as file: json_list = json.load(file) except FileNotFoundError: raise FileNotFoundError(f"The file {file_path} was not found.") @@ -530,6 +541,7 @@ def add_goldens_from_json_file( retrieval_context = json_obj.get(retrieval_context_key_name) tools_called = json_obj.get(tools_called_key_name) expected_tools = json_obj.get(expected_tools_key_name) + source_file = json_obj.get(source_file_key_name) self.goldens.append( Golden( diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py index d5bc1b3b..4b72e6fc 100644 --- a/tests/test_synthesizer.py +++ b/tests/test_synthesizer.py @@ -1,18 +1,19 @@ -import os -import pytest -import time from typing import Callable import asyncio -from deepeval.synthesizer import Synthesizer -from deepeval.dataset import EvaluationDataset -from deepeval.models import OpenAIEmbeddingModel -from deepeval.models.gpt_model_schematic import SchematicGPTModel +import pytest +import time +import os + from deepeval.synthesizer.chunking.context_generator import ContextGenerator +from deepeval.models.gpt_model_schematic import SchematicGPTModel +from deepeval.models import OpenAIEmbeddingModel +from deepeval.dataset import EvaluationDataset +from deepeval.synthesizer import Synthesizer +from deepeval.synthesizer.config import * from deepeval.synthesizer import ( Evolution, PromptEvolution, ) -from deepeval.synthesizer.config import * ######################################################### ### Context ############################################# @@ -282,7 +283,7 @@ def test_generate_goldens_from_docs(synthesizer: Synthesizer): synthesizer_async = Synthesizer(async_mode=True, max_concurrent=9) # test_generate_goldens_from_docs(synthesizer_sync) -test_generate_goldens_from_docs(synthesizer_async) +# test_generate_goldens_from_docs(synthesizer_async) ######################################################### ### Generate Goldens From Scratch ####################### @@ -343,6 +344,57 @@ def test_generate_generate_goldens_from_scratch(synthesizer: Synthesizer): # ) # print(dataset.goldens) -test_generate_goldens_from_contexts(synthesizer) -test_generate_goldens_from_docs(synthesizer) -test_generate_generate_goldens_from_scratch(synthesizer) +######################################################### +### Save to JSON/CSV #################################### +######################################################### + +def test_save_goldens(synthesizer: Synthesizer, file_type: str): + goldens = synthesizer.generate_goldens_from_docs( + max_goldens_per_context=3, + document_paths=document_paths, + context_construction_config=ContextConstructionConfig(chunk_size=100), + _send_data=False, + ) + if file_type == "csv": + synthesizer.save_as("csv", "./goldens") + elif file_type == "json": + synthesizer.save_as("json", "./goldens") + +def test_load_goldens(file_name: str): + _, extension = os.path.splitext(file_name) + dataset = EvaluationDataset() + print(extension) + if extension == ".csv": + dataset.add_goldens_from_csv_file( + file_name, + input_col_name="input", + actual_output_col_name="actual_output", + expected_output_col_name="expected_output", + context_col_name="context", + context_col_delimiter="|", + source_file_col_name="source_file" + ) + print(dataset.goldens) + elif extension == ".json": + dataset.add_goldens_from_json_file( + file_name, + input_key_name="input", + actual_output_key_name="actual_output", + expected_output_key_name="expected_output", + context_key_name="context", + source_file_key_name="source_file" + ) + print(dataset.goldens) + +# synthesizer = Synthesizer(async_mode=True) +# test_save_goldens(synthesizer, "json") +# test_load_goldens("./goldens/20241122_153727.csv") +test_load_goldens("./goldens/20241122_154545.json") + +######################################################### +### Test Everything ##################################### +######################################################### + +# test_generate_goldens_from_contexts(synthesizer) +# test_generate_goldens_from_docs(synthesizer) +# test_generate_generate_goldens_from_scratch(synthesizer)