diff --git a/deepeval/guardrails/types.py b/deepeval/guardrails/types.py index b919e8b4..dacd6437 100644 --- a/deepeval/guardrails/types.py +++ b/deepeval/guardrails/types.py @@ -65,10 +65,7 @@ class Guard(Enum): Guard.RELIGION, ] -entities_dependent_guards = [ - Guard.BOLA, - Guard.IMITATION -] +entities_dependent_guards = [Guard.BOLA, Guard.IMITATION] purpose_entities_dependent_guards = [ Guard.PII_API_DB, diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py index b29d60b0..7f3b7421 100644 --- a/deepeval/metrics/__init__.py +++ b/deepeval/metrics/__init__.py @@ -17,6 +17,7 @@ from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric from .tool_correctness.tool_correctness import ToolCorrectnessMetric from .json_correctness.json_correctness import JsonCorrectnessMetric +from .prompt_alignment.prompt_alignment import PromptAlignmentMetric from .text_to_image.text_to_image import TextToImageMetric from .image_editing.image_editing import ImageEditingMetric from .conversation_relevancy.conversation_relevancy import ( diff --git a/deepeval/metrics/prompt_alignment/__init__.py b/deepeval/metrics/prompt_alignment/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deepeval/metrics/prompt_alignment/prompt_alignment.py b/deepeval/metrics/prompt_alignment/prompt_alignment.py new file mode 100644 index 00000000..8ad6c455 --- /dev/null +++ b/deepeval/metrics/prompt_alignment/prompt_alignment.py @@ -0,0 +1,251 @@ +from typing import Optional, List, Union + +from deepeval.utils import get_or_create_event_loop, prettify_list +from deepeval.metrics.utils import ( + construct_verbose_logs, + trimAndLoadJson, + check_llm_test_case_params, + initialize_model, +) +from deepeval.test_case import ( + LLMTestCase, + LLMTestCaseParams, + ConversationalTestCase, +) +from deepeval.metrics import BaseMetric +from deepeval.models import DeepEvalBaseLLM +from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate +from deepeval.metrics.indicator import metric_progress_indicator +from deepeval.metrics.prompt_alignment.schema import * + +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, +] + + +class PromptAlignmentMetric(BaseMetric): + def __init__( + self, + prompt_instructions: List[str], + threshold: float = 0.5, + model: Optional[Union[str, DeepEvalBaseLLM]] = None, + include_reason: bool = True, + async_mode: bool = True, + strict_mode: bool = False, + verbose_mode: bool = False, + ): + if len(prompt_instructions) == 0: + raise ValueError("'prompt_instructions' must not be empty.") + + self.prompt_instructions = prompt_instructions + self.threshold = 1 if strict_mode else threshold + self.model, self.using_native_model = initialize_model(model) + self.evaluation_model = self.model.get_model_name() + self.include_reason = include_reason + self.async_mode = async_mode + self.strict_mode = strict_mode + self.verbose_mode = verbose_mode + + def measure( + self, + test_case: Union[LLMTestCase, ConversationalTestCase], + _show_indicator: bool = True, + ) -> float: + if isinstance(test_case, ConversationalTestCase): + test_case = test_case.turns[0] + check_llm_test_case_params(test_case, required_params, self) + + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator(self, _show_indicator=_show_indicator): + if self.async_mode: + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) + else: + self.verdicts: Verdicts = self._generate_verdicts( + test_case.input, test_case.actual_output + ) + self.score = self._calculate_score() + self.reason = self._generate_reason( + test_case.input, test_case.actual_output + ) + self.success = self.score >= self.threshold + self.verbose_logs = construct_verbose_logs( + self, + steps=[ + f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}", + f"Verdicts:\n{prettify_list(self.verdicts)}", + f"Score: {self.score}\nReason: {self.reason}", + ], + ) + + return self.score + + async def a_measure( + self, + test_case: Union[LLMTestCase, ConversationalTestCase], + _show_indicator: bool = True, + ) -> float: + if isinstance(test_case, ConversationalTestCase): + test_case = test_case.turns[0] + check_llm_test_case_params(test_case, required_params, self) + + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator( + self, async_mode=True, _show_indicator=_show_indicator + ): + self.verdicts: Verdicts = await self._a_generate_verdicts( + test_case.input, test_case.actual_output + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason( + test_case.input, test_case.actual_output + ) + self.success = self.score >= self.threshold + self.verbose_logs = construct_verbose_logs( + self, + steps=[ + f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}", + f"Verdicts:\n{prettify_list(self.verdicts)}", + f"Score: {self.score}\nReason: {self.reason}", + ], + ) + + return self.score + + async def _a_generate_reason(self, input: str, actual_output: str) -> str: + if self.include_reason is False: + return None + + unalignment_reasons = [] + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "no": + unalignment_reasons.append(verdict.reason) + + prompt = PromptAlignmentTemplate.generate_reason( + unalignment_reasons=unalignment_reasons, + input=input, + actual_output=actual_output, + score=format(self.score, ".2f"), + ) + if self.using_native_model: + res, cost = await self.model.a_generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return data["reason"] + else: + try: + res: Reason = await self.model.a_generate( + prompt=prompt, schema=Reason + ) + return res.reason + except TypeError: + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res, self) + return data["reason"] + + def _generate_reason(self, input: str, actual_output: str) -> str: + if self.include_reason is False: + return None + + unalignment_reasons = [] + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "no": + unalignment_reasons.append(verdict.reason) + + prompt = PromptAlignmentTemplate.generate_reason( + unalignment_reasons=unalignment_reasons, + input=input, + actual_output=actual_output, + score=format(self.score, ".2f"), + ) + if self.using_native_model: + res, cost = self.model.generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return data["reason"] + else: + try: + res: Reason = self.model.generate(prompt=prompt, schema=Reason) + return res.reason + except TypeError: + res = self.model.generate(prompt) + data = trimAndLoadJson(res, self) + return data["reason"] + + async def _a_generate_verdicts( + self, input: str, actual_output: str + ) -> Verdicts: + prompt = PromptAlignmentTemplate.generate_verdicts( + prompt_instructions=self.prompt_instructions, + input=input, + actual_output=actual_output, + ) + if self.using_native_model: + res, cost = await self.model.a_generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return [PromptAlignmentVerdict(**item) for item in data["verdicts"]] + else: + try: + res: Verdicts = await self.model.a_generate( + prompt, schema=Verdicts + ) + return [item for item in res.verdicts] + except TypeError: + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res, self) + return [ + PromptAlignmentVerdict(**item) for item in data["verdicts"] + ] + + def _generate_verdicts(self, input: str, actual_output: str) -> Verdicts: + prompt = PromptAlignmentTemplate.generate_verdicts( + prompt_instructions=self.prompt_instructions, + input=input, + actual_output=actual_output, + ) + if self.using_native_model: + res, cost = self.model.generate(prompt) + self.evaluation_cost += cost + data = trimAndLoadJson(res, self) + return [PromptAlignmentVerdict(**item) for item in data["verdicts"]] + else: + try: + res: Verdicts = self.model.generate(prompt, schema=Verdicts) + return [item for item in res.verdicts] + except TypeError: + res = self.model.generate(prompt) + data = trimAndLoadJson(res, self) + return [ + PromptAlignmentVerdict(**item) for item in data["verdicts"] + ] + + def _calculate_score(self): + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 1 + + alignment_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() != "no": + alignment_count += 1 + + score = alignment_count / number_of_verdicts + return 0 if self.strict_mode and score < self.threshold else score + + def is_successful(self) -> bool: + if self.error is not None: + self.success = False + else: + try: + self.success = self.score >= self.threshold + except: + self.success = False + return self.success + + @property + def __name__(self): + return "Prompt Alignment" diff --git a/deepeval/metrics/prompt_alignment/schema.py b/deepeval/metrics/prompt_alignment/schema.py new file mode 100644 index 00000000..61097e11 --- /dev/null +++ b/deepeval/metrics/prompt_alignment/schema.py @@ -0,0 +1,15 @@ +from typing import List, Optional +from pydantic import BaseModel, Field + + +class PromptAlignmentVerdict(BaseModel): + verdict: str + reason: Optional[str] = Field(default=None) + + +class Verdicts(BaseModel): + verdicts: List[PromptAlignmentVerdict] + + +class Reason(BaseModel): + reason: str diff --git a/deepeval/metrics/prompt_alignment/template.py b/deepeval/metrics/prompt_alignment/template.py new file mode 100644 index 00000000..80a73501 --- /dev/null +++ b/deepeval/metrics/prompt_alignment/template.py @@ -0,0 +1,87 @@ +from typing import List + + +class PromptAlignmentTemplate: + @staticmethod + def generate_verdicts( + prompt_instructions: List[str], input: str, actual_output: str + ): + return f"""For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM actual output. +Please generate a list of JSON with two keys: `verdict` and `reason`. +The 'verdict' key should STRICTLY be either a 'yes' or 'no'. Only answer 'yes' the instruction COMPLETELY follows the instruction, and 'no' otherwise. +You should be EXTRA STRICT AND CAREFUL when giving a 'yes'. +The 'reason' is the reason for the verdict. +Provide a 'reason' ONLY if the answer is 'no'. +The provided prompt instructions are the instructions to be followed in the prompt, which you have no acccess to. + +** +IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects. +Example input: What number is the stars of the sky? +Example actual output: HEY THERE! I think what you meant is "What is the number of stars in the sky", but unforunately I don't know the answer to it. +Example prompt instructions: ["Answer the input in a well-mannered fashion.", "Do not correct user of any grammatical errors.", "Respond in all upper case"] +Example JSON: +{{ + "verdicts": [ + {{ + "verdict": "yes" + }}, + {{ + "verdict": "no", + "reason": "The LLM corrected the user when the user used the wrong grammar in asking about the number of stars in the sky." + }}, + {{ + "verdict": "no", + "reason": "The LLM only made 'HEY THERE' uppercase, which does not follow the instruction of making everything uppercase completely." + }} + ] +}} + +Since you are going to generate a verdict for each instruction, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of prompt instructions. +** + +Prompt Instructions: +{prompt_instructions} + +Input: +{input} + +LLM Actual Output: +{actual_output} + +JSON: +""" + + @staticmethod + def generate_reason( + unalignment_reasons: List[str], + actual_output: str, + input: str, + score: int, + ): + return f"""Given the prompt alignment score, the reaons for unalignment found in the LLM actual output, the actual output, and input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score. +The unalignments represent prompt instructions that are not followed by the LLM in the actual output. +If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying). +Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons. + +** +IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. +Example JSON: +{{ + "reason": "The score is because ." +}} +** + +Input: +{input} + +LLM Actual Output: +{actual_output} + +Prompt Alignment Score: +{score} + +Reasons for unalignment: +{unalignment_reasons} + +JSON: +""" diff --git a/tests/test_everything.py b/tests/test_everything.py index 62c7ac8d..5a6da5e4 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -22,6 +22,8 @@ ConversationRelevancyMetric, RoleAdherenceMetric, ConversationCompletenessMetric, + PromptAlignmentMetric, + JsonCorrectnessMetric, ) from deepeval.metrics.ragas import RagasMetric from deepeval import assert_test