new metric

confident-ai · Nov 27, 2024 · 04997ff · 04997ff
1 parent 6629210
commit 04997ff
Show file tree

Hide file tree

Showing 7 changed files with 357 additions and 4 deletions.
diff --git a/deepeval/guardrails/types.py b/deepeval/guardrails/types.py
@@ -65,10 +65,7 @@ class Guard(Enum):
     Guard.RELIGION,
 ]
 
-entities_dependent_guards = [
-    Guard.BOLA, 
-    Guard.IMITATION
-]
+entities_dependent_guards = [Guard.BOLA, Guard.IMITATION]
 
 purpose_entities_dependent_guards = [
     Guard.PII_API_DB,

diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
@@ -17,6 +17,7 @@
 from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
 from .tool_correctness.tool_correctness import ToolCorrectnessMetric
 from .json_correctness.json_correctness import JsonCorrectnessMetric
+from .prompt_alignment.prompt_alignment import PromptAlignmentMetric
 from .text_to_image.text_to_image import TextToImageMetric
 from .image_editing.image_editing import ImageEditingMetric
 from .conversation_relevancy.conversation_relevancy import (

diff --git a/deepeval/metrics/prompt_alignment/__init__.py b/deepeval/metrics/prompt_alignment/__init__.py
diff --git a/deepeval/metrics/prompt_alignment/prompt_alignment.py b/deepeval/metrics/prompt_alignment/prompt_alignment.py
@@ -0,0 +1,251 @@
+from typing import Optional, List, Union
+
+from deepeval.utils import get_or_create_event_loop, prettify_list
+from deepeval.metrics.utils import (
+    construct_verbose_logs,
+    trimAndLoadJson,
+    check_llm_test_case_params,
+    initialize_model,
+)
+from deepeval.test_case import (
+    LLMTestCase,
+    LLMTestCaseParams,
+    ConversationalTestCase,
+)
+from deepeval.metrics import BaseMetric
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
+from deepeval.metrics.indicator import metric_progress_indicator
+from deepeval.metrics.prompt_alignment.schema import *
+
+required_params: List[LLMTestCaseParams] = [
+    LLMTestCaseParams.INPUT,
+    LLMTestCaseParams.ACTUAL_OUTPUT,
+]
+
+
+class PromptAlignmentMetric(BaseMetric):
+    def __init__(
+        self,
+        prompt_instructions: List[str],
+        threshold: float = 0.5,
+        model: Optional[Union[str, DeepEvalBaseLLM]] = None,
+        include_reason: bool = True,
+        async_mode: bool = True,
+        strict_mode: bool = False,
+        verbose_mode: bool = False,
+    ):
+        if len(prompt_instructions) == 0:
+            raise ValueError("'prompt_instructions' must not be empty.")
+
+        self.prompt_instructions = prompt_instructions
+        self.threshold = 1 if strict_mode else threshold
+        self.model, self.using_native_model = initialize_model(model)
+        self.evaluation_model = self.model.get_model_name()
+        self.include_reason = include_reason
+        self.async_mode = async_mode
+        self.strict_mode = strict_mode
+        self.verbose_mode = verbose_mode
+
+    def measure(
+        self,
+        test_case: Union[LLMTestCase, ConversationalTestCase],
+        _show_indicator: bool = True,
+    ) -> float:
+        if isinstance(test_case, ConversationalTestCase):
+            test_case = test_case.turns[0]
+        check_llm_test_case_params(test_case, required_params, self)
+
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(self, _show_indicator=_show_indicator):
+            if self.async_mode:
+                loop = get_or_create_event_loop()
+                loop.run_until_complete(
+                    self.a_measure(test_case, _show_indicator=False)
+                )
+            else:
+                self.verdicts: Verdicts = self._generate_verdicts(
+                    test_case.input, test_case.actual_output
+                )
+                self.score = self._calculate_score()
+                self.reason = self._generate_reason(
+                    test_case.input, test_case.actual_output
+                )
+                self.success = self.score >= self.threshold
+                self.verbose_logs = construct_verbose_logs(
+                    self,
+                    steps=[
+                        f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}",
+                        f"Verdicts:\n{prettify_list(self.verdicts)}",
+                        f"Score: {self.score}\nReason: {self.reason}",
+                    ],
+                )
+
+                return self.score
+
+    async def a_measure(
+        self,
+        test_case: Union[LLMTestCase, ConversationalTestCase],
+        _show_indicator: bool = True,
+    ) -> float:
+        if isinstance(test_case, ConversationalTestCase):
+            test_case = test_case.turns[0]
+        check_llm_test_case_params(test_case, required_params, self)
+
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(
+            self, async_mode=True, _show_indicator=_show_indicator
+        ):
+            self.verdicts: Verdicts = await self._a_generate_verdicts(
+                test_case.input, test_case.actual_output
+            )
+            self.score = self._calculate_score()
+            self.reason = await self._a_generate_reason(
+                test_case.input, test_case.actual_output
+            )
+            self.success = self.score >= self.threshold
+            self.verbose_logs = construct_verbose_logs(
+                self,
+                steps=[
+                    f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}",
+                    f"Verdicts:\n{prettify_list(self.verdicts)}",
+                    f"Score: {self.score}\nReason: {self.reason}",
+                ],
+            )
+
+            return self.score
+
+    async def _a_generate_reason(self, input: str, actual_output: str) -> str:
+        if self.include_reason is False:
+            return None
+
+        unalignment_reasons = []
+        for verdict in self.verdicts:
+            if verdict.verdict.strip().lower() == "no":
+                unalignment_reasons.append(verdict.reason)
+
+        prompt = PromptAlignmentTemplate.generate_reason(
+            unalignment_reasons=unalignment_reasons,
+            input=input,
+            actual_output=actual_output,
+            score=format(self.score, ".2f"),
+        )
+        if self.using_native_model:
+            res, cost = await self.model.a_generate(prompt)
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res, self)
+            return data["reason"]
+        else:
+            try:
+                res: Reason = await self.model.a_generate(
+                    prompt=prompt, schema=Reason
+                )
+                return res.reason
+            except TypeError:
+                res = await self.model.a_generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return data["reason"]
+
+    def _generate_reason(self, input: str, actual_output: str) -> str:
+        if self.include_reason is False:
+            return None
+
+        unalignment_reasons = []
+        for verdict in self.verdicts:
+            if verdict.verdict.strip().lower() == "no":
+                unalignment_reasons.append(verdict.reason)
+
+        prompt = PromptAlignmentTemplate.generate_reason(
+            unalignment_reasons=unalignment_reasons,
+            input=input,
+            actual_output=actual_output,
+            score=format(self.score, ".2f"),
+        )
+        if self.using_native_model:
+            res, cost = self.model.generate(prompt)
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res, self)
+            return data["reason"]
+        else:
+            try:
+                res: Reason = self.model.generate(prompt=prompt, schema=Reason)
+                return res.reason
+            except TypeError:
+                res = self.model.generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return data["reason"]
+
+    async def _a_generate_verdicts(
+        self, input: str, actual_output: str
+    ) -> Verdicts:
+        prompt = PromptAlignmentTemplate.generate_verdicts(
+            prompt_instructions=self.prompt_instructions,
+            input=input,
+            actual_output=actual_output,
+        )
+        if self.using_native_model:
+            res, cost = await self.model.a_generate(prompt)
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res, self)
+            return [PromptAlignmentVerdict(**item) for item in data["verdicts"]]
+        else:
+            try:
+                res: Verdicts = await self.model.a_generate(
+                    prompt, schema=Verdicts
+                )
+                return [item for item in res.verdicts]
+            except TypeError:
+                res = await self.model.a_generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return [
+                    PromptAlignmentVerdict(**item) for item in data["verdicts"]
+                ]
+
+    def _generate_verdicts(self, input: str, actual_output: str) -> Verdicts:
+        prompt = PromptAlignmentTemplate.generate_verdicts(
+            prompt_instructions=self.prompt_instructions,
+            input=input,
+            actual_output=actual_output,
+        )
+        if self.using_native_model:
+            res, cost = self.model.generate(prompt)
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res, self)
+            return [PromptAlignmentVerdict(**item) for item in data["verdicts"]]
+        else:
+            try:
+                res: Verdicts = self.model.generate(prompt, schema=Verdicts)
+                return [item for item in res.verdicts]
+            except TypeError:
+                res = self.model.generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return [
+                    PromptAlignmentVerdict(**item) for item in data["verdicts"]
+                ]
+
+    def _calculate_score(self):
+        number_of_verdicts = len(self.verdicts)
+        if number_of_verdicts == 0:
+            return 1
+
+        alignment_count = 0
+        for verdict in self.verdicts:
+            if verdict.verdict.strip().lower() != "no":
+                alignment_count += 1
+
+        score = alignment_count / number_of_verdicts
+        return 0 if self.strict_mode and score < self.threshold else score
+
+    def is_successful(self) -> bool:
+        if self.error is not None:
+            self.success = False
+        else:
+            try:
+                self.success = self.score >= self.threshold
+            except:
+                self.success = False
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Prompt Alignment"
diff --git a/deepeval/metrics/prompt_alignment/schema.py b/deepeval/metrics/prompt_alignment/schema.py
@@ -0,0 +1,15 @@
+from typing import List, Optional
+from pydantic import BaseModel, Field
+
+
+class PromptAlignmentVerdict(BaseModel):
+    verdict: str
+    reason: Optional[str] = Field(default=None)
+
+
+class Verdicts(BaseModel):
+    verdicts: List[PromptAlignmentVerdict]
+
+
+class Reason(BaseModel):
+    reason: str
diff --git a/deepeval/metrics/prompt_alignment/template.py b/deepeval/metrics/prompt_alignment/template.py
@@ -0,0 +1,87 @@
+from typing import List
+
+
+class PromptAlignmentTemplate:
+    @staticmethod
+    def generate_verdicts(
+        prompt_instructions: List[str], input: str, actual_output: str
+    ):
+        return f"""For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM actual output.
+Please generate a list of JSON with two keys: `verdict` and `reason`.
+The 'verdict' key should STRICTLY be either a 'yes' or 'no'. Only answer 'yes' the instruction COMPLETELY follows the instruction, and 'no' otherwise.
+You should be EXTRA STRICT AND CAREFUL when giving a 'yes'.
+The 'reason' is the reason for the verdict.
+Provide a 'reason' ONLY if the answer is 'no'. 
+The provided prompt instructions are the instructions to be followed in the prompt, which you have no acccess to.
+
+**
+IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
+Example input: What number is the stars of the sky?
+Example actual output: HEY THERE! I think what you meant is "What is the number of stars in the sky", but unforunately I don't know the answer to it.
+Example prompt instructions: ["Answer the input in a well-mannered fashion.", "Do not correct user of any grammatical errors.", "Respond in all upper case"]
+Example JSON:
+{{
+    "verdicts": [
+        {{
+            "verdict": "yes"
+        }},
+        {{
+            "verdict": "no",
+            "reason": "The LLM corrected the user when the user used the wrong grammar in asking about the number of stars in the sky."
+        }},
+        {{
+            "verdict": "no",
+            "reason": "The LLM only made 'HEY THERE' uppercase, which does not follow the instruction of making everything uppercase completely."
+        }}
+    ]  
+}}
+
+Since you are going to generate a verdict for each instruction, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of prompt instructions.
+**          
+
+Prompt Instructions:
+{prompt_instructions}
+
+Input:
+{input}
+
+LLM Actual Output:
+{actual_output}
+
+JSON:
+"""
+
+    @staticmethod
+    def generate_reason(
+        unalignment_reasons: List[str],
+        actual_output: str,
+        input: str,
+        score: int,
+    ):
+        return f"""Given the prompt alignment score, the reaons for unalignment found in the LLM actual output, the actual output, and input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
+The unalignments represent prompt instructions that are not followed by the LLM in the actual output.
+If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
+Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons.
+
+**
+IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
+Example JSON:
+{{
+    "reason": "The score is <prompt_alignment_score> because <your_reason>."
+}}
+**
+
+Input:
+{input}
+
+LLM Actual Output:
+{actual_output}
+
+Prompt Alignment Score:
+{score}
+
+Reasons for unalignment:
+{unalignment_reasons}
+
+JSON:
+"""
diff --git a/tests/test_everything.py b/tests/test_everything.py
@@ -22,6 +22,8 @@
     ConversationRelevancyMetric,
     RoleAdherenceMetric,
     ConversationCompletenessMetric,
+    PromptAlignmentMetric,
+    JsonCorrectnessMetric,
 )
 from deepeval.metrics.ragas import RagasMetric
 from deepeval import assert_test