.

confident-ai · Nov 27, 2024 · 6096feb · 6096feb
1 parent ed13f6e
commit 6096feb
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 43 deletions.
diff --git a/c.py b/c.py
diff --git a/deepeval/metrics/json_correctness/json_correctness.py b/deepeval/metrics/json_correctness/json_correctness.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List, Optional, Union
 import json
 from pydantic import BaseModel, ValidationError
 
@@ -11,9 +11,16 @@
 from deepeval.metrics.utils import (
     construct_verbose_logs,
     check_llm_test_case_params,
+    initialize_model,
+    trimAndLoadJson,
 )
+from deepeval.models import DeepEvalBaseLLM
 from deepeval.metrics.indicator import metric_progress_indicator
+from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
+from deepeval.metrics.json_correctness.schema import Reason
+from deepeval.utils import get_or_create_event_loop
 
+DEFAULT_CORRERCT_REASON = "The generated Json matches and is syntactically correct to the expected schema."
 
 required_params: List[LLMTestCaseParams] = [
     LLMTestCaseParams.INPUT,
@@ -25,14 +32,18 @@ class JsonCorrectnessMetric(BaseMetric):
     def __init__(
         self,
         expected_schema: BaseModel,
+        model: Optional[Union[str, DeepEvalBaseLLM]] = None,
         threshold: float = 0.5,
+        async_mode: bool = True,
         include_reason: bool = True,
-        strict_mode: bool = False,
+        strict_mode: bool = True,
         verbose_mode: bool = False,
     ):
         self.threshold = 1 if strict_mode else threshold
+        self.model, self.using_native_model = initialize_model(model)
         self.include_reason = include_reason
         self.strict_mode = strict_mode
+        self.async_mode = async_mode
         self.verbose_mode = verbose_mode
         self.expected_schema = expected_schema
 
@@ -45,52 +56,130 @@ def measure(
             test_case = test_case.turns[0]
         check_llm_test_case_params(test_case, required_params, self)
 
-        self.evaluation_cost = 0
+        self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(self, _show_indicator=_show_indicator):
+            if self.async_mode:
+                loop = get_or_create_event_loop()
+                loop.run_until_complete(
+                    self.a_measure(test_case, _show_indicator=False)
+                )
+            else:
+                valid_json = True
+                try:
+                    self.expected_schema.model_validate_json(
+                        test_case.actual_output
+                    )
+                except ValidationError as e:
+                    valid_json = False
+
+                self.score = 1 if valid_json else 0
+                self.reason = self.generate_reason(test_case.actual_output)
+                self.success = self.score >= self.threshold
+                self.verbose_logs = construct_verbose_logs(
+                    self,
+                    steps=[
+                        f"LLM outputed Json:\n{test_case.actual_output}",
+                        # f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}",
+                        f"Score: {self.score}\nReason: {self.reason}",
+                    ],
+                )
+
+                return self.score
+
+    async def a_measure(
+        self,
+        test_case: Union[LLMTestCase, ConversationalTestCase],
+        _show_indicator: bool = True,
+    ) -> float:
+        if isinstance(test_case, ConversationalTestCase):
+            test_case = test_case.turns[0]
+        check_llm_test_case_params(test_case, required_params, self)
+
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(
+            self, async_mode=True, _show_indicator=_show_indicator
+        ):
             valid_json = True
             try:
                 self.expected_schema.model_validate_json(
                     test_case.actual_output
                 )
             except ValidationError as e:
                 valid_json = False
-                if self.include_reason:
-                    self.reason = self.generate_friendly_error_message(e)
 
             self.score = 1 if valid_json else 0
+            self.reason = await self.a_generate_reason(test_case.actual_output)
             self.success = self.score >= self.threshold
             self.verbose_logs = construct_verbose_logs(
                 self,
                 steps=[
                     f"LLM outputed Json:\n{test_case.actual_output}",
-                    f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}",
+                    # f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}",
                     f"Score: {self.score}\nReason: {self.reason}",
                 ],
             )
 
             return self.score
 
-    async def a_measure(
-        self,
-        test_case: Union[LLMTestCase, ConversationalTestCase],
-        _show_indicator: bool = True,
-    ) -> float:
-        return self.measure(test_case, _show_indicator=_show_indicator)
-
-    def generate_friendly_error_message(self, error: ValidationError) -> str:
-        error_messages = []
-        for err in error.errors():
-            # Extract error location, message, and type
-            loc = " -> ".join(map(str, err.get("loc", [])))
-            msg = err.get("msg", "Unknown error")
-            error_type = err.get("type", "Unknown type")
-
-            # Format each error message in a readable way
-            error_message = f"Error in '{loc}': {msg} (Type: {error_type})"
-            error_messages.append(error_message)
-
-        # Join all error messages into a single formatted string
-        return "\n".join(error_messages)
+    async def a_generate_reason(self, actual_output: str) -> str:
+        if self.include_reason is False:
+            return None
+
+        is_valid_json = self.score == 1
+        if is_valid_json:
+            return DEFAULT_CORRERCT_REASON
+
+        prompt: dict = JsonCorrectnessTemplate.generate_reason(
+            actual_output=actual_output,
+            expected_schema=json.dumps(
+                self.expected_schema.model_json_schema(), indent=4
+            ),
+            is_valid_json=is_valid_json,
+        )
+
+        if self.using_native_model:
+            res, cost = await self.model.a_generate(prompt)
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res, self)
+            return data["reason"]
+        else:
+            try:
+                res: Reason = await self.model.a_generate(prompt, schema=Reason)
+                return res.reason
+            except TypeError:
+                res = await self.model.a_generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return data["reason"]
+
+    def generate_reason(self, actual_output: str) -> str:
+        if self.include_reason is False:
+            return None
+
+        is_valid_json = self.score == 1
+        if is_valid_json:
+            return DEFAULT_CORRERCT_REASON
+
+        prompt: dict = JsonCorrectnessTemplate.generate_reason(
+            actual_output=actual_output,
+            expected_schema=json.dumps(
+                self.expected_schema.model_json_schema(), indent=4
+            ),
+            is_valid_json=is_valid_json,
+        )
+
+        if self.using_native_model:
+            res, cost = self.model.generate(prompt)
+            self.evaluation_cost += cost
+            data = trimAndLoadJson(res, self)
+            return data["reason"]
+        else:
+            try:
+                res: Reason = self.model.generate(prompt, schema=Reason)
+                return res.reason
+            except TypeError:
+                res = self.model.generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return data["reason"]
 
     def is_successful(self) -> bool:
         if self.error is not None:

diff --git a/deepeval/metrics/json_correctness/schema.py b/deepeval/metrics/json_correctness/schema.py
@@ -0,0 +1,5 @@
+from pydantic import BaseModel
+
+
+class Reason(BaseModel):
+    reason: str
diff --git a/deepeval/metrics/json_correctness/template.py b/deepeval/metrics/json_correctness/template.py
@@ -4,7 +4,7 @@
 class JsonCorrectnessTemplate:
     @staticmethod
     def generate_reason(
-        generated_json: str, expected_schema: str, is_valid_json: bool
+        actual_output: str, expected_schema: str, is_valid_json: bool
     ):
         return f"""Based on the given generated json, generated by an LLM, and a boolean stating whether it is a valid JSON based on the expected json schema, give a reason why it is OR is not a valid Json.
 
@@ -15,11 +15,11 @@ def generate_reason(
     "reason": "The generated Json is <is_valid_json> because <your_reason>."
 }}
 
-If the json is not a valid one, your reason MUST compare `Expected Json Schema` and `Generated Json` in your reason.
+If the json is not a valid one, your reason MUST compare `Expected Json Schema` and `Generated Json` in your reason. Keep it SHORT and CONCISE while being very FACTUAL and ACTIONABLE.
 **
 
 Generated Json:
-{generated_json}
+{actual_output}
 
 Expected Json Schema:
 {expected_schema}