Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
penguine-ip committed Nov 27, 2024
1 parent ed13f6e commit 6096feb
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 43 deletions.
13 changes: 0 additions & 13 deletions c.py

This file was deleted.

143 changes: 116 additions & 27 deletions deepeval/metrics/json_correctness/json_correctness.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Union
from typing import List, Optional, Union
import json
from pydantic import BaseModel, ValidationError

Expand All @@ -11,9 +11,16 @@
from deepeval.metrics.utils import (
construct_verbose_logs,
check_llm_test_case_params,
initialize_model,
trimAndLoadJson,
)
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
from deepeval.metrics.json_correctness.schema import Reason
from deepeval.utils import get_or_create_event_loop

DEFAULT_CORRERCT_REASON = "The generated Json matches and is syntactically correct to the expected schema."

required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
Expand All @@ -25,14 +32,18 @@ class JsonCorrectnessMetric(BaseMetric):
def __init__(
self,
expected_schema: BaseModel,
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
threshold: float = 0.5,
async_mode: bool = True,
include_reason: bool = True,
strict_mode: bool = False,
strict_mode: bool = True,
verbose_mode: bool = False,
):
self.threshold = 1 if strict_mode else threshold
self.model, self.using_native_model = initialize_model(model)
self.include_reason = include_reason
self.strict_mode = strict_mode
self.async_mode = async_mode
self.verbose_mode = verbose_mode
self.expected_schema = expected_schema

Expand All @@ -45,52 +56,130 @@ def measure(
test_case = test_case.turns[0]
check_llm_test_case_params(test_case, required_params, self)

self.evaluation_cost = 0
self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(self, _show_indicator=_show_indicator):
if self.async_mode:
loop = get_or_create_event_loop()
loop.run_until_complete(
self.a_measure(test_case, _show_indicator=False)
)
else:
valid_json = True
try:
self.expected_schema.model_validate_json(
test_case.actual_output
)
except ValidationError as e:
valid_json = False

self.score = 1 if valid_json else 0
self.reason = self.generate_reason(test_case.actual_output)
self.success = self.score >= self.threshold
self.verbose_logs = construct_verbose_logs(
self,
steps=[
f"LLM outputed Json:\n{test_case.actual_output}",
# f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)

return self.score

async def a_measure(
self,
test_case: Union[LLMTestCase, ConversationalTestCase],
_show_indicator: bool = True,
) -> float:
if isinstance(test_case, ConversationalTestCase):
test_case = test_case.turns[0]
check_llm_test_case_params(test_case, required_params, self)

self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(
self, async_mode=True, _show_indicator=_show_indicator
):
valid_json = True
try:
self.expected_schema.model_validate_json(
test_case.actual_output
)
except ValidationError as e:
valid_json = False
if self.include_reason:
self.reason = self.generate_friendly_error_message(e)

self.score = 1 if valid_json else 0
self.reason = await self.a_generate_reason(test_case.actual_output)
self.success = self.score >= self.threshold
self.verbose_logs = construct_verbose_logs(
self,
steps=[
f"LLM outputed Json:\n{test_case.actual_output}",
f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}",
# f"Expected Json Schema:\n{json.dumps(self.expected_schema.model_json_schema(), indent=4)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)

return self.score

async def a_measure(
self,
test_case: Union[LLMTestCase, ConversationalTestCase],
_show_indicator: bool = True,
) -> float:
return self.measure(test_case, _show_indicator=_show_indicator)

def generate_friendly_error_message(self, error: ValidationError) -> str:
error_messages = []
for err in error.errors():
# Extract error location, message, and type
loc = " -> ".join(map(str, err.get("loc", [])))
msg = err.get("msg", "Unknown error")
error_type = err.get("type", "Unknown type")

# Format each error message in a readable way
error_message = f"Error in '{loc}': {msg} (Type: {error_type})"
error_messages.append(error_message)

# Join all error messages into a single formatted string
return "\n".join(error_messages)
async def a_generate_reason(self, actual_output: str) -> str:
if self.include_reason is False:
return None

is_valid_json = self.score == 1
if is_valid_json:
return DEFAULT_CORRERCT_REASON

prompt: dict = JsonCorrectnessTemplate.generate_reason(
actual_output=actual_output,
expected_schema=json.dumps(
self.expected_schema.model_json_schema(), indent=4
),
is_valid_json=is_valid_json,
)

if self.using_native_model:
res, cost = await self.model.a_generate(prompt)
self.evaluation_cost += cost
data = trimAndLoadJson(res, self)
return data["reason"]
else:
try:
res: Reason = await self.model.a_generate(prompt, schema=Reason)
return res.reason
except TypeError:
res = await self.model.a_generate(prompt)
data = trimAndLoadJson(res, self)
return data["reason"]

def generate_reason(self, actual_output: str) -> str:
if self.include_reason is False:
return None

is_valid_json = self.score == 1
if is_valid_json:
return DEFAULT_CORRERCT_REASON

prompt: dict = JsonCorrectnessTemplate.generate_reason(
actual_output=actual_output,
expected_schema=json.dumps(
self.expected_schema.model_json_schema(), indent=4
),
is_valid_json=is_valid_json,
)

if self.using_native_model:
res, cost = self.model.generate(prompt)
self.evaluation_cost += cost
data = trimAndLoadJson(res, self)
return data["reason"]
else:
try:
res: Reason = self.model.generate(prompt, schema=Reason)
return res.reason
except TypeError:
res = self.model.generate(prompt)
data = trimAndLoadJson(res, self)
return data["reason"]

def is_successful(self) -> bool:
if self.error is not None:
Expand Down
5 changes: 5 additions & 0 deletions deepeval/metrics/json_correctness/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from pydantic import BaseModel


class Reason(BaseModel):
reason: str
6 changes: 3 additions & 3 deletions deepeval/metrics/json_correctness/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
class JsonCorrectnessTemplate:
@staticmethod
def generate_reason(
generated_json: str, expected_schema: str, is_valid_json: bool
actual_output: str, expected_schema: str, is_valid_json: bool
):
return f"""Based on the given generated json, generated by an LLM, and a boolean stating whether it is a valid JSON based on the expected json schema, give a reason why it is OR is not a valid Json.
Expand All @@ -15,11 +15,11 @@ def generate_reason(
"reason": "The generated Json is <is_valid_json> because <your_reason>."
}}
If the json is not a valid one, your reason MUST compare `Expected Json Schema` and `Generated Json` in your reason.
If the json is not a valid one, your reason MUST compare `Expected Json Schema` and `Generated Json` in your reason. Keep it SHORT and CONCISE while being very FACTUAL and ACTIONABLE.
**
Generated Json:
{generated_json}
{actual_output}
Expected Json Schema:
{expected_schema}
Expand Down

0 comments on commit 6096feb

Please sign in to comment.