Skip to content

Commit

Permalink
new metric
Browse files Browse the repository at this point in the history
  • Loading branch information
penguine-ip committed Nov 27, 2024
1 parent 6629210 commit 04997ff
Show file tree
Hide file tree
Showing 7 changed files with 357 additions and 4 deletions.
5 changes: 1 addition & 4 deletions deepeval/guardrails/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,7 @@ class Guard(Enum):
Guard.RELIGION,
]

entities_dependent_guards = [
Guard.BOLA,
Guard.IMITATION
]
entities_dependent_guards = [Guard.BOLA, Guard.IMITATION]

purpose_entities_dependent_guards = [
Guard.PII_API_DB,
Expand Down
1 change: 1 addition & 0 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
from .tool_correctness.tool_correctness import ToolCorrectnessMetric
from .json_correctness.json_correctness import JsonCorrectnessMetric
from .prompt_alignment.prompt_alignment import PromptAlignmentMetric
from .text_to_image.text_to_image import TextToImageMetric
from .image_editing.image_editing import ImageEditingMetric
from .conversation_relevancy.conversation_relevancy import (
Expand Down
Empty file.
251 changes: 251 additions & 0 deletions deepeval/metrics/prompt_alignment/prompt_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
from typing import Optional, List, Union

from deepeval.utils import get_or_create_event_loop, prettify_list
from deepeval.metrics.utils import (
construct_verbose_logs,
trimAndLoadJson,
check_llm_test_case_params,
initialize_model,
)
from deepeval.test_case import (
LLMTestCase,
LLMTestCaseParams,
ConversationalTestCase,
)
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.metrics.prompt_alignment.schema import *

required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
]


class PromptAlignmentMetric(BaseMetric):
def __init__(
self,
prompt_instructions: List[str],
threshold: float = 0.5,
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
include_reason: bool = True,
async_mode: bool = True,
strict_mode: bool = False,
verbose_mode: bool = False,
):
if len(prompt_instructions) == 0:
raise ValueError("'prompt_instructions' must not be empty.")

self.prompt_instructions = prompt_instructions
self.threshold = 1 if strict_mode else threshold
self.model, self.using_native_model = initialize_model(model)
self.evaluation_model = self.model.get_model_name()
self.include_reason = include_reason
self.async_mode = async_mode
self.strict_mode = strict_mode
self.verbose_mode = verbose_mode

def measure(
self,
test_case: Union[LLMTestCase, ConversationalTestCase],
_show_indicator: bool = True,
) -> float:
if isinstance(test_case, ConversationalTestCase):
test_case = test_case.turns[0]
check_llm_test_case_params(test_case, required_params, self)

self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(self, _show_indicator=_show_indicator):
if self.async_mode:
loop = get_or_create_event_loop()
loop.run_until_complete(
self.a_measure(test_case, _show_indicator=False)
)
else:
self.verdicts: Verdicts = self._generate_verdicts(
test_case.input, test_case.actual_output
)
self.score = self._calculate_score()
self.reason = self._generate_reason(
test_case.input, test_case.actual_output
)
self.success = self.score >= self.threshold
self.verbose_logs = construct_verbose_logs(
self,
steps=[
f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}",
f"Verdicts:\n{prettify_list(self.verdicts)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)

return self.score

async def a_measure(
self,
test_case: Union[LLMTestCase, ConversationalTestCase],
_show_indicator: bool = True,
) -> float:
if isinstance(test_case, ConversationalTestCase):
test_case = test_case.turns[0]
check_llm_test_case_params(test_case, required_params, self)

self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(
self, async_mode=True, _show_indicator=_show_indicator
):
self.verdicts: Verdicts = await self._a_generate_verdicts(
test_case.input, test_case.actual_output
)
self.score = self._calculate_score()
self.reason = await self._a_generate_reason(
test_case.input, test_case.actual_output
)
self.success = self.score >= self.threshold
self.verbose_logs = construct_verbose_logs(
self,
steps=[
f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}",
f"Verdicts:\n{prettify_list(self.verdicts)}",
f"Score: {self.score}\nReason: {self.reason}",
],
)

return self.score

async def _a_generate_reason(self, input: str, actual_output: str) -> str:
if self.include_reason is False:
return None

unalignment_reasons = []
for verdict in self.verdicts:
if verdict.verdict.strip().lower() == "no":
unalignment_reasons.append(verdict.reason)

prompt = PromptAlignmentTemplate.generate_reason(
unalignment_reasons=unalignment_reasons,
input=input,
actual_output=actual_output,
score=format(self.score, ".2f"),
)
if self.using_native_model:
res, cost = await self.model.a_generate(prompt)
self.evaluation_cost += cost
data = trimAndLoadJson(res, self)
return data["reason"]
else:
try:
res: Reason = await self.model.a_generate(
prompt=prompt, schema=Reason
)
return res.reason
except TypeError:
res = await self.model.a_generate(prompt)
data = trimAndLoadJson(res, self)
return data["reason"]

def _generate_reason(self, input: str, actual_output: str) -> str:
if self.include_reason is False:
return None

unalignment_reasons = []
for verdict in self.verdicts:
if verdict.verdict.strip().lower() == "no":
unalignment_reasons.append(verdict.reason)

prompt = PromptAlignmentTemplate.generate_reason(
unalignment_reasons=unalignment_reasons,
input=input,
actual_output=actual_output,
score=format(self.score, ".2f"),
)
if self.using_native_model:
res, cost = self.model.generate(prompt)
self.evaluation_cost += cost
data = trimAndLoadJson(res, self)
return data["reason"]
else:
try:
res: Reason = self.model.generate(prompt=prompt, schema=Reason)
return res.reason
except TypeError:
res = self.model.generate(prompt)
data = trimAndLoadJson(res, self)
return data["reason"]

async def _a_generate_verdicts(
self, input: str, actual_output: str
) -> Verdicts:
prompt = PromptAlignmentTemplate.generate_verdicts(
prompt_instructions=self.prompt_instructions,
input=input,
actual_output=actual_output,
)
if self.using_native_model:
res, cost = await self.model.a_generate(prompt)
self.evaluation_cost += cost
data = trimAndLoadJson(res, self)
return [PromptAlignmentVerdict(**item) for item in data["verdicts"]]
else:
try:
res: Verdicts = await self.model.a_generate(
prompt, schema=Verdicts
)
return [item for item in res.verdicts]
except TypeError:
res = await self.model.a_generate(prompt)
data = trimAndLoadJson(res, self)
return [
PromptAlignmentVerdict(**item) for item in data["verdicts"]
]

def _generate_verdicts(self, input: str, actual_output: str) -> Verdicts:
prompt = PromptAlignmentTemplate.generate_verdicts(
prompt_instructions=self.prompt_instructions,
input=input,
actual_output=actual_output,
)
if self.using_native_model:
res, cost = self.model.generate(prompt)
self.evaluation_cost += cost
data = trimAndLoadJson(res, self)
return [PromptAlignmentVerdict(**item) for item in data["verdicts"]]
else:
try:
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
return [item for item in res.verdicts]
except TypeError:
res = self.model.generate(prompt)
data = trimAndLoadJson(res, self)
return [
PromptAlignmentVerdict(**item) for item in data["verdicts"]
]

def _calculate_score(self):
number_of_verdicts = len(self.verdicts)
if number_of_verdicts == 0:
return 1

alignment_count = 0
for verdict in self.verdicts:
if verdict.verdict.strip().lower() != "no":
alignment_count += 1

score = alignment_count / number_of_verdicts
return 0 if self.strict_mode and score < self.threshold else score

def is_successful(self) -> bool:
if self.error is not None:
self.success = False
else:
try:
self.success = self.score >= self.threshold
except:
self.success = False
return self.success

@property
def __name__(self):
return "Prompt Alignment"
15 changes: 15 additions & 0 deletions deepeval/metrics/prompt_alignment/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import List, Optional
from pydantic import BaseModel, Field


class PromptAlignmentVerdict(BaseModel):
verdict: str
reason: Optional[str] = Field(default=None)


class Verdicts(BaseModel):
verdicts: List[PromptAlignmentVerdict]


class Reason(BaseModel):
reason: str
87 changes: 87 additions & 0 deletions deepeval/metrics/prompt_alignment/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import List


class PromptAlignmentTemplate:
@staticmethod
def generate_verdicts(
prompt_instructions: List[str], input: str, actual_output: str
):
return f"""For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM actual output.
Please generate a list of JSON with two keys: `verdict` and `reason`.
The 'verdict' key should STRICTLY be either a 'yes' or 'no'. Only answer 'yes' the instruction COMPLETELY follows the instruction, and 'no' otherwise.
You should be EXTRA STRICT AND CAREFUL when giving a 'yes'.
The 'reason' is the reason for the verdict.
Provide a 'reason' ONLY if the answer is 'no'.
The provided prompt instructions are the instructions to be followed in the prompt, which you have no acccess to.
**
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
Example input: What number is the stars of the sky?
Example actual output: HEY THERE! I think what you meant is "What is the number of stars in the sky", but unforunately I don't know the answer to it.
Example prompt instructions: ["Answer the input in a well-mannered fashion.", "Do not correct user of any grammatical errors.", "Respond in all upper case"]
Example JSON:
{{
"verdicts": [
{{
"verdict": "yes"
}},
{{
"verdict": "no",
"reason": "The LLM corrected the user when the user used the wrong grammar in asking about the number of stars in the sky."
}},
{{
"verdict": "no",
"reason": "The LLM only made 'HEY THERE' uppercase, which does not follow the instruction of making everything uppercase completely."
}}
]
}}
Since you are going to generate a verdict for each instruction, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of prompt instructions.
**
Prompt Instructions:
{prompt_instructions}
Input:
{input}
LLM Actual Output:
{actual_output}
JSON:
"""

@staticmethod
def generate_reason(
unalignment_reasons: List[str],
actual_output: str,
input: str,
score: int,
):
return f"""Given the prompt alignment score, the reaons for unalignment found in the LLM actual output, the actual output, and input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
The unalignments represent prompt instructions that are not followed by the LLM in the actual output.
If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons.
**
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
Example JSON:
{{
"reason": "The score is <prompt_alignment_score> because <your_reason>."
}}
**
Input:
{input}
LLM Actual Output:
{actual_output}
Prompt Alignment Score:
{score}
Reasons for unalignment:
{unalignment_reasons}
JSON:
"""
2 changes: 2 additions & 0 deletions tests/test_everything.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
ConversationRelevancyMetric,
RoleAdherenceMetric,
ConversationCompletenessMetric,
PromptAlignmentMetric,
JsonCorrectnessMetric,
)
from deepeval.metrics.ragas import RagasMetric
from deepeval import assert_test
Expand Down

0 comments on commit 04997ff

Please sign in to comment.