-
Notifications
You must be signed in to change notification settings - Fork 312
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6629210
commit 04997ff
Showing
7 changed files
with
357 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
from typing import Optional, List, Union | ||
|
||
from deepeval.utils import get_or_create_event_loop, prettify_list | ||
from deepeval.metrics.utils import ( | ||
construct_verbose_logs, | ||
trimAndLoadJson, | ||
check_llm_test_case_params, | ||
initialize_model, | ||
) | ||
from deepeval.test_case import ( | ||
LLMTestCase, | ||
LLMTestCaseParams, | ||
ConversationalTestCase, | ||
) | ||
from deepeval.metrics import BaseMetric | ||
from deepeval.models import DeepEvalBaseLLM | ||
from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate | ||
from deepeval.metrics.indicator import metric_progress_indicator | ||
from deepeval.metrics.prompt_alignment.schema import * | ||
|
||
required_params: List[LLMTestCaseParams] = [ | ||
LLMTestCaseParams.INPUT, | ||
LLMTestCaseParams.ACTUAL_OUTPUT, | ||
] | ||
|
||
|
||
class PromptAlignmentMetric(BaseMetric): | ||
def __init__( | ||
self, | ||
prompt_instructions: List[str], | ||
threshold: float = 0.5, | ||
model: Optional[Union[str, DeepEvalBaseLLM]] = None, | ||
include_reason: bool = True, | ||
async_mode: bool = True, | ||
strict_mode: bool = False, | ||
verbose_mode: bool = False, | ||
): | ||
if len(prompt_instructions) == 0: | ||
raise ValueError("'prompt_instructions' must not be empty.") | ||
|
||
self.prompt_instructions = prompt_instructions | ||
self.threshold = 1 if strict_mode else threshold | ||
self.model, self.using_native_model = initialize_model(model) | ||
self.evaluation_model = self.model.get_model_name() | ||
self.include_reason = include_reason | ||
self.async_mode = async_mode | ||
self.strict_mode = strict_mode | ||
self.verbose_mode = verbose_mode | ||
|
||
def measure( | ||
self, | ||
test_case: Union[LLMTestCase, ConversationalTestCase], | ||
_show_indicator: bool = True, | ||
) -> float: | ||
if isinstance(test_case, ConversationalTestCase): | ||
test_case = test_case.turns[0] | ||
check_llm_test_case_params(test_case, required_params, self) | ||
|
||
self.evaluation_cost = 0 if self.using_native_model else None | ||
with metric_progress_indicator(self, _show_indicator=_show_indicator): | ||
if self.async_mode: | ||
loop = get_or_create_event_loop() | ||
loop.run_until_complete( | ||
self.a_measure(test_case, _show_indicator=False) | ||
) | ||
else: | ||
self.verdicts: Verdicts = self._generate_verdicts( | ||
test_case.input, test_case.actual_output | ||
) | ||
self.score = self._calculate_score() | ||
self.reason = self._generate_reason( | ||
test_case.input, test_case.actual_output | ||
) | ||
self.success = self.score >= self.threshold | ||
self.verbose_logs = construct_verbose_logs( | ||
self, | ||
steps=[ | ||
f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}", | ||
f"Verdicts:\n{prettify_list(self.verdicts)}", | ||
f"Score: {self.score}\nReason: {self.reason}", | ||
], | ||
) | ||
|
||
return self.score | ||
|
||
async def a_measure( | ||
self, | ||
test_case: Union[LLMTestCase, ConversationalTestCase], | ||
_show_indicator: bool = True, | ||
) -> float: | ||
if isinstance(test_case, ConversationalTestCase): | ||
test_case = test_case.turns[0] | ||
check_llm_test_case_params(test_case, required_params, self) | ||
|
||
self.evaluation_cost = 0 if self.using_native_model else None | ||
with metric_progress_indicator( | ||
self, async_mode=True, _show_indicator=_show_indicator | ||
): | ||
self.verdicts: Verdicts = await self._a_generate_verdicts( | ||
test_case.input, test_case.actual_output | ||
) | ||
self.score = self._calculate_score() | ||
self.reason = await self._a_generate_reason( | ||
test_case.input, test_case.actual_output | ||
) | ||
self.success = self.score >= self.threshold | ||
self.verbose_logs = construct_verbose_logs( | ||
self, | ||
steps=[ | ||
f"Prompt Instructions:\n{prettify_list(self.prompt_instructions)}", | ||
f"Verdicts:\n{prettify_list(self.verdicts)}", | ||
f"Score: {self.score}\nReason: {self.reason}", | ||
], | ||
) | ||
|
||
return self.score | ||
|
||
async def _a_generate_reason(self, input: str, actual_output: str) -> str: | ||
if self.include_reason is False: | ||
return None | ||
|
||
unalignment_reasons = [] | ||
for verdict in self.verdicts: | ||
if verdict.verdict.strip().lower() == "no": | ||
unalignment_reasons.append(verdict.reason) | ||
|
||
prompt = PromptAlignmentTemplate.generate_reason( | ||
unalignment_reasons=unalignment_reasons, | ||
input=input, | ||
actual_output=actual_output, | ||
score=format(self.score, ".2f"), | ||
) | ||
if self.using_native_model: | ||
res, cost = await self.model.a_generate(prompt) | ||
self.evaluation_cost += cost | ||
data = trimAndLoadJson(res, self) | ||
return data["reason"] | ||
else: | ||
try: | ||
res: Reason = await self.model.a_generate( | ||
prompt=prompt, schema=Reason | ||
) | ||
return res.reason | ||
except TypeError: | ||
res = await self.model.a_generate(prompt) | ||
data = trimAndLoadJson(res, self) | ||
return data["reason"] | ||
|
||
def _generate_reason(self, input: str, actual_output: str) -> str: | ||
if self.include_reason is False: | ||
return None | ||
|
||
unalignment_reasons = [] | ||
for verdict in self.verdicts: | ||
if verdict.verdict.strip().lower() == "no": | ||
unalignment_reasons.append(verdict.reason) | ||
|
||
prompt = PromptAlignmentTemplate.generate_reason( | ||
unalignment_reasons=unalignment_reasons, | ||
input=input, | ||
actual_output=actual_output, | ||
score=format(self.score, ".2f"), | ||
) | ||
if self.using_native_model: | ||
res, cost = self.model.generate(prompt) | ||
self.evaluation_cost += cost | ||
data = trimAndLoadJson(res, self) | ||
return data["reason"] | ||
else: | ||
try: | ||
res: Reason = self.model.generate(prompt=prompt, schema=Reason) | ||
return res.reason | ||
except TypeError: | ||
res = self.model.generate(prompt) | ||
data = trimAndLoadJson(res, self) | ||
return data["reason"] | ||
|
||
async def _a_generate_verdicts( | ||
self, input: str, actual_output: str | ||
) -> Verdicts: | ||
prompt = PromptAlignmentTemplate.generate_verdicts( | ||
prompt_instructions=self.prompt_instructions, | ||
input=input, | ||
actual_output=actual_output, | ||
) | ||
if self.using_native_model: | ||
res, cost = await self.model.a_generate(prompt) | ||
self.evaluation_cost += cost | ||
data = trimAndLoadJson(res, self) | ||
return [PromptAlignmentVerdict(**item) for item in data["verdicts"]] | ||
else: | ||
try: | ||
res: Verdicts = await self.model.a_generate( | ||
prompt, schema=Verdicts | ||
) | ||
return [item for item in res.verdicts] | ||
except TypeError: | ||
res = await self.model.a_generate(prompt) | ||
data = trimAndLoadJson(res, self) | ||
return [ | ||
PromptAlignmentVerdict(**item) for item in data["verdicts"] | ||
] | ||
|
||
def _generate_verdicts(self, input: str, actual_output: str) -> Verdicts: | ||
prompt = PromptAlignmentTemplate.generate_verdicts( | ||
prompt_instructions=self.prompt_instructions, | ||
input=input, | ||
actual_output=actual_output, | ||
) | ||
if self.using_native_model: | ||
res, cost = self.model.generate(prompt) | ||
self.evaluation_cost += cost | ||
data = trimAndLoadJson(res, self) | ||
return [PromptAlignmentVerdict(**item) for item in data["verdicts"]] | ||
else: | ||
try: | ||
res: Verdicts = self.model.generate(prompt, schema=Verdicts) | ||
return [item for item in res.verdicts] | ||
except TypeError: | ||
res = self.model.generate(prompt) | ||
data = trimAndLoadJson(res, self) | ||
return [ | ||
PromptAlignmentVerdict(**item) for item in data["verdicts"] | ||
] | ||
|
||
def _calculate_score(self): | ||
number_of_verdicts = len(self.verdicts) | ||
if number_of_verdicts == 0: | ||
return 1 | ||
|
||
alignment_count = 0 | ||
for verdict in self.verdicts: | ||
if verdict.verdict.strip().lower() != "no": | ||
alignment_count += 1 | ||
|
||
score = alignment_count / number_of_verdicts | ||
return 0 if self.strict_mode and score < self.threshold else score | ||
|
||
def is_successful(self) -> bool: | ||
if self.error is not None: | ||
self.success = False | ||
else: | ||
try: | ||
self.success = self.score >= self.threshold | ||
except: | ||
self.success = False | ||
return self.success | ||
|
||
@property | ||
def __name__(self): | ||
return "Prompt Alignment" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from typing import List, Optional | ||
from pydantic import BaseModel, Field | ||
|
||
|
||
class PromptAlignmentVerdict(BaseModel): | ||
verdict: str | ||
reason: Optional[str] = Field(default=None) | ||
|
||
|
||
class Verdicts(BaseModel): | ||
verdicts: List[PromptAlignmentVerdict] | ||
|
||
|
||
class Reason(BaseModel): | ||
reason: str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from typing import List | ||
|
||
|
||
class PromptAlignmentTemplate: | ||
@staticmethod | ||
def generate_verdicts( | ||
prompt_instructions: List[str], input: str, actual_output: str | ||
): | ||
return f"""For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM actual output. | ||
Please generate a list of JSON with two keys: `verdict` and `reason`. | ||
The 'verdict' key should STRICTLY be either a 'yes' or 'no'. Only answer 'yes' the instruction COMPLETELY follows the instruction, and 'no' otherwise. | ||
You should be EXTRA STRICT AND CAREFUL when giving a 'yes'. | ||
The 'reason' is the reason for the verdict. | ||
Provide a 'reason' ONLY if the answer is 'no'. | ||
The provided prompt instructions are the instructions to be followed in the prompt, which you have no acccess to. | ||
** | ||
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects. | ||
Example input: What number is the stars of the sky? | ||
Example actual output: HEY THERE! I think what you meant is "What is the number of stars in the sky", but unforunately I don't know the answer to it. | ||
Example prompt instructions: ["Answer the input in a well-mannered fashion.", "Do not correct user of any grammatical errors.", "Respond in all upper case"] | ||
Example JSON: | ||
{{ | ||
"verdicts": [ | ||
{{ | ||
"verdict": "yes" | ||
}}, | ||
{{ | ||
"verdict": "no", | ||
"reason": "The LLM corrected the user when the user used the wrong grammar in asking about the number of stars in the sky." | ||
}}, | ||
{{ | ||
"verdict": "no", | ||
"reason": "The LLM only made 'HEY THERE' uppercase, which does not follow the instruction of making everything uppercase completely." | ||
}} | ||
] | ||
}} | ||
Since you are going to generate a verdict for each instruction, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of prompt instructions. | ||
** | ||
Prompt Instructions: | ||
{prompt_instructions} | ||
Input: | ||
{input} | ||
LLM Actual Output: | ||
{actual_output} | ||
JSON: | ||
""" | ||
|
||
@staticmethod | ||
def generate_reason( | ||
unalignment_reasons: List[str], | ||
actual_output: str, | ||
input: str, | ||
score: int, | ||
): | ||
return f"""Given the prompt alignment score, the reaons for unalignment found in the LLM actual output, the actual output, and input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score. | ||
The unalignments represent prompt instructions that are not followed by the LLM in the actual output. | ||
If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying). | ||
Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons. | ||
** | ||
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. | ||
Example JSON: | ||
{{ | ||
"reason": "The score is <prompt_alignment_score> because <your_reason>." | ||
}} | ||
** | ||
Input: | ||
{input} | ||
LLM Actual Output: | ||
{actual_output} | ||
Prompt Alignment Score: | ||
{score} | ||
Reasons for unalignment: | ||
{unalignment_reasons} | ||
JSON: | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters