Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix evaluation PR #1711

Merged
merged 22 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 56 additions & 39 deletions agenta-backend/agenta_backend/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
"key": "auto_exact_match",
"direct_use": True,
"settings_template": {
"label": "Exact Match Settings",
"description": "Settings for the Exact Match evaluator",
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"type": "array",
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
mmabrouk marked this conversation as resolved.
Show resolved Hide resolved
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "Exact Match evaluator determines if the output exactly matches the specified correct answer, ensuring precise alignment with expected results.",
Expand All @@ -18,10 +19,7 @@
"name": "Contains Json",
"key": "auto_contains_json",
"direct_use": True,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the JSON output contains the specified JSON structure.",
},
"settings_template": {},
"description": "Contains Json evaluator checks if the output contains the specified JSON structure.",
},
{
Expand All @@ -38,10 +36,13 @@
"max": 1,
"required": True,
},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"type": "array",
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.",
Expand Down Expand Up @@ -79,10 +80,13 @@
"description": "The name of the field in the JSON output that you wish to evaluate",
"required": True,
},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.",
Expand All @@ -95,10 +99,18 @@
"prompt_template": {
"label": "Prompt Template",
"type": "text",
"default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nCorrect Answer:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"description": "Template for AI critique prompts",
"required": True,
}
},
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "AI Critique evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).",
},
Expand Down Expand Up @@ -128,10 +140,13 @@
"description": "https://your-webhook-url.com",
"required": True,
},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response indicating the correctness of the answer. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.",
Expand All @@ -141,13 +156,17 @@
"key": "auto_starts_with",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output starts with the specified prefix.",
"prefix": {"label": "prefix", "type": "string", "required": True},
"prefix": {
"label": "prefix",
"type": "string",
"required": True,
"description": "The string to match at the start of the output.",
},
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
"default": True,
"description": "If the evaluation should be case sensitive.",
},
},
"description": "Starts With evaluator checks if the output starts with a specified prefix, considering case sensitivity based on the settings.",
Expand All @@ -157,8 +176,6 @@
"key": "auto_ends_with",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output ends with the specified suffix.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -179,8 +196,6 @@
"key": "auto_contains",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output contains the specified substring.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -201,8 +216,6 @@
"key": "auto_contains_any",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output contains any of the specified substrings.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -223,8 +236,6 @@
"key": "auto_contains_all",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output contains all of the specified substrings.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -245,13 +256,19 @@
"key": "auto_levenshtein_distance",
"direct_use": False,
"settings_template": {
"label": "Levenshtein Distance Settings",
"description": "Evaluates the Levenshtein distance between the output and the correct answer. If a threshold is specified, it checks if the distance is below this threshold and returns a boolean value. If no threshold is specified, it returns the numerical Levenshtein distance.",
"threshold": {"label": "Threshold", "type": "number", "required": False},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"threshold": {
"label": "Threshold",
"type": "number",
"required": False,
"description": "The maximum allowed Levenshtein distance between the output and the correct answer.",
},
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.",
Expand Down
21 changes: 12 additions & 9 deletions agenta-backend/agenta_backend/services/evaluator_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,22 @@ async def create_ready_to_use_evaluators(app: AppDB):
Returns:
Nothing. The function works by side effect, modifying the database.
"""
evaluators = get_evaluators()

direct_use_evaluators = [
evaluator for evaluator in evaluators if evaluator.get("direct_use")
evaluator for evaluator in get_evaluators() if evaluator.get("direct_use")
]

for evaluator in direct_use_evaluators:
settings_values = {}
settings_template = evaluator.get("settings_template", {})
if "correct_answer_keys" in settings_template:
settings_values["correct_answer_keys"] = settings_template[
"correct_answer_keys"
].get("default", [])
settings_values = {
setting_name: setting.get("default")
for setting_name, setting in evaluator.get("settings_template", {}).items()
if setting.get("ground_truth_key") is True and setting.get("default", "")
}

for setting_name, default_value in settings_values.items():
assert (
default_value != ""
), f"Default value for ground truth key '{setting_name}' in Evaluator is empty"

await db_manager.create_evaluator_config(
app=app,
organization=app.organization if isCloudEE() else None, # noqa,
Expand Down
Loading
Loading