Skip to content

Commit

Permalink
Merge pull request #1711 from Agenta-AI/fix-all-columns
Browse files Browse the repository at this point in the history
Fix evaluation PR
  • Loading branch information
mmabrouk authored May 30, 2024
2 parents ffb3c2d + d285754 commit 2080680
Show file tree
Hide file tree
Showing 12 changed files with 1,530 additions and 711 deletions.
95 changes: 56 additions & 39 deletions agenta-backend/agenta_backend/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
"key": "auto_exact_match",
"direct_use": True,
"settings_template": {
"label": "Exact Match Settings",
"description": "Settings for the Exact Match evaluator",
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"type": "array",
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "Exact Match evaluator determines if the output exactly matches the specified correct answer, ensuring precise alignment with expected results.",
Expand All @@ -18,10 +19,7 @@
"name": "Contains Json",
"key": "auto_contains_json",
"direct_use": True,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the JSON output contains the specified JSON structure.",
},
"settings_template": {},
"description": "Contains Json evaluator checks if the output contains the specified JSON structure.",
},
{
Expand All @@ -38,10 +36,13 @@
"max": 1,
"required": True,
},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"type": "array",
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.",
Expand Down Expand Up @@ -79,10 +80,13 @@
"description": "The name of the field in the JSON output that you wish to evaluate",
"required": True,
},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.",
Expand All @@ -95,10 +99,18 @@
"prompt_template": {
"label": "Prompt Template",
"type": "text",
"default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nCorrect Answer:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"description": "Template for AI critique prompts",
"required": True,
}
},
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "AI Critique evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).",
},
Expand Down Expand Up @@ -128,10 +140,13 @@
"description": "https://your-webhook-url.com",
"required": True,
},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response indicating the correctness of the answer. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.",
Expand All @@ -141,13 +156,17 @@
"key": "auto_starts_with",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output starts with the specified prefix.",
"prefix": {"label": "prefix", "type": "string", "required": True},
"prefix": {
"label": "prefix",
"type": "string",
"required": True,
"description": "The string to match at the start of the output.",
},
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
"default": True,
"description": "If the evaluation should be case sensitive.",
},
},
"description": "Starts With evaluator checks if the output starts with a specified prefix, considering case sensitivity based on the settings.",
Expand All @@ -157,8 +176,6 @@
"key": "auto_ends_with",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output ends with the specified suffix.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -179,8 +196,6 @@
"key": "auto_contains",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output contains the specified substring.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -201,8 +216,6 @@
"key": "auto_contains_any",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output contains any of the specified substrings.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -223,8 +236,6 @@
"key": "auto_contains_all",
"direct_use": False,
"settings_template": {
"label": "Single Model Testing Settings",
"description": "Checks if the output contains all of the specified substrings.",
"case_sensitive": {
"label": "Case Sensitive",
"type": "boolean",
Expand All @@ -245,13 +256,19 @@
"key": "auto_levenshtein_distance",
"direct_use": False,
"settings_template": {
"label": "Levenshtein Distance Settings",
"description": "Evaluates the Levenshtein distance between the output and the correct answer. If a threshold is specified, it checks if the distance is below this threshold and returns a boolean value. If no threshold is specified, it returns the numerical Levenshtein distance.",
"threshold": {"label": "Threshold", "type": "number", "required": False},
"correct_answer_keys": {
"label": "Correct Answer",
"default": ["correct_answer"],
"threshold": {
"label": "Threshold",
"type": "number",
"required": False,
"description": "The maximum allowed Levenshtein distance between the output and the correct answer.",
},
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.",
Expand Down
21 changes: 12 additions & 9 deletions agenta-backend/agenta_backend/services/evaluator_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,22 @@ async def create_ready_to_use_evaluators(app: AppDB):
Returns:
Nothing. The function works by side effect, modifying the database.
"""
evaluators = get_evaluators()

direct_use_evaluators = [
evaluator for evaluator in evaluators if evaluator.get("direct_use")
evaluator for evaluator in get_evaluators() if evaluator.get("direct_use")
]

for evaluator in direct_use_evaluators:
settings_values = {}
settings_template = evaluator.get("settings_template", {})
if "correct_answer_keys" in settings_template:
settings_values["correct_answer_keys"] = settings_template[
"correct_answer_keys"
].get("default", [])
settings_values = {
setting_name: setting.get("default")
for setting_name, setting in evaluator.get("settings_template", {}).items()
if setting.get("ground_truth_key") is True and setting.get("default", "")
}

for setting_name, default_value in settings_values.items():
assert (
default_value != ""
), f"Default value for ground truth key '{setting_name}' in Evaluator is empty"

await db_manager.create_evaluator_config(
app=app,
organization=app.organization if isCloudEE() else None, # noqa,
Expand Down
Loading

0 comments on commit 2080680

Please sign in to comment.