Merge pull request #1711 from Agenta-AI/fix-all-columns

Fix evaluation PR
Agenta-AI · May 30, 2024 · 2080680 · 2080680
2 parents ffb3c2d + d285754
commit 2080680
Show file tree

Hide file tree

Showing 12 changed files with 1,530 additions and 711 deletions.
diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
@@ -4,12 +4,13 @@
         "key": "auto_exact_match",
         "direct_use": True,
         "settings_template": {
-            "label": "Exact Match Settings",
-            "description": "Settings for the Exact Match evaluator",
-            "correct_answer_keys": {
-                "label": "Correct Answer",
-                "default": ["correct_answer"],
-                "type": "array",
+            "correct_answer_key": {
+                "label": "Expected Answer Column",
+                "default": "correct_answer",
+                "type": "string",
+                "advanced": True,  # Tells the frontend that this setting is advanced and should be hidden by default
+                "ground_truth_key": True,  # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
+                "description": "The name of the column in the test data that contains the correct answer",
             },
         },
         "description": "Exact Match evaluator determines if the output exactly matches the specified correct answer, ensuring precise alignment with expected results.",
@@ -18,10 +19,7 @@
         "name": "Contains Json",
         "key": "auto_contains_json",
         "direct_use": True,
-        "settings_template": {
-            "label": "Single Model Testing Settings",
-            "description": "Checks if the JSON output contains the specified JSON structure.",
-        },
+        "settings_template": {},
         "description": "Contains Json evaluator checks if the output contains the specified JSON structure.",
     },
     {
@@ -38,10 +36,13 @@
                 "max": 1,
                 "required": True,
             },
-            "correct_answer_keys": {
-                "label": "Correct Answer",
-                "default": ["correct_answer"],
-                "type": "array",
+            "correct_answer_key": {
+                "label": "Expected Answer Column",
+                "default": "correct_answer",
+                "type": "string",
+                "advanced": True,  # Tells the frontend that this setting is advanced and should be hidden by default
+                "ground_truth_key": True,  # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
+                "description": "The name of the column in the test data that contains the correct answer",
             },
         },
         "description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.",
@@ -79,10 +80,13 @@
                 "description": "The name of the field in the JSON output that you wish to evaluate",
                 "required": True,
             },
-            "correct_answer_keys": {
-                "label": "Correct Answer",
-                "default": ["correct_answer"],
+            "correct_answer_key": {
+                "label": "Expected Answer Column",
+                "default": "correct_answer",
                 "type": "string",
+                "advanced": True,  # Tells the frontend that this setting is advanced and should be hidden by default
+                "ground_truth_key": True,  # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
+                "description": "The name of the column in the test data that contains the correct answer",
             },
         },
         "description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.",
@@ -95,10 +99,18 @@
             "prompt_template": {
                 "label": "Prompt Template",
                 "type": "text",
-                "default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nCorrect Answer:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
+                "default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
                 "description": "Template for AI critique prompts",
                 "required": True,
-            }
+            },
+            "correct_answer_key": {
+                "label": "Expected Answer Column",
+                "default": "correct_answer",
+                "type": "string",
+                "advanced": True,  # Tells the frontend that this setting is advanced and should be hidden by default
+                "ground_truth_key": True,  # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
+                "description": "The name of the column in the test data that contains the correct answer",
+            },
         },
         "description": "AI Critique evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).",
     },
@@ -128,10 +140,13 @@
                 "description": "https://your-webhook-url.com",
                 "required": True,
             },
-            "correct_answer_keys": {
-                "label": "Correct Answer",
-                "default": ["correct_answer"],
+            "correct_answer_key": {
+                "label": "Expected Answer Column",
+                "default": "correct_answer",
                 "type": "string",
+                "advanced": True,  # Tells the frontend that this setting is advanced and should be hidden by default
+                "ground_truth_key": True,  # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
+                "description": "The name of the column in the test data that contains the correct answer",
             },
         },
         "description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response indicating the correctness of the answer. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.",
@@ -141,13 +156,17 @@
         "key": "auto_starts_with",
         "direct_use": False,
         "settings_template": {
-            "label": "Single Model Testing Settings",
-            "description": "Checks if the output starts with the specified prefix.",
-            "prefix": {"label": "prefix", "type": "string", "required": True},
+            "prefix": {
+                "label": "prefix",
+                "type": "string",
+                "required": True,
+                "description": "The string to match at the start of the output.",
+            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
                 "default": True,
+                "description": "If the evaluation should be case sensitive.",
             },
         },
         "description": "Starts With evaluator checks if the output starts with a specified prefix, considering case sensitivity based on the settings.",
@@ -157,8 +176,6 @@
         "key": "auto_ends_with",
         "direct_use": False,
         "settings_template": {
-            "label": "Single Model Testing Settings",
-            "description": "Checks if the output ends with the specified suffix.",
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -179,8 +196,6 @@
         "key": "auto_contains",
         "direct_use": False,
         "settings_template": {
-            "label": "Single Model Testing Settings",
-            "description": "Checks if the output contains the specified substring.",
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -201,8 +216,6 @@
         "key": "auto_contains_any",
         "direct_use": False,
         "settings_template": {
-            "label": "Single Model Testing Settings",
-            "description": "Checks if the output contains any of the specified substrings.",
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -223,8 +236,6 @@
         "key": "auto_contains_all",
         "direct_use": False,
         "settings_template": {
-            "label": "Single Model Testing Settings",
-            "description": "Checks if the output contains all of the specified substrings.",
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -245,13 +256,19 @@
         "key": "auto_levenshtein_distance",
         "direct_use": False,
         "settings_template": {
-            "label": "Levenshtein Distance Settings",
-            "description": "Evaluates the Levenshtein distance between the output and the correct answer. If a threshold is specified, it checks if the distance is below this threshold and returns a boolean value. If no threshold is specified, it returns the numerical Levenshtein distance.",
-            "threshold": {"label": "Threshold", "type": "number", "required": False},
-            "correct_answer_keys": {
-                "label": "Correct Answer",
-                "default": ["correct_answer"],
+            "threshold": {
+                "label": "Threshold",
+                "type": "number",
+                "required": False,
+                "description": "The maximum allowed Levenshtein distance between the output and the correct answer.",
+            },
+            "correct_answer_key": {
+                "label": "Expected Answer Column",
+                "default": "correct_answer",
                 "type": "string",
+                "advanced": True,  # Tells the frontend that this setting is advanced and should be hidden by default
+                "ground_truth_key": True,  # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
+                "description": "The name of the column in the test data that contains the correct answer",
             },
         },
         "description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.",

diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py
@@ -141,19 +141,22 @@ async def create_ready_to_use_evaluators(app: AppDB):
     Returns:
     Nothing. The function works by side effect, modifying the database.
     """
-    evaluators = get_evaluators()
-
     direct_use_evaluators = [
-        evaluator for evaluator in evaluators if evaluator.get("direct_use")
+        evaluator for evaluator in get_evaluators() if evaluator.get("direct_use")
     ]
 
     for evaluator in direct_use_evaluators:
-        settings_values = {}
-        settings_template = evaluator.get("settings_template", {})
-        if "correct_answer_keys" in settings_template:
-            settings_values["correct_answer_keys"] = settings_template[
-                "correct_answer_keys"
-            ].get("default", [])
+        settings_values = {
+            setting_name: setting.get("default")
+            for setting_name, setting in evaluator.get("settings_template", {}).items()
+            if setting.get("ground_truth_key") is True and setting.get("default", "")
+        }
+
+        for setting_name, default_value in settings_values.items():
+            assert (
+                default_value != ""
+            ), f"Default value for ground truth key '{setting_name}' in Evaluator is empty"
+
         await db_manager.create_evaluator_config(
             app=app,
             organization=app.organization if isCloudEE() else None,  # noqa,