Agenta-AI · aakrem · May 31, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
@@ -2,11 +2,16 @@
     {
         "name": "Exact Match",
         "key": "auto_exact_match",
-        "direct_use": True,
+        "direct_use": False,
         "settings_template": {
             "label": "Exact Match Settings",
             "description": "Settings for the Exact Match evaluator",
+            "correct_answer": {
+                "label": "Correct Answer",
+                "type": "string",
+            },
         },
+        "description": "Exact Match evaluator determines if the output exactly matches the specified correct answer, ensuring precise alignment with expected results.",
     },
     {
         "name": "Contains Json",
@@ -31,7 +36,11 @@
                 "min": 0,
                 "max": 1,
                 "required": True,
-            }
+            },
+            "correct_answer": {
+                "label": "Correct Answer",
+                "type": "string",
+            },
         },
         "description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.",
     },
@@ -67,7 +76,11 @@
                 "default": "",
                 "description": "The name of the field in the JSON output that you wish to evaluate",
                 "required": True,
-            }
+            },
+            "correct_answer": {
+                "label": "Correct Answer",
+                "type": "string",
+            },
         },
         "description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.",
     },
@@ -112,27 +125,13 @@
                 "description": "https://your-webhook-url.com",
                 "required": True,
             },
+            "correct_answer": {
+                "label": "Correct Answer",
+                "type": "string",
+            },
         },
         "description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response indicating the correctness of the answer. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.",
     },
-    {
-        "name": "A/B Test",
-        "key": "human_a_b_testing",
-        "direct_use": False,
-        "settings_template": {
-            "label": "A/B Testing Settings",
-            "description": "Settings for A/B testing configurations",
-        },
-    },
-    {
-        "name": "Single Model Test",
-        "key": "human_single_model_test",
-        "direct_use": False,
-        "settings_template": {
-            "label": "Single Model Testing Settings",
-            "description": "Settings for single model testing configurations",
-        },
-    },
     {
         "name": "Starts With",
         "key": "auto_starts_with",
@@ -245,6 +244,10 @@
             "label": "Levenshtein Distance Settings",
             "description": "Evaluates the Levenshtein distance between the output and the correct answer. If a threshold is specified, it checks if the distance is below this threshold and returns a boolean value. If no threshold is specified, it returns the numerical Levenshtein distance.",
             "threshold": {"label": "Threshold", "type": "number", "required": False},
+            "correct_answer": {
+                "label": "Correct Answer",
+                "type": "string",
+            },
         },
         "description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.",
     },

diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py
@@ -119,11 +119,6 @@ async def create_evaluation(
             return response
 
         evaluations = []
-        correct_answer_column = (
-            "correct_answer"
-            if payload.correct_answer_column is None
-            else payload.correct_answer_column
-        )
 
         for variant_id in payload.variant_ids:
             evaluation = await evaluation_service.create_new_evaluation(
@@ -141,7 +136,6 @@ async def create_evaluation(
                 evaluation_id=evaluation.id,
                 rate_limit_config=payload.rate_limit.dict(),
                 lm_providers_keys=payload.lm_providers_keys,
-                correct_answer_column=correct_answer_column,
             )
             evaluations.append(evaluation)
 

diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -1,7 +1,7 @@
 import re
 import json
 import httpx
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Tuple, List
 
 from agenta_backend.services.security import sandbox
 from agenta_backend.models.db_models import Error, Result
@@ -18,13 +18,14 @@
 def auto_exact_match(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
 ) -> Result:
     try:
-        exact_match = True if output == correct_answer else False
+        exact_match = True if output == data_point[correct_answer_key] else False
         result = Result(type="bool", value=exact_match)
         return result
     except Exception as e:
@@ -40,14 +41,15 @@ def auto_exact_match(
 def auto_similarity_match(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
 ) -> Result:
     try:
         set1 = set(output.split())
-        set2 = set(correct_answer.split())
+        set2 = set(data_point[correct_answer_key].split())
         intersect = set1.intersection(set2)
         union = set1.union(set2)
 
@@ -72,7 +74,8 @@ def auto_similarity_match(
 def auto_regex_test(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -96,14 +99,17 @@ def auto_regex_test(
 def field_match_test(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
 ) -> Result:
     try:
         output_json = json.loads(output)
-        result = output_json[settings_values["json_field"]] == correct_answer
+        result = (
+            output_json[settings_values["json_field"]] == data_point[correct_answer_key]
+        )
         return Result(type="bool", value=result)
     except Exception as e:
         logging.debug("Field Match Test Failed because of Error: " + str(e))
@@ -113,15 +119,16 @@ def field_match_test(
 def auto_webhook_test(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
 ) -> Result:
     try:
         with httpx.Client() as client:
             payload = {
-                "correct_answer": correct_answer,
+                "correct_answer": data_point[correct_answer_key],
                 "output": output,
                 "inputs": inputs,
             }
@@ -168,7 +175,8 @@ def auto_webhook_test(
 def auto_custom_code_run(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -178,7 +186,7 @@ def auto_custom_code_run(
             app_params=app_params,
             inputs=inputs,
             output=output,
-            correct_answer=correct_answer,
+            data_point=data_point,
             code=settings_values["code"],
         )
         return Result(type="number", value=result)
@@ -195,7 +203,8 @@ def auto_custom_code_run(
 def auto_ai_critique(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -206,7 +215,7 @@ def auto_ai_critique(
     Args:
         inputs (Dict[str, Any]): Input parameters for the LLM app variant.
         output (str): The output of the LLM app variant.
-        correct_answer (str): Correct answer for evaluation.
+        correct_answer_key (str): The key name of the correct answer  in the datapoint.
         app_params (Dict[str, Any]): Application parameters.
         settings_values (Dict[str, Any]): Settings for the evaluation.
         lm_providers_keys (Dict[str, Any]): Keys for language model providers.
@@ -224,7 +233,7 @@ def auto_ai_critique(
         chain_run_args = {
             "llm_app_prompt_template": app_params.get("prompt_user", ""),
             "variant_output": output,
-            "correct_answer": correct_answer,
+            "correct_answer": data_point[correct_answer_key],
         }
 
         for key, value in inputs.items():
@@ -252,7 +261,8 @@ def auto_ai_critique(
 def auto_starts_with(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -280,7 +290,8 @@ def auto_starts_with(
 def auto_ends_with(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -306,7 +317,8 @@ def auto_ends_with(
 def auto_contains(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -332,7 +344,8 @@ def auto_contains(
 def auto_contains_any(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -363,7 +376,8 @@ def auto_contains_any(
 def auto_contains_all(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -394,7 +408,8 @@ def auto_contains_all(
 def auto_contains_json(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -444,13 +459,14 @@ def levenshtein_distance(s1, s2):
 def auto_levenshtein_distance(
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
 ) -> Result:
     try:
-        distance = levenshtein_distance(output, correct_answer)
+        distance = levenshtein_distance(output, data_point[correct_answer_key])
 
         if "threshold" in settings_values:
             threshold = settings_values["threshold"]
@@ -474,7 +490,8 @@ def evaluate(
     evaluator_key: str,
     inputs: Dict[str, Any],
     output: str,
-    correct_answer: str,
+    data_point: Dict[str, Any],
+    correct_answer_key: str,
     app_params: Dict[str, Any],
     settings_values: Dict[str, Any],
     lm_providers_keys: Dict[str, Any],
@@ -486,7 +503,8 @@ def evaluate(
         return evaluation_function(
             inputs,
             output,
-            correct_answer,
+            data_point,
+            correct_answer_key,
             app_params,
             settings_values,
             lm_providers_keys,

diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py
@@ -71,7 +71,6 @@ def evaluate(
     evaluation_id: str,
     rate_limit_config: Dict[str, int],
     lm_providers_keys: Dict[str, Any],
-    correct_answer_column: str,
 ):
     """
     Evaluate function that performs the evaluation of an app variant using the provided evaluators and testset.
@@ -214,14 +213,18 @@ def evaluate(
                 continue
 
             # 3. We evaluate
-            evaluators_results: [EvaluationScenarioResult] = []
+            evaluators_results: List[EvaluationScenarioResult] = []
             for evaluator_config_db in evaluator_config_dbs:
                 logger.debug(f"Evaluating with evaluator: {evaluator_config_db}")
+                correct_answer_column = evaluator_config_db.settings_values.get(
+                    "correct_answer"
+                )
                 if correct_answer_column in data_point:
                     result = evaluators_service.evaluate(
                         evaluator_key=evaluator_config_db.evaluator_key,
                         output=app_output.result.value,
-                        correct_answer=data_point[correct_answer_column],
+                        data_point=data_point,
+                        correct_answer_key=correct_answer_column,
                         settings_values=evaluator_config_db.settings_values,
                         app_params=app_variant_parameters,
                         inputs=data_point,