Merge branch 'evaluations-in-backend' of https://github.com/Agenta-AI…

…/agenta into evaluations-in-backend
Agenta-AI · Jan 2, 2024 · 7731157 · 7731157
2 parents c76cc3d + 7a32360
commit 7731157
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 90 deletions.
diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -45,7 +45,7 @@ class EvaluationStatusEnum(str, Enum):
     EVALUATION_INITIALIZED = "EVALUATION_INITIALIZED"
     EVALUATION_STARTED = "EVALUATION_STARTED"
     EVALUATION_FINISHED = "EVALUATION_FINISHED"
-    EVALUATION_ERROR = "EVALUATION_ERROR"
+    EVALUATION_FAILED = "EVALUATION_FAILED"
 
 
 class EvaluationScenarioStatusEnum(str, Enum):

diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py
@@ -1855,3 +1855,27 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool:
         )  # checking if delete_result is None (has been deleted)
     except Exception as e:
         raise e
+
+
+async def update_evaluation(
+    evaluation_id: str, updates: Dict[str, Any]
+) -> EvaluationDB:
+    """
+    Update an evaluator configuration in the database with the provided id.
+
+    Arguments:
+        evaluation_id (str): The ID of the evaluator configuration to be updated.
+        updates (Dict[str, Any]): The updates to apply to the evaluator configuration.
+
+    Returns:
+        EvaluatorConfigDB: The updated evaluator configuration object.
+    """
+    evaluation = await engine.find_one(
+        EvaluationDB, EvaluationDB.id == ObjectId(evaluation_id)
+    )
+
+    for key, value in updates.items():
+        if key in evaluation.__fields__:
+            setattr(evaluation, key, value)
+    await engine.save(evaluation)
+    return evaluation
diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py
@@ -13,6 +13,7 @@
     fetch_testset_by_id,
     create_new_evaluation_scenario,
     fetch_evaluator_config_by_appId,
+    update_evaluation,
     update_evaluation_with_aggregated_results,
 )
 from agenta_backend.models.db_models import (
@@ -32,107 +33,129 @@ def evaluate(
     app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str
 ):
     loop = asyncio.get_event_loop()
-    app = AppDB(**app_data)
-    evaluation = NewEvaluation(**new_evaluation_data)
 
-    testset = loop.run_until_complete(fetch_testset_by_id(testset_id))
-    new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id))
-    evaluators_aggregated_data = defaultdict(list)
+    try:
+        app = AppDB(**app_data)
+        evaluation = NewEvaluation(**new_evaluation_data)
 
-    variant_id = str(evaluation.variant_ids[0])
-
-    app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id))
-    deployment = loop.run_until_complete(
-        get_deployment_by_objectid(app_variant_db.base.deployment)
-    )
+        testset = loop.run_until_complete(fetch_testset_by_id(testset_id))
+        new_evaluation_db = loop.run_until_complete(
+            fetch_evaluation_by_id(evaluation_id)
+        )
+        evaluators_aggregated_data = defaultdict(list)
 
-    # TODO: remove if abraham's fix is working
-    uri = deployment.uri.replace("http://localhost", "http://host.docker.internal")
+        variant_id = str(evaluation.variant_ids[0])
 
-    for data_point in testset.csvdata:
-        # 1. We prepare the inputs
-        raw_inputs = (
-            app_variant_db.parameters.get("inputs", [])
-            if app_variant_db.parameters
-            else []
+        app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id))
+        deployment = loop.run_until_complete(
+            get_deployment_by_objectid(app_variant_db.base.deployment)
         )
-        inputs = []
-        if raw_inputs:
-            inputs = [
-                EvaluationScenarioInputDB(
-                    name=input_item["name"],
-                    type="text",
-                    value=data_point[input_item["name"]],
-                )
-                for input_item in raw_inputs
-            ]
-
-        #!NOTE: do not remove! this will be used in github workflow!
-        backend_environment = os.environ.get("ENVIRONMENT")
-        if backend_environment is not None and backend_environment == "github":
-            uri = f"http://{deployment.container_name}"
-        else:
-            uri = deployment.uri.replace(
-                "http://localhost", "http://host.docker.internal"
-            )
-        # 2. We get the output from the llm app
-        variant_output = llm_apps_service.get_llm_app_output(uri, data_point)
-
-        # 3. We evaluate
-        evaluators_results: [EvaluationScenarioResult] = []
-        for evaluator_config_id in evaluation.evaluators_configs:
-            evaluator_config = loop.run_until_complete(
-                fetch_evaluator_config(evaluator_config_id)
-            )
 
-            additional_kwargs = (
-                {
-                    "app_params": app_variant_db.config.parameters,
-                    "inputs": data_point,  # TODO: fetch input from config parameters when #1102 has been fixed
-                }
-                if evaluator_config.evaluator_key == "custom_code_run"
-                else {}
-            )
-            result = evaluators_service.evaluate(
-                evaluator_config.evaluator_key,
-                variant_output,
-                data_point["correct_answer"],
-                evaluator_config.settings_values,
-                **additional_kwargs,
+        # TODO: remove if abraham's fix is working
+        uri = deployment.uri.replace("http://localhost", "http://host.docker.internal")
+
+        for data_point in testset.csvdata:
+            # 1. We prepare the inputs
+            raw_inputs = (
+                app_variant_db.parameters.get("inputs", [])
+                if app_variant_db.parameters
+                else []
             )
+            inputs = []
+            if raw_inputs:
+                inputs = [
+                    EvaluationScenarioInputDB(
+                        name=input_item["name"],
+                        type="text",
+                        value=data_point[input_item["name"]],
+                    )
+                    for input_item in raw_inputs
+                ]
+
+            #!NOTE: do not remove! this will be used in github workflow!
+            backend_environment = os.environ.get("ENVIRONMENT")
+            if backend_environment is not None and backend_environment == "github":
+                uri = f"http://{deployment.container_name}"
+            else:
+                uri = deployment.uri.replace(
+                    "http://localhost", "http://host.docker.internal"
+                )
+
+            # 2. We get the output from the llm app
+            try:
+                variant_output = llm_apps_service.get_llm_app_output(uri, data_point)
+            except Exception as e:
+                print(f"Error getting variant output: {e}")
+                loop.run_until_complete(
+                    update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"})
+                )
+                return
 
-            result_object = EvaluationScenarioResult(
-                evaluator_config=evaluator_config.id,
-                result=result,
+            # 3. We evaluate
+            evaluators_results: [EvaluationScenarioResult] = []
+            for evaluator_config_id in evaluation.evaluators_configs:
+                evaluator_config = loop.run_until_complete(
+                    fetch_evaluator_config(evaluator_config_id)
+                )
+
+                additional_kwargs = (
+                    {
+                        "app_params": app_variant_db.config.parameters,
+                        "inputs": data_point,  # TODO: fetch input from config parameters when #1102 has been fixed
+                    }
+                    if evaluator_config.evaluator_key == "custom_code_run"
+                    else {}
+                )
+                result = evaluators_service.evaluate(
+                    evaluator_config.evaluator_key,
+                    variant_output,
+                    data_point["correct_answer"],
+                    evaluator_config.settings_values,
+                    **additional_kwargs,
+                )
+
+                result_object = EvaluationScenarioResult(
+                    evaluator_config=evaluator_config.id,
+                    result=result,
+                )
+                evaluators_results.append(result_object)
+                evaluators_aggregated_data[evaluator_config.evaluator_key].append(
+                    result
+                )
+
+            # 4. We create a new evaluation scenario
+            evaluation_scenario = loop.run_until_complete(
+                create_new_evaluation_scenario(
+                    user=app.user,
+                    organization=app.organization,
+                    evaluation=new_evaluation_db,
+                    variant_id=variant_id,
+                    evaluators_configs=new_evaluation_db.evaluators_configs,
+                    inputs=inputs,
+                    is_pinned=False,
+                    note="",
+                    correct_answer=data_point["correct_answer"],
+                    outputs=[
+                        EvaluationScenarioOutputDB(type="text", value=variant_output)
+                    ],
+                    results=evaluators_results,
+                )
             )
-            evaluators_results.append(result_object)
-            evaluators_aggregated_data[evaluator_config.evaluator_key].append(result)
-
-        # 4. We create a new evaluation scenario
-        evaluation_scenario = loop.run_until_complete(
-            create_new_evaluation_scenario(
-                user=app.user,
-                organization=app.organization,
-                evaluation=new_evaluation_db,
-                variant_id=variant_id,
-                evaluators_configs=new_evaluation_db.evaluators_configs,
-                inputs=inputs,
-                is_pinned=False,
-                note="",
-                correct_answer=data_point["correct_answer"],
-                outputs=[EvaluationScenarioOutputDB(type="text", value=variant_output)],
-                results=evaluators_results,
+
+        aggregated_results = loop.run_until_complete(
+            aggregate_evaluator_results(app, evaluators_aggregated_data)
+        )
+        updated_evaluation = loop.run_until_complete(
+            update_evaluation_with_aggregated_results(
+                new_evaluation_db.id, aggregated_results
             )
         )
 
-    aggregated_results = loop.run_until_complete(
-        aggregate_evaluator_results(app, evaluators_aggregated_data)
-    )
-    updated_evaluation = loop.run_until_complete(
-        update_evaluation_with_aggregated_results(
-            new_evaluation_db.id, aggregated_results
+    except Exception as e:
+        print(f"An error occurred during evaluation: {e}")
+        loop.run_until_complete(
+            update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"})
         )
-    )
 
 
 async def aggregate_evaluator_results(

diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts
@@ -335,7 +335,7 @@ export enum EvaluationStatus {
     INITIALIZED = "EVALUATION_INITIALIZED",
     STARTED = "EVALUATION_STARTED",
     FINISHED = "EVALUATION_FINISHED",
-    ERROR = "EVALUATION_ERROR",
+    ERROR = "EVALUATION_FAILED",
 }
 
 export interface _Evaluation {