Agenta-AI · aakrem · Mar 19, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -71,6 +71,8 @@ class Evaluation(BaseModel):
     testset_name: Optional[str]
     status: Result
     aggregated_results: List[AggregatedResult]
+    started_at: Optional[datetime]
+    finished_at: Optional[datetime]
     created_at: datetime
     updated_at: datetime
 
@@ -249,6 +251,10 @@ class NewEvaluation(BaseModel):
     correct_answer_column: Optional[str]
 
 
+class RerunEvaluation(BaseModel):
+    lm_providers_keys: Optional[Dict[LMProvidersEnum, str]]
+
+
 class NewEvaluatorConfig(BaseModel):
     app_id: str
     name: str

diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py
@@ -143,6 +143,8 @@ async def evaluation_db_to_pydantic(
             else str(evaluation_db.testset.name)
         ),
         aggregated_results=aggregated_results,
+        started_at=evaluation_db.started_at,
+        finished_at=evaluation_db.finished_at,
         created_at=evaluation_db.created_at,
         updated_at=evaluation_db.updated_at,
     )

diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py
@@ -19,6 +19,7 @@
         TestSetDB_ as TestSetDB,
         AppVariantDB_ as AppVariantDB,
         EvaluationDB_ as EvaluationDB,
+        EvaluationParamsDB_ as EvaluationParamsDB,
         DeploymentDB_ as DeploymentDB,
         VariantBaseDB_ as VariantBaseDB,
         AppEnvironmentDB_ as AppEnvironmentDB,
@@ -35,6 +36,7 @@
         ImageDB,
         TestSetDB,
         EvaluationDB,
+        EvaluationParamsDB,
         DeploymentDB,
         AppVariantDB,
         VariantBaseDB,
@@ -65,6 +67,7 @@
     AppVariantDB,
     DeploymentDB,
     EvaluationDB,
+    EvaluationParamsDB,
     VariantBaseDB,
     AppEnvironmentDB,
     AppEnvironmentRevisionDB,

diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py
@@ -257,6 +257,21 @@ class Settings:
         name = "human_evaluations_scenarios"
 
 
+class EvaluationParamsDB(Document):
+    app: Link[AppDB]
+    user: Link[UserDB]
+    testset_id: PydanticObjectId
+    variants_ids: List[PydanticObjectId]
+    evaluators_configs: List[PydanticObjectId]
+    rate_limit_config: dict
+    correct_answer_column: str
+    created_at: Optional[datetime] = Field(default=datetime.now())
+    updated_at: Optional[datetime] = Field(default=datetime.now())
+
+    class Settings:
+        name = "evaluations_params"
+
+
 class EvaluationDB(Document):
     app: Link[AppDB]
     user: Link[UserDB]
@@ -266,6 +281,11 @@ class EvaluationDB(Document):
     variant_revision: PydanticObjectId
     evaluators_configs: List[PydanticObjectId]
     aggregated_results: List[AggregatedResult]
+    rerun_count: int = Field(default=None)
+    started_at: Optional[datetime] = None
+    finished_at: Optional[datetime] = None
+    evaluation_params_id: Optional[PydanticObjectId] = None
+
     created_at: datetime = Field(default=datetime.now())
     updated_at: datetime = Field(default=datetime.now())
 
@@ -284,6 +304,7 @@ class EvaluationScenarioDB(Document):
     note: Optional[str]
     evaluators_configs: List[PydanticObjectId]
     results: List[EvaluationScenarioResult]
+    rerun_count: int = Field(default=None)
     created_at: datetime = Field(default=datetime.now())
     updated_at: datetime = Field(default=datetime.now())
 

diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py
@@ -1,5 +1,6 @@
 import secrets
 import logging
+from datetime import datetime
 from typing import Any, List
 
 from fastapi.responses import JSONResponse
@@ -15,11 +16,17 @@
     NewEvaluation,
     DeleteEvaluation,
     EvaluationWebhook,
+    RerunEvaluation,
+    EvaluationStatusEnum,
 )
 from agenta_backend.services.evaluator_manager import (
     check_ai_critique_inputs,
 )
 
+from agenta_backend.models.db_models import (
+    Result,
+)
+
 if isCloudEE():
     from agenta_backend.commons.models.db_models import Permission
     from agenta_backend.commons.utils.permissions import check_action_access
@@ -125,12 +132,22 @@ async def create_evaluation(
             else payload.correct_answer_column
         )
 
+        evaluation_params = await evaluation_service.create_new_evaluation_params(
+            app_id=payload.app_id,
+            evaluator_config_ids=payload.evaluators_configs,
+            testset_id=payload.testset_id,
+            variants_ids=payload.variant_ids,
+            rate_limit_config=payload.rate_limit,
+            correct_answer_column=correct_answer_column,
+        )
+
         for variant_id in payload.variant_ids:
             evaluation = await evaluation_service.create_new_evaluation(
                 app_id=payload.app_id,
                 variant_id=variant_id,
                 evaluator_config_ids=payload.evaluators_configs,
                 testset_id=payload.testset_id,
+                evaluation_params_id=evaluation_params.id,
             )
 
             evaluate.delay(
@@ -145,6 +162,15 @@ async def create_evaluation(
             )
             evaluations.append(evaluation)
 
+        # In case we want to persist all evaluations' data
+        # to be able to rerun it later (exactly how the user
+        # created it especially with selecting the multiple
+        #  variants) then we also need to update the
+        # evaluations_params with evaluations ids like:
+        # evaluation_service.update_evaluation_params(
+        #     evaluations_ids=[evaluation.id for evaluation in evaluations]
+        # )
+
         return evaluations
     except KeyError:
         raise HTTPException(
@@ -153,6 +179,84 @@ async def create_evaluation(
         )
 
 
+@router.post("/re-run/{evaluation_ids}/", operation_id="re_run_evaluation")
+async def re_run_evaluation(
+    evaluation_ids: str,
+    app_id: str,
+    payload: RerunEvaluation,
+    request: Request,
+):
+    """Re-runs the evaluations for the given evaluation IDs and increments their rerun count.
+    Raises:
+        HTTPException: If the app is not found or the user lacks permissions.
+    Returns:
+        HTTP response indicating the operation's outcome.
+    """
+    try:
+        app = await db_manager.fetch_app_by_id(app_id)
+        if app is None:
+            raise HTTPException(status_code=404, detail="App not found")
+
+        if isCloudEE():
+            has_permission = await check_action_access(
+                user_uid=request.state.user_id,
+                object=app,
+                permission=Permission.CREATE_EVALUATION,
+            )
+            logger.debug(f"User has permission to create evaluation: {has_permission}")
+            if not has_permission:
+                error_msg = "You do not have permission to perform this action. Please contact your organization admin."
+                logger.error(error_msg)
+                return JSONResponse(
+                    {"detail": error_msg},
+                    status_code=403,
+                )
+
+        evaluation_ids = evaluation_ids.split(",")
+
+        for evaluation_id in evaluation_ids:
+            evaluation = await evaluation_service.get_evaluation_by_id(evaluation_id)
+            evaluation_params = await evaluation_service.fetch_evaluation_params(
+                evaluation.evaluation_params_id
+            )
+
+            if evaluation_params == None:
+                # due to the fact that the correct answer was not persisted
+                # rerunning evaluations with "correct_answer" as a value will
+                # result in errors. Hince returning an error.
+                return JSONResponse(
+                    {
+                        "detail": "This is an old evaluation that cannot be rerun. Please select a newer evaluation!"
+                    },
+                    status_code=400,
+                )
+
+            await evaluation_service.update_on_evaluation_rerun(
+                evaluation_id=evaluation_id,
+                evaluation=evaluation,
+            )
+
+            evaluate.delay(
+                app_id=app_id,
+                variant_id=str(evaluation.variant),
+                evaluators_config_ids=[
+                    str(config_id) for config_id in evaluation.evaluators_configs
+                ],
+                testset_id=str(evaluation.testset.id),
+                evaluation_id=evaluation_id,
+                rate_limit_config=evaluation_params.rate_limit_config,
+                lm_providers_keys=payload.lm_providers_keys,
+                correct_answer_column=evaluation_params.correct_answer_column,
+            )
+
+        return Response(status_code=status.HTTP_200_OK)
+    except KeyError:
+        raise HTTPException(
+            status_code=400,
+            detail="columns in the test set should match the names of the inputs in the variant",
+        )
+
+
 @router.get("/{evaluation_id}/status/", operation_id="fetch_evaluation_status")
 async def fetch_evaluation_status(evaluation_id: str, request: Request):
     """Fetches the status of the evaluation.

diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py
@@ -31,6 +31,7 @@
         TestSetDB_ as TestSetDB,
         AppVariantDB_ as AppVariantDB,
         EvaluationDB_ as EvaluationDB,
+        EvaluationParamsDB_ as EvaluationParamsDB,
         DeploymentDB_ as DeploymentDB,
         VariantBaseDB_ as VariantBaseDB,
         AppEnvironmentDB_ as AppEnvironmentDB,
@@ -49,6 +50,7 @@
         TestSetDB,
         AppVariantDB,
         EvaluationDB,
+        EvaluationParamsDB,
         DeploymentDB,
         VariantBaseDB,
         AppEnvironmentDB,
@@ -1818,10 +1820,14 @@ async def create_new_evaluation(
     evaluators_configs: List[str],
     organization=None,
     workspace=None,
+    started_at: Optional[datetime] = None,
+    finished_at: Optional[datetime] = None,
+    evaluation_params_id: Optional[ObjectId] = None,
 ) -> EvaluationDB:
     """Create a new evaluation scenario.
+
     Returns:
-        EvaluationScenarioDB: The created evaluation scenario.
+        EvaluationDB: The created evaluation.
     """
     evaluation = EvaluationDB(
         app=app,
@@ -1832,8 +1838,9 @@ async def create_new_evaluation(
         variant_revision=variant_revision,
         evaluators_configs=evaluators_configs,
         aggregated_results=[],
-        created_at=datetime.now().isoformat(),
-        updated_at=datetime.now().isoformat(),
+        started_at=started_at,
+        finished_at=finished_at,
+        evaluation_params_id=evaluation_params_id,
     )
 
     if isCloudEE():
@@ -1849,6 +1856,69 @@ async def create_new_evaluation(
     return evaluation
 
 
+async def create_new_evaluation_params(
+    app: AppDB,
+    testset_id: str,
+    variants_ids: List[str],
+    evaluators_configs: List[str],
+    rate_limit_config: dict,
+    correct_answer_column: str,
+    user: UserDB,
+    organization=None,
+    workspace=None,
+) -> EvaluationParamsDB:
+    """
+    Create new evaluation parameters.
+
+    Args:
+        app (AppDB): The app associated with the evaluation parameters.
+        testset_id (str): The ID of the testset.
+        variants_ids (List[str]): A list of IDs for the variants.
+        evaluators_configs (List[str]): A list of evaluator configuration IDs.
+        rate_limit_config (dict): The rate limit configuration.
+        user (UserDB): The user associated with the evaluation.
+        organization: The organization associated with the evaluation, if applicable.
+        workspace: The workspace associated with the evaluation, if applicable.
+
+    Returns:
+        EvaluationParamsDB: The created evaluation parameters.
+    """
+    evaluation_params = EvaluationParamsDB(
+        app=app,
+        user=user,
+        testset_id=testset_id,
+        variants_ids=variants_ids,
+        evaluators_configs=evaluators_configs,
+        rate_limit_config=rate_limit_config,
+        correct_answer_column=correct_answer_column,
+    )
+
+    if isCloudEE():
+        assert (
+            organization is not None and workspace is not None
+        ), "organization and workspace must be provided together"
+        evaluation_params.organization = organization
+        evaluation_params.workspace = workspace
+
+    await evaluation_params.create()
+    return evaluation_params
+
+
+async def fetch_evaluation_params(evaluation_params_id: str) -> EvaluationParamsDB:
+    """
+    Fetches evaluation parameters by their ID.
+
+    Args:
+        evaluation_params_id (str): The ID of the evaluation parameters to fetch.
+
+    Returns:
+        EvaluationParamsDB: The fetched evaluation parameters.
+    """
+    return await EvaluationParamsDB.find_one(
+        EvaluationParamsDB.id == ObjectId(evaluation_params_id)
+    )
+
+
 async def create_new_evaluation_scenario(
     user: UserDB,
     evaluation: EvaluationDB,
@@ -1862,6 +1932,7 @@ async def create_new_evaluation_scenario(
     results: List[EvaluationScenarioResult],
     organization=None,
     workspace=None,
+    rerun_count: Optional[int] = 0,
 ) -> EvaluationScenarioDB:
     """Create a new evaluation scenario.
     Returns:
@@ -1878,6 +1949,7 @@ async def create_new_evaluation_scenario(
         note=note,
         evaluators_configs=evaluators_configs,
         results=results,
+        rerun_count=rerun_count,
     )
 
     if isCloudEE():