Merge pull request #1168 from Agenta-AI/cleanup-evaluations

Refactor - Cleanup redundant code in evaluations branch
Agenta-AI · Jan 8, 2024 · 71c7100 · 71c7100
2 parents 548b3ce + 03e7efa
commit 71c7100
Show file tree

Hide file tree

Showing 5 changed files with 6 additions and 138 deletions.
diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -21,25 +21,8 @@ class EvaluatorConfig(BaseModel):
     updated_at: datetime
 
 
-class EvaluationTypeSettings(BaseModel):
-    similarity_threshold: Optional[float]
-    regex_pattern: Optional[str]
-    regex_should_match: Optional[bool]
-    webhook_url: Optional[str]
-    custom_code_evaluation_id: Optional[str]
-    llm_app_prompt_template: Optional[str]
-    evaluation_prompt_template: Optional[str]
-
-
 class EvaluationType(str, Enum):
-    auto_exact_match = "auto_exact_match"
-    auto_similarity_match = "auto_similarity_match"
-    auto_regex_test = "auto_regex_test"
-    auto_webhook_test = "auto_webhook_test"
-    auto_ai_critique = "auto_ai_critique"
     human_a_b_testing = "human_a_b_testing"
-    human_scoring = "human_scoring"
-    custom_code_run = "custom_code_run"
     single_model_test = "single_model_test"
 
 
@@ -63,7 +46,6 @@ class NewHumanEvaluation(BaseModel):
     app_id: str
     variant_ids: List[str]
     evaluation_type: EvaluationType
-    evaluation_type_settings: Optional[EvaluationTypeSettings]
     inputs: List[str]
     testset_id: str
     status: str
@@ -99,7 +81,6 @@ class SimpleEvaluationOutput(BaseModel):
 
 class HumanEvaluationUpdate(BaseModel):
     status: Optional[EvaluationStatusEnum]
-    evaluation_type_settings: Optional[EvaluationTypeSettings]
 
 
 class EvaluationScenarioResult(BaseModel):
@@ -134,7 +115,6 @@ class HumanEvaluation(BaseModel):
     user_id: str
     user_username: str
     evaluation_type: EvaluationType
-    evaluation_type_settings: Optional[EvaluationTypeSettings]
     variant_ids: List[str]
     variant_names: List[str]
     testset_id: str
@@ -179,15 +159,6 @@ class EvaluationScenario(BaseModel):
     results: List[EvaluationScenarioResult]
 
 
-class AICritiqueCreate(BaseModel):
-    correct_answer: str
-    llm_app_prompt_template: Optional[str]
-    inputs: List[EvaluationScenarioInput]
-    outputs: List[EvaluationScenarioOutput]
-    evaluation_prompt_template: Optional[str]
-    open_ai_key: Optional[str]
-
-
 class EvaluationScenarioUpdate(BaseModel):
     vote: Optional[str]
     score: Optional[Any]
@@ -245,12 +216,6 @@ class EvaluationWebhook(BaseModel):
     score: float
 
 
-class EvaluationSettingsTemplate(BaseModel):
-    type: str
-    default: str
-    description: str
-
-
 class LLMRunRateLimit(BaseModel):
     batch_size: int
     max_retries: int

diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py
@@ -19,7 +19,6 @@
     AppVariantDB,
     TemplateDB,
     TestSetDB,
-    CustomEvaluationDB,
     EvaluatorConfigDB,
     HumanEvaluationDB,
     HumanEvaluationScenarioDB,
@@ -47,7 +46,6 @@
     AppVariantDB,
     TemplateDB,
     TestSetDB,
-    CustomEvaluationDB,
     EvaluatorConfigDB,
     HumanEvaluationDB,
     HumanEvaluationScenarioDB,

diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py
@@ -195,25 +195,6 @@ class Settings:
         name = "testsets"
 
 
-class CustomEvaluationDB(Document):
-    evaluation_name: str
-    python_code: str
-    app: Link[AppDB]
-    user: Link[UserDB]
-    organization: Link[OrganizationDB]
-    created_at: Optional[datetime] = Field(default=datetime.utcnow())
-    updated_at: Optional[datetime] = Field(default=datetime.utcnow())
-
-    class Settings:
-        name = "custom_evaluations"
-
-
-class EvaluationSettingsTemplate(BaseModel):
-    type: str
-    default: str
-    description: str
-
-
 class EvaluatorConfigDB(Document):
     app: Link[AppDB]
     organization: Link[OrganizationDB]

diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py
@@ -9,13 +9,11 @@
     EvaluationScenario,
     EvaluationScenarioInput,
     EvaluationType,
-    EvaluationTypeSettings,
     HumanEvaluation,
     HumanEvaluationScenario,
     HumanEvaluationUpdate,
     NewEvaluation,
     EvaluationScenarioUpdate,
-    CreateCustomEvaluation,
     EvaluationStatusEnum,
     NewHumanEvaluation,
 )
@@ -33,7 +31,6 @@
     HumanEvaluationScenarioOutput,
     UserDB,
     AppDB,
-    CustomEvaluationDB,
 )
 
 from beanie import PydanticObjectId as ObjectId
@@ -268,21 +265,6 @@ async def update_human_evaluation_service(
     if update_payload.status is not None:
         updates["status"] = update_payload.status
 
-    if update_payload.evaluation_type_settings is not None:
-        current_settings = evaluation.evaluation_type_settings
-        new_settings = update_payload.evaluation_type_settings
-
-        # Update only the fields that are explicitly set in the payload
-        for field in EvaluationTypeSettings.__annotations__.keys():
-            setattr(
-                current_settings,
-                field,
-                getattr(new_settings, field, None)
-                or getattr(current_settings, field, None),
-            )
-
-        updates["evaluation_type_settings"] = current_settings
-
     # Update the evaluation
     await evaluation.update({"$set": updates})
 
@@ -376,11 +358,6 @@ async def update_human_evaluation_scenario(
     new_eval_set = {}
 
     if updated_data["score"] is not None and evaluation_type in [
-        EvaluationType.auto_exact_match,
-        EvaluationType.auto_similarity_match,
-        EvaluationType.auto_regex_test,
-        EvaluationType.auto_webhook_test,
-        EvaluationType.auto_ai_critique,
         EvaluationType.single_model_test,
     ]:
         new_eval_set["score"] = updated_data["score"]
@@ -389,8 +366,6 @@ async def update_human_evaluation_scenario(
         and evaluation_type == EvaluationType.human_a_b_testing
     ):
         new_eval_set["vote"] = updated_data["vote"]
-    elif evaluation_type == EvaluationType.custom_code_run:
-        new_eval_set["correct_answer"] = updated_data["correct_answer"]
 
     if updated_data["outputs"] is not None:
         new_outputs = [
@@ -471,14 +446,7 @@ async def get_evaluation_scenario_score_service(
 
 def _extend_with_evaluation(evaluation_type: EvaluationType):
     evaluation = {}
-    if (
-        evaluation_type == EvaluationType.auto_exact_match
-        or evaluation_type == EvaluationType.auto_similarity_match
-        or evaluation_type == EvaluationType.auto_regex_test
-        or evaluation_type == EvaluationType.auto_webhook_test
-        or evaluation_type == EvaluationType.single_model_test
-        or EvaluationType.auto_ai_critique
-    ):
+    if evaluation_type == EvaluationType.single_model_test:
         evaluation["score"] = ""
 
     if evaluation_type == EvaluationType.human_a_b_testing:
@@ -488,15 +456,8 @@ def _extend_with_evaluation(evaluation_type: EvaluationType):
 
 def _extend_with_correct_answer(evaluation_type: EvaluationType, row: dict):
     correct_answer = {}
-    if (
-        evaluation_type == EvaluationType.auto_exact_match
-        or evaluation_type == EvaluationType.auto_similarity_match
-        or evaluation_type == EvaluationType.auto_regex_test
-        or evaluation_type == EvaluationType.auto_ai_critique
-        or evaluation_type == EvaluationType.auto_webhook_test
-    ):
-        if row["correct_answer"]:
-            correct_answer["correct_answer"] = row["correct_answer"]
+    if row["correct_answer"]:
+        correct_answer["correct_answer"] = row["correct_answer"]
     return correct_answer
 
 
@@ -634,42 +595,6 @@ async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) -
         await evaluation.delete()
 
 
-async def create_custom_code_evaluation(
-    payload: CreateCustomEvaluation, **user_org_data: dict
-) -> str:
-    """Save the custom evaluation code in the database.
-
-    Args:
-        payload (CreateCustomEvaluation): the required payload
-
-    Returns:
-        str: the custom evaluation id
-    """
-
-    # Initialize custom evaluation instance
-    access = await check_access_to_app(
-        user_org_data=user_org_data, app_id=payload.app_id
-    )
-    if not access:
-        raise HTTPException(
-            status_code=403,
-            detail=f"You do not have access to this app: {payload.app_id}",
-        )
-    app = await db_manager.fetch_app_by_id(app_id=payload.app_id)
-    custom_eval = CustomEvaluationDB(
-        evaluation_name=payload.evaluation_name,
-        user=app.user,
-        organization=app.organization,
-        app=app,
-        python_code=payload.python_code,
-        created_at=datetime.utcnow(),
-        updated_at=datetime.utcnow(),
-    )
-
-    await custom_eval.create()
-    return str(custom_eval.id)
-
-
 async def create_new_human_evaluation(
     payload: NewHumanEvaluation, **user_org_data: dict
 ) -> EvaluationDB:

diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py
@@ -1,13 +1,12 @@
 from agenta_backend.models.db_models import (
-    EvaluationScenarioDB,
-    EvaluationDB,
     HumanEvaluationDB,
+    EvaluationScenarioDB,
     HumanEvaluationScenarioDB,
 )
-from agenta_backend.services import evaluation_service
 from agenta_backend.services import db_manager
 from agenta_backend.models.api.evaluation_model import EvaluationType
-from bson import ObjectId
+
+from beanie import PydanticObjectId as ObjectId
 
 
 async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB):