From 03e7efade1c10b64e0ae133b94ab1c61e51806d9 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 8 Jan 2024 11:59:32 +0100 Subject: [PATCH] Cleanup - remove redundant db/api models and code --- .../models/api/evaluation_model.py | 35 -------- .../agenta_backend/models/db_engine.py | 2 - .../agenta_backend/models/db_models.py | 19 ----- .../services/evaluation_service.py | 81 +------------------ .../services/results_service.py | 7 +- 5 files changed, 6 insertions(+), 138 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 04ff3000e7..5e1dc7c6cf 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -21,25 +21,8 @@ class EvaluatorConfig(BaseModel): updated_at: datetime -class EvaluationTypeSettings(BaseModel): - similarity_threshold: Optional[float] - regex_pattern: Optional[str] - regex_should_match: Optional[bool] - webhook_url: Optional[str] - custom_code_evaluation_id: Optional[str] - llm_app_prompt_template: Optional[str] - evaluation_prompt_template: Optional[str] - - class EvaluationType(str, Enum): - auto_exact_match = "auto_exact_match" - auto_similarity_match = "auto_similarity_match" - auto_regex_test = "auto_regex_test" - auto_webhook_test = "auto_webhook_test" - auto_ai_critique = "auto_ai_critique" human_a_b_testing = "human_a_b_testing" - human_scoring = "human_scoring" - custom_code_run = "custom_code_run" single_model_test = "single_model_test" @@ -63,7 +46,6 @@ class NewHumanEvaluation(BaseModel): app_id: str variant_ids: List[str] evaluation_type: EvaluationType - evaluation_type_settings: Optional[EvaluationTypeSettings] inputs: List[str] testset_id: str status: str @@ -99,7 +81,6 @@ class SimpleEvaluationOutput(BaseModel): class HumanEvaluationUpdate(BaseModel): status: Optional[EvaluationStatusEnum] - evaluation_type_settings: Optional[EvaluationTypeSettings] class EvaluationScenarioResult(BaseModel): @@ -134,7 +115,6 @@ class HumanEvaluation(BaseModel): user_id: str user_username: str evaluation_type: EvaluationType - evaluation_type_settings: Optional[EvaluationTypeSettings] variant_ids: List[str] variant_names: List[str] testset_id: str @@ -179,15 +159,6 @@ class EvaluationScenario(BaseModel): results: List[EvaluationScenarioResult] -class AICritiqueCreate(BaseModel): - correct_answer: str - llm_app_prompt_template: Optional[str] - inputs: List[EvaluationScenarioInput] - outputs: List[EvaluationScenarioOutput] - evaluation_prompt_template: Optional[str] - open_ai_key: Optional[str] - - class EvaluationScenarioUpdate(BaseModel): vote: Optional[str] score: Optional[Any] @@ -245,12 +216,6 @@ class EvaluationWebhook(BaseModel): score: float -class EvaluationSettingsTemplate(BaseModel): - type: str - default: str - description: str - - class LLMRunRateLimit(BaseModel): batch_size: int max_retries: int diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 820e9f6893..b951be23a0 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -19,7 +19,6 @@ AppVariantDB, TemplateDB, TestSetDB, - CustomEvaluationDB, EvaluatorConfigDB, HumanEvaluationDB, HumanEvaluationScenarioDB, @@ -47,7 +46,6 @@ AppVariantDB, TemplateDB, TestSetDB, - CustomEvaluationDB, EvaluatorConfigDB, HumanEvaluationDB, HumanEvaluationScenarioDB, diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index f9b5b28d00..ea579f45f9 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -195,25 +195,6 @@ class Settings: name = "testsets" -class CustomEvaluationDB(Document): - evaluation_name: str - python_code: str - app: Link[AppDB] - user: Link[UserDB] - organization: Link[OrganizationDB] - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - - class Settings: - name = "custom_evaluations" - - -class EvaluationSettingsTemplate(BaseModel): - type: str - default: str - description: str - - class EvaluatorConfigDB(Document): app: Link[AppDB] organization: Link[OrganizationDB] diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 5e8a0a84e1..1c6daf7c83 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -9,13 +9,11 @@ EvaluationScenario, EvaluationScenarioInput, EvaluationType, - EvaluationTypeSettings, HumanEvaluation, HumanEvaluationScenario, HumanEvaluationUpdate, NewEvaluation, EvaluationScenarioUpdate, - CreateCustomEvaluation, EvaluationStatusEnum, NewHumanEvaluation, ) @@ -33,7 +31,6 @@ HumanEvaluationScenarioOutput, UserDB, AppDB, - CustomEvaluationDB, ) from beanie import PydanticObjectId as ObjectId @@ -268,21 +265,6 @@ async def update_human_evaluation_service( if update_payload.status is not None: updates["status"] = update_payload.status - if update_payload.evaluation_type_settings is not None: - current_settings = evaluation.evaluation_type_settings - new_settings = update_payload.evaluation_type_settings - - # Update only the fields that are explicitly set in the payload - for field in EvaluationTypeSettings.__annotations__.keys(): - setattr( - current_settings, - field, - getattr(new_settings, field, None) - or getattr(current_settings, field, None), - ) - - updates["evaluation_type_settings"] = current_settings - # Update the evaluation await evaluation.update({"$set": updates}) @@ -376,11 +358,6 @@ async def update_human_evaluation_scenario( new_eval_set = {} if updated_data["score"] is not None and evaluation_type in [ - EvaluationType.auto_exact_match, - EvaluationType.auto_similarity_match, - EvaluationType.auto_regex_test, - EvaluationType.auto_webhook_test, - EvaluationType.auto_ai_critique, EvaluationType.single_model_test, ]: new_eval_set["score"] = updated_data["score"] @@ -389,8 +366,6 @@ async def update_human_evaluation_scenario( and evaluation_type == EvaluationType.human_a_b_testing ): new_eval_set["vote"] = updated_data["vote"] - elif evaluation_type == EvaluationType.custom_code_run: - new_eval_set["correct_answer"] = updated_data["correct_answer"] if updated_data["outputs"] is not None: new_outputs = [ @@ -471,14 +446,7 @@ async def get_evaluation_scenario_score_service( def _extend_with_evaluation(evaluation_type: EvaluationType): evaluation = {} - if ( - evaluation_type == EvaluationType.auto_exact_match - or evaluation_type == EvaluationType.auto_similarity_match - or evaluation_type == EvaluationType.auto_regex_test - or evaluation_type == EvaluationType.auto_webhook_test - or evaluation_type == EvaluationType.single_model_test - or EvaluationType.auto_ai_critique - ): + if evaluation_type == EvaluationType.single_model_test: evaluation["score"] = "" if evaluation_type == EvaluationType.human_a_b_testing: @@ -488,15 +456,8 @@ def _extend_with_evaluation(evaluation_type: EvaluationType): def _extend_with_correct_answer(evaluation_type: EvaluationType, row: dict): correct_answer = {} - if ( - evaluation_type == EvaluationType.auto_exact_match - or evaluation_type == EvaluationType.auto_similarity_match - or evaluation_type == EvaluationType.auto_regex_test - or evaluation_type == EvaluationType.auto_ai_critique - or evaluation_type == EvaluationType.auto_webhook_test - ): - if row["correct_answer"]: - correct_answer["correct_answer"] = row["correct_answer"] + if row["correct_answer"]: + correct_answer["correct_answer"] = row["correct_answer"] return correct_answer @@ -634,42 +595,6 @@ async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) - await evaluation.delete() -async def create_custom_code_evaluation( - payload: CreateCustomEvaluation, **user_org_data: dict -) -> str: - """Save the custom evaluation code in the database. - - Args: - payload (CreateCustomEvaluation): the required payload - - Returns: - str: the custom evaluation id - """ - - # Initialize custom evaluation instance - access = await check_access_to_app( - user_org_data=user_org_data, app_id=payload.app_id - ) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {payload.app_id}", - ) - app = await db_manager.fetch_app_by_id(app_id=payload.app_id) - custom_eval = CustomEvaluationDB( - evaluation_name=payload.evaluation_name, - user=app.user, - organization=app.organization, - app=app, - python_code=payload.python_code, - created_at=datetime.utcnow(), - updated_at=datetime.utcnow(), - ) - - await custom_eval.create() - return str(custom_eval.id) - - async def create_new_human_evaluation( payload: NewHumanEvaluation, **user_org_data: dict ) -> EvaluationDB: diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index dc9d9e8df2..d33a1c419f 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -1,13 +1,12 @@ from agenta_backend.models.db_models import ( - EvaluationScenarioDB, - EvaluationDB, HumanEvaluationDB, + EvaluationScenarioDB, HumanEvaluationScenarioDB, ) -from agenta_backend.services import evaluation_service from agenta_backend.services import db_manager from agenta_backend.models.api.evaluation_model import EvaluationType -from bson import ObjectId + +from beanie import PydanticObjectId as ObjectId async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB):