Skip to content

Commit

Permalink
Merge pull request #1168 from Agenta-AI/cleanup-evaluations
Browse files Browse the repository at this point in the history
Refactor - Cleanup redundant code in evaluations branch
  • Loading branch information
aybruhm authored Jan 8, 2024
2 parents 548b3ce + 03e7efa commit 71c7100
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 138 deletions.
35 changes: 0 additions & 35 deletions agenta-backend/agenta_backend/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,8 @@ class EvaluatorConfig(BaseModel):
updated_at: datetime


class EvaluationTypeSettings(BaseModel):
similarity_threshold: Optional[float]
regex_pattern: Optional[str]
regex_should_match: Optional[bool]
webhook_url: Optional[str]
custom_code_evaluation_id: Optional[str]
llm_app_prompt_template: Optional[str]
evaluation_prompt_template: Optional[str]


class EvaluationType(str, Enum):
auto_exact_match = "auto_exact_match"
auto_similarity_match = "auto_similarity_match"
auto_regex_test = "auto_regex_test"
auto_webhook_test = "auto_webhook_test"
auto_ai_critique = "auto_ai_critique"
human_a_b_testing = "human_a_b_testing"
human_scoring = "human_scoring"
custom_code_run = "custom_code_run"
single_model_test = "single_model_test"


Expand All @@ -63,7 +46,6 @@ class NewHumanEvaluation(BaseModel):
app_id: str
variant_ids: List[str]
evaluation_type: EvaluationType
evaluation_type_settings: Optional[EvaluationTypeSettings]
inputs: List[str]
testset_id: str
status: str
Expand Down Expand Up @@ -99,7 +81,6 @@ class SimpleEvaluationOutput(BaseModel):

class HumanEvaluationUpdate(BaseModel):
status: Optional[EvaluationStatusEnum]
evaluation_type_settings: Optional[EvaluationTypeSettings]


class EvaluationScenarioResult(BaseModel):
Expand Down Expand Up @@ -134,7 +115,6 @@ class HumanEvaluation(BaseModel):
user_id: str
user_username: str
evaluation_type: EvaluationType
evaluation_type_settings: Optional[EvaluationTypeSettings]
variant_ids: List[str]
variant_names: List[str]
testset_id: str
Expand Down Expand Up @@ -179,15 +159,6 @@ class EvaluationScenario(BaseModel):
results: List[EvaluationScenarioResult]


class AICritiqueCreate(BaseModel):
correct_answer: str
llm_app_prompt_template: Optional[str]
inputs: List[EvaluationScenarioInput]
outputs: List[EvaluationScenarioOutput]
evaluation_prompt_template: Optional[str]
open_ai_key: Optional[str]


class EvaluationScenarioUpdate(BaseModel):
vote: Optional[str]
score: Optional[Any]
Expand Down Expand Up @@ -245,12 +216,6 @@ class EvaluationWebhook(BaseModel):
score: float


class EvaluationSettingsTemplate(BaseModel):
type: str
default: str
description: str


class LLMRunRateLimit(BaseModel):
batch_size: int
max_retries: int
Expand Down
2 changes: 0 additions & 2 deletions agenta-backend/agenta_backend/models/db_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
AppVariantDB,
TemplateDB,
TestSetDB,
CustomEvaluationDB,
EvaluatorConfigDB,
HumanEvaluationDB,
HumanEvaluationScenarioDB,
Expand Down Expand Up @@ -47,7 +46,6 @@
AppVariantDB,
TemplateDB,
TestSetDB,
CustomEvaluationDB,
EvaluatorConfigDB,
HumanEvaluationDB,
HumanEvaluationScenarioDB,
Expand Down
19 changes: 0 additions & 19 deletions agenta-backend/agenta_backend/models/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,25 +195,6 @@ class Settings:
name = "testsets"


class CustomEvaluationDB(Document):
evaluation_name: str
python_code: str
app: Link[AppDB]
user: Link[UserDB]
organization: Link[OrganizationDB]
created_at: Optional[datetime] = Field(default=datetime.utcnow())
updated_at: Optional[datetime] = Field(default=datetime.utcnow())

class Settings:
name = "custom_evaluations"


class EvaluationSettingsTemplate(BaseModel):
type: str
default: str
description: str


class EvaluatorConfigDB(Document):
app: Link[AppDB]
organization: Link[OrganizationDB]
Expand Down
81 changes: 3 additions & 78 deletions agenta-backend/agenta_backend/services/evaluation_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@
EvaluationScenario,
EvaluationScenarioInput,
EvaluationType,
EvaluationTypeSettings,
HumanEvaluation,
HumanEvaluationScenario,
HumanEvaluationUpdate,
NewEvaluation,
EvaluationScenarioUpdate,
CreateCustomEvaluation,
EvaluationStatusEnum,
NewHumanEvaluation,
)
Expand All @@ -33,7 +31,6 @@
HumanEvaluationScenarioOutput,
UserDB,
AppDB,
CustomEvaluationDB,
)

from beanie import PydanticObjectId as ObjectId
Expand Down Expand Up @@ -268,21 +265,6 @@ async def update_human_evaluation_service(
if update_payload.status is not None:
updates["status"] = update_payload.status

if update_payload.evaluation_type_settings is not None:
current_settings = evaluation.evaluation_type_settings
new_settings = update_payload.evaluation_type_settings

# Update only the fields that are explicitly set in the payload
for field in EvaluationTypeSettings.__annotations__.keys():
setattr(
current_settings,
field,
getattr(new_settings, field, None)
or getattr(current_settings, field, None),
)

updates["evaluation_type_settings"] = current_settings

# Update the evaluation
await evaluation.update({"$set": updates})

Expand Down Expand Up @@ -376,11 +358,6 @@ async def update_human_evaluation_scenario(
new_eval_set = {}

if updated_data["score"] is not None and evaluation_type in [
EvaluationType.auto_exact_match,
EvaluationType.auto_similarity_match,
EvaluationType.auto_regex_test,
EvaluationType.auto_webhook_test,
EvaluationType.auto_ai_critique,
EvaluationType.single_model_test,
]:
new_eval_set["score"] = updated_data["score"]
Expand All @@ -389,8 +366,6 @@ async def update_human_evaluation_scenario(
and evaluation_type == EvaluationType.human_a_b_testing
):
new_eval_set["vote"] = updated_data["vote"]
elif evaluation_type == EvaluationType.custom_code_run:
new_eval_set["correct_answer"] = updated_data["correct_answer"]

if updated_data["outputs"] is not None:
new_outputs = [
Expand Down Expand Up @@ -471,14 +446,7 @@ async def get_evaluation_scenario_score_service(

def _extend_with_evaluation(evaluation_type: EvaluationType):
evaluation = {}
if (
evaluation_type == EvaluationType.auto_exact_match
or evaluation_type == EvaluationType.auto_similarity_match
or evaluation_type == EvaluationType.auto_regex_test
or evaluation_type == EvaluationType.auto_webhook_test
or evaluation_type == EvaluationType.single_model_test
or EvaluationType.auto_ai_critique
):
if evaluation_type == EvaluationType.single_model_test:
evaluation["score"] = ""

if evaluation_type == EvaluationType.human_a_b_testing:
Expand All @@ -488,15 +456,8 @@ def _extend_with_evaluation(evaluation_type: EvaluationType):

def _extend_with_correct_answer(evaluation_type: EvaluationType, row: dict):
correct_answer = {}
if (
evaluation_type == EvaluationType.auto_exact_match
or evaluation_type == EvaluationType.auto_similarity_match
or evaluation_type == EvaluationType.auto_regex_test
or evaluation_type == EvaluationType.auto_ai_critique
or evaluation_type == EvaluationType.auto_webhook_test
):
if row["correct_answer"]:
correct_answer["correct_answer"] = row["correct_answer"]
if row["correct_answer"]:
correct_answer["correct_answer"] = row["correct_answer"]
return correct_answer


Expand Down Expand Up @@ -634,42 +595,6 @@ async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) -
await evaluation.delete()


async def create_custom_code_evaluation(
payload: CreateCustomEvaluation, **user_org_data: dict
) -> str:
"""Save the custom evaluation code in the database.
Args:
payload (CreateCustomEvaluation): the required payload
Returns:
str: the custom evaluation id
"""

# Initialize custom evaluation instance
access = await check_access_to_app(
user_org_data=user_org_data, app_id=payload.app_id
)
if not access:
raise HTTPException(
status_code=403,
detail=f"You do not have access to this app: {payload.app_id}",
)
app = await db_manager.fetch_app_by_id(app_id=payload.app_id)
custom_eval = CustomEvaluationDB(
evaluation_name=payload.evaluation_name,
user=app.user,
organization=app.organization,
app=app,
python_code=payload.python_code,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
)

await custom_eval.create()
return str(custom_eval.id)


async def create_new_human_evaluation(
payload: NewHumanEvaluation, **user_org_data: dict
) -> EvaluationDB:
Expand Down
7 changes: 3 additions & 4 deletions agenta-backend/agenta_backend/services/results_service.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from agenta_backend.models.db_models import (
EvaluationScenarioDB,
EvaluationDB,
HumanEvaluationDB,
EvaluationScenarioDB,
HumanEvaluationScenarioDB,
)
from agenta_backend.services import evaluation_service
from agenta_backend.services import db_manager
from agenta_backend.models.api.evaluation_model import EvaluationType
from bson import ObjectId

from beanie import PydanticObjectId as ObjectId


async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB):
Expand Down

0 comments on commit 71c7100

Please sign in to comment.