Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re run evaluation #1455

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
886a122
add rerurn evaluation
aakrem Mar 19, 2024
81f3f9c
rerun functionality
MohammedMaaz Mar 20, 2024
6ec1b91
add updating status after rerunning
aakrem Mar 20, 2024
57c5b94
added started at and finished at for evaluation
aakrem Mar 20, 2024
a057ec4
add finished_at
aakrem Mar 20, 2024
9eb06f4
added started_at and finished_at fields to the frontend
MohammedMaaz Mar 21, 2024
f3a2766
add EvaluationParams in db router and service
aakrem Mar 21, 2024
756dd17
Merge branch 'main' into re-run-evaluation
aakrem Mar 21, 2024
49a13b3
remove comments
aakrem Mar 21, 2024
243b172
format
aakrem Mar 21, 2024
4c99599
fixes
aakrem Mar 21, 2024
f10e145
more fixes
aakrem Mar 21, 2024
e4157ca
fix typos and docstrings
aakrem Mar 21, 2024
49bc43b
add corrected answer
aakrem Mar 21, 2024
312d41f
replace created_at with started_at
aakrem Mar 21, 2024
f3521cb
add missing import
aakrem Mar 22, 2024
bc2dda1
add support for data without evaluation params
aakrem Mar 22, 2024
5e56814
fix and improve error message
aakrem Mar 25, 2024
b3eab68
fix format
aakrem Mar 25, 2024
8383b45
move updating an evaluation when rerunning to the evaluations service
aakrem Mar 25, 2024
3720c70
refactor evaluation fetching to use service layer for improved modula…
aakrem Mar 25, 2024
1ee9ff3
update comment
aakrem Mar 25, 2024
99538f3
format
aakrem Mar 25, 2024
8e140df
added fallback to created_at column value in eval results
MohammedMaaz Mar 25, 2024
4d95d54
fix imports
aakrem Mar 25, 2024
bad6c88
move evaluations ids to body
aakrem Mar 28, 2024
77b2007
fix sorting
aakrem Mar 28, 2024
e0c15bf
fix format
aakrem Mar 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions agenta-backend/agenta_backend/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class Evaluation(BaseModel):
testset_name: Optional[str]
status: Result
aggregated_results: List[AggregatedResult]
started_at: Optional[datetime]
finished_at: Optional[datetime]
created_at: datetime
updated_at: datetime

Expand Down Expand Up @@ -249,6 +251,10 @@ class NewEvaluation(BaseModel):
correct_answer_column: Optional[str]


class RerunEvaluation(BaseModel):
lm_providers_keys: Optional[Dict[LMProvidersEnum, str]]


class NewEvaluatorConfig(BaseModel):
app_id: str
name: str
Expand Down
2 changes: 2 additions & 0 deletions agenta-backend/agenta_backend/models/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ async def evaluation_db_to_pydantic(
else str(evaluation_db.testset.name)
),
aggregated_results=aggregated_results,
started_at=evaluation_db.started_at,
finished_at=evaluation_db.finished_at,
created_at=evaluation_db.created_at,
updated_at=evaluation_db.updated_at,
)
Expand Down
3 changes: 3 additions & 0 deletions agenta-backend/agenta_backend/models/db_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
TestSetDB_ as TestSetDB,
AppVariantDB_ as AppVariantDB,
EvaluationDB_ as EvaluationDB,
EvaluationParamsDB_ as EvaluationParamsDB,
DeploymentDB_ as DeploymentDB,
VariantBaseDB_ as VariantBaseDB,
AppEnvironmentDB_ as AppEnvironmentDB,
Expand All @@ -35,6 +36,7 @@
ImageDB,
TestSetDB,
EvaluationDB,
EvaluationParamsDB,
DeploymentDB,
AppVariantDB,
VariantBaseDB,
Expand Down Expand Up @@ -65,6 +67,7 @@
AppVariantDB,
DeploymentDB,
EvaluationDB,
EvaluationParamsDB,
VariantBaseDB,
AppEnvironmentDB,
AppEnvironmentRevisionDB,
Expand Down
21 changes: 21 additions & 0 deletions agenta-backend/agenta_backend/models/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,21 @@ class Settings:
name = "human_evaluations_scenarios"


class EvaluationParamsDB(Document):
app: Link[AppDB]
user: Link[UserDB]
testset_id: PydanticObjectId
variants_ids: List[PydanticObjectId]
evaluators_configs: List[PydanticObjectId]
rate_limit_config: dict
correct_answer_column: str
created_at: Optional[datetime] = Field(default=datetime.now())
updated_at: Optional[datetime] = Field(default=datetime.now())

class Settings:
name = "evaluations_params"


class EvaluationDB(Document):
app: Link[AppDB]
user: Link[UserDB]
Expand All @@ -266,6 +281,11 @@ class EvaluationDB(Document):
variant_revision: PydanticObjectId
evaluators_configs: List[PydanticObjectId]
aggregated_results: List[AggregatedResult]
rerun_count: int = Field(default=None)
started_at: Optional[datetime] = None
finished_at: Optional[datetime] = None
evaluation_params_id: Optional[PydanticObjectId] = None

created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand All @@ -284,6 +304,7 @@ class EvaluationScenarioDB(Document):
note: Optional[str]
evaluators_configs: List[PydanticObjectId]
results: List[EvaluationScenarioResult]
rerun_count: int = Field(default=None)
created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand Down
104 changes: 104 additions & 0 deletions agenta-backend/agenta_backend/routers/evaluation_router.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import secrets
import logging
from datetime import datetime
from typing import Any, List

from fastapi.responses import JSONResponse
Expand All @@ -15,11 +16,17 @@
NewEvaluation,
DeleteEvaluation,
EvaluationWebhook,
RerunEvaluation,
EvaluationStatusEnum,
)
from agenta_backend.services.evaluator_manager import (
check_ai_critique_inputs,
)

from agenta_backend.models.db_models import (
Result,
)

if isCloudEE():
from agenta_backend.commons.models.db_models import Permission
from agenta_backend.commons.utils.permissions import check_action_access
Expand Down Expand Up @@ -125,12 +132,22 @@ async def create_evaluation(
else payload.correct_answer_column
)

evaluation_params = await evaluation_service.create_new_evaluation_params(
app_id=payload.app_id,
evaluator_config_ids=payload.evaluators_configs,
testset_id=payload.testset_id,
variants_ids=payload.variant_ids,
rate_limit_config=payload.rate_limit,
correct_answer_column=correct_answer_column,
)

for variant_id in payload.variant_ids:
evaluation = await evaluation_service.create_new_evaluation(
app_id=payload.app_id,
variant_id=variant_id,
evaluator_config_ids=payload.evaluators_configs,
testset_id=payload.testset_id,
evaluation_params_id=evaluation_params.id,
)

evaluate.delay(
Expand All @@ -145,6 +162,15 @@ async def create_evaluation(
)
evaluations.append(evaluation)

# In case we want to persist all evaluations' data
# to be able to rerun it later (exactly how the user
# created it especially with selecting the multiple
# variants) then we also need to update the
# evaluations_params with evaluations ids like:
# evaluation_service.update_evaluation_params(
# evaluations_ids=[evaluation.id for evaluation in evaluations]
# )

return evaluations
except KeyError:
raise HTTPException(
Expand All @@ -153,6 +179,84 @@ async def create_evaluation(
)


@router.post("/re-run/{evaluation_ids}/", operation_id="re_run_evaluation")
async def re_run_evaluation(
evaluation_ids: str,
aakrem marked this conversation as resolved.
Show resolved Hide resolved
app_id: str,
payload: RerunEvaluation,
request: Request,
):
"""Re-runs the evaluations for the given evaluation IDs and increments their rerun count.
Raises:
HTTPException: If the app is not found or the user lacks permissions.
Returns:
HTTP response indicating the operation's outcome.
"""
try:
app = await db_manager.fetch_app_by_id(app_id)
if app is None:
raise HTTPException(status_code=404, detail="App not found")

if isCloudEE():
has_permission = await check_action_access(
user_uid=request.state.user_id,
object=app,
permission=Permission.CREATE_EVALUATION,
)
logger.debug(f"User has permission to create evaluation: {has_permission}")
if not has_permission:
error_msg = "You do not have permission to perform this action. Please contact your organization admin."
logger.error(error_msg)
return JSONResponse(
{"detail": error_msg},
status_code=403,
)

evaluation_ids = evaluation_ids.split(",")

for evaluation_id in evaluation_ids:
evaluation = await evaluation_service.get_evaluation_by_id(evaluation_id)
evaluation_params = await evaluation_service.fetch_evaluation_params(
evaluation.evaluation_params_id
)

if evaluation_params == None:
aakrem marked this conversation as resolved.
Show resolved Hide resolved
# due to the fact that the correct answer was not persisted
# rerunning evaluations with "correct_answer" as a value will
# result in errors. Hince returning an error.
return JSONResponse(
{
"detail": "This is an old evaluation that cannot be rerun. Please select a newer evaluation!"
},
status_code=400,
)

await evaluation_service.update_on_evaluation_rerun(
evaluation_id=evaluation_id,
evaluation=evaluation,
)

evaluate.delay(
app_id=app_id,
variant_id=str(evaluation.variant),
aakrem marked this conversation as resolved.
Show resolved Hide resolved
evaluators_config_ids=[
str(config_id) for config_id in evaluation.evaluators_configs
],
testset_id=str(evaluation.testset.id),
evaluation_id=evaluation_id,
rate_limit_config=evaluation_params.rate_limit_config,
lm_providers_keys=payload.lm_providers_keys,
correct_answer_column=evaluation_params.correct_answer_column,
)

return Response(status_code=status.HTTP_200_OK)
except KeyError:
raise HTTPException(
status_code=400,
detail="columns in the test set should match the names of the inputs in the variant",
)


@router.get("/{evaluation_id}/status/", operation_id="fetch_evaluation_status")
async def fetch_evaluation_status(evaluation_id: str, request: Request):
"""Fetches the status of the evaluation.
Expand Down
78 changes: 75 additions & 3 deletions agenta-backend/agenta_backend/services/db_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
TestSetDB_ as TestSetDB,
AppVariantDB_ as AppVariantDB,
EvaluationDB_ as EvaluationDB,
EvaluationParamsDB_ as EvaluationParamsDB,
DeploymentDB_ as DeploymentDB,
VariantBaseDB_ as VariantBaseDB,
AppEnvironmentDB_ as AppEnvironmentDB,
Expand All @@ -49,6 +50,7 @@
TestSetDB,
AppVariantDB,
EvaluationDB,
EvaluationParamsDB,
DeploymentDB,
VariantBaseDB,
AppEnvironmentDB,
Expand Down Expand Up @@ -1818,10 +1820,14 @@ async def create_new_evaluation(
evaluators_configs: List[str],
organization=None,
workspace=None,
started_at: Optional[datetime] = None,
finished_at: Optional[datetime] = None,
evaluation_params_id: Optional[ObjectId] = None,
) -> EvaluationDB:
"""Create a new evaluation scenario.

Returns:
EvaluationScenarioDB: The created evaluation scenario.
EvaluationDB: The created evaluation.
"""
evaluation = EvaluationDB(
app=app,
Expand All @@ -1832,8 +1838,9 @@ async def create_new_evaluation(
variant_revision=variant_revision,
evaluators_configs=evaluators_configs,
aggregated_results=[],
created_at=datetime.now().isoformat(),
updated_at=datetime.now().isoformat(),
started_at=started_at,
finished_at=finished_at,
evaluation_params_id=evaluation_params_id,
)

if isCloudEE():
Expand All @@ -1849,6 +1856,69 @@ async def create_new_evaluation(
return evaluation


async def create_new_evaluation_params(
app: AppDB,
testset_id: str,
variants_ids: List[str],
evaluators_configs: List[str],
rate_limit_config: dict,
correct_answer_column: str,
user: UserDB,
organization=None,
workspace=None,
) -> EvaluationParamsDB:
"""
Create new evaluation parameters.

Args:
app (AppDB): The app associated with the evaluation parameters.
testset_id (str): The ID of the testset.
variants_ids (List[str]): A list of IDs for the variants.
evaluators_configs (List[str]): A list of evaluator configuration IDs.
rate_limit_config (dict): The rate limit configuration.
user (UserDB): The user associated with the evaluation.
organization: The organization associated with the evaluation, if applicable.
workspace: The workspace associated with the evaluation, if applicable.

Returns:
EvaluationParamsDB: The created evaluation parameters.
"""
evaluation_params = EvaluationParamsDB(
app=app,
user=user,
testset_id=testset_id,
variants_ids=variants_ids,
evaluators_configs=evaluators_configs,
rate_limit_config=rate_limit_config,
correct_answer_column=correct_answer_column,
)

if isCloudEE():
assert (
organization is not None and workspace is not None
), "organization and workspace must be provided together"
evaluation_params.organization = organization
evaluation_params.workspace = workspace

await evaluation_params.create()
return evaluation_params


async def fetch_evaluation_params(evaluation_params_id: str) -> EvaluationParamsDB:
"""
Fetches evaluation parameters by their ID.

Args:
evaluation_params_id (str): The ID of the evaluation parameters to fetch.

Returns:
EvaluationParamsDB: The fetched evaluation parameters.
"""
return await EvaluationParamsDB.find_one(
EvaluationParamsDB.id == ObjectId(evaluation_params_id)
)


async def create_new_evaluation_scenario(
user: UserDB,
evaluation: EvaluationDB,
Expand All @@ -1862,6 +1932,7 @@ async def create_new_evaluation_scenario(
results: List[EvaluationScenarioResult],
organization=None,
workspace=None,
rerun_count: Optional[int] = 0,
) -> EvaluationScenarioDB:
"""Create a new evaluation scenario.
Returns:
Expand All @@ -1878,6 +1949,7 @@ async def create_new_evaluation_scenario(
note=note,
evaluators_configs=evaluators_configs,
results=results,
rerun_count=rerun_count,
)

if isCloudEE():
Expand Down
Loading
Loading