diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 4549e85842..2016d49bc2 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -72,6 +72,7 @@ class Evaluation(BaseModel): status: Result aggregated_results: List[AggregatedResult] average_cost: Optional[Result] + total_cost: Optional[Result] average_latency: Optional[Result] created_at: datetime updated_at: datetime diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 2e13547ff5..541462c3b0 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -141,6 +141,7 @@ async def evaluation_db_to_pydantic( created_at=evaluation_db.created_at, updated_at=evaluation_db.updated_at, average_cost=evaluation_db.average_cost, + total_cost=evaluation_db.total_cost, average_latency=evaluation_db.average_latency, ) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index f7dcc49ba9..e67c8b42ce 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -269,6 +269,7 @@ class EvaluationDB(Document): evaluators_configs: List[PydanticObjectId] aggregated_results: List[AggregatedResult] average_cost: Optional[Result] = None + total_cost: Optional[Result] = None average_latency: Optional[Result] = None created_at: datetime = Field(default=datetime.now(timezone.utc)) updated_at: datetime = Field(default=datetime.now(timezone.utc)) diff --git a/agenta-backend/agenta_backend/services/aggregation_service.py b/agenta-backend/agenta_backend/services/aggregation_service.py index b459766301..d1120f1b53 100644 --- a/agenta-backend/agenta_backend/services/aggregation_service.py +++ b/agenta-backend/agenta_backend/services/aggregation_service.py @@ -99,3 +99,29 @@ def aggregate_float_from_llm_app_response( value=None, error=Error(message=str(exc), stacktrace=str(traceback.format_exc())), ) + + +def sum_float_from_llm_app_response( + invocation_results: List[InvokationResult], key: Optional[str] +) -> Result: + try: + if not key: + raise ValueError("Key is required to aggregate InvokationResult objects.") + + values = [ + getattr(inv_result, key) + for inv_result in invocation_results + if hasattr(inv_result, key) and getattr(inv_result, key) is not None + ] + + if not values: + raise ValueError(f"No valid values found for {key} sum aggregation.") + + total_value = sum(values) + return Result(type=key, value=total_value) + except Exception as exc: + return Result( + type="error", + value=None, + error=Error(message=str(exc), stacktrace=str(traceback.format_exc())), + ) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 956a4276d0..5dd602de6e 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -285,10 +285,17 @@ def evaluate( average_cost = aggregation_service.aggregate_float_from_llm_app_response( app_outputs, "cost" ) + total_cost = aggregation_service.sum_float_from_llm_app_response( + app_outputs, "cost" + ) loop.run_until_complete( update_evaluation( evaluation_id, - {"average_latency": average_latency, "average_cost": average_cost}, + { + "average_latency": average_latency, + "average_cost": average_cost, + "total_cost": total_cost, + }, ) ) diff --git a/agenta-cli/agenta/client/backend/types/evaluation.py b/agenta-cli/agenta/client/backend/types/evaluation.py index ca877a97f2..55bb6685cb 100644 --- a/agenta-cli/agenta/client/backend/types/evaluation.py +++ b/agenta-cli/agenta/client/backend/types/evaluation.py @@ -27,6 +27,7 @@ class Evaluation(pydantic.BaseModel): status: Result aggregated_results: typing.List[AggregatedResult] average_cost: typing.Optional[Result] + total_cost: typing.Optional[Result] average_latency: typing.Optional[Result] created_at: dt.datetime updated_at: dt.datetime diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 8ea9dc84db..bfb7965239 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -289,7 +289,7 @@ const EvaluationResults: React.FC = () => { { flex: 1, field: "average_latency", - headerName: "Latency", + headerName: "Avg. Latency", hide: hiddenCols.includes("Latency"), minWidth: 120, ...getFilterParams("number"), @@ -297,12 +297,12 @@ const EvaluationResults: React.FC = () => { }, { flex: 1, - field: "average_cost", - headerName: "Cost", + field: "total_cost", + headerName: "Total Cost", hide: hiddenCols.includes("Cost"), minWidth: 120, ...getFilterParams("number"), - valueGetter: (params) => getTypedValue(params?.data?.average_cost), + valueGetter: (params) => getTypedValue(params?.data?.total_cost), }, { flex: 1, diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 8a981fa524..a5d9bb95f1 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -409,6 +409,7 @@ export interface _Evaluation { revisions: string[] average_latency?: TypedValue & {error: null | EvaluationError} average_cost?: TypedValue & {error: null | EvaluationError} + total_cost?: TypedValue & {error: null | EvaluationError} variant_revision_ids: string[] } diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index 7a2089544f..c672840052 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -105,6 +105,7 @@ const evaluationTransformer = (item: any) => ({ variant_revision_ids: item.variant_revision_ids, variant_ids: item.variant_ids, average_cost: item.average_cost, + total_cost: item.total_cost, average_latency: item.average_latency, }) export const fetchAllEvaluations = async (appId: string) => {