Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Latency cost in eval #1468

Merged
merged 31 commits into from
Mar 31, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
513ecdc
add average cost and latency to evaluation schema
aakrem Mar 28, 2024
f9e8269
add cost and latency to all models and related methods
aakrem Mar 28, 2024
1082d8b
add aggregate method for the llm response latency & cost
aakrem Mar 28, 2024
484f0f5
adjust result from llm response to contain cost and latency
aakrem Mar 28, 2024
20609b9
add average cost and latency to the evaluation
aakrem Mar 28, 2024
43eb333
cost and latency columns
aakrem Mar 28, 2024
55d7c0a
fixes
aakrem Mar 28, 2024
6b5eb60
improve the getTypedValue with new types
aakrem Mar 28, 2024
31882df
formatters utils | evaluators link fixed
MohammedMaaz Mar 18, 2024
87ed91f
use Maaz currency and latency helpers
aakrem Mar 28, 2024
3147577
add latency and cost to models
aakrem Mar 28, 2024
9847da5
fix aggregation
aakrem Mar 28, 2024
c382959
adjust schema
aakrem Mar 28, 2024
f236aed
adjust EvaluationScenarioOutputDB
aakrem Mar 28, 2024
da220cb
fixes
aakrem Mar 28, 2024
361b6d7
format
aakrem Mar 28, 2024
5042b9d
handle null values for cost and latency
aakrem Mar 28, 2024
7c28565
revert change
aakrem Mar 28, 2024
ccda053
fix types
aakrem Mar 28, 2024
5b606e4
add cost and latency in eval scenario
aakrem Mar 28, 2024
1a94867
remove old implementation code
aakrem Mar 29, 2024
52fb907
fixed failing cypress tests
bekossy Mar 29, 2024
9eec347
Merge branch 'latency-cost-in-eval' of https://github.com/Agenta-AI/a…
bekossy Mar 29, 2024
29e0989
handle optional latency and cost in app response
aakrem Mar 29, 2024
f65424a
add latency and cost to comparison view
aakrem Mar 29, 2024
21b42bb
add latency and cost to comparison view
aakrem Mar 29, 2024
d9661af
format
aakrem Mar 29, 2024
cf87d94
another fix
aakrem Mar 29, 2024
82ecf54
fix types
aakrem Mar 29, 2024
9478392
fix formatter
aakrem Mar 29, 2024
e49dfb1
bumped ag-grid version
bekossy Mar 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions agenta-backend/agenta_backend/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class Evaluation(BaseModel):
testset_name: Optional[str]
status: Result
aggregated_results: List[AggregatedResult]
average_cost: Optional[Result]
average_latency: Optional[Result]
created_at: datetime
updated_at: datetime

Expand Down Expand Up @@ -100,6 +102,8 @@ class EvaluationScenarioInput(BaseModel):

class EvaluationScenarioOutput(BaseModel):
result: Result
cost: Optional[float]
latency: Optional[float]


class HumanEvaluationScenarioInput(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions agenta-backend/agenta_backend/models/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ async def evaluation_db_to_pydantic(
aggregated_results=aggregated_results,
created_at=evaluation_db.created_at,
updated_at=evaluation_db.updated_at,
average_cost=evaluation_db.average_cost,
average_latency=evaluation_db.average_latency,
)


Expand Down
8 changes: 8 additions & 0 deletions agenta-backend/agenta_backend/models/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ class Result(BaseModel):

class InvokationResult(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class EvaluationScenarioResult(BaseModel):
Expand All @@ -213,6 +215,8 @@ class EvaluationScenarioInputDB(BaseModel):

class EvaluationScenarioOutputDB(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class HumanEvaluationScenarioInput(BaseModel):
Expand Down Expand Up @@ -266,6 +270,8 @@ class EvaluationDB(Document):
variant_revision: PydanticObjectId
evaluators_configs: List[PydanticObjectId]
aggregated_results: List[AggregatedResult]
average_cost: Optional[Result] = None
average_latency: Optional[Result] = None
created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand All @@ -284,6 +290,8 @@ class EvaluationScenarioDB(Document):
note: Optional[str]
evaluators_configs: List[PydanticObjectId]
results: List[EvaluationScenarioResult]
latency: Optional[int] = None
cost: Optional[int] = None
created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand Down
30 changes: 28 additions & 2 deletions agenta-backend/agenta_backend/services/aggregation_service.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re
import traceback
from typing import List
from typing import List, Optional

from agenta_backend.models.db_models import Result, Error
from agenta_backend.models.db_models import InvokationResult, Result, Error


def aggregate_ai_critique(results: List[Result]) -> Result:
Expand Down Expand Up @@ -73,3 +73,29 @@ def aggregate_float(results: List[Result]) -> Result:
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)


def aggregate_float_from_llm_app_response(
invocation_results: List[InvokationResult], key: Optional[str]
) -> Result:
try:
if not key:
raise ValueError("Key is required to aggregate InvokationResult objects.")

values = [
getattr(inv_result, key)
for inv_result in invocation_results
if hasattr(inv_result, key) and getattr(inv_result, key) is not None
]

if not values:
raise ValueError(f"No valid values found for {key} aggregation.")

average_value = sum(values) / len(values)
return Result(type=key, value=average_value)
except Exception as exc:
return Result(
type="error",
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)
10 changes: 8 additions & 2 deletions agenta-backend/agenta_backend/services/llm_apps_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,15 @@ async def invoke_app(
url, json=payload, timeout=httpx.Timeout(timeout=5, read=None, write=5)
)
response.raise_for_status()
app_output = response.json()
app_response = response.json()
return InvokationResult(
result=Result(type="text", value=app_output["message"], error=None)
result=Result(
type="text",
value=app_response["message"],
error=None,
),
latency=app_response["latency"],
aakrem marked this conversation as resolved.
Show resolved Hide resolved
cost=app_response["cost"],
)

except httpx.HTTPStatusError as e:
Expand Down
25 changes: 23 additions & 2 deletions agenta-backend/agenta_backend/tasks/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,14 @@ def evaluate(
for evaluator_config_db in evaluator_config_dbs:
logger.debug(f"Evaluating with evaluator: {evaluator_config_db}")
if correct_answer_column in data_point:
output_value = (
app_output.result.value["output"]
aakrem marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(app_output.result.value, dict)
else app_output.result.value
)
result = evaluators_service.evaluate(
evaluator_key=evaluator_config_db.evaluator_key,
output=app_output.result.value,
output=output_value,
correct_answer=data_point[correct_answer_column],
settings_values=evaluator_config_db.settings_values,
app_params=app_variant_parameters,
Expand Down Expand Up @@ -267,7 +272,9 @@ def evaluate(
correct_answer=correct_answer,
outputs=[
EvaluationScenarioOutputDB(
result=Result(type="text", value=app_output.result.value)
result=Result(type="text", value=app_output.result.value),
aakrem marked this conversation as resolved.
Show resolved Hide resolved
latency=app_output.latency,
cost=app_output.cost,
)
],
results=evaluators_results,
Expand All @@ -276,6 +283,20 @@ def evaluate(
)
)

# Add average cost and latency
average_latency = aggregation_service.aggregate_float_from_llm_app_response(
app_outputs, "latency"
)
average_cost = aggregation_service.aggregate_float_from_llm_app_response(
app_outputs, "cost"
)
loop.run_until_complete(
update_evaluation(
evaluation_id,
{"average_latency": average_latency, "average_cost": average_cost},
)
)

except Exception as e:
logger.error(f"An error occurred during evaluation: {e}")
traceback.print_exc()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,22 @@ const EvaluationResults: React.FC<Props> = () => {
statusMapper(token)[params.data?.status.value as EvaluationStatus].label,
cellRenderer: StatusRenderer,
},
{
flex: 1,
field: "average_latency",
headerName: "Latency",
minWidth: 120,
...getFilterParams("number"),
valueGetter: (params) => getTypedValue(params?.data?.average_latency),
},
{
flex: 1,
field: "average_cost",
headerName: "Cost",
minWidth: 120,
...getFilterParams("number"),
valueGetter: (params) => getTypedValue(params?.data?.average_cost),
},
{
flex: 1,
field: "created_at",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import {useAtom} from "jotai"
import {evaluatorsAtom} from "@/lib/atoms/evaluation"
import CompareOutputDiff from "@/components/CompareOutputDiff/CompareOutputDiff"
import {useQueryParam} from "@/hooks/useQuery"
import {formatCurrency, formatLatency} from "@/lib/helpers/formatters"

const useStyles = createUseStyles((theme: JSSTheme) => ({
infoRow: {
Expand Down Expand Up @@ -139,6 +140,29 @@ const EvaluationScenarios: React.FC<Props> = () => {
},
})
})
colDefs.push({
flex: 1,
minWidth: 120,
headerName: "Cost",
...getFilterParams("text"),
valueGetter: (params) => {
return params.data?.outputs[0].cost == undefined
? "-"
: formatCurrency(params.data.outputs[0].cost)
},
})

colDefs.push({
flex: 1,
minWidth: 120,
headerName: "Latency",
...getFilterParams("text"),
valueGetter: (params) => {
return params.data?.outputs[0].latency == undefined
? "-"
: formatLatency(params.data.outputs[0].latency)
},
})
return colDefs
}, [evalaution, scenarios, showDiff])

Expand Down
6 changes: 5 additions & 1 deletion agenta-web/src/lib/Types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,8 @@ type ValueTypeOptions =
| "regex"
| "object"
| "error"
| "cost"
| "latency"

//evaluation revamp types
export interface EvaluationSettingsTemplate {
Expand Down Expand Up @@ -442,6 +444,8 @@ export interface _Evaluation {
updated_at?: string
duration?: number
revisions: string[]
average_latency?: TypedValue & {error: null | EvaluationError}
average_cost?: TypedValue & {error: null | EvaluationError}
variant_revision_ids: string[]
}

Expand All @@ -451,7 +455,7 @@ export interface _EvaluationScenario {
evaluation: _Evaluation
evaluators_configs: EvaluatorConfig[]
inputs: (TypedValue & {name: string})[]
outputs: {result: TypedValue}[]
outputs: {result: TypedValue; cost?: number; latency?: number}[]
correct_answer?: string
is_pinned?: boolean
note?: string
Expand Down
4 changes: 4 additions & 0 deletions agenta-web/src/lib/helpers/dateTimeHelper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@ import dayjs from "dayjs"
export const formatDate = (date: dayjs.ConfigType): string => {
return dayjs(date).format("DD MMM YYYY | h:m a")
}

export const formatDate24 = (date: dayjs.ConfigType, includeSeconds = false): string => {
return dayjs(date).format("DD MMM YY, HH:mm" + (includeSeconds ? ":ss" : ""))
}
19 changes: 14 additions & 5 deletions agenta-web/src/lib/helpers/evaluate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import AlertPopup from "@/components/AlertPopup/AlertPopup"
import {capitalize, round} from "lodash"
import dayjs from "dayjs"
import {runningStatuses} from "@/components/pages/evaluations/cellRenderers/cellRenderers"
import {formatCurrency, formatLatency} from "./formatters"

export const exportExactEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => {
const exportRow = rows.map((data, ix) => {
Expand Down Expand Up @@ -269,11 +270,19 @@ export function getTypedValue(res?: TypedValue) {

if (value === undefined) return "-"

return type === "number"
? round(Number(value), 2)
: ["boolean", "bool"].includes(type as string)
? capitalize(value?.toString())
: value?.toString()
switch (type) {
case "number":
return round(Number(value), 2)
case "boolean":
case "bool":
return capitalize(value?.toString())
case "cost":
return formatCurrency(Number(value))
case "latency":
return formatLatency(Number(value))
default:
return value?.toString()
}
}

type CellDataType = "number" | "text" | "date"
Expand Down
21 changes: 21 additions & 0 deletions agenta-web/src/lib/helpers/formatters.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
const intlNumber = new Intl.NumberFormat("en-US", {
maximumFractionDigits: 2,
})

const intlCurrency = new Intl.NumberFormat("en-US", {
style: "currency",
currency: "USD",
maximumFractionDigits: 4,
})

export const formatNumber = (value = 0) => {
return intlNumber.format(value)
}

export const formatCurrency = (value = 0) => {
return intlCurrency.format(value)
}

export const formatLatency = (value = 0) => {
return `${intlNumber.format(value / 1000)}s`
}
2 changes: 2 additions & 0 deletions agenta-web/src/services/evaluations/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ const evaluationTransformer = (item: any) => ({
revisions: item.revisions,
variant_revision_ids: item.variant_revision_ids,
variant_ids: item.variant_ids,
average_cost: item.average_cost,
average_latency: item.average_latency,
})
export const fetchAllEvaluations = async (appId: string) => {
const response = await axios.get(`/api/evaluations/`, {params: {app_id: appId}})
Expand Down
Loading