Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Latency cost in eval #1468

Merged
merged 31 commits into from
Mar 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
513ecdc
add average cost and latency to evaluation schema
aakrem Mar 28, 2024
f9e8269
add cost and latency to all models and related methods
aakrem Mar 28, 2024
1082d8b
add aggregate method for the llm response latency & cost
aakrem Mar 28, 2024
484f0f5
adjust result from llm response to contain cost and latency
aakrem Mar 28, 2024
20609b9
add average cost and latency to the evaluation
aakrem Mar 28, 2024
43eb333
cost and latency columns
aakrem Mar 28, 2024
55d7c0a
fixes
aakrem Mar 28, 2024
6b5eb60
improve the getTypedValue with new types
aakrem Mar 28, 2024
31882df
formatters utils | evaluators link fixed
MohammedMaaz Mar 18, 2024
87ed91f
use Maaz currency and latency helpers
aakrem Mar 28, 2024
3147577
add latency and cost to models
aakrem Mar 28, 2024
9847da5
fix aggregation
aakrem Mar 28, 2024
c382959
adjust schema
aakrem Mar 28, 2024
f236aed
adjust EvaluationScenarioOutputDB
aakrem Mar 28, 2024
da220cb
fixes
aakrem Mar 28, 2024
361b6d7
format
aakrem Mar 28, 2024
5042b9d
handle null values for cost and latency
aakrem Mar 28, 2024
7c28565
revert change
aakrem Mar 28, 2024
ccda053
fix types
aakrem Mar 28, 2024
5b606e4
add cost and latency in eval scenario
aakrem Mar 28, 2024
1a94867
remove old implementation code
aakrem Mar 29, 2024
52fb907
fixed failing cypress tests
bekossy Mar 29, 2024
9eec347
Merge branch 'latency-cost-in-eval' of https://github.com/Agenta-AI/a…
bekossy Mar 29, 2024
29e0989
handle optional latency and cost in app response
aakrem Mar 29, 2024
f65424a
add latency and cost to comparison view
aakrem Mar 29, 2024
21b42bb
add latency and cost to comparison view
aakrem Mar 29, 2024
d9661af
format
aakrem Mar 29, 2024
cf87d94
another fix
aakrem Mar 29, 2024
82ecf54
fix types
aakrem Mar 29, 2024
9478392
fix formatter
aakrem Mar 29, 2024
e49dfb1
bumped ag-grid version
bekossy Mar 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions agenta-backend/agenta_backend/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class Evaluation(BaseModel):
testset_name: Optional[str]
status: Result
aggregated_results: List[AggregatedResult]
average_cost: Optional[Result]
average_latency: Optional[Result]
created_at: datetime
updated_at: datetime

Expand Down Expand Up @@ -100,6 +102,8 @@ class EvaluationScenarioInput(BaseModel):

class EvaluationScenarioOutput(BaseModel):
result: Result
cost: Optional[float]
latency: Optional[float]


class HumanEvaluationScenarioInput(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions agenta-backend/agenta_backend/models/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ async def evaluation_db_to_pydantic(
aggregated_results=aggregated_results,
created_at=evaluation_db.created_at,
updated_at=evaluation_db.updated_at,
average_cost=evaluation_db.average_cost,
average_latency=evaluation_db.average_latency,
)


Expand Down
8 changes: 8 additions & 0 deletions agenta-backend/agenta_backend/models/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ class Result(BaseModel):

class InvokationResult(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class EvaluationScenarioResult(BaseModel):
Expand All @@ -213,6 +215,8 @@ class EvaluationScenarioInputDB(BaseModel):

class EvaluationScenarioOutputDB(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class HumanEvaluationScenarioInput(BaseModel):
Expand Down Expand Up @@ -266,6 +270,8 @@ class EvaluationDB(Document):
variant_revision: PydanticObjectId
evaluators_configs: List[PydanticObjectId]
aggregated_results: List[AggregatedResult]
average_cost: Optional[Result] = None
average_latency: Optional[Result] = None
created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand All @@ -284,6 +290,8 @@ class EvaluationScenarioDB(Document):
note: Optional[str]
evaluators_configs: List[PydanticObjectId]
results: List[EvaluationScenarioResult]
latency: Optional[int] = None
cost: Optional[int] = None
created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand Down
30 changes: 28 additions & 2 deletions agenta-backend/agenta_backend/services/aggregation_service.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re
import traceback
from typing import List
from typing import List, Optional

from agenta_backend.models.db_models import Result, Error
from agenta_backend.models.db_models import InvokationResult, Result, Error


def aggregate_ai_critique(results: List[Result]) -> Result:
Expand Down Expand Up @@ -73,3 +73,29 @@ def aggregate_float(results: List[Result]) -> Result:
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)


def aggregate_float_from_llm_app_response(
invocation_results: List[InvokationResult], key: Optional[str]
) -> Result:
try:
if not key:
raise ValueError("Key is required to aggregate InvokationResult objects.")

values = [
getattr(inv_result, key)
for inv_result in invocation_results
if hasattr(inv_result, key) and getattr(inv_result, key) is not None
]

if not values:
raise ValueError(f"No valid values found for {key} aggregation.")

average_value = sum(values) / len(values)
return Result(type=key, value=average_value)
except Exception as exc:
return Result(
type="error",
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)
10 changes: 8 additions & 2 deletions agenta-backend/agenta_backend/services/llm_apps_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,15 @@ async def invoke_app(
url, json=payload, timeout=httpx.Timeout(timeout=5, read=None, write=5)
)
response.raise_for_status()
app_output = response.json()
app_response = response.json()
return InvokationResult(
result=Result(type="text", value=app_output["message"], error=None)
result=Result(
type="text",
value=app_response["message"],
error=None,
),
latency=app_response.get("latency"),
cost=app_response.get("cost"),
)

except httpx.HTTPStatusError as e:
Expand Down
18 changes: 17 additions & 1 deletion agenta-backend/agenta_backend/tasks/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,9 @@ def evaluate(
correct_answer=correct_answer,
outputs=[
EvaluationScenarioOutputDB(
result=Result(type="text", value=app_output.result.value)
result=Result(type="text", value=app_output.result.value),
aakrem marked this conversation as resolved.
Show resolved Hide resolved
latency=app_output.latency,
cost=app_output.cost,
)
],
results=evaluators_results,
Expand All @@ -276,6 +278,20 @@ def evaluate(
)
)

# Add average cost and latency
average_latency = aggregation_service.aggregate_float_from_llm_app_response(
app_outputs, "latency"
)
average_cost = aggregation_service.aggregate_float_from_llm_app_response(
app_outputs, "cost"
)
loop.run_until_complete(
update_evaluation(
evaluation_id,
{"average_latency": average_latency, "average_cost": average_cost},
)
)

except Exception as e:
logger.error(f"An error occurred during evaluation: {e}")
traceback.print_exc()
Expand Down
4 changes: 2 additions & 2 deletions agenta-web/cypress/e2e/eval.comparison.cy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ describe("Evaluation Comparison Test", function () {
})

it("Should select 2 evaluations, click on the compare button, and successfully navigate to the comparison page", () => {
cy.get("#ag-33-input").check()
cy.get("#ag-39-input").check()
cy.get("div.ag-selection-checkbox input").eq(0).check()
cy.get("div.ag-selection-checkbox input").eq(1).check()
cy.get('[data-cy="evaluation-results-compare-button"]').should("not.be.disabled")
cy.get('[data-cy="evaluation-results-compare-button"]').click()
cy.location("pathname").should("include", "/evaluations/compare")
Expand Down
2 changes: 1 addition & 1 deletion agenta-web/cypress/e2e/eval.evaluations.cy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ describe("Evaluations CRUD Operations Test", function () {

it("Should select evaluation and successfully delete it", () => {
cy.get(".ag-root-wrapper").should("exist")
cy.get("#ag-33-input").check()
cy.get("div.ag-selection-checkbox input").eq(0).check()
cy.get(":nth-child(1) > .ant-btn > .ant-btn-icon > .anticon > svg").click()
cy.get(".ant-modal-confirm-btns > :nth-child(2) > span").click()
})
Expand Down
22 changes: 11 additions & 11 deletions agenta-web/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions agenta-web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
"@types/react-highlight-words": "^0.16.4",
"@types/react-syntax-highlighter": "^15.5.7",
"@types/uuid": "^9.0.7",
"ag-grid-community": "^31.0.1",
"ag-grid-react": "^31.0.1",
"ag-grid-community": "^31.2.0",
"ag-grid-react": "^31.2.0",
"antd": "^5.4.7",
"autoprefixer": "10.4.14",
"axios": "^1.4.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import AgCustomHeader from "@/components/AgCustomHeader/AgCustomHeader"
import {useAtom} from "jotai"
import {evaluatorsAtom} from "@/lib/atoms/evaluation"
import CompareOutputDiff from "@/components/CompareOutputDiff/CompareOutputDiff"
import {formatCurrency, formatLatency} from "@/lib/helpers/formatters"

const useStyles = createUseStyles((theme: JSSTheme) => ({
table: {
Expand Down Expand Up @@ -218,6 +219,50 @@ const EvaluationCompareMode: React.FC<Props> = () => {
})
})

variants.forEach((variant, vi) => {
colDefs.push({
headerComponent: (props: any) => (
<AgCustomHeader {...props}>
<Space direction="vertical">
<span>Latency</span>
<Tag color={colors[vi]}>{variant.variantName}</Tag>
</Space>
</AgCustomHeader>
),
minWidth: 120,
flex: 1,
valueGetter: (params) => {
const latency = params.data?.variants.find(
(item) => item.evaluationId === variant.evaluationId,
)?.output?.latency
return latency === undefined ? "-" : formatLatency(latency)
},
...getFilterParams("text"),
})
})

variants.forEach((variant, vi) => {
colDefs.push({
headerComponent: (props: any) => (
<AgCustomHeader {...props}>
<Space direction="vertical">
<span>Cost</span>
<Tag color={colors[vi]}>{variant.variantName}</Tag>
</Space>
</AgCustomHeader>
),
minWidth: 120,
flex: 1,
valueGetter: (params) => {
const cost = params.data?.variants.find(
(item) => item.evaluationId === variant.evaluationId,
)?.output?.cost
return cost === undefined ? "-" : formatCurrency(cost)
},
...getFilterParams("text"),
})
})

return colDefs
}, [rows, showDiff, evalIds])

Expand Down Expand Up @@ -364,3 +409,6 @@ const EvaluationCompareMode: React.FC<Props> = () => {
}

export default EvaluationCompareMode
function formatCost(cost: any) {
throw new Error("Function not implemented.")
}
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,22 @@ const EvaluationResults: React.FC<Props> = () => {
statusMapper(token)[params.data?.status.value as EvaluationStatus].label,
cellRenderer: StatusRenderer,
},
{
flex: 1,
field: "average_latency",
headerName: "Latency",
minWidth: 120,
...getFilterParams("number"),
valueGetter: (params) => getTypedValue(params?.data?.average_latency),
},
{
flex: 1,
field: "average_cost",
headerName: "Cost",
minWidth: 120,
...getFilterParams("number"),
valueGetter: (params) => getTypedValue(params?.data?.average_cost),
},
{
flex: 1,
field: "created_at",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import {useAtom} from "jotai"
import {evaluatorsAtom} from "@/lib/atoms/evaluation"
import CompareOutputDiff from "@/components/CompareOutputDiff/CompareOutputDiff"
import {useQueryParam} from "@/hooks/useQuery"
import {formatCurrency, formatLatency} from "@/lib/helpers/formatters"

const useStyles = createUseStyles((theme: JSSTheme) => ({
infoRow: {
Expand Down Expand Up @@ -139,6 +140,29 @@ const EvaluationScenarios: React.FC<Props> = () => {
},
})
})
colDefs.push({
flex: 1,
minWidth: 120,
headerName: "Cost",
...getFilterParams("text"),
valueGetter: (params) => {
return params.data?.outputs[0].cost == undefined
? "-"
: formatCurrency(params.data.outputs[0].cost)
},
})

colDefs.push({
flex: 1,
minWidth: 120,
headerName: "Latency",
...getFilterParams("text"),
valueGetter: (params) => {
return params.data?.outputs[0].latency == undefined
? "-"
: formatLatency(params.data.outputs[0].latency)
},
})
return colDefs
}, [evalaution, scenarios, showDiff])

Expand Down
8 changes: 6 additions & 2 deletions agenta-web/src/lib/Types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,8 @@ type ValueTypeOptions =
| "regex"
| "object"
| "error"
| "cost"
| "latency"

//evaluation revamp types
export interface EvaluationSettingsTemplate {
Expand Down Expand Up @@ -442,6 +444,8 @@ export interface _Evaluation {
updated_at?: string
duration?: number
revisions: string[]
average_latency?: TypedValue & {error: null | EvaluationError}
average_cost?: TypedValue & {error: null | EvaluationError}
variant_revision_ids: string[]
}

Expand All @@ -451,7 +455,7 @@ export interface _EvaluationScenario {
evaluation: _Evaluation
evaluators_configs: EvaluatorConfig[]
inputs: (TypedValue & {name: string})[]
outputs: {result: TypedValue}[]
outputs: {result: TypedValue; cost?: number; latency?: number}[]
correct_answer?: string
is_pinned?: boolean
note?: string
Expand Down Expand Up @@ -487,7 +491,7 @@ export type ComparisonResultRow = {
variants: {
variantId: string
variantName: string
output: {result: TypedValue}
output: {result: TypedValue; cost?: number; latency?: number}
evaluationId: string
evaluatorConfigs: {
evaluatorConfig: EvaluatorConfig
Expand Down
Loading
Loading