Skip to content

Commit

Permalink
Merge pull request #1479 from Agenta-AI/main
Browse files Browse the repository at this point in the history
merge main into template url
  • Loading branch information
aakrem authored Apr 1, 2024
2 parents 4726408 + 9f2ef27 commit dee21f1
Show file tree
Hide file tree
Showing 27 changed files with 311 additions and 142 deletions.
4 changes: 4 additions & 0 deletions agenta-backend/agenta_backend/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class Evaluation(BaseModel):
testset_name: Optional[str]
status: Result
aggregated_results: List[AggregatedResult]
average_cost: Optional[Result]
average_latency: Optional[Result]
created_at: datetime
updated_at: datetime

Expand Down Expand Up @@ -100,6 +102,8 @@ class EvaluationScenarioInput(BaseModel):

class EvaluationScenarioOutput(BaseModel):
result: Result
cost: Optional[float]
latency: Optional[float]


class HumanEvaluationScenarioInput(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions agenta-backend/agenta_backend/models/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ async def evaluation_db_to_pydantic(
aggregated_results=aggregated_results,
created_at=evaluation_db.created_at,
updated_at=evaluation_db.updated_at,
average_cost=evaluation_db.average_cost,
average_latency=evaluation_db.average_latency,
)


Expand Down
8 changes: 8 additions & 0 deletions agenta-backend/agenta_backend/models/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ class Result(BaseModel):

class InvokationResult(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class EvaluationScenarioResult(BaseModel):
Expand All @@ -213,6 +215,8 @@ class EvaluationScenarioInputDB(BaseModel):

class EvaluationScenarioOutputDB(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class HumanEvaluationScenarioInput(BaseModel):
Expand Down Expand Up @@ -266,6 +270,8 @@ class EvaluationDB(Document):
variant_revision: PydanticObjectId
evaluators_configs: List[PydanticObjectId]
aggregated_results: List[AggregatedResult]
average_cost: Optional[Result] = None
average_latency: Optional[Result] = None
created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand All @@ -284,6 +290,8 @@ class EvaluationScenarioDB(Document):
note: Optional[str]
evaluators_configs: List[PydanticObjectId]
results: List[EvaluationScenarioResult]
latency: Optional[int] = None
cost: Optional[int] = None
created_at: datetime = Field(default=datetime.now())
updated_at: datetime = Field(default=datetime.now())

Expand Down
30 changes: 28 additions & 2 deletions agenta-backend/agenta_backend/services/aggregation_service.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re
import traceback
from typing import List
from typing import List, Optional

from agenta_backend.models.db_models import Result, Error
from agenta_backend.models.db_models import InvokationResult, Result, Error


def aggregate_ai_critique(results: List[Result]) -> Result:
Expand Down Expand Up @@ -73,3 +73,29 @@ def aggregate_float(results: List[Result]) -> Result:
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)


def aggregate_float_from_llm_app_response(
invocation_results: List[InvokationResult], key: Optional[str]
) -> Result:
try:
if not key:
raise ValueError("Key is required to aggregate InvokationResult objects.")

values = [
getattr(inv_result, key)
for inv_result in invocation_results
if hasattr(inv_result, key) and getattr(inv_result, key) is not None
]

if not values:
raise ValueError(f"No valid values found for {key} aggregation.")

average_value = sum(values) / len(values)
return Result(type=key, value=average_value)
except Exception as exc:
return Result(
type="error",
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)
10 changes: 8 additions & 2 deletions agenta-backend/agenta_backend/services/llm_apps_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,15 @@ async def invoke_app(
url, json=payload, timeout=httpx.Timeout(timeout=5, read=None, write=5)
)
response.raise_for_status()
app_output = response.json()
app_response = response.json()
return InvokationResult(
result=Result(type="text", value=app_output["message"], error=None)
result=Result(
type="text",
value=app_response["message"],
error=None,
),
latency=app_response.get("latency"),
cost=app_response.get("cost"),
)

except httpx.HTTPStatusError as e:
Expand Down
18 changes: 17 additions & 1 deletion agenta-backend/agenta_backend/tasks/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,9 @@ def evaluate(
correct_answer=correct_answer,
outputs=[
EvaluationScenarioOutputDB(
result=Result(type="text", value=app_output.result.value)
result=Result(type="text", value=app_output.result.value),
latency=app_output.latency,
cost=app_output.cost,
)
],
results=evaluators_results,
Expand All @@ -276,6 +278,20 @@ def evaluate(
)
)

# Add average cost and latency
average_latency = aggregation_service.aggregate_float_from_llm_app_response(
app_outputs, "latency"
)
average_cost = aggregation_service.aggregate_float_from_llm_app_response(
app_outputs, "cost"
)
loop.run_until_complete(
update_evaluation(
evaluation_id,
{"average_latency": average_latency, "average_cost": average_cost},
)
)

except Exception as e:
logger.error(f"An error occurred during evaluation: {e}")
traceback.print_exc()
Expand Down
2 changes: 1 addition & 1 deletion agenta-backend/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "agenta_backend"
version = "0.12.4"
version = "0.12.6"
description = ""
authors = ["Mahmoud Mabrouk <[email protected]>"]
readme = "README.md"
Expand Down
19 changes: 11 additions & 8 deletions agenta-cli/agenta/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,18 +153,21 @@ def init(app_name: str, backend_host: str):
try:
key_prefix = api_key.split(".")[0]
client.validate_api_key(key_prefix=key_prefix)

# Make request to fetch user organizations after api key validation
except Exception as ex:
click.echo(
click.style(
f"Error: Unable to validate API key.\nError: {ex}", fg="red"
)
)
sys.exit(1)
# Make request to fetch user organizations after api key validation
try:
organizations = client.list_organizations()
if len(organizations) >= 1:
user_organizations = organizations
except Exception as ex:
if ex.status_code == 401:
click.echo(click.style("Error: Invalid API key", fg="red"))
sys.exit(1)
else:
click.echo(click.style(f"Error: {ex}", fg="red"))
sys.exit(1)
click.echo(click.style(f"Error: {ex}", fg="red"))
sys.exit(1)

filtered_org = None
if where_question == "On agenta cloud":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class AppVariantOutput(pydantic.BaseModel):
variant_name: str
parameters: typing.Optional[typing.Dict[str, typing.Any]]
previous_variant_name: typing.Optional[str]
organization_id: str
organization_id: typing.Optional[str]
user_id: str
base_name: str
base_id: str
Expand Down
2 changes: 1 addition & 1 deletion agenta-cli/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "agenta"
version = "0.12.4"
version = "0.12.6"
description = "The SDK for agenta is an open-source LLMOps platform."
readme = "README.md"
authors = ["Mahmoud Mabrouk <[email protected]>"]
Expand Down
4 changes: 2 additions & 2 deletions agenta-web/cypress/e2e/eval.comparison.cy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ describe("Evaluation Comparison Test", function () {
})

it("Should select 2 evaluations, click on the compare button, and successfully navigate to the comparison page", () => {
cy.get("#ag-33-input").check()
cy.get("#ag-39-input").check()
cy.get("div.ag-selection-checkbox input").eq(0).check()
cy.get("div.ag-selection-checkbox input").eq(1).check()
cy.get('[data-cy="evaluation-results-compare-button"]').should("not.be.disabled")
cy.get('[data-cy="evaluation-results-compare-button"]').click()
cy.location("pathname").should("include", "/evaluations/compare")
Expand Down
2 changes: 1 addition & 1 deletion agenta-web/cypress/e2e/eval.evaluations.cy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ describe("Evaluations CRUD Operations Test", function () {

it("Should select evaluation and successfully delete it", () => {
cy.get(".ag-root-wrapper").should("exist")
cy.get("#ag-33-input").check()
cy.get("div.ag-selection-checkbox input").eq(0).check()
cy.get(":nth-child(1) > .ant-btn > .ant-btn-icon > .anticon > svg").click()
cy.get(".ant-modal-confirm-btns > :nth-child(2) > span").click()
})
Expand Down
26 changes: 13 additions & 13 deletions agenta-web/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions agenta-web/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "agenta",
"version": "0.12.4",
"version": "0.12.6",
"private": true,
"engines": {
"node": ">=18"
Expand Down Expand Up @@ -39,8 +39,8 @@
"@types/react-highlight-words": "^0.16.4",
"@types/react-syntax-highlighter": "^15.5.7",
"@types/uuid": "^9.0.7",
"ag-grid-community": "^31.0.1",
"ag-grid-react": "^31.0.1",
"ag-grid-community": "^31.2.0",
"ag-grid-react": "^31.2.0",
"antd": "^5.4.7",
"autoprefixer": "10.4.14",
"axios": "^1.4.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ const AddToTestSetDrawer: React.FC<Props> = ({params, isChatVariant, ...props})
)

// if cols mismatch (playground cols not a superset of testset cols)
if (missingColsTestset.length) {
if (missingColsTestset.length && missingColsPlayground.length) {
AlertPopup({
type: "error",
title: "Columns mismatch",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import {Environment, IPromptRevisions, Parameter, Variant} from "@/lib/Types"
import type {CollapseProps} from "antd"
import {Button, Col, Collapse, Row, Space, Tooltip, message} from "antd"
import React, {useContext, useEffect, useState} from "react"
import React, {useEffect, useState} from "react"
import {createUseStyles} from "react-jss"
import {ModelParameters, ObjectParameters, StringParameters} from "./ParametersCards"
import PublishVariantModal from "./PublishVariantModal"
Expand Down
10 changes: 1 addition & 9 deletions agenta-web/src/components/Playground/Views/TestView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,7 @@ import React, {useContext, useEffect, useRef, useState} from "react"
import {Button, Input, Card, Row, Col, Space, Form, Modal} from "antd"
import {CaretRightOutlined, CloseCircleOutlined, PlusOutlined} from "@ant-design/icons"
import {callVariant, promptRevision} from "@/lib/services/api"
import {
ChatMessage,
ChatRole,
GenericObject,
IPromptVersioning,
Parameter,
Variant,
} from "@/lib/Types"
import {ChatMessage, ChatRole, GenericObject, Parameter, Variant} from "@/lib/Types"
import {batchExecute, randString, removeKeys} from "@/lib/helpers/utils"
import LoadTestsModal from "../LoadTestsModal"
import AddToTestSetDrawer from "../AddToTestSetDrawer/AddToTestSetDrawer"
Expand All @@ -29,7 +22,6 @@ import dayjs from "dayjs"
import relativeTime from "dayjs/plugin/relativeTime"
import duration from "dayjs/plugin/duration"
import {useQueryParam} from "@/hooks/useQuery"
import {dynamicComponent} from "@/lib/helpers/dynamic"

dayjs.extend(relativeTime)
dayjs.extend(duration)
Expand Down
Loading

0 comments on commit dee21f1

Please sign in to comment.