From c823104ecfdba82f7dffebbb26e6976e256b7e4c Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 08:55:48 +0100 Subject: [PATCH 01/12] Feat - implemented batch invoke rpm rate-limiting logic --- .../models/api/evaluation_model.py | 5 ++ .../services/llm_apps_service.py | 74 ++++++++++++++++--- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index b8716c6d33..73d01183e4 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -66,6 +66,11 @@ class NewHumanEvaluation(BaseModel): status: str +class AppOutput(BaseModel): + output: Any + status: str + + class Evaluation(BaseModel): id: str app_id: str diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index d556f526f7..ddcde3f940 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -1,15 +1,18 @@ -from typing import Any +import asyncio +import logging +from typing import Any, List + +from agenta_backend.models.api.evaluation_model import AppOutput import httpx -import backoff -@backoff.on_exception( - backoff.expo, - (httpx.TimeoutException, httpx.ConnectTimeout, httpx.ConnectError), - max_tries=2, -) -def get_llm_app_output(uri: str, input: Any) -> Any: +# Set logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +async def get_llm_app_output(uri: str, input: Any) -> AppOutput: url = f"{uri}/generate" # TODO: adjust these hardcoded values in this payload @@ -23,9 +26,58 @@ def get_llm_app_output(uri: str, input: Any) -> Any: "inputs": {"country": input}, } - with httpx.Client() as client: - response = client.post( + async with httpx.AsyncClient() as client: + response = await client.post( url, json=payload, timeout=httpx.Timeout(timeout=5, read=None, write=5) ) response.raise_for_status() - return response.json() + return AppOutput(output=response.json(), status="success") + + +async def run_with_retry( + uri: str, input_data: Any, max_retry_count: int, retry_delay: int +) -> AppOutput: + retries = 0 + while retries < max_retry_count: + try: + result = await get_llm_app_output(uri, input_data) + return result + except (httpx.TimeoutException, httpx.ConnectTimeout, httpx.ConnectError) as e: + print(f"Error in evaluation. Retrying in {retry_delay} seconds:", e) + await asyncio.sleep(retry_delay) + retries += 1 + + # If max retries reached, raise the last exception + raise e + + +async def batch_invoke(uri: str, testset_data: List[dict]) -> List[AppOutput]: + batch_size = 10 # Number of evaluations to make in each batch + max_retries = 3 # Maximum number of times to retry a failed evaluation + retry_delay = 3 # Delay before retrying a failed evaluation (in seconds) + delay_between_batches = 5 # Delay between batches (in seconds) + list_of_app_outputs: List[AppOutput] = [] # Outputs after running all batches + + async def run_batch(start_idx: int): + print(f"Preparing {start_idx} batch...") + end_idx = min(start_idx + batch_size, len(testset_data)) + for index in range(start_idx, end_idx): + print(f"Running datapoint(s) in {start_idx} batch...") + try: + batch_output: AppOutput = await run_with_retry(uri, testset_data[index], max_retries, retry_delay) + list_of_app_outputs.append(batch_output) + print(f"Adding outputs to batch {start_idx}") + except Exception as exc: + logger.info( + f"Error processing batch[{start_idx}]:[{end_idx}] ==> {str(exc)}" + ) + + # Schedule the next batch with a delay + next_batch_start_idx = end_idx + if next_batch_start_idx < len(testset_data): + await asyncio.sleep(delay_between_batches) + await run_batch(next_batch_start_idx) + + # Start the first batch + await run_batch(0) + return list_of_app_outputs From 9d4678aa47e8aed75e0ecb02eca1c3e7a1512ce3 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 12:58:19 +0100 Subject: [PATCH 02/12] Feat - created llm run rate limit api model --- .../agenta_backend/models/api/evaluation_model.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 73d01183e4..6ac326a290 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -235,11 +235,19 @@ class EvaluationSettingsTemplate(BaseModel): description: str +class LLMRunRateLimit(BaseModel): + batch_size: int = Field(default=10) + max_retries: int = Field(default=3) + retry_delay: int = Field(default=3) + delay_between_batches: int = Field(default=5) + + class NewEvaluation(BaseModel): app_id: str variant_ids: List[str] evaluators_configs: List[str] testset_id: str + rate_limit: LLMRunRateLimit class NewEvaluatorConfig(BaseModel): From 9eb28bcec858adbf8ced664056afdac867aa0163 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:14:32 +0100 Subject: [PATCH 03/12] Feat - created llm run rate limit type --- agenta-web/src/lib/Types.ts | 7 +++++++ agenta-web/src/services/evaluations/index.ts | 2 ++ 2 files changed, 9 insertions(+) diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 2f33533ee0..bf40d6f7cc 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -50,6 +50,13 @@ export interface PlaygroundTabsItem { closable: boolean } +export interface LLMRunRateLimit { + batch_size: number + max_retries: number + retry_delay: number + delay_between_batches: number +} + export interface Evaluation { id: string createdAt: string diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index a2cc6fc55e..b2ee464db0 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -5,6 +5,7 @@ import { EvaluationStatus, Evaluator, EvaluatorConfig, + LLMRunRateLimit, TypedValue, _Evaluation, _EvaluationScenario, @@ -117,6 +118,7 @@ export type CreateEvaluationData = { testset_id: string variant_ids: string[] evaluators_configs: string[] + rate_limit: LLMRunRateLimit } export const createEvalutaiton = async (appId: string, evaluation: CreateEvaluationData) => { return axios.post(`/api/evaluations/`, {...evaluation, app_id: appId}) From a127ec65c6a729f51b441321eb6db5e302d7a225 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:16:05 +0100 Subject: [PATCH 04/12] Update - remove default values in llm run rate limit api model --- .../agenta_backend/models/api/evaluation_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 6ac326a290..04340d02a2 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -236,10 +236,10 @@ class EvaluationSettingsTemplate(BaseModel): class LLMRunRateLimit(BaseModel): - batch_size: int = Field(default=10) - max_retries: int = Field(default=3) - retry_delay: int = Field(default=3) - delay_between_batches: int = Field(default=5) + batch_size: int + max_retries: int + retry_delay: int + delay_between_batches: int class NewEvaluation(BaseModel): From 6f35a30364321189bef74c48cf9ace96efffb06c Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:18:43 +0100 Subject: [PATCH 05/12] Update - modified batch_invoke function --- .../services/llm_apps_service.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index ddcde3f940..8aebc83e46 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -37,34 +37,49 @@ async def get_llm_app_output(uri: str, input: Any) -> AppOutput: async def run_with_retry( uri: str, input_data: Any, max_retry_count: int, retry_delay: int ) -> AppOutput: + retries = 0 + last_exception = None while retries < max_retry_count: try: result = await get_llm_app_output(uri, input_data) return result except (httpx.TimeoutException, httpx.ConnectTimeout, httpx.ConnectError) as e: + last_exception = e print(f"Error in evaluation. Retrying in {retry_delay} seconds:", e) await asyncio.sleep(retry_delay) retries += 1 - # If max retries reached, raise the last exception - raise e + # If max retries reached, return the last exception + return AppOutput(output=None, status=str(last_exception)) + +async def batch_invoke( + uri: str, testset_data: List[dict], rate_limit_config: dict +) -> List[AppOutput]: + batch_size = rate_limit_config[ + "batch_size" + ] # Number of testset to make in each batch + max_retries = rate_limit_config[ + "max_retries" + ] # Maximum number of times to retry the failed llm call + retry_delay = rate_limit_config[ + "retry_delay" + ] # Delay before retrying the failed llm call (in seconds) + delay_between_batches = rate_limit_config[ + "delay_between_batches" + ] # Delay between batches (in seconds) -async def batch_invoke(uri: str, testset_data: List[dict]) -> List[AppOutput]: - batch_size = 10 # Number of evaluations to make in each batch - max_retries = 3 # Maximum number of times to retry a failed evaluation - retry_delay = 3 # Delay before retrying a failed evaluation (in seconds) - delay_between_batches = 5 # Delay between batches (in seconds) - list_of_app_outputs: List[AppOutput] = [] # Outputs after running all batches + list_of_app_outputs: List[AppOutput] = [] # Outputs after running all batches async def run_batch(start_idx: int): print(f"Preparing {start_idx} batch...") end_idx = min(start_idx + batch_size, len(testset_data)) for index in range(start_idx, end_idx): - print(f"Running datapoint(s) in {start_idx} batch...") try: - batch_output: AppOutput = await run_with_retry(uri, testset_data[index], max_retries, retry_delay) + batch_output: AppOutput = await run_with_retry( + uri, testset_data[index], max_retries, retry_delay + ) list_of_app_outputs.append(batch_output) print(f"Adding outputs to batch {start_idx}") except Exception as exc: From a28be4852d8283751e986d4ad3f311f51a859ad0 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:19:11 +0100 Subject: [PATCH 06/12] Update - added extra inputs for evaluation run rate limit --- .../evaluationResults/NewEvaluationModal.tsx | 117 +++++++++++++++++- 1 file changed, 113 insertions(+), 4 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 1b34e34c5a..5ea6dcd61c 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -1,10 +1,10 @@ import {useAppId} from "@/hooks/useAppId" -import {JSSTheme, Variant, testset} from "@/lib/Types" +import {JSSTheme, Variant, LLMRunRateLimit, testset} from "@/lib/Types" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" import {fetchTestsets, fetchVariants} from "@/lib/services/api" import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations" -import {PlusOutlined} from "@ant-design/icons" -import {Divider, Form, Modal, Select, Spin, Tag, Typography} from "antd" +import {PlusOutlined, QuestionCircleOutlined} from "@ant-design/icons" +import {Divider, Form, Modal, Select, Spin, Tag, Typography, InputNumber, Row, Col, Tooltip} from "antd" import dayjs from "dayjs" import {useAtom} from "jotai" import Image from "next/image" @@ -75,9 +75,20 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { .finally(() => setFetching(false)) }, [props.open, appId]) + const [rateLimitValues, setRateLimitValues] = useState({ + batch_size: 10, + max_retries: 3, + retry_delay: 3, + delay_between_batches: 5, + }) + const onRateLimitInputChange = (field: keyof LLMRunRateLimit, value: number) => { + setRateLimitValues((prevValues: any) => ({ ...prevValues, [field]: value })); + } + const onSubmit = (values: CreateEvaluationData) => { setSubmitLoading(true) - createEvalutaiton(appId, values) + const EvaluationRateLimit: LLMRunRateLimit = rateLimitValues + createEvalutaiton(appId, {...values, rate_limit: EvaluationRateLimit}) .then(onSuccess) .catch(console.error) .finally(() => setSubmitLoading(false)) @@ -176,6 +187,104 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { })} + + + + + + + Batch Size  + + + + + } + name="batch_size" + style={{ marginBottom: '0' }} + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('batch_size', value) + } + style={{ width: '100%' }} + /> + + + + + Max Retries  + + + + + } + name="max_retries" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('max_retries', value) + } + style={{ width: '100%' }} + /> + + + + + Retry Delay  + + + + + } + name="retry_delay" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('retry_delay', value) + } + style={{ width: '100%' }} + /> + + + + + Delay Between Batches  + + + + + } + name="delay_between_batches" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('delay_between_batches', value) + } + style={{ width: '100%' }} + /> + + + + From 73a4653de6844f57fb3621b420704b6b41fefc2c Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:19:58 +0100 Subject: [PATCH 07/12] Update - pass in rate_limit api model dict --- .../routers/evaluation_router.py | 6 +- .../agenta_backend/tasks/evaluations.py | 72 ++++++++++--------- 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index ce38e767f9..b5e0b143d6 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -96,6 +96,7 @@ async def create_evaluation( "variant_ids": [variant_id], # Only this variant ID "evaluators_configs": payload.evaluators_configs, "testset_id": payload.testset_id, + "rate_limit": payload.rate_limit.dict() } evaluation = await evaluation_service.create_new_evaluation( @@ -105,7 +106,10 @@ async def create_evaluation( ) evaluate.delay( - app_data, new_evaluation_data, evaluation.id, evaluation.testset_id + app_data, + new_evaluation_data, + evaluation.id, + evaluation.testset_id, ) evaluations.append(evaluation) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 81a86ca5d0..c6237b85d6 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -24,12 +24,15 @@ Result, ) from agenta_backend.services import evaluators_service -from agenta_backend.models.api.evaluation_model import NewEvaluation +from agenta_backend.models.api.evaluation_model import NewEvaluation, AppOutput @shared_task(queue="agenta_backend.tasks.evaluations.evaluate") def evaluate( - app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str + app_data: dict, + new_evaluation_data: dict, + evaluation_id: str, + testset_id: str, ): loop = asyncio.get_event_loop() app = AppDB(**app_data) @@ -46,11 +49,25 @@ def evaluate( get_deployment_by_objectid(app_variant_db.base.deployment) ) - # TODO: remove if abraham's fix is working - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + #!NOTE: do not remove! this will be used in github workflow! + backend_environment = os.environ.get("ENVIRONMENT") + if backend_environment is not None and backend_environment == "github": + uri = f"http://{deployment.container_name}" + else: + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + + # 2. We get the output from the llm app + app_outputs: List[AppOutput] = loop.run_until_complete( + llm_apps_service.batch_invoke( + uri, testset.csvdata, evaluation.rate_limit.dict() + ) + ) + for data_point, app_output in zip(testset.csvdata, app_outputs): + if len(testset.csvdata) != len(app_outputs): + # TODO: properly handle error in the case where the length are not the same + break - for data_point in testset.csvdata: - # 1. We prepare the inputs + # 2. We prepare the inputs raw_inputs = ( app_variant_db.parameters.get("inputs", []) if app_variant_db.parameters @@ -67,17 +84,6 @@ def evaluate( for input_item in raw_inputs ] - #!NOTE: do not remove! this will be used in github workflow! - backend_environment = os.environ.get("ENVIRONMENT") - if backend_environment is not None and backend_environment == "github": - uri = f"http://{deployment.container_name}" - else: - uri = deployment.uri.replace( - "http://localhost", "http://host.docker.internal" - ) - # 2. We get the output from the llm app - variant_output = llm_apps_service.get_llm_app_output(uri, data_point) - # 3. We evaluate evaluators_results: [EvaluationScenarioResult] = [] for evaluator_config_id in evaluation.evaluators_configs: @@ -95,7 +101,7 @@ def evaluate( ) result = evaluators_service.evaluate( evaluator_config.evaluator_key, - variant_output, + app_output.output, data_point["correct_answer"], evaluator_config.settings_values, **additional_kwargs, @@ -108,22 +114,22 @@ def evaluate( evaluators_results.append(result_object) evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) - # 4. We create a new evaluation scenario - evaluation_scenario = loop.run_until_complete( - create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - variant_id=variant_id, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=inputs, - is_pinned=False, - note="", - correct_answer=data_point["correct_answer"], - outputs=[EvaluationScenarioOutputDB(type="text", value=variant_output)], - results=evaluators_results, - ) + # 4. We create a new evaluation scenario + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + variant_id=variant_id, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=inputs, + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], + results=evaluators_results, ) + ) aggregated_results = loop.run_until_complete( aggregate_evaluator_results(app, evaluators_aggregated_data) From 38b593a75fdb7a30fd5beafaff3743f4b787be25 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:21:16 +0100 Subject: [PATCH 08/12] :art: Format - ran black and prettier --- .../models/api/evaluation_model.py | 4 +- .../routers/evaluation_router.py | 2 +- .../services/llm_apps_service.py | 1 - .../modals/CreateAppStatusModal.tsx | 4 +- .../Evaluations/HumanEvaluationResult.tsx | 8 +--- .../evaluationResults/EvaluationResults.tsx | 8 ++-- .../evaluationResults/NewEvaluationModal.tsx | 39 ++++++++++++------- agenta-web/src/lib/services/api.ts | 10 ++--- 8 files changed, 38 insertions(+), 38 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 04340d02a2..1a6fab63b3 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -67,8 +67,8 @@ class NewHumanEvaluation(BaseModel): class AppOutput(BaseModel): - output: Any - status: str + output: Any + status: str class Evaluation(BaseModel): diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index b5e0b143d6..5cc4367c9c 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -96,7 +96,7 @@ async def create_evaluation( "variant_ids": [variant_id], # Only this variant ID "evaluators_configs": payload.evaluators_configs, "testset_id": payload.testset_id, - "rate_limit": payload.rate_limit.dict() + "rate_limit": payload.rate_limit.dict(), } evaluation = await evaluation_service.create_new_evaluation( diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index 8aebc83e46..a749cbc418 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -37,7 +37,6 @@ async def get_llm_app_output(uri: str, input: Any) -> AppOutput: async def run_with_retry( uri: str, input_data: Any, max_retry_count: int, retry_delay: int ) -> AppOutput: - retries = 0 last_exception = None while retries < max_retry_count: diff --git a/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx b/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx index 3a7d655407..cb165d4b04 100644 --- a/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx +++ b/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx @@ -196,8 +196,8 @@ const CreateAppStatusModal: React.FC> type === "success" ? "success" : type === "error" - ? "danger" - : "secondary" + ? "danger" + : "secondary" } strong={Object.keys(messages)[ix] === "success"} > diff --git a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx index 80e8905946..192280ace3 100644 --- a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx +++ b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx @@ -98,15 +98,11 @@ export default function HumanEvaluationResult() { } const fetchEvaluations = async () => { try { - fetchData( - `${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`, - ) + fetchData(`${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`) .then((response) => { const fetchPromises = response.map((item: EvaluationResponseType) => { return fetchData( - `${getAgentaApiUrl()}/api/human-evaluations/${ - item.id - }/results/`, + `${getAgentaApiUrl()}/api/human-evaluations/${item.id}/results/`, ) .then((results) => { if (item.evaluation_type === EvaluationType.human_a_b_testing) { diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index dcc5ae6ae4..10a1f937ac 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -112,8 +112,8 @@ export function getTypedValue(res?: TypedValue) { return type === "number" ? round(Number(value), 2) : ["boolean", "bool"].includes(type as string) - ? capitalize(value?.toString()) - : value?.toString() + ? capitalize(value?.toString()) + : value?.toString() } export function getFilterParams(type: "number" | "text" | "date") { @@ -144,8 +144,8 @@ export function getFilterParams(type: "number" | "text" | "date") { type === "number" ? "agNumberColumnFilter" : type === "date" - ? "agDateColumnFilter" - : "agTextColumnFilter", + ? "agDateColumnFilter" + : "agTextColumnFilter", cellDataType: type, filterParams, } diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 5ea6dcd61c..5aed01d43f 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -4,7 +4,19 @@ import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" import {fetchTestsets, fetchVariants} from "@/lib/services/api" import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations" import {PlusOutlined, QuestionCircleOutlined} from "@ant-design/icons" -import {Divider, Form, Modal, Select, Spin, Tag, Typography, InputNumber, Row, Col, Tooltip} from "antd" +import { + Divider, + Form, + Modal, + Select, + Spin, + Tag, + Typography, + InputNumber, + Row, + Col, + Tooltip, +} from "antd" import dayjs from "dayjs" import {useAtom} from "jotai" import Image from "next/image" @@ -82,7 +94,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { delay_between_batches: 5, }) const onRateLimitInputChange = (field: keyof LLMRunRateLimit, value: number) => { - setRateLimitValues((prevValues: any) => ({ ...prevValues, [field]: value })); + setRateLimitValues((prevValues: any) => ({...prevValues, [field]: value})) } const onSubmit = (values: CreateEvaluationData) => { @@ -189,10 +201,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - + = ({onSuccess, ...props}) => { } name="batch_size" - style={{ marginBottom: '0' }} + style={{marginBottom: "0"}} rules={[{required: true, message: "This field is required"}]} > - onRateLimitInputChange('batch_size', value) + onRateLimitInputChange("batch_size", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> @@ -233,9 +242,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - onRateLimitInputChange('max_retries', value) + onRateLimitInputChange("max_retries", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> @@ -255,9 +264,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - onRateLimitInputChange('retry_delay', value) + onRateLimitInputChange("retry_delay", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> @@ -277,9 +286,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - onRateLimitInputChange('delay_between_batches', value) + onRateLimitInputChange("delay_between_batches", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> diff --git a/agenta-web/src/lib/services/api.ts b/agenta-web/src/lib/services/api.ts index 0ff9fc5859..0b95afd04a 100644 --- a/agenta-web/src/lib/services/api.ts +++ b/agenta-web/src/lib/services/api.ts @@ -369,13 +369,9 @@ export const createNewEvaluation = async ( status: EvaluationFlow.EVALUATION_INITIALIZED, } - const response = await axios.post( - `${getAgentaApiUrl()}/api/human-evaluations/`, - data, - { - _ignoreError: ignoreAxiosError, - } as any, - ) + const response = await axios.post(`${getAgentaApiUrl()}/api/human-evaluations/`, data, { + _ignoreError: ignoreAxiosError, + } as any) return response.data.id } From 6429d7ffa7b3b05387af74c6c6a8fe915ea97207 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:54:06 +0100 Subject: [PATCH 09/12] Update - wrapped entire logic in try-except block --- .../agenta_backend/tasks/evaluations.py | 173 +++++++++--------- 1 file changed, 90 insertions(+), 83 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index a3579ccb99..799de79dc2 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -10,6 +10,7 @@ fetch_app_variant_by_id, fetch_evaluator_config, get_deployment_by_objectid, + update_evaluation, fetch_testset_by_id, create_new_evaluation_scenario, fetch_evaluator_config_by_appId, @@ -38,98 +39,104 @@ def evaluate( app = AppDB(**app_data) evaluation = NewEvaluation(**new_evaluation_data) - testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) - new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) - evaluators_aggregated_data = defaultdict(list) + try: + testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) + new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) + evaluators_aggregated_data = defaultdict(list) - variant_id = str(evaluation.variant_ids[0]) + variant_id = str(evaluation.variant_ids[0]) - app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) - deployment = loop.run_until_complete( - get_deployment_by_objectid(app_variant_db.base.deployment) - ) - - #!NOTE: do not remove! this will be used in github workflow! - backend_environment = os.environ.get("ENVIRONMENT") - if backend_environment is not None and backend_environment == "github": - uri = f"http://{deployment.container_name}" - else: - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") - - # 1. We get the output from the llm app - app_outputs: List[AppOutput] = loop.run_until_complete( - llm_apps_service.batch_invoke( - uri, testset.csvdata, evaluation.rate_limit.dict() - ) - ) - for data_point, app_output in zip(testset.csvdata, app_outputs): - if len(testset.csvdata) != len(app_outputs): - # TODO: properly handle error in the case where the length are not the same - break - - # 2. We prepare the inputs - raw_inputs = ( - app_variant_db.parameters.get("inputs", []) - if app_variant_db.parameters - else [] + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) + deployment = loop.run_until_complete( + get_deployment_by_objectid(app_variant_db.base.deployment) ) - inputs = [] - if raw_inputs: - inputs = [ - EvaluationScenarioInputDB( - name=input_item["name"], - type="text", - value=data_point[input_item["name"]], - ) - for input_item in raw_inputs - ] - - # 3. We evaluate - evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config_id in evaluation.evaluators_configs: - evaluator_config = loop.run_until_complete( - fetch_evaluator_config(evaluator_config_id) - ) - additional_kwargs = ( - { - "app_params": app_variant_db.config.parameters, - "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed - } - if evaluator_config.evaluator_key == "custom_code_run" - else {} + #!NOTE: do not remove! this will be used in github workflow! + backend_environment = os.environ.get("ENVIRONMENT") + if backend_environment is not None and backend_environment == "github": + uri = f"http://{deployment.container_name}" + else: + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + + # 1. We get the output from the llm app + app_outputs: List[AppOutput] = loop.run_until_complete( + llm_apps_service.batch_invoke( + uri, testset.csvdata, evaluation.rate_limit.dict() ) - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - app_output.output, - data_point["correct_answer"], - evaluator_config.settings_values, - **additional_kwargs, + ) + for data_point, app_output in zip(testset.csvdata, app_outputs): + if len(testset.csvdata) != len(app_outputs): + # TODO: properly handle error in the case where the length are not the same + break + + # 2. We prepare the inputs + raw_inputs = ( + app_variant_db.parameters.get("inputs", []) + if app_variant_db.parameters + else [] ) + inputs = [] + if raw_inputs: + inputs = [ + EvaluationScenarioInputDB( + name=input_item["name"], + type="text", + value=data_point[input_item["name"]], + ) + for input_item in raw_inputs + ] + + # 3. We evaluate + evaluators_results: [EvaluationScenarioResult] = [] + for evaluator_config_id in evaluation.evaluators_configs: + evaluator_config = loop.run_until_complete( + fetch_evaluator_config(evaluator_config_id) + ) + + additional_kwargs = ( + { + "app_params": app_variant_db.config.parameters, + "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed + } + if evaluator_config.evaluator_key == "custom_code_run" + else {} + ) + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + app_output.output, + data_point["correct_answer"], + evaluator_config.settings_values, + **additional_kwargs, + ) - result_object = EvaluationScenarioResult( - evaluator_config=evaluator_config.id, - result=result, + result_object = EvaluationScenarioResult( + evaluator_config=evaluator_config.id, + result=result, + ) + evaluators_results.append(result_object) + evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) + + # 4. We create a new evaluation scenario + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + variant_id=variant_id, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=inputs, + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], + results=evaluators_results, ) - evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) - - # 4. We create a new evaluation scenario - evaluation_scenario = loop.run_until_complete( - create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - variant_id=variant_id, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=inputs, - is_pinned=False, - note="", - correct_answer=data_point["correct_answer"], - outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], - results=evaluators_results, ) - ) + except Exception as e: + print(f"An error occurred during evaluation: {e}") + loop.run_until_complete( + update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"}) + ) aggregated_results = loop.run_until_complete( aggregate_evaluator_results(app, evaluators_aggregated_data) From 06f5379f0f50a1080f633489a7b27d87fbe56cb5 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:58:34 +0100 Subject: [PATCH 10/12] Update - include rate_limit data in create_evaluation testcase --- .../agenta_backend/tasks/evaluations.py | 16 ++++++++++++---- .../test_evaluators_router.py | 6 ++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 799de79dc2..28f5ca7c84 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -41,7 +41,9 @@ def evaluate( try: testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) - new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) + new_evaluation_db = loop.run_until_complete( + fetch_evaluation_by_id(evaluation_id) + ) evaluators_aggregated_data = defaultdict(list) variant_id = str(evaluation.variant_ids[0]) @@ -56,7 +58,9 @@ def evaluate( if backend_environment is not None and backend_environment == "github": uri = f"http://{deployment.container_name}" else: - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + uri = deployment.uri.replace( + "http://localhost", "http://host.docker.internal" + ) # 1. We get the output from the llm app app_outputs: List[AppOutput] = loop.run_until_complete( @@ -114,7 +118,9 @@ def evaluate( result=result, ) evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) + evaluators_aggregated_data[evaluator_config.evaluator_key].append( + result + ) # 4. We create a new evaluation scenario evaluation_scenario = loop.run_until_complete( @@ -128,7 +134,9 @@ def evaluate( is_pinned=False, note="", correct_answer=data_point["correct_answer"], - outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], + outputs=[ + EvaluationScenarioOutputDB(type="text", value=app_output.output) + ], results=evaluators_results, ) ) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index b399e37fd4..c86201d146 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -161,6 +161,12 @@ async def test_create_evaluation(): "variant_ids": [str(app_variant.id)], "evaluators_configs": [], "testset_id": str(testset.id), + "rate_limit": { + "batch_size": 10, + "max_retries": 3, + "retry_delay": 3, + "delay_between_batches": 5, + }, } # Fetch evaluator configs From fb143e21daa2143012692a7f2e6c2e2445071fd3 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 22:11:46 +0100 Subject: [PATCH 11/12] Update - added switch form item to show advanced rate-limit configuration --- .../evaluationResults/NewEvaluationModal.tsx | 196 ++++++++++-------- 1 file changed, 106 insertions(+), 90 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 5aed01d43f..dff2b706c8 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -15,6 +15,7 @@ import { InputNumber, Row, Col, + Switch, Tooltip, } from "antd" import dayjs from "dayjs" @@ -73,6 +74,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { const [evaluatorConfigs] = useAtom(evaluatorConfigsAtom) const [evaluators] = useAtom(evaluatorsAtom) const [submitLoading, setSubmitLoading] = useState(false) + const [showRateLimitInputs, setShowRateLimitInputs] = useState(false) const [form] = Form.useForm() useEffect(() => { @@ -96,6 +98,10 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { const onRateLimitInputChange = (field: keyof LLMRunRateLimit, value: number) => { setRateLimitValues((prevValues: any) => ({...prevValues, [field]: value})) } + const onRateLimitSwitchChange = (checked: boolean) => { + setShowRateLimitInputs(checked) + } + const onSubmit = (values: CreateEvaluationData) => { setSubmitLoading(true) @@ -199,101 +205,111 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { })} - + + + - - - - - Batch Size  - - - - - } - name="batch_size" - style={{marginBottom: "0"}} - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("batch_size", value) + {showRateLimitInputs && ( + + + + + + Batch Size  + + + + } - style={{width: "100%"}} - /> - - - - - Max Retries  - - - - - } - name="max_retries" - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("max_retries", value) + name="batch_size" + style={{marginBottom: "0"}} + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("batch_size", value) + } + style={{width: "100%"}} + /> + + + + + Max Retries  + + + + } - style={{width: "100%"}} - /> - - - - - Retry Delay  - - - - - } - name="retry_delay" - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("retry_delay", value) + name="max_retries" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("max_retries", value) + } + style={{width: "100%"}} + /> + + + + + Retry Delay  + + + + } - style={{width: "100%"}} - /> - - - - - Delay Between Batches  - - - - - } - name="delay_between_batches" - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("delay_between_batches", value) + style={{ marginBottom: '0' }} + name="retry_delay" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("retry_delay", value) + } + style={{width: "100%"}} + /> + + + + + Delay Between Batches  + + + + } - style={{width: "100%"}} - /> - - - - + name="delay_between_batches" + style={{ marginBottom: '0' }} + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("delay_between_batches", value) + } + style={{width: "100%"}} + /> + + + + + )} From 15846084f24143fd2dd9168fd0f6e107bf9babf5 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 22:12:35 +0100 Subject: [PATCH 12/12] :art: Format - ran prettier --- .../evaluationResults/NewEvaluationModal.tsx | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index dff2b706c8..e881685aff 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -102,7 +102,6 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { setShowRateLimitInputs(checked) } - const onSubmit = (values: CreateEvaluationData) => { setSubmitLoading(true) const EvaluationRateLimit: LLMRunRateLimit = rateLimitValues @@ -207,7 +206,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { @@ -228,7 +227,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { } name="batch_size" style={{marginBottom: "0"}} - rules={[{required: true, message: "This field is required"}]} + rules={[ + {required: true, message: "This field is required"}, + ]} > = ({onSuccess, ...props}) => { } name="max_retries" - rules={[{required: true, message: "This field is required"}]} + rules={[ + {required: true, message: "This field is required"}, + ]} > = ({onSuccess, ...props}) => { } - style={{ marginBottom: '0' }} + style={{marginBottom: "0"}} name="retry_delay" - rules={[{required: true, message: "This field is required"}]} + rules={[ + {required: true, message: "This field is required"}, + ]} > = ({onSuccess, ...props}) => { } name="delay_between_batches" - style={{ marginBottom: '0' }} - rules={[{required: true, message: "This field is required"}]} + style={{marginBottom: "0"}} + rules={[ + {required: true, message: "This field is required"}, + ]} > - onRateLimitInputChange("delay_between_batches", value) + onRateLimitInputChange( + "delay_between_batches", + value, + ) } style={{width: "100%"}} />