diff --git a/agenta-backend/Dockerfile b/agenta-backend/Dockerfile index 0f7eec3b78..0591506007 100644 --- a/agenta-backend/Dockerfile +++ b/agenta-backend/Dockerfile @@ -22,6 +22,17 @@ RUN touch /app/agenta_backend/__init__.py RUN poetry config virtualenvs.create false \ && poetry install --no-interaction --no-ansi +RUN apt-get update -y \ + && apt-get install -y git \ + && git clone https://github.com/mmabrouk/beanie /tmp/beanie \ + && python -m venv /beanie_venv \ + && /beanie_venv/bin/pip install --upgrade pip \ + && /beanie_venv/bin/pip install /tmp/beanie \ + && rm -rf /tmp/beanie \ + && apt-get remove -y git \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + # remove dummy module RUN rm -r /app/agenta_backend EXPOSE 8000 \ No newline at end of file diff --git a/agenta-backend/agenta_backend/services/deployment_manager.py b/agenta-backend/agenta_backend/services/deployment_manager.py index a43e302a98..3cc7599766 100644 --- a/agenta-backend/agenta_backend/services/deployment_manager.py +++ b/agenta-backend/agenta_backend/services/deployment_manager.py @@ -139,3 +139,12 @@ async def validate_image(image: Image) -> bool: f"Image {image.docker_id} with tags {image.tags} not found" ) return True + + +def get_deployment_uri(deployment: DeploymentDB) -> str: + #!NOTE: do not remove! this will be used in github workflow! + backend_environment = os.environ.get("ENVIRONMENT") + if backend_environment is not None and backend_environment == "github": + return f"http://{deployment.container_name}" + else: + return deployment.uri.replace("http://localhost", "http://host.docker.internal") diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index 59b826c528..5dcacfe03a 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -1,8 +1,8 @@ -import asyncio import json +import asyncio import logging -from typing import Any, Dict, List import traceback +from typing import Any, Dict, List import httpx @@ -244,6 +244,7 @@ async def get_parameters_from_openapi(uri: str) -> List[Dict]: async def _get_openai_json_from_uri(uri): async with httpx.AsyncClient() as client: - resp = await client.get(uri) + timeout = httpx.Timeout(timeout=5, read=None, write=5) + resp = await client.get(uri, timeout=timeout) json_data = json.loads(resp.text) return json_data diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index cfb130621f..4a4b8d011c 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -1,12 +1,10 @@ import asyncio import logging -import os import re import traceback -from collections import defaultdict from typing import Any, Dict, List -from agenta_backend.models.api.evaluation_model import AppOutput, NewEvaluation +from agenta_backend.models.api.evaluation_model import AppOutput from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.db_models import ( AggregatedResult, @@ -16,14 +14,17 @@ EvaluationScenarioResult, Result, ) -from agenta_backend.services import evaluators_service, llm_apps_service +from agenta_backend.services import ( + evaluators_service, + llm_apps_service, + deployment_manager, +) from agenta_backend.services.db_manager import ( create_new_evaluation_scenario, fetch_app_by_id, fetch_app_variant_by_id, fetch_evaluation_by_id, fetch_evaluator_config, - fetch_evaluator_config_by_appId, fetch_testset_by_id, get_deployment_by_objectid, update_evaluation, @@ -83,7 +84,7 @@ def evaluate( deployment_db = loop.run_until_complete( get_deployment_by_objectid(app_variant_db.base.deployment) ) - uri = _get_deployment_uri(deployment_db) + uri = deployment_manager.get_deployment_uri(deployment_db) # 2. Initialize vars evaluators_aggregated_data = { @@ -246,19 +247,6 @@ async def aggregate_evaluator_results( return aggregated_results -def _get_deployment_uri(deployment_db) -> str: - #!NOTE: do not remove! this will be used in github workflow! - backend_environment = os.environ.get( - "ENVIRONMENT" - ) # TODO @abram rename the environment variable to something other than environment!!! - if backend_environment is not None and backend_environment == "github": - return f"http://{deployment_db.container_name}" # TODO: @abram Remove this from here. Move it to the deployment manager - else: - return deployment_db.uri.replace( - "http://localhost", "http://host.docker.internal" - ) - - def get_app_inputs(app_variant_parameters, openapi_parameters) -> List[Dict[str, str]]: """ Get a list of application inputs based on the app variant parameters and openapi parameters. diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py index 2401f352b3..3a3d58979a 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -322,8 +322,6 @@ def auto_ai_critique_evaluator_config(): "settings_values": { "open_ai_key": OPEN_AI_KEY, "temperature": 0.9, - "evaluation_prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.", - "llm_app_prompt_template": "", - "llm_app_inputs": [{"input_name": "country", "input_value": "tunisia"}], + "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.", }, } diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py index d17603e211..5f457d8db0 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py @@ -6,7 +6,6 @@ from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum from agenta_backend.models.db_models import ( AppDB, - ConfigDB, TestSetDB, AppVariantDB, EvaluationDB, @@ -23,6 +22,7 @@ # Set global variables APP_NAME = "evaluation_in_backend" ENVIRONMENT = os.environ.get("ENVIRONMENT") +OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": @@ -178,6 +178,7 @@ async def test_create_evaluation(): "variant_ids": [str(app_variant.id)], "evaluators_configs": [], "testset_id": str(testset.id), + "lm_providers_keys": {"openai": OPEN_AI_KEY}, "rate_limit": { "batch_size": 10, "max_retries": 3, @@ -199,6 +200,9 @@ async def test_create_evaluation(): # Update payload with list of configs ids payload["evaluators_configs"] = list_of_configs_ids + # Sleep for 10 seconds + await asyncio.sleep(10) + # Make request to create evaluation response = await test_client.post( f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout @@ -220,7 +224,7 @@ async def test_fetch_evaluation_status(): # Prepare and start short-polling request max_attempts = 10 - intervals = 3 # seconds + intervals = 5 # seconds for _ in range(max_attempts): response = await test_client.get( f"{BACKEND_API_HOST}/evaluations/{str(evaluation.id)}/status/", @@ -251,7 +255,7 @@ async def test_fetch_evaluation_results(): assert response.status_code == 200 assert response_data["evaluation_id"] == str(evaluation.id) - assert len(response_data["results"]) == 5 + assert len(response_data["results"]) == 6 @pytest.mark.asyncio