Agenta-AI · mmabrouk · Jan 24, 2024 · Jan 23, 2024 · Jan 23, 2024 · Jan 23, 2024
diff --git a/agenta-backend/agenta_backend/services/deployment_manager.py b/agenta-backend/agenta_backend/services/deployment_manager.py
@@ -139,3 +139,7 @@ async def validate_image(image: Image) -> bool:
             f"Image {image.docker_id} with tags {image.tags} not found"
         )
     return True
+
+
+def get_deployment_uri(deployment: DeploymentDB) -> str:
+    return deployment.uri.replace("http://localhost", "http://host.docker.internal")
diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py
@@ -1,12 +1,13 @@
-import asyncio
 import json
+import asyncio
 import logging
-from typing import Any, Dict, List
 import traceback
+from typing import Any, Dict, List
+
+from agenta_backend.models.api.evaluation_model import AppOutput
 
 import httpx
 
-from agenta_backend.models.api.evaluation_model import AppOutput
 
 # Set logger
 logger = logging.getLogger(__name__)
@@ -245,5 +246,7 @@ async def get_parameters_from_openapi(uri: str) -> List[Dict]:
 async def _get_openai_json_from_uri(uri):
     async with httpx.AsyncClient() as client:
         resp = await client.get(uri)
+        timeout = httpx.Timeout(timeout=5, read=None, write=5)
+        resp = await client.get(uri, timeout=timeout)
         json_data = json.loads(resp.text)
         return json_data
diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py
@@ -3,10 +3,9 @@
 import os
 import re
 import traceback
-from collections import defaultdict
 from typing import Any, Dict, List
 
-from agenta_backend.models.api.evaluation_model import AppOutput, NewEvaluation
+from agenta_backend.models.api.evaluation_model import AppOutput
 from agenta_backend.models.db_engine import DBEngine
 from agenta_backend.models.db_models import (
     AggregatedResult,
@@ -16,14 +15,17 @@
     EvaluationScenarioResult,
     Result,
 )
-from agenta_backend.services import evaluators_service, llm_apps_service
+from agenta_backend.services import (
+    evaluators_service,
+    llm_apps_service,
+    deployment_manager,
+)
 from agenta_backend.services.db_manager import (
     create_new_evaluation_scenario,
     fetch_app_by_id,
     fetch_app_variant_by_id,
     fetch_evaluation_by_id,
     fetch_evaluator_config,
-    fetch_evaluator_config_by_appId,
     fetch_testset_by_id,
     get_deployment_by_objectid,
     update_evaluation,
@@ -83,7 +85,7 @@ def evaluate(
         deployment_db = loop.run_until_complete(
             get_deployment_by_objectid(app_variant_db.base.deployment)
         )
-        uri = _get_deployment_uri(deployment_db)
+        uri = deployment_manager.get_deployment_uri(deployment_db)
 
         # 2. Initialize vars
         evaluators_aggregated_data = {
@@ -246,19 +248,6 @@ async def aggregate_evaluator_results(
     return aggregated_results
 
 
-def _get_deployment_uri(deployment_db) -> str:
-    #!NOTE: do not remove! this will be used in github workflow!
-    backend_environment = os.environ.get(
-        "ENVIRONMENT"
-    )  # TODO @abram rename the environment variable to something other than environment!!!
-    if backend_environment is not None and backend_environment == "github":
-        return f"http://{deployment_db.container_name}"  # TODO: @abram Remove this from here. Move it to the deployment manager
-    else:
-        return deployment_db.uri.replace(
-            "http://localhost", "http://host.docker.internal"
-        )
-
-
 def get_app_inputs(app_variant_parameters, openapi_parameters) -> List[Dict[str, str]]:
     """
     Get a list of application inputs based on the app variant parameters and openapi parameters.

diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
@@ -322,8 +322,6 @@ def auto_ai_critique_evaluator_config():
         "settings_values": {
             "open_ai_key": OPEN_AI_KEY,
             "temperature": 0.9,
-            "evaluation_prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.",
-            "llm_app_prompt_template": "",
-            "llm_app_inputs": [{"input_name": "country", "input_value": "tunisia"}],
+            "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.",
         },
     }
diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py
@@ -6,7 +6,6 @@
 from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum
 from agenta_backend.models.db_models import (
     AppDB,
-    ConfigDB,
     TestSetDB,
     AppVariantDB,
     EvaluationDB,
@@ -23,6 +22,7 @@
 # Set global variables
 APP_NAME = "evaluation_in_backend"
 ENVIRONMENT = os.environ.get("ENVIRONMENT")
+OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY")
 if ENVIRONMENT == "development":
     BACKEND_API_HOST = "http://host.docker.internal/api"
 elif ENVIRONMENT == "github":
@@ -178,6 +178,7 @@ async def test_create_evaluation():
         "variant_ids": [str(app_variant.id)],
         "evaluators_configs": [],
         "testset_id": str(testset.id),
+        "lm_providers_keys": {"openai": OPEN_AI_KEY},
         "rate_limit": {
             "batch_size": 10,
             "max_retries": 3,
@@ -199,6 +200,9 @@ async def test_create_evaluation():
     # Update payload with list of configs ids
     payload["evaluators_configs"] = list_of_configs_ids
 
+    # Sleep for 10 seconds (to allow the llm app container start completely)
+    await asyncio.sleep(10)
+
     # Make request to create evaluation
     response = await test_client.post(
         f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout
@@ -220,7 +224,7 @@ async def test_fetch_evaluation_status():
 
     # Prepare and start short-polling request
     max_attempts = 10
-    intervals = 3  # seconds
+    intervals = 5  # seconds
     for _ in range(max_attempts):
         response = await test_client.get(
             f"{BACKEND_API_HOST}/evaluations/{str(evaluation.id)}/status/",
@@ -251,7 +255,7 @@ async def test_fetch_evaluation_results():
 
     assert response.status_code == 200
     assert response_data["evaluation_id"] == str(evaluation.id)
-    assert len(response_data["results"]) == 5
+    assert len(response_data["results"]) == 6
 
 
 @pytest.mark.asyncio