Agenta-AI · aakrem · Sep 17, 2023 · Sep 10, 2023 · Sep 10, 2023 · Sep 10, 2023
diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -1,7 +1,7 @@
-from pydantic import BaseModel, Field
-from typing import Optional, List, Dict
-from datetime import datetime
 from enum import Enum
+from datetime import datetime
+from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any
 
 
 class EvaluationTypeSettings(BaseModel):
@@ -19,6 +19,7 @@ class EvaluationType(str, Enum):
     auto_ai_critique = "auto_ai_critique"
     human_a_b_testing = "human_a_b_testing"
     human_scoring = "human_scoring"
+    custom_code_run = "custom_code_run"
 
 
 class EvaluationStatusEnum(str, Enum):
@@ -33,6 +34,9 @@ class Evaluation(BaseModel):
     status: str
     evaluation_type: EvaluationType
     evaluation_type_settings: Optional[EvaluationTypeSettings]
+    custom_code_evaluation_id: Optional[
+        str
+    ]  # will be added when running custom code evaluation
     llm_app_prompt_template: Optional[str]
     variants: Optional[List[str]]
     app_name: str
@@ -70,13 +74,21 @@ class EvaluationScenario(BaseModel):
 class EvaluationScenarioUpdate(BaseModel):
     vote: Optional[str]
     score: Optional[str]
+    correct_answer: Optional[str]  # will be used when running custom code evaluation
     outputs: List[EvaluationScenarioOutput]
     evaluation_prompt_template: Optional[str]
     open_ai_key: Optional[str]
 
 
+class EvaluationScenarioScoreUpdate(BaseModel):
+    score: float
+
+
 class NewEvaluation(BaseModel):
     evaluation_type: EvaluationType
+    custom_code_evaluation_id: Optional[
+        str
+    ]  # will be added when running custom code evaluation
     evaluation_type_settings: Optional[EvaluationTypeSettings]
     app_name: str
     variants: List[str]
@@ -90,5 +102,40 @@ class DeleteEvaluation(BaseModel):
     evaluations_ids: List[str]
 
 
+class CreateCustomEvaluation(BaseModel):
+    evaluation_name: str
+    python_code: str
+    app_name: str
+
+
+class CustomEvaluationOutput(BaseModel):
+    id: str
+    app_name: str
+    evaluation_name: str
+    created_at: datetime
+
+
+class CustomEvaluationDetail(BaseModel):
+    id: str
+    app_name: str
+    evaluation_name: str
+    python_code: str
+    created_at: datetime
+    updated_at: datetime
+
+
+class CustomEvaluationNames(BaseModel):
+    id: str
+    evaluation_name: str
+
+
+class ExecuteCustomEvaluationCode(BaseModel):
+    inputs: List[Dict[str, Any]]
+    app_name: str
+    variant_name: str
+    correct_answer: str
+    outputs: List[Dict[str, Any]]
+
+
 class EvaluationWebhook(BaseModel):
     score: float
diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py
@@ -86,6 +86,7 @@ class EvaluationScenarioOutput(EmbeddedModel):
 class EvaluationDB(Model):
     status: str
     evaluation_type: str
+    custom_code_evaluation_id: Optional[str]
     evaluation_type_settings: EvaluationTypeSettings
     llm_app_prompt_template: str
     variants: List[str]
@@ -115,6 +116,18 @@ class Config:
         collection = "evaluation_scenarios"
 
 
+class CustomEvaluationDB(Model):
+    evaluation_name: str
+    python_code: str
+    app_name: str
+    user: UserDB = Reference()
+    created_at: Optional[datetime] = Field(default=datetime.utcnow())
+    updated_at: Optional[datetime] = Field(default=datetime.utcnow())
+
+    class Config:
+        collection = "custom_evaluations"
+
+
 class TestSetDB(Model):
     name: str
     app_name: str

diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py
@@ -1,22 +1,31 @@
 import os
+import random
 from bson import ObjectId
 from datetime import datetime
 from typing import List, Optional
-import random
 
+from fastapi.responses import JSONResponse
 from fastapi import HTTPException, APIRouter, Body, Depends
 
+from agenta_backend.services.helpers import format_inputs, format_outputs
 from agenta_backend.models.api.evaluation_model import (
+    CustomEvaluationNames,
     Evaluation,
     EvaluationScenario,
+    CustomEvaluationOutput,
+    CustomEvaluationDetail,
+    EvaluationScenarioScoreUpdate,
     EvaluationScenarioUpdate,
+    ExecuteCustomEvaluationCode,
     NewEvaluation,
     DeleteEvaluation,
     EvaluationType,
+    CreateCustomEvaluation,
     EvaluationUpdate,
     EvaluationWebhook,
 )
 from agenta_backend.services.results_service import (
+    fetch_average_score_for_custom_code_run,
     fetch_results_for_human_a_b_testing_evaluation,
     fetch_results_for_auto_exact_match_evaluation,
     fetch_results_for_auto_similarity_match_evaluation,
@@ -26,10 +35,17 @@
 )
 from agenta_backend.services.evaluation_service import (
     UpdateEvaluationScenarioError,
+    fetch_custom_evaluation_names,
+    fetch_custom_evaluations,
+    fetch_custom_evaluation_detail,
+    get_evaluation_scenario_score,
     update_evaluation_scenario,
+    update_evaluation_scenario_score,
     update_evaluation,
     create_new_evaluation,
     create_new_evaluation_scenario,
+    create_custom_code_evaluation,
+    execute_custom_code_evaluation,
 )
 from agenta_backend.services.db_manager import engine, query, get_user_object
 from agenta_backend.models.db_models import EvaluationDB, EvaluationScenarioDB
@@ -213,6 +229,60 @@ async def update_evaluation_scenario_router(
         raise HTTPException(status_code=500, detail=str(e)) from e
 
 
+@router.get("/evaluation_scenario/{evaluation_scenario_id}/score")
+async def get_evaluation_scenario_score_router(
+    evaluation_scenario_id: str,
+    stoken_session: SessionContainer = Depends(verify_session()),
+):
+    """Get the s
+
+    Args:
+        evaluation_scenario_id (str): _description_
+        stoken_session (SessionContainer, optional): _description_. Defaults to Depends(verify_session()).
+
+    Raises:
+        HTTPException: _description_
+        HTTPException: _description_
+        HTTPException: _description_
+
+    Returns:
+        _type_: _description_
+    """
+
+    # Get user and organization id
+    kwargs: dict = await get_user_and_org_id(stoken_session)
+    scenario_score = await get_evaluation_scenario_score(
+        evaluation_scenario_id, **kwargs
+    )
+    return scenario_score
+
+
+@router.put("/evaluation_scenario/{evaluation_scenario_id}/score")
+async def update_evaluation_scenario_score_router(
+    evaluation_scenario_id: str,
+    payload: EvaluationScenarioScoreUpdate,
+    stoken_session: SessionContainer = Depends(verify_session()),
+):
+    """Updates evaluation scenario score
+
+    Args:
+        evaluation_scenario_id (str): the evaluation scenario to update
+        score (float): the value to update
+
+    Raises:
+        HTTPException: server error if evaluation update went wrong
+    """
+
+    try:
+        # Get user and organization id
+        kwargs: dict = await get_user_and_org_id(stoken_session)
+        return await update_evaluation_scenario_score(
+            evaluation_scenario_id, payload.score, **kwargs
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e)) from e
+
+
 @router.get("/", response_model=List[Evaluation])
 async def fetch_list_evaluations(
     app_name: Optional[str] = None,
@@ -238,6 +308,7 @@ async def fetch_list_evaluations(
             id=str(evaluation.id),
             status=evaluation.status,
             evaluation_type=evaluation.evaluation_type,
+            custom_code_evaluation_id=evaluation.custom_code_evaluation_id,
             evaluation_type_settings=evaluation.evaluation_type_settings,
             llm_app_prompt_template=evaluation.llm_app_prompt_template,
             variants=evaluation.variants,
@@ -275,6 +346,7 @@ async def fetch_evaluation(
             id=str(evaluation.id),
             status=evaluation.status,
             evaluation_type=evaluation.evaluation_type,
+            custom_code_evaluation_id=evaluation.custom_code_evaluation_id,
             evaluation_type_settings=evaluation.evaluation_type_settings,
             llm_app_prompt_template=evaluation.llm_app_prompt_template,
             variants=evaluation.variants,
@@ -386,6 +458,147 @@ async def fetch_results(
         results = await fetch_results_for_auto_ai_critique(evaluation_id)
         return {"results_data": results}
 
+    elif evaluation.evaluation_type == EvaluationType.custom_code_run:
+        results = await fetch_average_score_for_custom_code_run(evaluation_id)
+        return {"avg_score": results}
+
+
+@router.post("/custom_evaluation/")
+async def create_custom_evaluation(
+    custom_evaluation_payload: CreateCustomEvaluation,
+    stoken_session: SessionContainer = Depends(verify_session()),
+):
+    """Create evaluation with custom python code.
+
+    Args:
+        \n custom_evaluation_payload (CreateCustomEvaluation): the required payload
+    """
+
+    # Get user and organization id
+    kwargs: dict = await get_user_and_org_id(stoken_session)
+
+    # create custom evaluation in database
+    evaluation_id = await create_custom_code_evaluation(
+        custom_evaluation_payload, **kwargs
+    )
+
+    return JSONResponse(
+        {
+            "status": "success",
+            "message": "Evaluation created successfully.",
+            "evaluation_id": evaluation_id,
+        },
+        status_code=200,
+    )
+
+
+@router.get(
+    "/custom_evaluation/list/{app_name}",
+    response_model=List[CustomEvaluationOutput],
+)
+async def list_custom_evaluations(
+    app_name: str,
+    stoken_session: SessionContainer = Depends(verify_session()),
+):
+    """List the custom code evaluations for a given app.
+
+    Args:
+        app_name (str): the name of the app
+
+    Returns:
+        List[CustomEvaluationOutput]: a list of custom evaluation
+    """
+
+    # Get user and organization id
+    kwargs: dict = await get_user_and_org_id(stoken_session)
+
+    # Fetch custom evaluations from database
+    evaluations = await fetch_custom_evaluations(app_name, **kwargs)
+    return evaluations
+
+
+@router.get(
+    "/custom_evaluation/{id}",
+    response_model=CustomEvaluationDetail,
+)
+async def get_custom_evaluation(
+    id: str,
+    stoken_session: SessionContainer = Depends(verify_session()),
+):
+    """Get the custom code evaluation detail.
+
+    Args:
+        id (str): the id of the custom evaluation
+
+    Returns:
+        CustomEvaluationDetail: Detail of the custom evaluation
+    """
+
+    # Get user and organization id
+    kwargs: dict = await get_user_and_org_id(stoken_session)
+
+    # Fetch custom evaluations from database
+    evaluation = await fetch_custom_evaluation_detail(id, **kwargs)
+    return evaluation
+
+
+@router.get(
+    "/custom_evaluation/{app_name}/names/",
+    response_model=List[CustomEvaluationNames],
+)
+async def get_custom_evaluation_names(
+    app_name: str, stoken_session: SessionContainer = Depends(verify_session())
+):
+    """Get the names of custom evaluation for a given app.
+
+    Args:
+        app_name (str): the name of the app the evaluation belongs to
+
+    Returns:
+        List[CustomEvaluationNames]: the list of name of custom evaluations
+    """
+    # Get user and organization id
+    kwargs: dict = await get_user_and_org_id(stoken_session)
+
+    custom_eval_names = await fetch_custom_evaluation_names(app_name, **kwargs)
+    return custom_eval_names
+
+
+@router.post(
+    "/custom_evaluation/execute/{evaluation_id}/",
+)
+async def execute_custom_evaluation(
+    evaluation_id: str,
+    payload: ExecuteCustomEvaluationCode,
+    stoken_session: SessionContainer = Depends(verify_session()),
+):
+    """Execute a custom evaluation code.
+
+    Args:
+        evaluation_id (str): the custom evaluation id
+        payload (ExecuteCustomEvaluationCode): the required payload
+
+    Returns:
+        float: the result of the evaluation custom code
+    """
+
+    # Get user and organization id
+    kwargs: dict = await get_user_and_org_id(stoken_session)
+
+    # Execute custom code evaluation
+    formatted_inputs = format_inputs(payload.inputs)
+    formatted_outputs = format_outputs(payload.outputs)
+    result = await execute_custom_code_evaluation(
+        evaluation_id,
+        payload.app_name,
+        formatted_outputs[payload.variant_name],  # gets the output of the app variant
+        payload.correct_answer,
+        payload.variant_name,
+        formatted_inputs,
+        **kwargs,
+    )
+    return result
+
 
 @router.post("/webhook_example_fake", response_model=EvaluationWebhook)
 async def webhook_example_fake():