Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluation in backend #1036

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ agenta-web/cypress/screenshots/
agenta-web/cypress/videos/
.nextjs_cache/

rabbitmq_data/
23 changes: 23 additions & 0 deletions agenta-backend/agenta_backend/celery_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
from kombu import Exchange, Queue

# Use environment variables with default values as fallback
BROKER_URL = os.getenv('CELERY_BROKER_URL')
CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND')
CELERY_TASK_SERIALIZER = 'json'
CELERY_ACCEPT_CONTENT = ['json']
CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = 'UTC'

# TODO: Can we improve this to be more dynamic?
Copy link
Member

@aybruhm aybruhm Dec 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we can. To automatically discover tasks, you can use the autodiscover_tasks function from celery and specify the path to the modules containing the tasks. This function will automatically discover tasks by inspecting the installed apps and modules, and you won't have to manually define the queues and exchanges for each task.

celery_app = Celery("your_app_name")

# Set the broker URL and other configurations
....

# Autodiscover tasks
celery_app.autodiscover_tasks(["agenta_backend.tasks.evaluations"])

CELERY_QUEUES = (
Queue('agenta_backend.tasks.evaluations.auto_exact_match',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the advantage of having multiple tasks one per each eval. See comment in GoogleDoc. I think it would make us rerun the variant many times (+ no advantage)

Exchange('agenta_backend.tasks.evaluations.auto_exact_match'),
routing_key='agenta_backend.tasks.evaluations.auto_exact_match'),
Queue('agenta_backend.tasks.evaluations.auto_similarity_match',
Exchange('agenta_backend.tasks.evaluations.auto_similarity_match'),
routing_key='agenta_backend.tasks.evaluations.auto_similarity_match'),
Queue('agenta_backend.tasks.evaluations.auto_regex_test',
Exchange('agenta_backend.tasks.evaluations.auto_regex_test'),
routing_key='agenta_backend.tasks.evaluations.auto_regex_test'),
)
4 changes: 4 additions & 0 deletions agenta-backend/agenta_backend/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
from celery import Celery
from contextlib import asynccontextmanager
from agenta_backend import celery_config

from agenta_backend.config import settings
from agenta_backend.routers import (
Expand Down Expand Up @@ -32,6 +34,8 @@
"http://0.0.0.0:3001",
]

celery_app = Celery('evaluation_app')
celery_app.config_from_object(celery_config)

@asynccontextmanager
async def lifespan(application: FastAPI, cache=True):
Expand Down
8 changes: 8 additions & 0 deletions agenta-backend/agenta_backend/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class EvaluationScenario(BaseModel):
note: Optional[str]



class AICritiqueCreate(BaseModel):
correct_answer: str
llm_app_prompt_template: Optional[str]
Expand Down Expand Up @@ -118,6 +119,13 @@ class NewEvaluation(BaseModel):
status: str


class NewBulkEvaluation(BaseModel):
app_id: str
variant_ids: List[str]
evaluation_type: List[EvaluationType]
testset_id: str


class DeleteEvaluation(BaseModel):
evaluations_ids: List[str]

Expand Down
36 changes: 36 additions & 0 deletions agenta-backend/agenta_backend/models/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,23 @@ class EvaluationScenarioOutput(EmbeddedModel):
variant_id: str
variant_output: str

# TODO: This should be removed and replaced with EvaluationDB
# Keppeing it for now for backwards compatibility
class BulkEvaluationDB(Model):
app: AppDB = Reference(key_name="app")
organization: OrganizationDB = Reference(key_name="organization")
user: UserDB = Reference(key_name="user")
status: str
evaluation_type: List[str]
evaluation_type_settings: EvaluationTypeSettings
variants: List[ObjectId]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we saving the evaluation of multiple variants in the same object? We want to enable the user to run the eval for multiple variants from the same command. However there is no need to save these results in the same evaluation document. (disregard this comment in case the goal was just to use for human a/b testing)

testset: TestSetDB = Reference(key_name="testsets")
Copy link
Member

@aybruhm aybruhm Dec 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you're setting the key_name of the testset field to testsets, does this mean that the field is going to be storing multiple testsets? If yes, then I think it'll be advisable to refactor the field to this:

Suggested change
testset: TestSetDB = Reference(key_name="testsets")
testsets: List[ObjectId]

Or, this:

Suggested change
testset: TestSetDB = Reference(key_name="testsets")
testsets: List[TestSetDB]

Otherwise, it is fine to call the Reference object with no default value:

Suggested change
testset: TestSetDB = Reference(key_name="testsets")
testset: TestSetDB = Reference()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think one eval should link to one test set

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might allow running an eval in one batch on multiple testsets, but no reason to save all of these in one documetn

created_at: Optional[datetime] = Field(default=datetime.utcnow())
updated_at: Optional[datetime] = Field(default=datetime.utcnow())

class Config:
collection = "bulk_evaluations"


class EvaluationDB(Model):
app: AppDB = Reference(key_name="app")
Expand Down Expand Up @@ -246,6 +263,25 @@ class EvaluationScenarioDB(Model):
class Config:
collection = "evaluation_scenarios"

# TODO: This should be removed and replaced with EvaluationScenarioDB
# Keppeing it for now for backwards compatibility
class EvaluationScenarioDBForBulkEvaluationDB(Model):
user: UserDB = Reference(key_name="user")
organization: OrganizationDB = Reference(key_name="organization")
evaluation: BulkEvaluationDB = Reference(key_name="bulk_evaluations")
inputs: List[EvaluationScenarioInput]
outputs: List[EvaluationScenarioOutput]
vote: Optional[str]
score: Optional[Union[str, int]]
correct_answer: Optional[str]
created_at: Optional[datetime] = Field(default=datetime.utcnow())
updated_at: Optional[datetime] = Field(default=datetime.utcnow())
is_pinned: Optional[bool]
note: Optional[str]

class Config:
collection = "single_evaluation_scenarios"


class CustomEvaluationDB(Model):
evaluation_name: str
Expand Down
35 changes: 35 additions & 0 deletions agenta-backend/agenta_backend/routers/evaluation_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
EvaluationScenarioScoreUpdate,
EvaluationScenarioUpdate,
ExecuteCustomEvaluationCode,
NewBulkEvaluation,
NewEvaluation,
DeleteEvaluation,
EvaluationType,
Expand All @@ -26,6 +27,7 @@
)
from agenta_backend.services.evaluation_service import (
UpdateEvaluationScenarioError,
evaluate_in_bulk,
evaluate_with_ai_critique,
fetch_custom_evaluation_names,
fetch_custom_evaluations,
Expand Down Expand Up @@ -54,6 +56,39 @@
router = APIRouter()


@router.post("/bulk-evaluate/")
async def create_bulk_evaluation(payload: NewBulkEvaluation, request: Request):
try:
user_org_data: dict = await get_user_and_org_id(request.state.user_id)

access_app = await check_access_to_app(
user_org_data=user_org_data,
app_id=payload.app_id,
check_owner=False,
)

if not access_app:
error_msg = f"You do not have access to this app: {payload.app_id}"
return JSONResponse(
{"detail": error_msg},
status_code=400,
)
app = await db_manager.fetch_app_by_id(app_id=payload.app_id)

if app is None:
raise HTTPException(status_code=404, detail="App not found")

new_evaluation_db = await evaluation_service.create_new_bulk_evaluation(
app,
payload,
**user_org_data
)

await evaluate_in_bulk(new_evaluation_db, **user_org_data)
except Exception as e:
raise HTTPException(f"Failed to evaluate AI critique: {str(e)}")


@router.post("/", response_model=SimpleEvaluationOutput)
async def create_evaluation(
payload: NewEvaluation,
Expand Down
15 changes: 15 additions & 0 deletions agenta-backend/agenta_backend/services/db_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from agenta_backend.models.db_models import (
AppDB,
AppVariantDB,
BulkEvaluationDB,
VariantBaseDB,
ConfigDB,
ConfigVersionDB,
Expand Down Expand Up @@ -1277,6 +1278,20 @@ async def fetch_evaluation_by_id(evaluation_id: str) -> Optional[EvaluationDB]:
return evaluation


async def fetch_bulk_evaluation_by_id(evaluation_id: str) -> Optional[BulkEvaluationDB]:
"""Fetches a evaluation by its ID.
Args:
evaluation_id (str): The ID of the evaluation to fetch.
Returns:
EvaluationDB: The fetched evaluation, or None if no evaluation was found.
"""
assert evaluation_id is not None, "evaluation_id cannot be None"
evaluation = await engine.find_one(
BulkEvaluationDB, BulkEvaluationDB.id == ObjectId(evaluation_id)
)
return evaluation


async def fetch_evaluation_scenario_by_id(
evaluation_scenario_id: str,
) -> Optional[EvaluationScenarioDB]:
Expand Down
Loading
Loading