Skip to content

Commit

Permalink
Merge branch 'evaluations-in-backend' of https://github.com/Agenta-AI…
Browse files Browse the repository at this point in the history
…/agenta into evaluations-in-backend
  • Loading branch information
MohammedMaaz committed Jan 2, 2024
2 parents c76cc3d + 7a32360 commit 7731157
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 90 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class EvaluationStatusEnum(str, Enum):
EVALUATION_INITIALIZED = "EVALUATION_INITIALIZED"
EVALUATION_STARTED = "EVALUATION_STARTED"
EVALUATION_FINISHED = "EVALUATION_FINISHED"
EVALUATION_ERROR = "EVALUATION_ERROR"
EVALUATION_FAILED = "EVALUATION_FAILED"


class EvaluationScenarioStatusEnum(str, Enum):
Expand Down
24 changes: 24 additions & 0 deletions agenta-backend/agenta_backend/services/db_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1855,3 +1855,27 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool:
) # checking if delete_result is None (has been deleted)
except Exception as e:
raise e


async def update_evaluation(
evaluation_id: str, updates: Dict[str, Any]
) -> EvaluationDB:
"""
Update an evaluator configuration in the database with the provided id.
Arguments:
evaluation_id (str): The ID of the evaluator configuration to be updated.
updates (Dict[str, Any]): The updates to apply to the evaluator configuration.
Returns:
EvaluatorConfigDB: The updated evaluator configuration object.
"""
evaluation = await engine.find_one(
EvaluationDB, EvaluationDB.id == ObjectId(evaluation_id)
)

for key, value in updates.items():
if key in evaluation.__fields__:
setattr(evaluation, key, value)
await engine.save(evaluation)
return evaluation
199 changes: 111 additions & 88 deletions agenta-backend/agenta_backend/tasks/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
fetch_testset_by_id,
create_new_evaluation_scenario,
fetch_evaluator_config_by_appId,
update_evaluation,
update_evaluation_with_aggregated_results,
)
from agenta_backend.models.db_models import (
Expand All @@ -32,107 +33,129 @@ def evaluate(
app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str
):
loop = asyncio.get_event_loop()
app = AppDB(**app_data)
evaluation = NewEvaluation(**new_evaluation_data)

testset = loop.run_until_complete(fetch_testset_by_id(testset_id))
new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id))
evaluators_aggregated_data = defaultdict(list)
try:
app = AppDB(**app_data)
evaluation = NewEvaluation(**new_evaluation_data)

variant_id = str(evaluation.variant_ids[0])

app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id))
deployment = loop.run_until_complete(
get_deployment_by_objectid(app_variant_db.base.deployment)
)
testset = loop.run_until_complete(fetch_testset_by_id(testset_id))
new_evaluation_db = loop.run_until_complete(
fetch_evaluation_by_id(evaluation_id)
)
evaluators_aggregated_data = defaultdict(list)

# TODO: remove if abraham's fix is working
uri = deployment.uri.replace("http://localhost", "http://host.docker.internal")
variant_id = str(evaluation.variant_ids[0])

for data_point in testset.csvdata:
# 1. We prepare the inputs
raw_inputs = (
app_variant_db.parameters.get("inputs", [])
if app_variant_db.parameters
else []
app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id))
deployment = loop.run_until_complete(
get_deployment_by_objectid(app_variant_db.base.deployment)
)
inputs = []
if raw_inputs:
inputs = [
EvaluationScenarioInputDB(
name=input_item["name"],
type="text",
value=data_point[input_item["name"]],
)
for input_item in raw_inputs
]

#!NOTE: do not remove! this will be used in github workflow!
backend_environment = os.environ.get("ENVIRONMENT")
if backend_environment is not None and backend_environment == "github":
uri = f"http://{deployment.container_name}"
else:
uri = deployment.uri.replace(
"http://localhost", "http://host.docker.internal"
)
# 2. We get the output from the llm app
variant_output = llm_apps_service.get_llm_app_output(uri, data_point)

# 3. We evaluate
evaluators_results: [EvaluationScenarioResult] = []
for evaluator_config_id in evaluation.evaluators_configs:
evaluator_config = loop.run_until_complete(
fetch_evaluator_config(evaluator_config_id)
)

additional_kwargs = (
{
"app_params": app_variant_db.config.parameters,
"inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed
}
if evaluator_config.evaluator_key == "custom_code_run"
else {}
)
result = evaluators_service.evaluate(
evaluator_config.evaluator_key,
variant_output,
data_point["correct_answer"],
evaluator_config.settings_values,
**additional_kwargs,
# TODO: remove if abraham's fix is working
uri = deployment.uri.replace("http://localhost", "http://host.docker.internal")

for data_point in testset.csvdata:
# 1. We prepare the inputs
raw_inputs = (
app_variant_db.parameters.get("inputs", [])
if app_variant_db.parameters
else []
)
inputs = []
if raw_inputs:
inputs = [
EvaluationScenarioInputDB(
name=input_item["name"],
type="text",
value=data_point[input_item["name"]],
)
for input_item in raw_inputs
]

#!NOTE: do not remove! this will be used in github workflow!
backend_environment = os.environ.get("ENVIRONMENT")
if backend_environment is not None and backend_environment == "github":
uri = f"http://{deployment.container_name}"
else:
uri = deployment.uri.replace(
"http://localhost", "http://host.docker.internal"
)

# 2. We get the output from the llm app
try:
variant_output = llm_apps_service.get_llm_app_output(uri, data_point)
except Exception as e:
print(f"Error getting variant output: {e}")
loop.run_until_complete(
update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"})
)
return

result_object = EvaluationScenarioResult(
evaluator_config=evaluator_config.id,
result=result,
# 3. We evaluate
evaluators_results: [EvaluationScenarioResult] = []
for evaluator_config_id in evaluation.evaluators_configs:
evaluator_config = loop.run_until_complete(
fetch_evaluator_config(evaluator_config_id)
)

additional_kwargs = (
{
"app_params": app_variant_db.config.parameters,
"inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed
}
if evaluator_config.evaluator_key == "custom_code_run"
else {}
)
result = evaluators_service.evaluate(
evaluator_config.evaluator_key,
variant_output,
data_point["correct_answer"],
evaluator_config.settings_values,
**additional_kwargs,
)

result_object = EvaluationScenarioResult(
evaluator_config=evaluator_config.id,
result=result,
)
evaluators_results.append(result_object)
evaluators_aggregated_data[evaluator_config.evaluator_key].append(
result
)

# 4. We create a new evaluation scenario
evaluation_scenario = loop.run_until_complete(
create_new_evaluation_scenario(
user=app.user,
organization=app.organization,
evaluation=new_evaluation_db,
variant_id=variant_id,
evaluators_configs=new_evaluation_db.evaluators_configs,
inputs=inputs,
is_pinned=False,
note="",
correct_answer=data_point["correct_answer"],
outputs=[
EvaluationScenarioOutputDB(type="text", value=variant_output)
],
results=evaluators_results,
)
)
evaluators_results.append(result_object)
evaluators_aggregated_data[evaluator_config.evaluator_key].append(result)

# 4. We create a new evaluation scenario
evaluation_scenario = loop.run_until_complete(
create_new_evaluation_scenario(
user=app.user,
organization=app.organization,
evaluation=new_evaluation_db,
variant_id=variant_id,
evaluators_configs=new_evaluation_db.evaluators_configs,
inputs=inputs,
is_pinned=False,
note="",
correct_answer=data_point["correct_answer"],
outputs=[EvaluationScenarioOutputDB(type="text", value=variant_output)],
results=evaluators_results,

aggregated_results = loop.run_until_complete(
aggregate_evaluator_results(app, evaluators_aggregated_data)
)
updated_evaluation = loop.run_until_complete(
update_evaluation_with_aggregated_results(
new_evaluation_db.id, aggregated_results
)
)

aggregated_results = loop.run_until_complete(
aggregate_evaluator_results(app, evaluators_aggregated_data)
)
updated_evaluation = loop.run_until_complete(
update_evaluation_with_aggregated_results(
new_evaluation_db.id, aggregated_results
except Exception as e:
print(f"An error occurred during evaluation: {e}")
loop.run_until_complete(
update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"})
)
)


async def aggregate_evaluator_results(
Expand Down
2 changes: 1 addition & 1 deletion agenta-web/src/lib/Types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ export enum EvaluationStatus {
INITIALIZED = "EVALUATION_INITIALIZED",
STARTED = "EVALUATION_STARTED",
FINISHED = "EVALUATION_FINISHED",
ERROR = "EVALUATION_ERROR",
ERROR = "EVALUATION_FAILED",
}

export interface _Evaluation {
Expand Down

0 comments on commit 7731157

Please sign in to comment.