From 51caab270fac8c5e3c619b601110ba5d7f3a8f9e Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 08:35:38 +0100 Subject: [PATCH 001/414] add basic evaluators --- .gitignore | 1 + .../services/evaluators_service.py | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 agenta-backend/agenta_backend/services/evaluators_service.py diff --git a/.gitignore b/.gitignore index fe89181587..b32eb68f0d 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ agenta-web/cypress/screenshots/ agenta-web/cypress/videos/ .nextjs_cache/ +rabbitmq_data \ No newline at end of file diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py new file mode 100644 index 0000000000..337175f18a --- /dev/null +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -0,0 +1,32 @@ +import re + +def auto_exact_match(variant_output, correct_answer): + if variant_output == correct_answer: + return 1 + else: + return 0 + + +def auto_similarity_match(variant_output, correct_answer): + set1 = set(variant_output.split()) + set2 = set(correct_answer.split()) + intersect = set1.intersection(set2) + union = set1.union(set2) + + similarity = len(intersect) / len(union) + return similarity + + +def auto_regex_test(test_string, regex, should_match): + re_pattern = re.compile(regex, re.IGNORECASE) + result = bool(re_pattern.search(test_string)) + return result == should_match + + +def evaluate(evaluator_name, correct_answer, variant_output, *additional_args, **additional_kwargs): + try: + evaluation_function = globals()[evaluator_name] + + return evaluation_function(correct_answer, variant_output, *additional_args, **additional_kwargs) + except KeyError: + raise ValueError(f"Evaluation method '{evaluator_name}' not found.") \ No newline at end of file From b9999608782f347ff3354d2f3c93419826f533b5 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 19 Dec 2023 08:52:28 +0100 Subject: [PATCH 002/414] Refactor - redesign db models schema for evaluation --- .../agenta_backend/models/db_models.py | 125 ++++++++++-------- 1 file changed, 72 insertions(+), 53 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 6a243e57b4..cbe5fed5df 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -128,13 +128,11 @@ class Config: class AppVariantDB(Model): - app: AppDB = Reference(key_name="app") + app: AppDB = Reference() variant_name: str - image: ImageDB = Reference(key_name="image") - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") - parameters: Dict[str, Any] = Field(default=dict) # TODO: deprecated. remove - previous_variant_name: Optional[str] # TODO: deprecated. remove + image: ImageDB = Reference() + user: UserDB = Reference() + organization: OrganizationDB = Reference() base_name: Optional[str] base: VariantBaseDB = Reference(key_name="bases") config_name: Optional[str] @@ -142,19 +140,15 @@ class AppVariantDB(Model): created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - is_deleted: bool = Field( # TODO: deprecated. remove - default=False - ) # soft deletion for using the template variants - class Config: collection = "app_variants" class AppEnvironmentDB(Model): - app: AppDB = Reference(key_name="app") + app: AppDB = Reference() name: str - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") + user: UserDB = Reference() + organization: OrganizationDB = Reference() deployed_app_variant: Optional[ObjectId] deployment: Optional[ObjectId] # reference to deployment created_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -193,73 +187,98 @@ class Config: collection = "testsets" -class EvaluationTypeSettings(EmbeddedModel): - similarity_threshold: Optional[float] - regex_pattern: Optional[str] - regex_should_match: Optional[bool] - webhook_url: Optional[str] - llm_app_prompt_template: Optional[str] - custom_code_evaluation_id: Optional[str] - evaluation_prompt_template: Optional[str] +class CustomEvaluationDB(Model): + evaluation_name: str + python_code: str + app: AppDB = Reference(key_name="app") + user: UserDB = Reference(key_name="user") + organization: OrganizationDB = Reference(key_name="organization") + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Config: + collection = "custom_evaluations" + + +class EvalSettingsTemplate(EmbeddedModel): + type: str + default: str + description: str + + +class EvaluatorDB(Model): + name: str = Field(required=True) + settings_template: Dict[str, EvalSettingsTemplate] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Config: + collection = "evaluators" + + +class EvalSettingsValue(EmbeddedModel): + parameter: str + threshold_value: float = Field(min_value=0.0, max_value=1.0) + + +class EvaluatorConfigDB(Model): + evaluator: EvaluatorDB = Reference() + settings_value: EvalSettingsValue + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Config: + collection = "evaluator_config" + + +class EvaluationScenarioResult(EmbeddedModel): + evaluator: EvaluatorDB = Reference() + result: Any class EvaluationScenarioInput(EmbeddedModel): - input_name: str - input_value: str + name: str + type: str + value: str class EvaluationScenarioOutput(EmbeddedModel): - variant_id: str - variant_output: str + type: str + value: str class EvaluationDB(Model): app: AppDB = Reference(key_name="app") organization: OrganizationDB = Reference(key_name="organization") user: UserDB = Reference(key_name="user") - status: str - evaluation_type: str - evaluation_type_settings: EvaluationTypeSettings - variants: List[ObjectId] - testset: TestSetDB = Reference(key_name="testsets") - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + testset: TestSetDB = Reference() + variants: List[AppVariantDB] + evaluators: List[EvaluatorConfigDB] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) class Config: collection = "evaluations" class EvaluationScenarioDB(Model): - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") - evaluation: EvaluationDB = Reference(key_name="evaluations") + user: UserDB = Reference() + organization: OrganizationDB = Reference() + evaluation: EvaluationDB = Reference() inputs: List[EvaluationScenarioInput] - outputs: List[EvaluationScenarioOutput] # EvaluationScenarioOutput - vote: Optional[str] - score: Optional[Union[str, int]] + outputs: List[EvaluationScenarioOutput] correct_answer: Optional[str] - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) is_pinned: Optional[bool] note: Optional[str] + evaluators: List[EvaluatorConfigDB] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) class Config: collection = "evaluation_scenarios" -class CustomEvaluationDB(Model): - evaluation_name: str - python_code: str - app: AppDB = Reference(key_name="app") - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - - class Config: - collection = "custom_evaluations" - - class SpanDB(Model): parent_span_id: Optional[str] meta: Optional[Dict[str, Any]] From f57c7a115d2452fbcc06d73553074db5e62ae830 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 19 Dec 2023 09:05:03 +0100 Subject: [PATCH 003/414] :art: Format - ran black --- .../agenta_backend/services/evaluators_service.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 337175f18a..07bb6b202b 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,5 +1,6 @@ import re + def auto_exact_match(variant_output, correct_answer): if variant_output == correct_answer: return 1 @@ -23,10 +24,18 @@ def auto_regex_test(test_string, regex, should_match): return result == should_match -def evaluate(evaluator_name, correct_answer, variant_output, *additional_args, **additional_kwargs): +def evaluate( + evaluator_name, + correct_answer, + variant_output, + *additional_args, + **additional_kwargs, +): try: evaluation_function = globals()[evaluator_name] - return evaluation_function(correct_answer, variant_output, *additional_args, **additional_kwargs) + return evaluation_function( + correct_answer, variant_output, *additional_args, **additional_kwargs + ) except KeyError: - raise ValueError(f"Evaluation method '{evaluator_name}' not found.") \ No newline at end of file + raise ValueError(f"Evaluation method '{evaluator_name}' not found.") From 1ba2d9910898da681ad8fba9cb6ba58643ec95a1 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 10:28:32 +0100 Subject: [PATCH 004/414] add celery and rabbitmq --- docker-compose.yml | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 723707046c..2b3fc7961d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -37,6 +37,8 @@ services: - "traefik.http.routers.backend.service=backend" networks: - agenta-network + extra_hosts: + - "host.docker.internal:host-gateway" command: [ "uvicorn", @@ -49,7 +51,7 @@ services: "--log-level", "info", "--root-path", - "/api" + "/api", ] depends_on: mongo: @@ -86,7 +88,7 @@ services: networks: - agenta-network healthcheck: - test: [ "CMD", "mongo", "--eval", "db.adminCommand('ping')" ] + test: ["CMD", "mongo", "--eval", "db.adminCommand('ping')"] interval: 10s timeout: 10s retries: 20 @@ -112,6 +114,38 @@ services: volumes: - redis_data:/data + rabbitmq: + image: rabbitmq:3-management + ports: + - "5672:5672" + - "15672:15672" + volumes: + - ./rabbitmq_data:/var/lib/rabbitmq + environment: + RABBITMQ_DEFAULT_USER: "guest" + RABBITMQ_DEFAULT_PASS: "guest" + networks: + - agenta-network + + celery_worker: + build: ./agenta-backend + command: celery -A agenta_backend.main.celery_app worker --loglevel=info + environment: + - MONGODB_URI=mongodb://username:password@mongo:27017 + - REDIS_URL=redis://redis:6379/0 + - CELERY_BROKER_URL=amqp://guest@rabbitmq// + - CELERY_RESULT_BACKEND=redis://redis:6379/0 + - FEATURE_FLAG=oss + + volumes: + - ./agenta-backend/agenta_backend:/app/agenta_backend + - /var/run/docker.sock:/var/run/docker.sock + depends_on: + - rabbitmq + - redis + networks: + - agenta-network + networks: agenta-network: name: agenta-network From b55be13d493dd6cc10282367f85ff4df82e915f1 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 10:54:49 +0100 Subject: [PATCH 005/414] add seeding evaluators --- agenta-backend/db-seed/Dockerfile | 4 ++++ agenta-backend/db-seed/evaluators.json | 14 ++++++++++++++ docker-compose.yml | 7 +++++++ 3 files changed, 25 insertions(+) create mode 100644 agenta-backend/db-seed/Dockerfile create mode 100644 agenta-backend/db-seed/evaluators.json diff --git a/agenta-backend/db-seed/Dockerfile b/agenta-backend/db-seed/Dockerfile new file mode 100644 index 0000000000..0450d71397 --- /dev/null +++ b/agenta-backend/db-seed/Dockerfile @@ -0,0 +1,4 @@ +FROM mongo:5.0 + +COPY evaluators.json /evaluators.json +CMD mongoimport --host mongo --username username --password password --authenticationDatabase admin --db agenta_v2 --collection evaluators --type json --file /evaluators.json --jsonArray \ No newline at end of file diff --git a/agenta-backend/db-seed/evaluators.json b/agenta-backend/db-seed/evaluators.json new file mode 100644 index 0000000000..51ac77ffdf --- /dev/null +++ b/agenta-backend/db-seed/evaluators.json @@ -0,0 +1,14 @@ +[ + { + "name": "Exact Match", + "key": "auto_exact_match" + }, + { + "name": "Similarity Match", + "key": "auto_similarity_match" + }, + { + "name": "Regex Test", + "key": "auto_regex_test" + } +] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 2b3fc7961d..62fb159749 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -93,6 +93,13 @@ services: timeout: 10s retries: 20 + mongo-seed: + build: ./agenta-backend/db-seed + depends_on: + - mongo + networks: + - agenta-network + mongo_express: image: mongo-express environment: From 32037c37e7b8fdf99012bd4968eccf976c512a82 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 11:30:42 +0100 Subject: [PATCH 006/414] add celery config --- .../agenta_backend/celery_config.py | 15 ++ agenta-backend/poetry.lock | 211 +++++++++++++++++- agenta-backend/pyproject.toml | 1 + 3 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 agenta-backend/agenta_backend/celery_config.py diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py new file mode 100644 index 0000000000..33057e5723 --- /dev/null +++ b/agenta-backend/agenta_backend/celery_config.py @@ -0,0 +1,15 @@ +import os +from kombu import Exchange, Queue + +BROKER_URL = os.getenv('CELERY_BROKER_URL') +CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND') +CELERY_TASK_SERIALIZER = 'json' +CELERY_ACCEPT_CONTENT = ['json'] +CELERY_RESULT_SERIALIZER = 'json' +CELERY_TIMEZONE = 'UTC' + +CELERY_QUEUES = ( + Queue('agenta_backend.tasks.evaluations', + Exchange('agenta_backend.tasks.evaluations'), + routing_key='agenta_backend.tasks.evaluations') +) \ No newline at end of file diff --git a/agenta-backend/poetry.lock b/agenta-backend/poetry.lock index 8261ee057b..683b342b63 100644 --- a/agenta-backend/poetry.lock +++ b/agenta-backend/poetry.lock @@ -140,6 +140,20 @@ files = [ docs = ["sphinx (>=2,<4)", "sphinx_autodoc_typehints (>=1.7.0,<2.0.0)"] uvloop = ["uvloop (>=0.13,<0.15)"] +[[package]] +name = "amqp" +version = "5.2.0" +description = "Low-level AMQP client for Python (fork of amqplib)." +optional = false +python-versions = ">=3.6" +files = [ + {file = "amqp-5.2.0-py3-none-any.whl", hash = "sha256:827cb12fb0baa892aad844fd95258143bce4027fdac4fccddbc43330fd281637"}, + {file = "amqp-5.2.0.tar.gz", hash = "sha256:a1ecff425ad063ad42a486c902807d1482311481c8ad95a72694b2975e75f7fd"}, +] + +[package.dependencies] +vine = ">=5.0.0,<6.0.0" + [[package]] name = "anyio" version = "3.7.1" @@ -232,6 +246,17 @@ files = [ {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] +[[package]] +name = "billiard" +version = "4.2.0" +description = "Python multiprocessing fork with improvements and bugfixes" +optional = false +python-versions = ">=3.7" +files = [ + {file = "billiard-4.2.0-py3-none-any.whl", hash = "sha256:07aa978b308f334ff8282bd4a746e681b3513db5c9a514cbdd810cbbdc19714d"}, + {file = "billiard-4.2.0.tar.gz", hash = "sha256:9a3c3184cb275aa17a732f93f65b20c525d3d9f253722d26a82194803ade5a2c"}, +] + [[package]] name = "boto3" version = "1.29.3" @@ -284,6 +309,61 @@ files = [ {file = "cachetools-5.3.2.tar.gz", hash = "sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2"}, ] +[[package]] +name = "celery" +version = "5.3.6" +description = "Distributed Task Queue." +optional = false +python-versions = ">=3.8" +files = [ + {file = "celery-5.3.6-py3-none-any.whl", hash = "sha256:9da4ea0118d232ce97dff5ed4974587fb1c0ff5c10042eb15278487cdd27d1af"}, + {file = "celery-5.3.6.tar.gz", hash = "sha256:870cc71d737c0200c397290d730344cc991d13a057534353d124c9380267aab9"}, +] + +[package.dependencies] +billiard = ">=4.2.0,<5.0" +click = ">=8.1.2,<9.0" +click-didyoumean = ">=0.3.0" +click-plugins = ">=1.1.1" +click-repl = ">=0.2.0" +kombu = ">=5.3.4,<6.0" +python-dateutil = ">=2.8.2" +tzdata = ">=2022.7" +vine = ">=5.1.0,<6.0" + +[package.extras] +arangodb = ["pyArango (>=2.0.2)"] +auth = ["cryptography (==41.0.5)"] +azureblockblob = ["azure-storage-blob (>=12.15.0)"] +brotli = ["brotli (>=1.0.0)", "brotlipy (>=0.7.0)"] +cassandra = ["cassandra-driver (>=3.25.0,<4)"] +consul = ["python-consul2 (==0.1.5)"] +cosmosdbsql = ["pydocumentdb (==2.3.5)"] +couchbase = ["couchbase (>=3.0.0)"] +couchdb = ["pycouchdb (==1.14.2)"] +django = ["Django (>=2.2.28)"] +dynamodb = ["boto3 (>=1.26.143)"] +elasticsearch = ["elastic-transport (<=8.10.0)", "elasticsearch (<=8.11.0)"] +eventlet = ["eventlet (>=0.32.0)"] +gevent = ["gevent (>=1.5.0)"] +librabbitmq = ["librabbitmq (>=2.0.0)"] +memcache = ["pylibmc (==1.6.3)"] +mongodb = ["pymongo[srv] (>=4.0.2)"] +msgpack = ["msgpack (==1.0.7)"] +pymemcache = ["python-memcached (==1.59)"] +pyro = ["pyro4 (==4.82)"] +pytest = ["pytest-celery (==0.0.0)"] +redis = ["redis (>=4.5.2,!=4.5.5,<6.0.0)"] +s3 = ["boto3 (>=1.26.143)"] +slmq = ["softlayer-messaging (>=1.0.3)"] +solar = ["ephem (==4.1.5)"] +sqlalchemy = ["sqlalchemy (>=1.4.48,<2.1)"] +sqs = ["boto3 (>=1.26.143)", "kombu[sqs] (>=5.3.0)", "pycurl (>=7.43.0.5)", "urllib3 (>=1.26.16)"] +tblib = ["tblib (>=1.3.0)", "tblib (>=1.5.0)"] +yaml = ["PyYAML (>=3.10)"] +zookeeper = ["kazoo (>=1.3.1)"] +zstd = ["zstandard (==0.22.0)"] + [[package]] name = "certifi" version = "2023.11.17" @@ -472,6 +552,55 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "click-didyoumean" +version = "0.3.0" +description = "Enables git-like *did-you-mean* feature in click" +optional = false +python-versions = ">=3.6.2,<4.0.0" +files = [ + {file = "click-didyoumean-0.3.0.tar.gz", hash = "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035"}, + {file = "click_didyoumean-0.3.0-py3-none-any.whl", hash = "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667"}, +] + +[package.dependencies] +click = ">=7" + +[[package]] +name = "click-plugins" +version = "1.1.1" +description = "An extension module for click to enable registering CLI commands via setuptools entry-points." +optional = false +python-versions = "*" +files = [ + {file = "click-plugins-1.1.1.tar.gz", hash = "sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b"}, + {file = "click_plugins-1.1.1-py2.py3-none-any.whl", hash = "sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8"}, +] + +[package.dependencies] +click = ">=4.0" + +[package.extras] +dev = ["coveralls", "pytest (>=3.6)", "pytest-cov", "wheel"] + +[[package]] +name = "click-repl" +version = "0.3.0" +description = "REPL plugin for Click" +optional = false +python-versions = ">=3.6" +files = [ + {file = "click-repl-0.3.0.tar.gz", hash = "sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9"}, + {file = "click_repl-0.3.0-py3-none-any.whl", hash = "sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812"}, +] + +[package.dependencies] +click = ">=7.0" +prompt-toolkit = ">=3.0.36" + +[package.extras] +testing = ["pytest (>=7.2.1)", "pytest-cov (>=4.0.0)", "tox (>=4.4.3)"] + [[package]] name = "colorama" version = "0.4.6" @@ -906,6 +1035,39 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "kombu" +version = "5.3.4" +description = "Messaging library for Python." +optional = false +python-versions = ">=3.8" +files = [ + {file = "kombu-5.3.4-py3-none-any.whl", hash = "sha256:63bb093fc9bb80cfb3a0972336a5cec1fa7ac5f9ef7e8237c6bf8dda9469313e"}, + {file = "kombu-5.3.4.tar.gz", hash = "sha256:0bb2e278644d11dea6272c17974a3dbb9688a949f3bb60aeb5b791329c44fadc"}, +] + +[package.dependencies] +amqp = ">=5.1.1,<6.0.0" +typing-extensions = {version = "*", markers = "python_version < \"3.10\""} +vine = "*" + +[package.extras] +azureservicebus = ["azure-servicebus (>=7.10.0)"] +azurestoragequeues = ["azure-identity (>=1.12.0)", "azure-storage-queue (>=12.6.0)"] +confluentkafka = ["confluent-kafka (>=2.2.0)"] +consul = ["python-consul2"] +librabbitmq = ["librabbitmq (>=2.0.0)"] +mongodb = ["pymongo (>=4.1.1)"] +msgpack = ["msgpack"] +pyro = ["pyro4"] +qpid = ["qpid-python (>=0.26)", "qpid-tools (>=0.26)"] +redis = ["redis (>=4.5.2,!=4.5.5,<6.0.0)"] +slmq = ["softlayer-messaging (>=1.0.3)"] +sqlalchemy = ["sqlalchemy (>=1.4.48,<2.1)"] +sqs = ["boto3 (>=1.26.143)", "pycurl (>=7.43.0.5)", "urllib3 (>=1.26.16)"] +yaml = ["PyYAML (>=3.10)"] +zookeeper = ["kazoo (>=2.8.0)"] + [[package]] name = "kubernetes" version = "28.1.0" @@ -1346,6 +1508,20 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "prompt-toolkit" +version = "3.0.43" +description = "Library for building powerful interactive command lines in Python" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "prompt_toolkit-3.0.43-py3-none-any.whl", hash = "sha256:a11a29cb3bf0a28a387fe5122cdb649816a957cd9261dcedf8c9f1fef33eacf6"}, + {file = "prompt_toolkit-3.0.43.tar.gz", hash = "sha256:3527b7af26106cbc65a040bcc84839a3566ec1b051bb0bfe953631e704b0ff7d"}, +] + +[package.dependencies] +wcwidth = "*" + [[package]] name = "pyasn1" version = "0.5.0" @@ -2252,6 +2428,17 @@ files = [ mypy-extensions = ">=0.3.0" typing-extensions = ">=3.7.4" +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] + [[package]] name = "urllib3" version = "1.26.18" @@ -2286,6 +2473,28 @@ h11 = ">=0.8" [package.extras] standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] +[[package]] +name = "vine" +version = "5.1.0" +description = "Python promises." +optional = false +python-versions = ">=3.6" +files = [ + {file = "vine-5.1.0-py3-none-any.whl", hash = "sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc"}, + {file = "vine-5.1.0.tar.gz", hash = "sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0"}, +] + +[[package]] +name = "wcwidth" +version = "0.2.12" +description = "Measures the displayed width of unicode strings in a terminal" +optional = false +python-versions = "*" +files = [ + {file = "wcwidth-0.2.12-py2.py3-none-any.whl", hash = "sha256:f26ec43d96c8cbfed76a5075dac87680124fa84e0855195a6184da9c187f133c"}, + {file = "wcwidth-0.2.12.tar.gz", hash = "sha256:f01c104efdf57971bcb756f054dd58ddec5204dd15fa31d6503ea57947d97c02"}, +] + [[package]] name = "websocket-client" version = "1.6.4" @@ -2471,4 +2680,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "b658134806d28492dfef5cdb4effff9b68d898dd96c73f0a7214767119456592" +content-hash = "98b35b5cfab9773a1695c09dd69b181ef9081b8a7dfebf1fbb22b2e4a3f8fb59" diff --git a/agenta-backend/pyproject.toml b/agenta-backend/pyproject.toml index c452e48ad1..72836f0469 100644 --- a/agenta-backend/pyproject.toml +++ b/agenta-backend/pyproject.toml @@ -31,6 +31,7 @@ asyncer = "^0.0.2" anyio = "==3.7.1" sentry-sdk = {extras = ["fastapi"], version = "^1.34.0"} kubernetes = "^28.1.0" +celery = "^5.3.6" [tool.poetry.group.dev.dependencies] pytest = "^7.3.1" From bd278940d22b208d3ce77520f67901a13c42ccfa Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 12:07:36 +0100 Subject: [PATCH 007/414] config celery --- agenta-backend/agenta_backend/celery_config.py | 2 +- agenta-backend/agenta_backend/main.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py index 33057e5723..71523f1ec1 100644 --- a/agenta-backend/agenta_backend/celery_config.py +++ b/agenta-backend/agenta_backend/celery_config.py @@ -11,5 +11,5 @@ CELERY_QUEUES = ( Queue('agenta_backend.tasks.evaluations', Exchange('agenta_backend.tasks.evaluations'), - routing_key='agenta_backend.tasks.evaluations') + routing_key='agenta_backend.tasks.evaluations'), ) \ No newline at end of file diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index e197696360..7fe9d807ce 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -1,7 +1,9 @@ import os +from celery import Celery from contextlib import asynccontextmanager from agenta_backend.config import settings +from agenta_backend import celery_config from agenta_backend.routers import ( app_router, container_router, @@ -33,6 +35,10 @@ ] +celery_app = Celery('evaluation_app') +celery_app.config_from_object(celery_config) + + @asynccontextmanager async def lifespan(application: FastAPI, cache=True): """ From 60135faf7a78e97b7c1821f3d2f859e65b38810e Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 12:08:00 +0100 Subject: [PATCH 008/414] remove evaluation type --- agenta-backend/agenta_backend/models/db_models.py | 1 - agenta-backend/agenta_backend/services/evaluation_service.py | 1 - 2 files changed, 2 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index cbe5fed5df..7145211dbf 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -232,7 +232,6 @@ class Config: class EvaluationScenarioResult(EmbeddedModel): - evaluator: EvaluatorDB = Reference() result: Any diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 67ad49e8e5..7792d29ded 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -28,7 +28,6 @@ EvaluationScenarioDB, UserDB, AppDB, - EvaluationTypeSettings, EvaluationScenarioInput, EvaluationScenarioOutput, CustomEvaluationDB, From 26d16d6e117df4906f481d708d140f7af18aa8c5 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 14:42:29 +0100 Subject: [PATCH 009/414] add llm_apps service --- .../services/llm_apps_service.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 agenta-backend/agenta_backend/services/llm_apps_service.py diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py new file mode 100644 index 0000000000..b716677b04 --- /dev/null +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -0,0 +1,29 @@ +import httpx + +def get_llm_app_output(uri, input): + try: + url = f"{uri}/generate" + + # TODO: adjust these hardcoded values in this payload + payload = { + "temperature": 1, + "model": "gpt-3.5-turbo", + "max_tokens": -1, + "prompt_system": "You are an expert in geography.", + "prompt_user": f"What is the capital of {input}?", + "top_p": 1, + "inputs": { + "country": input + } + } + + with httpx.Client() as client: + response = client.post(url, json=payload) + response.raise_for_status() + return response.json() + except httpx.HTTPError as e: + print(f"An HTTP error occurred: {e}") + except Exception as e: + print(f"An error occurred: {e}") + + return None \ No newline at end of file From 69b4eaf7ec11fa0cb1398634a8c40049f204a18b Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 19 Dec 2023 17:25:22 +0100 Subject: [PATCH 010/414] Feat - created evaluator api models --- .../models/api/evaluator_model.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 agenta-backend/agenta_backend/models/api/evaluator_model.py diff --git a/agenta-backend/agenta_backend/models/api/evaluator_model.py b/agenta-backend/agenta_backend/models/api/evaluator_model.py new file mode 100644 index 0000000000..dc177c0db7 --- /dev/null +++ b/agenta-backend/agenta_backend/models/api/evaluator_model.py @@ -0,0 +1,25 @@ +from typing import List, Dict, Any +from pydantic import BaseModel + + +class EvaluationSettingsTemplate(BaseModel): + type: str + default: str + description: str + + +class Evaluator(BaseModel): + key: str + settings_template: Dict[str, EvaluationSettingsTemplate] + + +class EvaluatorConfig: + evaluator: Evaluator + settings_value: Dict[str, Any] + + +class NewEvaluation(BaseModel): + app_id: str + variant_ids: List[str] + evaluators_configs: List[EvaluatorConfig] + testset_id: str From 70878778de089a9ca789097c9008f66c688735b6 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 19 Dec 2023 17:25:41 +0100 Subject: [PATCH 011/414] Update - modified evaluation db models --- .../agenta_backend/models/db_models.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 7145211dbf..0eae05c6d9 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -200,7 +200,7 @@ class Config: collection = "custom_evaluations" -class EvalSettingsTemplate(EmbeddedModel): +class EvaluationSettingsTemplate(EmbeddedModel): type: str default: str description: str @@ -208,7 +208,8 @@ class EvalSettingsTemplate(EmbeddedModel): class EvaluatorDB(Model): name: str = Field(required=True) - settings_template: Dict[str, EvalSettingsTemplate] + key: str + settings_template: Dict[str, EvaluationSettingsTemplate] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) @@ -216,14 +217,9 @@ class Config: collection = "evaluators" -class EvalSettingsValue(EmbeddedModel): - parameter: str - threshold_value: float = Field(min_value=0.0, max_value=1.0) - - class EvaluatorConfigDB(Model): evaluator: EvaluatorDB = Reference() - settings_value: EvalSettingsValue + settings_value: Dict created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) @@ -252,7 +248,7 @@ class EvaluationDB(Model): user: UserDB = Reference(key_name="user") testset: TestSetDB = Reference() variants: List[AppVariantDB] - evaluators: List[EvaluatorConfigDB] + evaluators_configs: List[EvaluatorConfigDB] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) @@ -269,7 +265,7 @@ class EvaluationScenarioDB(Model): correct_answer: Optional[str] is_pinned: Optional[bool] note: Optional[str] - evaluators: List[EvaluatorConfigDB] + evaluators_configs: List[EvaluatorConfigDB] results: List[EvaluationScenarioResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) From a60872e06fcb47dae6c10cf35ee23fd008f869f3 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 19 Dec 2023 18:01:06 +0100 Subject: [PATCH 012/414] Update - resolve IndexError caused by EvaluatorConfigDB model --- agenta-backend/agenta_backend/models/db_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 0eae05c6d9..feef076b7a 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -219,7 +219,7 @@ class Config: class EvaluatorConfigDB(Model): evaluator: EvaluatorDB = Reference() - settings_value: Dict + settings_value: Dict[str, Any] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) From e220e70090553009744413ee088d7edd87eab09c Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 19:38:22 +0100 Subject: [PATCH 013/414] add cli logic to test the new evaluation --- agenta-cli/agenta/cli/evaluation_commands.py | 23 +++++++++++ agenta-cli/agenta/cli/main.py | 2 + agenta-cli/agenta/client/client.py | 43 ++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 agenta-cli/agenta/cli/evaluation_commands.py diff --git a/agenta-cli/agenta/cli/evaluation_commands.py b/agenta-cli/agenta/cli/evaluation_commands.py new file mode 100644 index 0000000000..672ecb8735 --- /dev/null +++ b/agenta-cli/agenta/cli/evaluation_commands.py @@ -0,0 +1,23 @@ + +import click +from agenta.client import client + + +@click.group() +def evaluation(): + """Commands for evaluations.""" + pass + + +#TODO: Remove hardcoded values +@evaluation.command(name="run") +def run_evaluation_cli(): + """Run an evaluation.""" + + try: + client.run_evaluation( + app_name="sss", + host="http://localhost", + ) + except Exception as ex: + click.echo(click.style(f"Error while running evaluation: {ex}", fg="red")) diff --git a/agenta-cli/agenta/cli/main.py b/agenta-cli/agenta/cli/main.py index ec0e2301e0..0844833e14 100644 --- a/agenta-cli/agenta/cli/main.py +++ b/agenta-cli/agenta/cli/main.py @@ -12,6 +12,7 @@ from agenta.client import client from agenta.cli import variant_configs from agenta.cli import variant_commands +from agenta.cli import evaluation_commands def print_version(ctx, param, value): @@ -194,6 +195,7 @@ def init(app_name: str): cli.add_command(init) cli.add_command(variant_configs.config) cli.add_command(variant_commands.variant) +cli.add_command(evaluation_commands.evaluation) if __name__ == "__main__": cli() diff --git a/agenta-cli/agenta/client/client.py b/agenta-cli/agenta/client/client.py index 0ca20301ef..e27e8068c4 100644 --- a/agenta-cli/agenta/client/client.py +++ b/agenta-cli/agenta/client/client.py @@ -524,3 +524,46 @@ def retrieve_user_id(host: str, api_key: Optional[str] = None) -> str: return response.json()["id"] except RequestException as e: raise APIRequestError(f"Request failed: {str(e)}") + +from pydantic import BaseModel +# def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: +def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: + """Creates new app on the server. + Args: + app_name (str): Name of the app + host (str): Hostname of the server + api_key (str): The API key to use for the request. + """ + + + + evaluators_configs = [ + { + "evaluator": { + "key": "auto_similarity_match", + } + } + ] + + new_evaluation = { + "app_id": "6577025e60084c599a43e51f", + "variant_ids": [ + "6577025e60084c599a43e525", + # "6570aed55d0eaff2293088e6" + ], + "evaluators_configs": evaluators_configs, + "testset_id": "6577025e60084c599a43e526" + } + + response = requests.post( + f"{host}/api/evaluations/", + json=new_evaluation, + # headers={"Authorization": api_key} if api_key is not None else None, + timeout=600, + ) + if response.status_code != 200: + error_message = response.json() + raise APIRequestError( + f"Request to run evaluations failed with status code {response.status_code} and error message: {error_message}." + ) + return response.json()["app_id"] \ No newline at end of file From bbccccbf2d60af4d7dc70ff82052dbfc41c704da Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 19:39:59 +0100 Subject: [PATCH 014/414] small fixes --- .../models/api/evaluation_model.py | 32 +++++++++++++------ .../models/api/evaluator_model.py | 25 --------------- .../agenta_backend/models/db_models.py | 2 +- 3 files changed, 23 insertions(+), 36 deletions(-) delete mode 100644 agenta-backend/agenta_backend/models/api/evaluator_model.py diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 4edf312d38..82abf384c5 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -108,16 +108,6 @@ class EvaluationScenarioScoreUpdate(BaseModel): score: float -class NewEvaluation(BaseModel): - app_id: str - variant_ids: List[str] - evaluation_type: EvaluationType - evaluation_type_settings: Optional[EvaluationTypeSettings] - inputs: List[str] - testset_id: str - status: str - - class DeleteEvaluation(BaseModel): evaluations_ids: List[str] @@ -159,3 +149,25 @@ class ExecuteCustomEvaluationCode(BaseModel): class EvaluationWebhook(BaseModel): score: float + + +class EvaluationSettingsTemplate(BaseModel): + type: str + default: str + description: str + + +class Evaluator(BaseModel): + key: str + + +class EvaluatorConfig(BaseModel): + evaluator: Evaluator + settings_values: Optional[Dict[str, Any]] + + +class NewEvaluation(BaseModel): + app_id: str + variant_ids: List[str] + evaluators_configs: List[EvaluatorConfig] + testset_id: str diff --git a/agenta-backend/agenta_backend/models/api/evaluator_model.py b/agenta-backend/agenta_backend/models/api/evaluator_model.py deleted file mode 100644 index dc177c0db7..0000000000 --- a/agenta-backend/agenta_backend/models/api/evaluator_model.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import List, Dict, Any -from pydantic import BaseModel - - -class EvaluationSettingsTemplate(BaseModel): - type: str - default: str - description: str - - -class Evaluator(BaseModel): - key: str - settings_template: Dict[str, EvaluationSettingsTemplate] - - -class EvaluatorConfig: - evaluator: Evaluator - settings_value: Dict[str, Any] - - -class NewEvaluation(BaseModel): - app_id: str - variant_ids: List[str] - evaluators_configs: List[EvaluatorConfig] - testset_id: str diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index feef076b7a..70ce531a11 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -219,7 +219,7 @@ class Config: class EvaluatorConfigDB(Model): evaluator: EvaluatorDB = Reference() - settings_value: Dict[str, Any] + settings_values: Dict[str, Any] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) From 7f0838c0d2832240dc29de514ee47845f68b2fd7 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 19:41:35 +0100 Subject: [PATCH 015/414] format --- .../agenta_backend/celery_config.py | 22 ++++++++++--------- agenta-backend/agenta_backend/main.py | 2 +- .../services/llm_apps_service.py | 7 +++--- agenta-cli/agenta/cli/evaluation_commands.py | 3 +-- agenta-cli/agenta/client/client.py | 9 ++++---- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py index 71523f1ec1..c81c467753 100644 --- a/agenta-backend/agenta_backend/celery_config.py +++ b/agenta-backend/agenta_backend/celery_config.py @@ -1,15 +1,17 @@ import os from kombu import Exchange, Queue -BROKER_URL = os.getenv('CELERY_BROKER_URL') -CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND') -CELERY_TASK_SERIALIZER = 'json' -CELERY_ACCEPT_CONTENT = ['json'] -CELERY_RESULT_SERIALIZER = 'json' -CELERY_TIMEZONE = 'UTC' +BROKER_URL = os.getenv("CELERY_BROKER_URL") +CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND") +CELERY_TASK_SERIALIZER = "json" +CELERY_ACCEPT_CONTENT = ["json"] +CELERY_RESULT_SERIALIZER = "json" +CELERY_TIMEZONE = "UTC" CELERY_QUEUES = ( - Queue('agenta_backend.tasks.evaluations', - Exchange('agenta_backend.tasks.evaluations'), - routing_key='agenta_backend.tasks.evaluations'), -) \ No newline at end of file + Queue( + "agenta_backend.tasks.evaluations", + Exchange("agenta_backend.tasks.evaluations"), + routing_key="agenta_backend.tasks.evaluations", + ), +) diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index 7fe9d807ce..717e844ffa 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -35,7 +35,7 @@ ] -celery_app = Celery('evaluation_app') +celery_app = Celery("evaluation_app") celery_app.config_from_object(celery_config) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index b716677b04..9da5104418 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -1,5 +1,6 @@ import httpx + def get_llm_app_output(uri, input): try: url = f"{uri}/generate" @@ -12,9 +13,7 @@ def get_llm_app_output(uri, input): "prompt_system": "You are an expert in geography.", "prompt_user": f"What is the capital of {input}?", "top_p": 1, - "inputs": { - "country": input - } + "inputs": {"country": input}, } with httpx.Client() as client: @@ -26,4 +25,4 @@ def get_llm_app_output(uri, input): except Exception as e: print(f"An error occurred: {e}") - return None \ No newline at end of file + return None diff --git a/agenta-cli/agenta/cli/evaluation_commands.py b/agenta-cli/agenta/cli/evaluation_commands.py index 672ecb8735..76e00f9694 100644 --- a/agenta-cli/agenta/cli/evaluation_commands.py +++ b/agenta-cli/agenta/cli/evaluation_commands.py @@ -1,4 +1,3 @@ - import click from agenta.client import client @@ -9,7 +8,7 @@ def evaluation(): pass -#TODO: Remove hardcoded values +# TODO: Remove hardcoded values @evaluation.command(name="run") def run_evaluation_cli(): """Run an evaluation.""" diff --git a/agenta-cli/agenta/client/client.py b/agenta-cli/agenta/client/client.py index e27e8068c4..b4aad9b5fc 100644 --- a/agenta-cli/agenta/client/client.py +++ b/agenta-cli/agenta/client/client.py @@ -525,7 +525,10 @@ def retrieve_user_id(host: str, api_key: Optional[str] = None) -> str: except RequestException as e: raise APIRequestError(f"Request failed: {str(e)}") + from pydantic import BaseModel + + # def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: """Creates new app on the server. @@ -535,8 +538,6 @@ def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: api_key (str): The API key to use for the request. """ - - evaluators_configs = [ { "evaluator": { @@ -552,7 +553,7 @@ def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: # "6570aed55d0eaff2293088e6" ], "evaluators_configs": evaluators_configs, - "testset_id": "6577025e60084c599a43e526" + "testset_id": "6577025e60084c599a43e526", } response = requests.post( @@ -566,4 +567,4 @@ def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: raise APIRequestError( f"Request to run evaluations failed with status code {response.status_code} and error message: {error_message}." ) - return response.json()["app_id"] \ No newline at end of file + return response.json()["app_id"] From e9564331219a2d14cc39b3606ff1294b5319b840 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 19 Dec 2023 19:54:00 +0100 Subject: [PATCH 016/414] put back initial AppVariantDB and AppEnvironmentDB schema --- .../agenta_backend/models/db_models.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 70ce531a11..479d699526 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -128,11 +128,13 @@ class Config: class AppVariantDB(Model): - app: AppDB = Reference() + app: AppDB = Reference(key_name="app") variant_name: str - image: ImageDB = Reference() - user: UserDB = Reference() - organization: OrganizationDB = Reference() + image: ImageDB = Reference(key_name="image") + user: UserDB = Reference(key_name="user") + organization: OrganizationDB = Reference(key_name="organization") + parameters: Dict[str, Any] = Field(default=dict) # TODO: deprecated. remove + previous_variant_name: Optional[str] # TODO: deprecated. remove base_name: Optional[str] base: VariantBaseDB = Reference(key_name="bases") config_name: Optional[str] @@ -140,22 +142,22 @@ class AppVariantDB(Model): created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_deleted: bool = Field( # TODO: deprecated. remove + default=False + ) # soft deletion for using the template variants + class Config: collection = "app_variants" class AppEnvironmentDB(Model): - app: AppDB = Reference() + app: AppDB = Reference(key_name="app") name: str - user: UserDB = Reference() - organization: OrganizationDB = Reference() + user: UserDB = Reference(key_name="user") + organization: OrganizationDB = Reference(key_name="organization") deployed_app_variant: Optional[ObjectId] deployment: Optional[ObjectId] # reference to deployment created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - - class Config: - collection = "environments" class TemplateDB(Model): From f6a83e733bacb0d9ef7476c528f4e3cb80075f62 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 09:08:16 +0100 Subject: [PATCH 017/414] add celery env vars for backend --- docker-compose.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 62fb159749..18580f2e70 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,6 +22,9 @@ services: - FEATURE_FLAG=oss - AGENTA_TEMPLATE_REPO=agentaai/templates_v2 - POSTHOG_API_KEY=phc_hmVSxIjTW1REBHXgj2aw4HW9X6CXb6FzerBgP9XenC7 + + - CELERY_BROKER_URL=amqp://guest@rabbitmq// + - CELERY_RESULT_BACKEND=redis://redis:6379/0 volumes: - ./agenta-backend/agenta_backend:/app/agenta_backend - ./agenta-backend/tests:/app/tests From cf345f5cc26fa64ffd3bef696e8bacf4426c8fc4 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 09:10:17 +0100 Subject: [PATCH 018/414] add celery configs --- agenta-backend/agenta_backend/celery_config.py | 6 +++--- agenta-backend/agenta_backend/main.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py index c81c467753..df11dad091 100644 --- a/agenta-backend/agenta_backend/celery_config.py +++ b/agenta-backend/agenta_backend/celery_config.py @@ -10,8 +10,8 @@ CELERY_QUEUES = ( Queue( - "agenta_backend.tasks.evaluations", - Exchange("agenta_backend.tasks.evaluations"), - routing_key="agenta_backend.tasks.evaluations", + "agenta_backend.tasks.evaluations.evaluate", + Exchange("agenta_backend.tasks.evaluations.evaluate"), + routing_key="agenta_backend.tasks.evaluations.evaluate", ), ) diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index 717e844ffa..98af93a2c5 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -34,8 +34,7 @@ "http://0.0.0.0:3001", ] - -celery_app = Celery("evaluation_app") +celery_app = Celery("agenta_app") celery_app.config_from_object(celery_config) From 969a43aa8940785bfb6af43dfda225986393dcde Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 09:10:38 +0100 Subject: [PATCH 019/414] temporarily evaluation route --- .../routers/evaluation_router.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 20282ee093..c521968955 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -43,6 +43,9 @@ from agenta_backend.services import db_manager from agenta_backend.models import converters from agenta_backend.services import results_service +from agenta_backend.tasks.evaluations import evaluate + +from fastapi.encoders import jsonable_encoder if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all @@ -54,7 +57,8 @@ router = APIRouter() -@router.post("/", response_model=SimpleEvaluationOutput) +# @router.post("/", response_model=SimpleEvaluationOutput) +@router.post("/") async def create_evaluation( payload: NewEvaluation, request: Request, @@ -82,11 +86,19 @@ async def create_evaluation( if app is None: raise HTTPException(status_code=404, detail="App not found") - - new_evaluation_db = await evaluation_service.create_new_evaluation( - payload, **user_org_data - ) - return converters.evaluation_db_to_simple_evaluation_output(new_evaluation_db) + # TODO: clean this + # new_evaluation_db = await evaluation_service.create_new_evaluation( + # payload, **user_org_data + # ) + app_data = jsonable_encoder(app) + new_evaluation_data = payload.dict() + # TODO: to review/find a better solution + # We need to serilize the data we pass to celery tasks otherwise we will get serilisation errors + + evaluate.delay(app_data, new_evaluation_data) + + return 200 + # return converters.evaluation_db_to_simple_evaluation_output(new_evaluation_db) except KeyError: raise HTTPException( status_code=400, From d4f3e12594ec9d43a5e9af406f1c8d1c79f42830 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 09:10:51 +0100 Subject: [PATCH 020/414] add celery task --- .../agenta_backend/tasks/evaluations.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 agenta-backend/agenta_backend/tasks/evaluations.py diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py new file mode 100644 index 0000000000..a5eccfe834 --- /dev/null +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -0,0 +1,27 @@ +from celery import shared_task +import asyncio + +from agenta_backend.services import llm_apps_service +from agenta_backend.services.db_manager import ( + fetch_app_variant_by_id, + get_deployment_by_objectid, + fetch_testset_by_id, + create_new_evaluation_scenario +) +from agenta_backend.models.api.evaluation_model import NewEvaluation, EvaluationScenario, EvaluationScenarioOutput + +from agenta_backend.models.db_models import ( + AppDB +) +# from agenta_backend.celery_init import celery_app + +@shared_task(queue='agenta_backend.tasks.evaluations.evaluate') +def evaluate(app_data, new_evaluation_data): + loop = asyncio.get_event_loop() + new_evaluation = NewEvaluation(**new_evaluation_data) + app = AppDB(**app_data) + testset = loop.run_until_complete(fetch_testset_by_id(new_evaluation.testset_id)) + print("data is ready") + print(testset) + + From 58cfc383112e0e2ed068747aad552f7134403a23 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 20 Dec 2023 19:16:25 +0100 Subject: [PATCH 021/414] Update - install watchdog[watchmedo] --- agenta-backend/poetry.lock | 57 +++++++++++++++++++++++++++++++++-- agenta-backend/pyproject.toml | 1 + 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/agenta-backend/poetry.lock b/agenta-backend/poetry.lock index 683b342b63..fdf7b77937 100644 --- a/agenta-backend/poetry.lock +++ b/agenta-backend/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiodocker" @@ -1706,6 +1706,7 @@ files = [ {file = "pymongo-4.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8"}, {file = "pymongo-4.6.0-cp312-cp312-win32.whl", hash = "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4"}, {file = "pymongo-4.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330"}, + {file = "pymongo-4.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e24025625bad66895b1bc3ae1647f48f0a92dd014108fb1be404c77f0b69ca67"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651"}, @@ -1913,6 +1914,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -1920,8 +1922,15 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -1938,6 +1947,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -1945,6 +1955,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -2484,6 +2495,48 @@ files = [ {file = "vine-5.1.0.tar.gz", hash = "sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0"}, ] +[[package]] +name = "watchdog" +version = "3.0.0" +description = "Filesystem events monitoring" +optional = false +python-versions = ">=3.7" +files = [ + {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:336adfc6f5cc4e037d52db31194f7581ff744b67382eb6021c868322e32eef41"}, + {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a70a8dcde91be523c35b2bf96196edc5730edb347e374c7de7cd20c43ed95397"}, + {file = "watchdog-3.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:adfdeab2da79ea2f76f87eb42a3ab1966a5313e5a69a0213a3cc06ef692b0e96"}, + {file = "watchdog-3.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2b57a1e730af3156d13b7fdddfc23dea6487fceca29fc75c5a868beed29177ae"}, + {file = "watchdog-3.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7ade88d0d778b1b222adebcc0927428f883db07017618a5e684fd03b83342bd9"}, + {file = "watchdog-3.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7e447d172af52ad204d19982739aa2346245cc5ba6f579d16dac4bfec226d2e7"}, + {file = "watchdog-3.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9fac43a7466eb73e64a9940ac9ed6369baa39b3bf221ae23493a9ec4d0022674"}, + {file = "watchdog-3.0.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8ae9cda41fa114e28faf86cb137d751a17ffd0316d1c34ccf2235e8a84365c7f"}, + {file = "watchdog-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f70b4aa53bd743729c7475d7ec41093a580528b100e9a8c5b5efe8899592fc"}, + {file = "watchdog-3.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4f94069eb16657d2c6faada4624c39464f65c05606af50bb7902e036e3219be3"}, + {file = "watchdog-3.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7c5f84b5194c24dd573fa6472685b2a27cc5a17fe5f7b6fd40345378ca6812e3"}, + {file = "watchdog-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3aa7f6a12e831ddfe78cdd4f8996af9cf334fd6346531b16cec61c3b3c0d8da0"}, + {file = "watchdog-3.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:233b5817932685d39a7896b1090353fc8efc1ef99c9c054e46c8002561252fb8"}, + {file = "watchdog-3.0.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:13bbbb462ee42ec3c5723e1205be8ced776f05b100e4737518c67c8325cf6100"}, + {file = "watchdog-3.0.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8f3ceecd20d71067c7fd4c9e832d4e22584318983cabc013dbf3f70ea95de346"}, + {file = "watchdog-3.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c9d8c8ec7efb887333cf71e328e39cffbf771d8f8f95d308ea4125bf5f90ba64"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0e06ab8858a76e1219e68c7573dfeba9dd1c0219476c5a44d5333b01d7e1743a"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:d00e6be486affb5781468457b21a6cbe848c33ef43f9ea4a73b4882e5f188a44"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:c07253088265c363d1ddf4b3cdb808d59a0468ecd017770ed716991620b8f77a"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:5113334cf8cf0ac8cd45e1f8309a603291b614191c9add34d33075727a967709"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:51f90f73b4697bac9c9a78394c3acbbd331ccd3655c11be1a15ae6fe289a8c83"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:ba07e92756c97e3aca0912b5cbc4e5ad802f4557212788e72a72a47ff376950d"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d429c2430c93b7903914e4db9a966c7f2b068dd2ebdd2fa9b9ce094c7d459f33"}, + {file = "watchdog-3.0.0-py3-none-win32.whl", hash = "sha256:3ed7c71a9dccfe838c2f0b6314ed0d9b22e77d268c67e015450a29036a81f60f"}, + {file = "watchdog-3.0.0-py3-none-win_amd64.whl", hash = "sha256:4c9956d27be0bb08fc5f30d9d0179a855436e655f046d288e2bcc11adfae893c"}, + {file = "watchdog-3.0.0-py3-none-win_ia64.whl", hash = "sha256:5d9f3a10e02d7371cd929b5d8f11e87d4bad890212ed3901f9b4d68767bee759"}, + {file = "watchdog-3.0.0.tar.gz", hash = "sha256:4d98a320595da7a7c5a18fc48cb633c2e73cda78f93cac2ef42d42bf609a33f9"}, +] + +[package.dependencies] +PyYAML = {version = ">=3.10", optional = true, markers = "extra == \"watchmedo\""} + +[package.extras] +watchmedo = ["PyYAML (>=3.10)"] + [[package]] name = "wcwidth" version = "0.2.12" @@ -2680,4 +2733,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "98b35b5cfab9773a1695c09dd69b181ef9081b8a7dfebf1fbb22b2e4a3f8fb59" +content-hash = "337e72d73feb6823a141ab0fd71374044a6210ce971fe4409edb685f959abbeb" diff --git a/agenta-backend/pyproject.toml b/agenta-backend/pyproject.toml index 72836f0469..df3025c7e5 100644 --- a/agenta-backend/pyproject.toml +++ b/agenta-backend/pyproject.toml @@ -32,6 +32,7 @@ anyio = "==3.7.1" sentry-sdk = {extras = ["fastapi"], version = "^1.34.0"} kubernetes = "^28.1.0" celery = "^5.3.6" +watchdog = {extras = ["watchmedo"], version = "^3.0.0"} [tool.poetry.group.dev.dependencies] pytest = "^7.3.1" From 4eba3a31b8a7829904ca5395dd3a7987e8bc4680 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 20 Dec 2023 19:17:44 +0100 Subject: [PATCH 022/414] Update - modified celery_worker command in docker compose --- docker-compose.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 18580f2e70..9d71d6e31d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,7 +22,6 @@ services: - FEATURE_FLAG=oss - AGENTA_TEMPLATE_REPO=agentaai/templates_v2 - POSTHOG_API_KEY=phc_hmVSxIjTW1REBHXgj2aw4HW9X6CXb6FzerBgP9XenC7 - - CELERY_BROKER_URL=amqp://guest@rabbitmq// - CELERY_RESULT_BACKEND=redis://redis:6379/0 volumes: @@ -139,14 +138,14 @@ services: celery_worker: build: ./agenta-backend - command: celery -A agenta_backend.main.celery_app worker --loglevel=info + command: > + watchmedo auto-restart --directory=./agenta_backend --pattern=*.py --recursive -- celery -A agenta_backend.main.celery_app worker --concurrency=1 --loglevel=INFO environment: - MONGODB_URI=mongodb://username:password@mongo:27017 - REDIS_URL=redis://redis:6379/0 - CELERY_BROKER_URL=amqp://guest@rabbitmq// - CELERY_RESULT_BACKEND=redis://redis:6379/0 - FEATURE_FLAG=oss - volumes: - ./agenta-backend/agenta_backend:/app/agenta_backend - /var/run/docker.sock:/var/run/docker.sock From d681d7706e93b61f404cd334b9e235a028c39107 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 19:30:42 +0100 Subject: [PATCH 023/414] update schema and related changes --- .../models/api/evaluation_model.py | 2 +- .../agenta_backend/models/db_models.py | 15 +++-- .../agenta_backend/services/db_manager.py | 65 +++++++++++++++++++ .../services/evaluation_service.py | 10 +-- 4 files changed, 79 insertions(+), 13 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 82abf384c5..e989680135 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -162,7 +162,7 @@ class Evaluator(BaseModel): class EvaluatorConfig(BaseModel): - evaluator: Evaluator + evaluator_key: str settings_values: Optional[Dict[str, Any]] diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 479d699526..b1fcbf6292 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -220,8 +220,8 @@ class Config: class EvaluatorConfigDB(Model): - evaluator: EvaluatorDB = Reference() - settings_values: Dict[str, Any] + evaluator_key: str + settings_values: Optional[Dict[str, Any]] = None created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) @@ -230,16 +230,17 @@ class Config: class EvaluationScenarioResult(EmbeddedModel): + evaluator_key: str result: Any -class EvaluationScenarioInput(EmbeddedModel): +class EvaluationScenarioInputDB(EmbeddedModel): name: str type: str value: str -class EvaluationScenarioOutput(EmbeddedModel): +class EvaluationScenarioOutputDB(EmbeddedModel): type: str value: str @@ -249,7 +250,7 @@ class EvaluationDB(Model): organization: OrganizationDB = Reference(key_name="organization") user: UserDB = Reference(key_name="user") testset: TestSetDB = Reference() - variants: List[AppVariantDB] + variants: List[ObjectId] evaluators_configs: List[EvaluatorConfigDB] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) @@ -262,8 +263,8 @@ class EvaluationScenarioDB(Model): user: UserDB = Reference() organization: OrganizationDB = Reference() evaluation: EvaluationDB = Reference() - inputs: List[EvaluationScenarioInput] - outputs: List[EvaluationScenarioOutput] + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] correct_answer: Optional[str] is_pinned: Optional[bool] note: Optional[str] diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 261673b9ba..d95044701c 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -21,6 +21,10 @@ from agenta_backend.models.db_models import ( AppDB, AppVariantDB, + EvaluationScenarioInputDB, + EvaluationScenarioOutputDB, + EvaluationScenarioResult, + EvaluatorConfigDB, VariantBaseDB, ConfigDB, ConfigVersionDB, @@ -1607,3 +1611,64 @@ async def fetch_app_by_name_and_organization( ) app_db = await engine.find_one(AppDB, query_expression) return app_db + + +async def create_new_evaluation( + app: AppDB, + organization: OrganizationDB, + user: UserDB, + testset:TestSetDB, + variants:[AppVariantDB], + evaluators_configs: [EvaluatorConfigDB], +) -> EvaluationDB: + """Create a new evaluation scenario. + Returns: + EvaluationScenarioDB: The created evaluation scenario. + """ + evaluation = EvaluationDB( + app=app, + organization=organization, + user=user, + testset=testset, + variants=variants, + evaluators_configs=evaluators_configs, + created_at=datetime.now().isoformat(), + updated_at=datetime.now().isoformat(), + ) + await engine.save(evaluation) + return evaluation + + + +async def create_new_evaluation_scenario( + user: UserDB, + organization: OrganizationDB, + evaluation: EvaluationDB, + inputs: List[EvaluationScenarioInputDB], + outputs: List[EvaluationScenarioOutputDB], + correct_answer: Optional[str], + is_pinned: Optional[bool], + note: Optional[str], + evaluators_configs: List[EvaluatorConfigDB], + results: List[EvaluationScenarioResult], +) -> EvaluationScenarioDB: + """Create a new evaluation scenario. + Returns: + EvaluationScenarioDB: The created evaluation scenario. + """ + evaluation_scenario = EvaluationScenarioDB( + user=user, + organization=organization, + evaluation=evaluation, + inputs=inputs, + outputs=outputs, + correct_answer=correct_answer, + is_pinned=is_pinned, + note=note, + evaluators_configs=evaluators_configs, + results=results, + created_at=datetime.utcnow(), + updated_at=datetime.utcnow(), + ) + await engine.save(evaluation_scenario) + return evaluation_scenario \ No newline at end of file diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 7792d29ded..af706459a3 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -28,8 +28,8 @@ EvaluationScenarioDB, UserDB, AppDB, - EvaluationScenarioInput, - EvaluationScenarioOutput, + EvaluationScenarioInputDB, + EvaluationScenarioOutputDB, CustomEvaluationDB, ) @@ -217,7 +217,7 @@ async def prepare_csvdata_and_create_evaluation_scenario( # Create evaluation scenarios list_of_scenario_input = [] for scenario_input in inputs: - eval_scenario_input_instance = EvaluationScenarioInput( + eval_scenario_input_instance = EvaluationScenarioInputDB( input_name=scenario_input["input_name"], input_value=scenario_input["input_value"], ) @@ -406,7 +406,7 @@ async def update_evaluation_scenario( if updated_data["outputs"] is not None: new_outputs = [ - EvaluationScenarioOutput( + EvaluationScenarioOutputDB( variant_id=output["variant_id"], variant_output=output["variant_output"], ).dict() @@ -416,7 +416,7 @@ async def update_evaluation_scenario( if updated_data["inputs"] is not None: new_inputs = [ - EvaluationScenarioInput( + EvaluationScenarioInputDB( input_name=input_item["input_name"], input_value=input_item["input_value"], ).dict() From 772fffe23173ad784892e8834d2d5229cec40541 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 19:30:58 +0100 Subject: [PATCH 024/414] update payload --- agenta-cli/agenta/client/client.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/agenta-cli/agenta/client/client.py b/agenta-cli/agenta/client/client.py index b4aad9b5fc..66d20b89ed 100644 --- a/agenta-cli/agenta/client/client.py +++ b/agenta-cli/agenta/client/client.py @@ -540,20 +540,18 @@ def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: evaluators_configs = [ { - "evaluator": { - "key": "auto_similarity_match", - } + "evaluator_key": "auto_similarity_match", } ] new_evaluation = { - "app_id": "6577025e60084c599a43e51f", + "app_id": "6581e69500afd8dfe404f765", "variant_ids": [ - "6577025e60084c599a43e525", + "6581e69500afd8dfe404f76b", # "6570aed55d0eaff2293088e6" ], "evaluators_configs": evaluators_configs, - "testset_id": "6577025e60084c599a43e526", + "testset_id": "6581e69500afd8dfe404f76c", } response = requests.post( From 24917fb869fdcf6a1e63ad1974e7769ee2b774fc Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 19:32:10 +0100 Subject: [PATCH 025/414] add celery task to perform evaluations --- .../agenta_backend/tasks/evaluations.py | 66 +++++++++++++++++-- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index a5eccfe834..e9899e768c 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -1,19 +1,25 @@ +from bson import ObjectId from celery import shared_task import asyncio +from datetime import datetime from agenta_backend.services import llm_apps_service from agenta_backend.services.db_manager import ( + create_new_evaluation, fetch_app_variant_by_id, get_deployment_by_objectid, fetch_testset_by_id, - create_new_evaluation_scenario + create_new_evaluation_scenario, ) -from agenta_backend.models.api.evaluation_model import NewEvaluation, EvaluationScenario, EvaluationScenarioOutput +from agenta_backend.models.api.evaluation_model import NewEvaluation from agenta_backend.models.db_models import ( - AppDB + AppDB, + EvaluationScenarioOutputDB, + EvaluationScenarioResult ) -# from agenta_backend.celery_init import celery_app + +from agenta_backend.services import evaluators_service @shared_task(queue='agenta_backend.tasks.evaluations.evaluate') def evaluate(app_data, new_evaluation_data): @@ -21,7 +27,55 @@ def evaluate(app_data, new_evaluation_data): new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) testset = loop.run_until_complete(fetch_testset_by_id(new_evaluation.testset_id)) - print("data is ready") - print(testset) + + new_evaluation_db = loop.run_until_complete(create_new_evaluation( + app=app, + organization=app.organization, + user=app.user, + testset=testset, + variants=new_evaluation.variant_ids, + evaluators_configs=new_evaluation.evaluators_configs, + )) + + for variant_id in new_evaluation.variant_ids: + variant_id = str(variant_id) + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) + deployment = loop.run_until_complete(get_deployment_by_objectid(app_variant_db.base.deployment)) + + # TODO: remove if abraham's fix is working + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + + for data_point in testset.csvdata: + variant_output = llm_apps_service.get_llm_app_output(uri, data_point) + + results:[EvaluationScenarioResult] = [] + for evaluator_config in new_evaluation.evaluators_configs: + result = evaluators_service.evaluate(evaluator_config.evaluator_key, data_point['correct_answer'], variant_output) + result_object = EvaluationScenarioResult( + evaluator_key=evaluator_config.evaluator_key, + result={ + "type": "number", + "value": result + } + ) + results.append(result_object) + + evaluation_scenario = loop.run_until_complete(create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=[], + is_pinned=False, + note="", + correct_answer=data_point['correct_answer'], + outputs=[EvaluationScenarioOutputDB( + type="text", + value=variant_output + )], + results=results + )) + print("evaluation scenario is ready") + print(evaluation_scenario) From 32e3190528b1a4097b4d31884f50025e766a50e2 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 20 Dec 2023 19:34:08 +0100 Subject: [PATCH 026/414] format --- .../agenta_backend/services/db_manager.py | 7 +- .../agenta_backend/tasks/evaluations.py | 73 ++++++++++--------- 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index d95044701c..957674a534 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1617,8 +1617,8 @@ async def create_new_evaluation( app: AppDB, organization: OrganizationDB, user: UserDB, - testset:TestSetDB, - variants:[AppVariantDB], + testset: TestSetDB, + variants: [AppVariantDB], evaluators_configs: [EvaluatorConfigDB], ) -> EvaluationDB: """Create a new evaluation scenario. @@ -1639,7 +1639,6 @@ async def create_new_evaluation( return evaluation - async def create_new_evaluation_scenario( user: UserDB, organization: OrganizationDB, @@ -1671,4 +1670,4 @@ async def create_new_evaluation_scenario( updated_at=datetime.utcnow(), ) await engine.save(evaluation_scenario) - return evaluation_scenario \ No newline at end of file + return evaluation_scenario diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index e9899e768c..0f100521d2 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -16,31 +16,36 @@ from agenta_backend.models.db_models import ( AppDB, EvaluationScenarioOutputDB, - EvaluationScenarioResult + EvaluationScenarioResult, ) from agenta_backend.services import evaluators_service -@shared_task(queue='agenta_backend.tasks.evaluations.evaluate') + +@shared_task(queue="agenta_backend.tasks.evaluations.evaluate") def evaluate(app_data, new_evaluation_data): loop = asyncio.get_event_loop() new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) testset = loop.run_until_complete(fetch_testset_by_id(new_evaluation.testset_id)) - new_evaluation_db = loop.run_until_complete(create_new_evaluation( - app=app, - organization=app.organization, - user=app.user, - testset=testset, - variants=new_evaluation.variant_ids, - evaluators_configs=new_evaluation.evaluators_configs, - )) + new_evaluation_db = loop.run_until_complete( + create_new_evaluation( + app=app, + organization=app.organization, + user=app.user, + testset=testset, + variants=new_evaluation.variant_ids, + evaluators_configs=new_evaluation.evaluators_configs, + ) + ) for variant_id in new_evaluation.variant_ids: variant_id = str(variant_id) app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) - deployment = loop.run_until_complete(get_deployment_by_objectid(app_variant_db.base.deployment)) + deployment = loop.run_until_complete( + get_deployment_by_objectid(app_variant_db.base.deployment) + ) # TODO: remove if abraham's fix is working uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") @@ -48,34 +53,34 @@ def evaluate(app_data, new_evaluation_data): for data_point in testset.csvdata: variant_output = llm_apps_service.get_llm_app_output(uri, data_point) - results:[EvaluationScenarioResult] = [] + results: [EvaluationScenarioResult] = [] for evaluator_config in new_evaluation.evaluators_configs: - result = evaluators_service.evaluate(evaluator_config.evaluator_key, data_point['correct_answer'], variant_output) + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + data_point["correct_answer"], + variant_output, + ) result_object = EvaluationScenarioResult( evaluator_key=evaluator_config.evaluator_key, - result={ - "type": "number", - "value": result - } + result={"type": "number", "value": result}, ) results.append(result_object) - evaluation_scenario = loop.run_until_complete(create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=[], - is_pinned=False, - note="", - correct_answer=data_point['correct_answer'], - outputs=[EvaluationScenarioOutputDB( - type="text", - value=variant_output - )], - results=results - )) + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=[], + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[ + EvaluationScenarioOutputDB(type="text", value=variant_output) + ], + results=results, + ) + ) print("evaluation scenario is ready") print(evaluation_scenario) - - From 02e1e8daa3e4b652d223ba67c0c182b5fa45508b Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 08:49:13 +0100 Subject: [PATCH 027/414] add calculation of the aggregated results --- .../agenta_backend/tasks/evaluations.py | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 0f100521d2..26f4f21d50 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -1,3 +1,4 @@ +from collections import defaultdict from bson import ObjectId from celery import shared_task import asyncio @@ -10,13 +11,17 @@ get_deployment_by_objectid, fetch_testset_by_id, create_new_evaluation_scenario, + update_evaluation_with_aggregated_results, ) from agenta_backend.models.api.evaluation_model import NewEvaluation from agenta_backend.models.db_models import ( + AggregatedResult, AppDB, EvaluationScenarioOutputDB, EvaluationScenarioResult, + EvaluatorConfigDB, + Result, ) from agenta_backend.services import evaluators_service @@ -39,6 +44,7 @@ def evaluate(app_data, new_evaluation_data): evaluators_configs=new_evaluation.evaluators_configs, ) ) + evaluators_aggregated_data = defaultdict(list) for variant_id in new_evaluation.variant_ids: variant_id = str(variant_id) @@ -53,18 +59,25 @@ def evaluate(app_data, new_evaluation_data): for data_point in testset.csvdata: variant_output = llm_apps_service.get_llm_app_output(uri, data_point) - results: [EvaluationScenarioResult] = [] + evaluators_results: [EvaluationScenarioResult] = [] for evaluator_config in new_evaluation.evaluators_configs: result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], variant_output, ) + result_object = EvaluationScenarioResult( evaluator_key=evaluator_config.evaluator_key, - result={"type": "number", "value": result}, + result=Result( + type="number", + value=result + ), + ) + evaluators_results.append(result_object) + evaluators_aggregated_data[evaluator_config.evaluator_key].append( + result ) - results.append(result_object) evaluation_scenario = loop.run_until_complete( create_new_evaluation_scenario( @@ -79,8 +92,25 @@ def evaluate(app_data, new_evaluation_data): outputs=[ EvaluationScenarioOutputDB(type="text", value=variant_output) ], - results=results, + results=evaluators_results, ) ) - print("evaluation scenario is ready") - print(evaluation_scenario) + + aggregated_results = aggregate_evaluator_results(evaluators_aggregated_data) + updated_evaluation = loop.run_until_complete(update_evaluation_with_aggregated_results(new_evaluation_db.id, aggregated_results)) + +def aggregate_evaluator_results(evaluators_aggregated_data): + aggregated_results = [] + for evaluator_key, values in evaluators_aggregated_data.items(): + average_value = sum(values) / len(values) if values else 0 + aggregated_result_value:AggregatedResult = AggregatedResult( + evaluator_config=EvaluatorConfigDB( + evaluator_key=evaluator_key + ), + result=Result( + type="number", + value=str(average_value) + ) + ) + aggregated_results.append(aggregated_result_value) + return aggregated_results From 294745619f7ecd48a597b052f3cd699df3a9554e Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 10:50:02 +0100 Subject: [PATCH 028/414] put evaluators back to code --- .../agenta_backend/models/api/evaluation_model.py | 4 ---- .../resources/evaluators}/evaluators.json | 14 +++++++++++++- agenta-backend/db-seed/Dockerfile | 4 ---- docker-compose.yml | 7 ------- 4 files changed, 13 insertions(+), 16 deletions(-) rename agenta-backend/{db-seed => agenta_backend/resources/evaluators}/evaluators.json (50%) delete mode 100644 agenta-backend/db-seed/Dockerfile diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index e989680135..357834392f 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -157,10 +157,6 @@ class EvaluationSettingsTemplate(BaseModel): description: str -class Evaluator(BaseModel): - key: str - - class EvaluatorConfig(BaseModel): evaluator_key: str settings_values: Optional[Dict[str, Any]] diff --git a/agenta-backend/db-seed/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json similarity index 50% rename from agenta-backend/db-seed/evaluators.json rename to agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 51ac77ffdf..7a81798ef9 100644 --- a/agenta-backend/db-seed/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -10,5 +10,17 @@ { "name": "Regex Test", "key": "auto_regex_test" + }, + { + "name": "AI Critique", + "key": "auto_ai_critique" + }, + { + "name": "Code Evaluation", + "key": "custom_code_run" + }, + { + "name": "Webhook test", + "key": "auto_webhook_test" } -] \ No newline at end of file +] diff --git a/agenta-backend/db-seed/Dockerfile b/agenta-backend/db-seed/Dockerfile deleted file mode 100644 index 0450d71397..0000000000 --- a/agenta-backend/db-seed/Dockerfile +++ /dev/null @@ -1,4 +0,0 @@ -FROM mongo:5.0 - -COPY evaluators.json /evaluators.json -CMD mongoimport --host mongo --username username --password password --authenticationDatabase admin --db agenta_v2 --collection evaluators --type json --file /evaluators.json --jsonArray \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 9d71d6e31d..c9b59a0946 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -95,13 +95,6 @@ services: timeout: 10s retries: 20 - mongo-seed: - build: ./agenta-backend/db-seed - depends_on: - - mongo - networks: - - agenta-network - mongo_express: image: mongo-express environment: From 6885ea09e3bc8241f9da7d42ea7d016aa470eb8a Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 10:50:48 +0100 Subject: [PATCH 029/414] fix mongo express --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index c9b59a0946..42cc4eefc6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -96,7 +96,7 @@ services: retries: 20 mongo_express: - image: mongo-express + image: mongo-express:0.54.0 environment: ME_CONFIG_MONGODB_ADMINUSERNAME: username ME_CONFIG_MONGODB_ADMINPASSWORD: password From 804daa92fa10dec6ea34f617e3b05991d95e0c54 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 10:51:49 +0100 Subject: [PATCH 030/414] adjust schemas --- .../agenta_backend/models/db_models.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index b1fcbf6292..1fdb7f7d34 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -208,18 +208,8 @@ class EvaluationSettingsTemplate(EmbeddedModel): description: str -class EvaluatorDB(Model): - name: str = Field(required=True) - key: str - settings_template: Dict[str, EvaluationSettingsTemplate] - created_at: datetime = Field(default=datetime.utcnow()) - updated_at: datetime = Field(default=datetime.utcnow()) - - class Config: - collection = "evaluators" - - class EvaluatorConfigDB(Model): + name: str evaluator_key: str settings_values: Optional[Dict[str, Any]] = None created_at: datetime = Field(default=datetime.utcnow()) @@ -229,9 +219,19 @@ class Config: collection = "evaluator_config" +class Result(EmbeddedModel): + type: str + value: str + + class EvaluationScenarioResult(EmbeddedModel): evaluator_key: str - result: Any + result: Result + + +class AggregatedResult(EmbeddedModel): + evaluator_config = EvaluatorConfigDB + result = Result class EvaluationScenarioInputDB(EmbeddedModel): @@ -252,6 +252,7 @@ class EvaluationDB(Model): testset: TestSetDB = Reference() variants: List[ObjectId] evaluators_configs: List[EvaluatorConfigDB] + aggregated_results: List[AggregatedResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) From 02211a24accaa7e884da54efe6cce82d50e93d30 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 10:52:16 +0100 Subject: [PATCH 031/414] update evaluation method --- .../agenta_backend/services/db_manager.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 957674a534..558e03ad41 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -19,6 +19,7 @@ ) from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( + AggregatedResult, AppDB, AppVariantDB, EvaluationScenarioInputDB, @@ -1632,6 +1633,7 @@ async def create_new_evaluation( testset=testset, variants=variants, evaluators_configs=evaluators_configs, + aggregated_results=[], created_at=datetime.now().isoformat(), updated_at=datetime.now().isoformat(), ) @@ -1671,3 +1673,18 @@ async def create_new_evaluation_scenario( ) await engine.save(evaluation_scenario) return evaluation_scenario + + +async def update_evaluation_with_aggregated_results( + evaluation_id: ObjectId, aggregated_results: List[AggregatedResult] +) -> EvaluationDB: + evaluation = await engine.find_one(EvaluationDB, EvaluationDB.id == evaluation_id) + + if not evaluation: + raise ValueError("Evaluation not found") + + evaluation.aggregated_results = aggregated_results + evaluation.updated_at = datetime.utcnow().isoformat() + + await engine.save(evaluation) + return evaluation From 6d90ea8fbc1e05ef16d34551a17744daee46f658 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 10:52:44 +0100 Subject: [PATCH 032/414] format --- .../agenta_backend/tasks/evaluations.py | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 26f4f21d50..5ded991ca8 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -69,10 +69,7 @@ def evaluate(app_data, new_evaluation_data): result_object = EvaluationScenarioResult( evaluator_key=evaluator_config.evaluator_key, - result=Result( - type="number", - value=result - ), + result=Result(type="number", value=result), ) evaluators_results.append(result_object) evaluators_aggregated_data[evaluator_config.evaluator_key].append( @@ -97,20 +94,20 @@ def evaluate(app_data, new_evaluation_data): ) aggregated_results = aggregate_evaluator_results(evaluators_aggregated_data) - updated_evaluation = loop.run_until_complete(update_evaluation_with_aggregated_results(new_evaluation_db.id, aggregated_results)) + updated_evaluation = loop.run_until_complete( + update_evaluation_with_aggregated_results( + new_evaluation_db.id, aggregated_results + ) + ) + def aggregate_evaluator_results(evaluators_aggregated_data): aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): average_value = sum(values) / len(values) if values else 0 - aggregated_result_value:AggregatedResult = AggregatedResult( - evaluator_config=EvaluatorConfigDB( - evaluator_key=evaluator_key - ), - result=Result( - type="number", - value=str(average_value) - ) + aggregated_result_value: AggregatedResult = AggregatedResult( + evaluator_config=EvaluatorConfigDB(evaluator_key=evaluator_key), + result=Result(type="number", value=str(average_value)), ) aggregated_results.append(aggregated_result_value) return aggregated_results From 73cce3dd688e541c66c79f13604406bb4f31d589 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 12:07:59 +0100 Subject: [PATCH 033/414] add generated name --- .../agenta_backend/tasks/evaluations.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 5ded991ca8..dbec3dda8e 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -3,6 +3,8 @@ from celery import shared_task import asyncio from datetime import datetime +from typing import List +import uuid from agenta_backend.services import llm_apps_service from agenta_backend.services.db_manager import ( @@ -13,7 +15,7 @@ create_new_evaluation_scenario, update_evaluation_with_aggregated_results, ) -from agenta_backend.models.api.evaluation_model import NewEvaluation +from agenta_backend.models.api.evaluation_model import EvaluatorConfig, NewEvaluation from agenta_backend.models.db_models import ( AggregatedResult, @@ -32,6 +34,12 @@ def evaluate(app_data, new_evaluation_data): loop = asyncio.get_event_loop() new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) + + # This will generate a name in case it's run from cli + new_evaluation.evaluators_configs = process_evaluators_configs( + new_evaluation.evaluators_configs + ) + testset = loop.run_until_complete(fetch_testset_by_id(new_evaluation.testset_id)) new_evaluation_db = loop.run_until_complete( @@ -101,6 +109,20 @@ def evaluate(app_data, new_evaluation_data): ) +def process_evaluators_configs( + evaluators_configs: List[EvaluatorConfig], +) -> List[EvaluatorConfigDB]: + """Process evaluators_configs to include names if missing.""" + processed_configs = [] + for config in evaluators_configs: + config_dict = config.dict() + if "name" not in config_dict: + config_dict["name"] = f"Evaluator_{uuid.uuid4()}" # Generate a random name + processed_config = EvaluatorConfigDB(**config_dict) + processed_configs.append(processed_config) + return processed_configs + + def aggregate_evaluator_results(evaluators_aggregated_data): aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): From d2b5fab8a22bbfb082a473074e1b59f5ac4812b9 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 13:12:46 +0100 Subject: [PATCH 034/414] add evaluator name in aggrefated results --- .../agenta_backend/tasks/evaluations.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index dbec3dda8e..83007b81cc 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -3,7 +3,7 @@ from celery import shared_task import asyncio from datetime import datetime -from typing import List +from typing import List, Tuple, Dict import uuid from agenta_backend.services import llm_apps_service @@ -36,7 +36,7 @@ def evaluate(app_data, new_evaluation_data): app = AppDB(**app_data) # This will generate a name in case it's run from cli - new_evaluation.evaluators_configs = process_evaluators_configs( + new_evaluation.evaluators_configs, evaluator_key_name_mapping = process_evaluators_configs( new_evaluation.evaluators_configs ) @@ -101,7 +101,8 @@ def evaluate(app_data, new_evaluation_data): ) ) - aggregated_results = aggregate_evaluator_results(evaluators_aggregated_data) + aggregated_results = aggregate_evaluator_results(evaluators_aggregated_data, evaluator_key_name_mapping) + updated_evaluation = loop.run_until_complete( update_evaluation_with_aggregated_results( new_evaluation_db.id, aggregated_results @@ -111,25 +112,31 @@ def evaluate(app_data, new_evaluation_data): def process_evaluators_configs( evaluators_configs: List[EvaluatorConfig], -) -> List[EvaluatorConfigDB]: - """Process evaluators_configs to include names if missing.""" +) -> Tuple[List[EvaluatorConfigDB], Dict[str, str]]: + """Process evaluators_configs to include names if missing and return a mapping of evaluator keys to names.""" processed_configs = [] + evaluator_key_name_mapping = {} for config in evaluators_configs: config_dict = config.dict() if "name" not in config_dict: config_dict["name"] = f"Evaluator_{uuid.uuid4()}" # Generate a random name processed_config = EvaluatorConfigDB(**config_dict) processed_configs.append(processed_config) - return processed_configs + evaluator_key_name_mapping[config_dict["evaluator_key"]] = config_dict["name"] + return processed_configs, evaluator_key_name_mapping -def aggregate_evaluator_results(evaluators_aggregated_data): +def aggregate_evaluator_results(evaluators_aggregated_data, evaluator_key_name_mapping): aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): average_value = sum(values) / len(values) if values else 0 + evaluator_name = evaluator_key_name_mapping.get(evaluator_key, "Unknown Evaluator") aggregated_result_value: AggregatedResult = AggregatedResult( - evaluator_config=EvaluatorConfigDB(evaluator_key=evaluator_key), + evaluator_config=EvaluatorConfigDB( + name=evaluator_name, + evaluator_key=evaluator_key + ), result=Result(type="number", value=str(average_value)), ) aggregated_results.append(aggregated_result_value) - return aggregated_results + return aggregated_results \ No newline at end of file From 91a1d3d96e576c7fd21727706c39b5ab7c8b2436 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Thu, 21 Dec 2023 17:39:44 +0500 Subject: [PATCH 035/414] eval v2 types | eval results and evaluators ui with mocking --- agenta-web/package-lock.json | 1 + agenta-web/package.json | 1 + agenta-web/src/components/Sidebar/Sidebar.tsx | 22 +++ .../components/TestSetTable/TestsetTable.tsx | 2 - .../evaluationResults/EvaluationResults.tsx | 137 +++++++++++++++ .../evaluations/evaluationResults/mock.ts | 160 ++++++++++++++++++ .../evaluations/evaluators/EvaluatorCard.tsx | 75 ++++++++ .../evaluations/evaluators/Evaluators.tsx | 56 ++++++ agenta-web/src/lib/Types.ts | 67 ++++++++ agenta-web/src/lib/helpers/utils.ts | 12 ++ agenta-web/src/pages/_app.tsx | 2 + .../apps/[app_id]/evaluations-new/index.tsx | 37 ++++ 12 files changed, 570 insertions(+), 2 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx create mode 100644 agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts create mode 100644 agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx create mode 100644 agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx create mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx diff --git a/agenta-web/package-lock.json b/agenta-web/package-lock.json index a6bedc7d31..da84a9d2c8 100644 --- a/agenta-web/package-lock.json +++ b/agenta-web/package-lock.json @@ -33,6 +33,7 @@ "autoprefixer": "10.4.14", "axios": "^1.4.0", "classnames": "^2.3.2", + "dayjs": "^1.11.10", "dotenv": "^16.3.1", "eslint": "8.39.0", "eslint-config-next": "13.3.4", diff --git a/agenta-web/package.json b/agenta-web/package.json index cca57b5be6..b4d786e016 100644 --- a/agenta-web/package.json +++ b/agenta-web/package.json @@ -44,6 +44,7 @@ "autoprefixer": "10.4.14", "axios": "^1.4.0", "classnames": "^2.3.2", + "dayjs": "^1.11.10", "dotenv": "^16.3.1", "eslint": "8.39.0", "eslint-config-next": "13.3.4", diff --git a/agenta-web/src/components/Sidebar/Sidebar.tsx b/agenta-web/src/components/Sidebar/Sidebar.tsx index 58e0f717a0..e35994fc37 100644 --- a/agenta-web/src/components/Sidebar/Sidebar.tsx +++ b/agenta-web/src/components/Sidebar/Sidebar.tsx @@ -271,6 +271,28 @@ const Sidebar: React.FC = () => { + + }> + + {collapsed + ? "Perform 1-to-1 variant comparisons on testsets to identify superior options." + : "Evaluate New"} + + + + div:nth-of-type(1)": { + width: 16, + height: 16, + borderRadius: "50%", + backgroundColor: "#52c41a", + }, + }, + dot: { + width: 3, + height: 3, + borderRadius: "50%", + backgroundColor: "#444", + }, +}) + +interface Props {} + +const EvaluationResults: React.FC = () => { + const {appTheme} = useAppTheme() + const classes = useStyles() + const [rowData, setRowData] = useState<_Evaluation[]>(Mock.evaluations) + + const evaluatorConfigs = useMemo( + () => + uniqBy( + rowData + .map((item) => item.aggregated_results.map((item) => item.evaluator_config)) + .flat(), + "id", + ), + [rowData], + ) + + const [colDefs, setColDefs] = useState[]>([ + {field: "testset.name"}, + { + field: "variants", + valueGetter: (params) => params.data?.variants[0].variantName, + headerName: "Variant", + }, + ...evaluatorConfigs.map( + (config) => + ({ + field: "aggregated_results", + headerComponent: () => ( + + {config.name} + + ), + valueGetter: (params) => + params.data?.aggregated_results.find( + (item) => item.evaluator_config.id === config.id, + )?.result?.value || "", + }) as ColDef<_Evaluation>, + ), + { + field: "status", + cellRenderer: (params: ICellRendererParams) => { + const classes = useStyles() + + return ( +
+
+
{capitalize(params.value)}
+ + {(params.data?.duration || 0) / 1000} +
+ ) + }, + }, + { + field: "created_at", + headerName: "Created", + valueFormatter: (params) => dayjs(params.value).fromNow(), + }, + ]) + + return ( +
+ + + + + +
+ + rowData={rowData} + columnDefs={colDefs} + getRowId={(params) => params.data.id} + /> +
+
+ ) +} + +export default EvaluationResults diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts b/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts new file mode 100644 index 0000000000..bc86328bb2 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts @@ -0,0 +1,160 @@ +import { + EvaluationSettingsTemplate, + Evaluator, + EvaluatorConfig, + Org, + TestSet, + User, + Variant, + _Evaluation, +} from "@/lib/Types" +import exactMatchImg from "@/media/target.png" +import similarityImg from "@/media/transparency.png" +import regexImg from "@/media/programming.png" +import webhookImg from "@/media/link.png" +import aiImg from "@/media/artificial-intelligence.png" +import codeImg from "@/media/browser.png" +import {PresetColors} from "antd/es/theme/internal" +import {stringToNumberInRange} from "@/lib/helpers/utils" + +const organizations: Org[] = [ + { + id: "org1", + name: "Organization 1", + description: "This is the description of organization 1", + owner: "user1", + }, +] + +const users: User[] = [ + { + id: "user1", + uid: "user1", + username: "user1", + email: "user1@test.com", + }, +] + +const testsets: TestSet[] = [ + { + id: "testset1", + name: "Test Set 1", + created_at: "2021-01-01T00:00:00.000Z", + updated_at: "2021-01-01T00:00:00.000Z", + csvdata: [], + }, +] + +const variants: Variant[] = [ + { + variantName: "variant1", + templateVariantName: "variant1", + persistent: false, + parameters: {}, + previousVariantName: null, + variantId: "variant1", + baseId: "variant1", + baseName: "variant1", + configId: "config1", + configName: "config1", + }, +] + +const evaluatorSettinsTemplates: EvaluationSettingsTemplate[] = [ + { + type: "number", + default: 0.5, + description: "Threshold for similarity matching", + }, +] + +const evaluators: Evaluator[] = [ + { + name: "Exact Match", + key: "auto_exact_match", + settings_template: {}, + icon_url: exactMatchImg, + }, + { + name: "Similarity", + key: "similarity", + settings_template: { + similarity_threshold: evaluatorSettinsTemplates[0], + }, + icon_url: similarityImg, + }, + { + name: "Regex Test", + key: "auto_regex_test", + settings_template: {}, + icon_url: regexImg, + }, + { + name: "AI Critique", + key: "auto_ai_critique", + settings_template: {}, + icon_url: aiImg, + }, + { + name: "Code Evaluation", + key: "custom_code_run", + settings_template: {}, + icon_url: codeImg, + }, + { + name: "Webhook test", + key: "auto_webhook_test", + settings_template: {}, + icon_url: webhookImg, + }, +].map((item) => ({ + ...(item as Evaluator), + color: PresetColors[stringToNumberInRange(item.key, 0, PresetColors.length - 1)], +})) + +const evaluatorConfigs: EvaluatorConfig[] = [ + { + evaluator_key: "similarity", + name: "Nearly Similar", + settings_values: { + similarity_threshold: 0.4, + }, + created_at: "2021-01-01T00:00:00.000Z", + id: "config1", + }, +] + +const evaluations: _Evaluation[] = [ + { + id: "evaluation1", + organization: organizations[0], + user: users[0], + testset: testsets[0], + status: "completed", + variants: [variants[0]], + aggregated_results: [ + { + evaluator_config: evaluatorConfigs[0], + result: { + type: "number", + value: 32.5, + }, + }, + ], + created_at: "2021-01-01T00:00:00.000Z", + duration: 50000, + }, +] + +const Mock = { + organizations, + users, + testsets, + variants, + evaluatorSettinsTemplates, + evaluators, + evaluatorConfigs, + evaluations, +} + +export default Mock diff --git a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx new file mode 100644 index 0000000000..bc83bc550a --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx @@ -0,0 +1,75 @@ +import React from "react" +import {EvaluatorConfig} from "@/lib/Types" +import {DeleteOutlined, EditOutlined} from "@ant-design/icons" +import {Card, Tag, Typography} from "antd" +import {createUseStyles} from "react-jss" +import Mock from "../evaluationResults/mock" +import dayjs from "dayjs" +import Image from "next/image" +import {useAppTheme} from "@/components/Layout/ThemeContextProvider" + +type StyleProps = { + themeMode: "dark" | "light" +} + +const useStyles = createUseStyles({ + body: { + display: "flex", + flexDirection: "column", + alignItems: "center", + }, + headerRow: { + display: "flex", + alignItems: "center", + alignSelf: "stretch", + justifyContent: "space-between", + marginBottom: "1.5rem", + }, + evaluationImg: ({themeMode}: StyleProps) => ({ + width: 27, + height: 27, + marginRight: "8px", + filter: themeMode === "dark" ? "invert(1)" : "none", + }), + name: { + marginTop: "0.25rem", + marginBottom: 0, + }, +}) + +interface Props { + evaluatorConfig: EvaluatorConfig +} + +const EvaluatorCard: React.FC = ({evaluatorConfig}) => { + const {appTheme} = useAppTheme() + const classes = useStyles({themeMode: appTheme} as StyleProps) + const evaluator = Mock.evaluators.find((item) => item.key === evaluatorConfig.evaluator_key)! + + return ( + , ]}> +
+
+ + {dayjs(evaluatorConfig.created_at).format("DD MMM YY")} + + {evaluator.name} +
+ + {evaluator.icon_url && ( + Exact match + )} + + + {evaluatorConfig.name} + +
+
+ ) +} + +export default EvaluatorCard diff --git a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx new file mode 100644 index 0000000000..08f995db6f --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx @@ -0,0 +1,56 @@ +import React, {useState} from "react" +import {createUseStyles} from "react-jss" +import Mock from "../evaluationResults/mock" +import EvaluatorCard from "./EvaluatorCard" +import {Button, Space} from "antd" +import {PlusCircleOutlined} from "@ant-design/icons" +import {pickRandom} from "@/lib/helpers/utils" +import {EvaluatorConfig} from "@/lib/Types" + +const useStyles = createUseStyles({ + root: { + display: "flex", + flexDirection: "column", + gap: "1rem", + }, + buttonsGroup: { + alignSelf: "flex-end", + }, + grid: { + display: "grid", + gridTemplateColumns: "repeat(auto-fit, minmax(240px, 1fr))", + gap: "1rem", + }, +}) + +interface Props {} + +const Evaluators: React.FC = () => { + const classes = useStyles() + const [evaluatorConfigs, setEvaluatorConfigs] = useState( + pickRandom(Mock.evaluators, 7).map((item, ix) => ({ + evaluator_key: item.key, + id: ix + "", + name: `Evaluator ${ix}`, + settings_values: {}, + created_at: new Date().toString(), + })), + ) + + return ( +
+ + + +
+ {evaluatorConfigs.map((item) => ( + + ))} +
+
+ ) +} + +export default Evaluators diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 50c4a15d19..c25943d6fa 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -1,3 +1,4 @@ +import {StaticImageData} from "next/image" import {EvaluationFlow, EvaluationType} from "./enums" export interface testset { @@ -288,3 +289,69 @@ export type ChatMessage = { content: string id?: string } + +type ValueType = number | string | boolean | GenericObject | null + +//evaluation revamp types +export interface EvaluationSettingsTemplate { + type: string + default: ValueType + description: string +} + +export interface Evaluator { + name: string + key: string + settings_template: Record + icon_url?: string | StaticImageData + color?: string +} + +export interface EvaluatorConfig { + id: string + evaluator_key: string + name: string + settings_values: Record + created_at: string +} + +export interface TypedValue { + type: string + value: ValueType +} + +export interface EvaluationScenarioResult { + evaluator: Evaluator + result: TypedValue +} + +export interface _Evaluation { + id: string + organization: Org + user: User + testset: TestSet + status: "completed" | "failed" | "pending" + variants: Variant[] + aggregated_results: { + evaluator_config: EvaluatorConfig + result: TypedValue + }[] + created_at?: string + duration?: number +} + +export interface _EvaluationScenario { + id: string + user: User + organization: Org + evaluation: _Evaluation + inputs: (TypedValue & {name: string})[] + outputs: TypedValue[] + correct_answer?: string + created_at?: Date + updated_at?: Date + is_pinned?: boolean + note?: string + evaluators_configs: EvaluatorConfig[] + results: EvaluationResult[] +} diff --git a/agenta-web/src/lib/helpers/utils.ts b/agenta-web/src/lib/helpers/utils.ts index 79aa0e82ed..3e11530cf1 100644 --- a/agenta-web/src/lib/helpers/utils.ts +++ b/agenta-web/src/lib/helpers/utils.ts @@ -313,3 +313,15 @@ export const shortPoll = async ( await delay(delayMs) } } + +export function pickRandom(arr: T[], len: number) { + const result: T[] = [] + const length = arr.length + + for (let i = 0; i < len; i++) { + const randomIndex = Math.floor(Math.random() * length) + result.push(arr[randomIndex]) + } + + return result +} diff --git a/agenta-web/src/pages/_app.tsx b/agenta-web/src/pages/_app.tsx index 4e3f89b94c..6c6ae23a0e 100644 --- a/agenta-web/src/pages/_app.tsx +++ b/agenta-web/src/pages/_app.tsx @@ -10,6 +10,8 @@ import Layout from "@/components/Layout/Layout" import ThemeContextProvider from "@/components/Layout/ThemeContextProvider" import AppContextProvider from "@/contexts/app.context" import ProfileContextProvider from "@/contexts/profile.context" +import "ag-grid-community/styles/ag-grid.css" +import "ag-grid-community/styles/ag-theme-alpine.css" // Initialize the Posthog client if (typeof window !== "undefined") { diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx new file mode 100644 index 0000000000..d0fcc7cefa --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx @@ -0,0 +1,37 @@ +import EvaluationResults from "@/components/pages/evaluations/evaluationResults/EvaluationResults" +import Evaluators from "@/components/pages/evaluations/evaluators/Evaluators" +import {useQueryParam} from "@/hooks/useQuery" +import {SlidersOutlined, UnorderedListOutlined} from "@ant-design/icons" +import {Tabs} from "antd" +import React from "react" + +interface Props {} + +const Evaluations: React.FC = () => { + const [tab, setTab] = useQueryParam("tab", "results") + + return ( +
+ , + children: , + }, + { + key: "evaluators", + label: "Evaluators", + icon: , + children: , + }, + ]} + onChange={setTab} + /> +
+ ) +} + +export default Evaluations From 5b48105eede04de8ccfc915138bbe75007e207ac Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 11:38:12 +0100 Subject: [PATCH 036/414] Update - added new enum to EvaluationStatus and created EvaluationScenarioStatus enum model --- .../agenta_backend/models/api/evaluation_model.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 357834392f..8639879a81 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -29,8 +29,12 @@ class EvaluationType(str, Enum): class EvaluationStatusEnum(str, Enum): EVALUATION_INITIALIZED = "EVALUATION_INITIALIZED" EVALUATION_STARTED = "EVALUATION_STARTED" - COMPARISON_RUN_STARTED = "COMPARISON_RUN_STARTED" EVALUATION_FINISHED = "EVALUATION_FINISHED" + EVALUATION_ERROR = "EVALUATION_ERROR" + + +class EvaluationScenarioStatusEnum(str, Enum): + COMPARISON_RUN_STARTED = "COMPARISON_RUN_STARTED" class Evaluation(BaseModel): From a147497928a8fb8412dc979ebb73fcc1600e090e Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 12:04:51 +0100 Subject: [PATCH 037/414] Update - added status to Evaluation db model --- agenta-backend/agenta_backend/models/db_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 1fdb7f7d34..4a99db11ce 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -249,6 +249,7 @@ class EvaluationDB(Model): app: AppDB = Reference(key_name="app") organization: OrganizationDB = Reference(key_name="organization") user: UserDB = Reference(key_name="user") + status: str testset: TestSetDB = Reference() variants: List[ObjectId] evaluators_configs: List[EvaluatorConfigDB] From 4cd4ba18ff0c040de8dbb6102330cea68a5391bd Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 13:18:55 +0100 Subject: [PATCH 038/414] Update - added default status to evaluation --- agenta-backend/agenta_backend/models/db_models.py | 2 +- agenta-backend/agenta_backend/services/db_manager.py | 2 ++ agenta-backend/agenta_backend/tasks/evaluations.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 4a99db11ce..68af9039e5 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -249,7 +249,7 @@ class EvaluationDB(Model): app: AppDB = Reference(key_name="app") organization: OrganizationDB = Reference(key_name="organization") user: UserDB = Reference(key_name="user") - status: str + status: str = Field(default="EVALUATION_INITIALIZED") testset: TestSetDB = Reference() variants: List[ObjectId] evaluators_configs: List[EvaluatorConfigDB] diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 558e03ad41..ea4bb88a76 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1619,6 +1619,7 @@ async def create_new_evaluation( organization: OrganizationDB, user: UserDB, testset: TestSetDB, + status: str, variants: [AppVariantDB], evaluators_configs: [EvaluatorConfigDB], ) -> EvaluationDB: @@ -1631,6 +1632,7 @@ async def create_new_evaluation( organization=organization, user=user, testset=testset, + status=status, variants=variants, evaluators_configs=evaluators_configs, aggregated_results=[], diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 83007b81cc..5f7bc25ed5 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -15,7 +15,7 @@ create_new_evaluation_scenario, update_evaluation_with_aggregated_results, ) -from agenta_backend.models.api.evaluation_model import EvaluatorConfig, NewEvaluation +from agenta_backend.models.api.evaluation_model import EvaluatorConfig, NewEvaluation, EvaluationStatusEnum from agenta_backend.models.db_models import ( AggregatedResult, @@ -48,6 +48,7 @@ def evaluate(app_data, new_evaluation_data): organization=app.organization, user=app.user, testset=testset, + status=EvaluationStatusEnum.EVALUATION_STARTED, variants=new_evaluation.variant_ids, evaluators_configs=new_evaluation.evaluators_configs, ) From ecf3d1dd3fa9d7873bb56626134466bffa9ed57e Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 14:56:19 +0100 Subject: [PATCH 039/414] Refactor - modified create_new_evaluation logic --- .../services/evaluation_service.py | 107 ++++++------------ 1 file changed, 37 insertions(+), 70 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index af706459a3..abfd02374f 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -17,6 +17,7 @@ EvaluationScenarioUpdate, CreateCustomEvaluation, EvaluationUpdate, + EvaluationStatusEnum ) from agenta_backend.models import converters from agenta_backend.utils.common import engine, check_access_to_app @@ -105,76 +106,6 @@ async def _fetch_evaluation_scenario_and_check_access( return evaluation_scenario -async def create_new_evaluation( - payload: NewEvaluation, **user_org_data: dict -) -> EvaluationDB: - """ - Create a new evaluation based on the provided payload and additional arguments. - - Args: - payload (NewEvaluation): The evaluation payload. - **user_org_data (dict): Additional keyword arguments, e.g., user id. - - Returns: - EvaluationDB - """ - user = await get_user(user_uid=user_org_data["uid"]) - - # Initialize evaluation type settings - settings = payload.evaluation_type_settings - evaluation_type_settings = EvaluationTypeSettings( - similarity_threshold=settings.similarity_threshold or 0.0, - regex_pattern=settings.regex_pattern or "", - regex_should_match=settings.regex_should_match or True, - webhook_url=settings.webhook_url or "", - custom_code_evaluation_id=settings.custom_code_evaluation_id or "", - llm_app_prompt_template=settings.llm_app_prompt_template or "", - ) - - current_time = datetime.utcnow() - - # Fetch app - app = await db_manager.fetch_app_by_id(app_id=payload.app_id) - if app is None: - raise HTTPException( - status_code=404, - detail=f"App with id {payload.app_id} does not exist", - ) - - variants = [ObjectId(variant_id) for variant_id in payload.variant_ids] - - testset = await db_manager.fetch_testset_by_id(testset_id=payload.testset_id) - # Initialize and save evaluation instance to database - eval_instance = EvaluationDB( - app=app, - organization=app.organization, # Assuming user has an organization_id attribute - user=user, - status=payload.status, - evaluation_type=payload.evaluation_type, - evaluation_type_settings=evaluation_type_settings, - variants=variants, - testset=testset, - created_at=current_time, - updated_at=current_time, - ) - newEvaluation = await engine.save(eval_instance) - - if newEvaluation is None: - raise HTTPException( - status_code=500, detail="Failed to create evaluation_scenario" - ) - - await prepare_csvdata_and_create_evaluation_scenario( - testset.csvdata, - payload.inputs, - payload.evaluation_type, - newEvaluation, - user, - app, - ) - return newEvaluation - - async def prepare_csvdata_and_create_evaluation_scenario( csvdata: List[Dict[str, str]], payload_inputs: List[str], @@ -916,3 +847,39 @@ async def fetch_custom_evaluation_names( ) ) return list_of_custom_eval_names + + +async def create_new_evaluation( + app_data: dict, new_evaluation_data: dict +) -> Evaluation: + """ + Create a new evaluation based on the provided payload and additional arguments. + + Args: + payload (NewEvaluation): The evaluation payload. + **user_org_data (dict): Additional keyword arguments, e.g., user id. + + Returns: + Evaluation + """ + + from agenta_backend.tasks.evaluations import process_evaluators_configs + + new_evaluation = NewEvaluation(**new_evaluation_data) + app = AppDB(**app_data) + + # This will generate a name in case it's run from cli + new_evaluation.evaluators_configs = process_evaluators_configs( + new_evaluation.evaluators_configs + ) + testset = await db_manager.fetch_testset_by_id(new_evaluation.testset_id) + evaluation_db = await db_manager.create_new_evaluation( + app=app, + organization=app.organization, + user=app.user, + testset=testset, + status=EvaluationStatusEnum.EVALUATION_STARTED, + variants=new_evaluation.variant_ids, + evaluators_configs=new_evaluation.evaluators_configs, + ) + return converters.evaluation_db_to_pydantic(evaluation_db) From 7f566a5b80333bec29ee8a8f7d97950d89593d46 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 14:56:58 +0100 Subject: [PATCH 040/414] Update - modified evaluation shared_task logic --- .../agenta_backend/tasks/evaluations.py | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 5f7bc25ed5..b958422a57 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -9,6 +9,7 @@ from agenta_backend.services import llm_apps_service from agenta_backend.services.db_manager import ( create_new_evaluation, + fetch_evaluation_by_id, fetch_app_variant_by_id, get_deployment_by_objectid, fetch_testset_by_id, @@ -30,28 +31,19 @@ @shared_task(queue="agenta_backend.tasks.evaluations.evaluate") -def evaluate(app_data, new_evaluation_data): +def evaluate(app_data, new_evaluation_data, evaluation_id: str, testset_id: str): loop = asyncio.get_event_loop() new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) - # This will generate a name in case it's run from cli - new_evaluation.evaluators_configs, evaluator_key_name_mapping = process_evaluators_configs( + # NOTE: This will generate a name in case it's run from cli + evaluation_evaluators_configs, evaluator_key_name_mapping = process_evaluators_configs( new_evaluation.evaluators_configs ) - testset = loop.run_until_complete(fetch_testset_by_id(new_evaluation.testset_id)) - + testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) new_evaluation_db = loop.run_until_complete( - create_new_evaluation( - app=app, - organization=app.organization, - user=app.user, - testset=testset, - status=EvaluationStatusEnum.EVALUATION_STARTED, - variants=new_evaluation.variant_ids, - evaluators_configs=new_evaluation.evaluators_configs, - ) + fetch_evaluation_by_id(evaluation_id) ) evaluators_aggregated_data = defaultdict(list) @@ -69,7 +61,7 @@ def evaluate(app_data, new_evaluation_data): variant_output = llm_apps_service.get_llm_app_output(uri, data_point) evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config in new_evaluation.evaluators_configs: + for evaluator_config in evaluation_evaluators_configs: result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], From 5b9fa1dbbd5c9c39dd13bdfe1e1b3ae4a6dcc26f Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 14:58:03 +0100 Subject: [PATCH 041/414] Update - modified create evaluation api router --- .../routers/evaluation_router.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index c521968955..93aa889696 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -83,22 +83,21 @@ async def create_evaluation( status_code=400, ) app = await db_manager.fetch_app_by_id(app_id=payload.app_id) - if app is None: raise HTTPException(status_code=404, detail="App not found") - # TODO: clean this - # new_evaluation_db = await evaluation_service.create_new_evaluation( - # payload, **user_org_data - # ) + app_data = jsonable_encoder(app) new_evaluation_data = payload.dict() - # TODO: to review/find a better solution - # We need to serilize the data we pass to celery tasks otherwise we will get serilisation errors - - evaluate.delay(app_data, new_evaluation_data) + evaluation = await evaluation_service.create_new_evaluation( + app_data=app_data, + new_evaluation_data=new_evaluation_data, + ) - return 200 - # return converters.evaluation_db_to_simple_evaluation_output(new_evaluation_db) + # Start celery task + evaluate.delay( + app_data, new_evaluation_data, evaluation.id, evaluation.testset_id + ) + return evaluation.id except KeyError: raise HTTPException( status_code=400, From 13cc5851a405bfc90c934321259bab3c207b3a6e Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 14:58:33 +0100 Subject: [PATCH 042/414] Feat - created fetch evaluation status and results api router --- .../routers/evaluation_router.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 93aa889696..9a052d2187 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -105,6 +105,47 @@ async def create_evaluation( ) +@router.get("/{evaluation_id}/status/") +async def fetch_evaluation_status(evaluation_id: str, request: Request): + """Fetches the status of the evaluation. + + Args: + evaluation_id (str): the evaluation id + request (Request): the request object + + Returns: + (str): the evaluation status + """ + + try: + # Get user and organization id + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + evaluation = await evaluation_service.fetch_evaluation( + evaluation_id, **user_org_data + ) + return evaluation.status + except Exception as exc: + raise HTTPException(status_code=500, detail=str(exc)) + + +@router.get("/{evaluation_id}/results/") +async def fetch_evaluation_results(evaluation_id: str, request: Request): + """Fetches the results of the evaluation + + Args: + evaluation_id (str): the evaluation id + request (Request): the request object + + Returns: + _type_: _description_ + """ + + try: + ... + except Exception as exc: + raise ... + + @router.put("/{evaluation_id}/") async def update_evaluation_router( request: Request, From 1e57d661c4589aaf7626cc7e4500fe2158b15a55 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 15:21:52 +0100 Subject: [PATCH 043/414] Update - clean up Evaluation api model --- agenta-backend/agenta_backend/models/api/evaluation_model.py | 2 -- agenta-backend/agenta_backend/models/converters.py | 2 -- agenta-backend/agenta_backend/services/evaluation_service.py | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 8639879a81..6ee28a92fc 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -42,8 +42,6 @@ class Evaluation(BaseModel): app_id: str user_id: str user_username: str - evaluation_type: EvaluationType - evaluation_type_settings: Optional[EvaluationTypeSettings] variant_ids: List[str] variant_names: List[str] testset_id: str diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index add5413dc1..355a7937fd 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -73,8 +73,6 @@ async def evaluation_db_to_pydantic( user_id=str(evaluation_db.user.id), user_username=evaluation_db.user.username or "", status=evaluation_db.status, - evaluation_type=evaluation_db.evaluation_type, - evaluation_type_settings=evaluation_db.evaluation_type_settings, variant_ids=[str(variant) for variant in evaluation_db.variants], variant_names=variant_names, testset_id=str(evaluation_db.testset.id), diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index abfd02374f..9d2f521a9b 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -882,4 +882,4 @@ async def create_new_evaluation( variants=new_evaluation.variant_ids, evaluators_configs=new_evaluation.evaluators_configs, ) - return converters.evaluation_db_to_pydantic(evaluation_db) + return await converters.evaluation_db_to_pydantic(evaluation_db) From 4136da078a2d9f5cddc506ba39543a35c8ffcc21 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 15:25:30 +0100 Subject: [PATCH 044/414] Update - added extra args in evaluate task --- agenta-backend/agenta_backend/tasks/evaluations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index b958422a57..260b40df58 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -31,7 +31,7 @@ @shared_task(queue="agenta_backend.tasks.evaluations.evaluate") -def evaluate(app_data, new_evaluation_data, evaluation_id: str, testset_id: str): +def evaluate(app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str): loop = asyncio.get_event_loop() new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) From ea0d97ae579a3d0043534f05d2f0d7e17ee0d532 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 15:37:31 +0100 Subject: [PATCH 045/414] Update - modified process_evaluators_configs iin create_new_evaluation service --- agenta-backend/agenta_backend/services/evaluation_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 9d2f521a9b..2464f3e42e 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -869,7 +869,7 @@ async def create_new_evaluation( app = AppDB(**app_data) # This will generate a name in case it's run from cli - new_evaluation.evaluators_configs = process_evaluators_configs( + evaluation_evaluators_configs, _ = process_evaluators_configs( new_evaluation.evaluators_configs ) testset = await db_manager.fetch_testset_by_id(new_evaluation.testset_id) @@ -880,6 +880,6 @@ async def create_new_evaluation( testset=testset, status=EvaluationStatusEnum.EVALUATION_STARTED, variants=new_evaluation.variant_ids, - evaluators_configs=new_evaluation.evaluators_configs, + evaluators_configs=evaluation_evaluators_configs, ) return await converters.evaluation_db_to_pydantic(evaluation_db) From 528c70c4aebcf6d02bdcd676df29e46e738b9553 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 16:29:48 +0100 Subject: [PATCH 046/414] return the whole evaluation object --- agenta-backend/agenta_backend/routers/evaluation_router.py | 2 +- agenta-cli/agenta/client/client.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 9a052d2187..47724402e0 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -97,7 +97,7 @@ async def create_evaluation( evaluate.delay( app_data, new_evaluation_data, evaluation.id, evaluation.testset_id ) - return evaluation.id + return evaluation except KeyError: raise HTTPException( status_code=400, diff --git a/agenta-cli/agenta/client/client.py b/agenta-cli/agenta/client/client.py index 66d20b89ed..dc5cbdcd9b 100644 --- a/agenta-cli/agenta/client/client.py +++ b/agenta-cli/agenta/client/client.py @@ -565,4 +565,5 @@ def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: raise APIRequestError( f"Request to run evaluations failed with status code {response.status_code} and error message: {error_message}." ) - return response.json()["app_id"] + print(response.json()) + return response.json() From d2b911461f7033600ed8e56fae1456a1b1a4a0c1 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 17:56:45 +0100 Subject: [PATCH 047/414] Feat - created aggregated_result_to_pydantic converter --- agenta-backend/agenta_backend/models/converters.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 355a7937fd..0984dfb2c9 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -17,6 +17,7 @@ EvaluationScenarioDB, VariantBaseDB, UserDB, + AggregatedResult, ) from agenta_backend.models.api.api_models import ( AppVariant, @@ -82,6 +83,19 @@ async def evaluation_db_to_pydantic( ) +def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: + list_of_aggregated_results = [] + for aggregated_result in results: + result_dict = { + "evaluator_config": aggregated_result.evaluator_config.json( + exclude={"created_at", "updated_at"} + ), + "result": aggregated_result.result.json(), + } + list_of_aggregated_results.append(result_dict) + return list_of_aggregated_results + + def evaluation_scenario_db_to_pydantic( evaluation_scenario_db: EvaluationScenarioDB, ) -> EvaluationScenario: From 8e522d8a59f55951ca7fb4ce8a1750b8544656bd Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 17:57:46 +0100 Subject: [PATCH 048/414] Update - implemented retrieve evaluation results evaluation service --- .../agenta_backend/routers/evaluation_router.py | 9 ++++----- .../services/evaluation_service.py | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 47724402e0..88e87daf7e 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -3,6 +3,7 @@ from typing import List, Dict from fastapi.responses import JSONResponse +from fastapi.encoders import jsonable_encoder from fastapi import HTTPException, APIRouter, Body, Request, status, Response from agenta_backend.services.helpers import format_inputs, format_outputs @@ -22,7 +23,6 @@ CreateCustomEvaluation, EvaluationUpdate, EvaluationWebhook, - SimpleEvaluationOutput, ) from agenta_backend.services.evaluation_service import ( UpdateEvaluationScenarioError, @@ -45,7 +45,6 @@ from agenta_backend.services import results_service from agenta_backend.tasks.evaluations import evaluate -from fastapi.encoders import jsonable_encoder if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all @@ -57,7 +56,6 @@ router = APIRouter() -# @router.post("/", response_model=SimpleEvaluationOutput) @router.post("/") async def create_evaluation( payload: NewEvaluation, @@ -141,9 +139,10 @@ async def fetch_evaluation_results(evaluation_id: str, request: Request): """ try: - ... + results = await evaluation_service.retrieve_evaluation_results(evaluation_id) + return {**results, "evaluation_id": evaluation_id} except Exception as exc: - raise ... + raise HTTPException(status_code=500, detail=str(exc)) @router.put("/{evaluation_id}/") diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 2464f3e42e..1e5a9589c5 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -17,7 +17,7 @@ EvaluationScenarioUpdate, CreateCustomEvaluation, EvaluationUpdate, - EvaluationStatusEnum + EvaluationStatusEnum, ) from agenta_backend.models import converters from agenta_backend.utils.common import engine, check_access_to_app @@ -883,3 +883,17 @@ async def create_new_evaluation( evaluators_configs=evaluation_evaluators_configs, ) return await converters.evaluation_db_to_pydantic(evaluation_db) + + +async def retrieve_evaluation_results(evaluation_id: str) -> List[dict]: + """Retrieve the aggregated results for a given evaluation. + + Args: + evaluation_id (str): the evaluation id + + Returns: + List[dict]: evaluation aggregated results + """ + + evaluation = await db_manager.fetch_evaluation_by_id(evaluation_id) + return converters.aggregated_result_to_pydantic(evaluation.aggregated_results) From 6b8c9874b49bce7e3807ad19256989ca43413b8b Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 21 Dec 2023 18:23:53 +0100 Subject: [PATCH 049/414] add evaluators endpoint --- agenta-backend/agenta_backend/main.py | 2 ++ .../models/api/evaluation_model.py | 5 +++ .../routers/evaluators_router.py | 32 +++++++++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 agenta-backend/agenta_backend/routers/evaluators_router.py diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index 98af93a2c5..e11329eeb3 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -9,6 +9,7 @@ container_router, environment_router, evaluation_router, + evaluators_router, observability_router, organization_router, testset_router, @@ -77,6 +78,7 @@ async def lifespan(application: FastAPI, cache=True): app.include_router(app_router.router, prefix="/apps") app.include_router(variants_router.router, prefix="/variants") app.include_router(evaluation_router.router, prefix="/evaluations") +app.include_router(evaluators_router.router, prefix="/evaluators") app.include_router(testset_router.router, prefix="/testsets") app.include_router(container_router.router, prefix="/containers") app.include_router(environment_router.router, prefix="/environments") diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 6ee28a92fc..a277281ddd 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -169,3 +169,8 @@ class NewEvaluation(BaseModel): variant_ids: List[str] evaluators_configs: List[EvaluatorConfig] testset_id: str + + +class Evaluator(BaseModel): + name: str + key: str \ No newline at end of file diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py new file mode 100644 index 0000000000..bb49659117 --- /dev/null +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -0,0 +1,32 @@ +import os +import json +from typing import List + +from fastapi import HTTPException, APIRouter + +from agenta_backend.models.api.evaluation_model import ( + Evaluator +) + +router = APIRouter() + +@router.get("/", response_model=List[Evaluator]) +async def get_evaluators(): + """Fetches a list of evaluators from the hardcoded JSON file. + + Returns: + List[Evaluator]: A list of evaluator objects. + """ + + file_path = 'agenta_backend/resources/evaluators/evaluators.json' + + if not os.path.exists(file_path): + raise HTTPException(status_code=404, detail="Evaluators file not found") + + try: + with open(file_path, 'r') as file: + evaluators = json.load(file) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error reading evaluators file: {str(e)}") + + return evaluators \ No newline at end of file From 7ec90e661dd81fe6137e79183d6a20fd842f1454 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 18:02:15 +0100 Subject: [PATCH 050/414] Update - modified fetch evaluation results api router --- agenta-backend/agenta_backend/routers/evaluation_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 88e87daf7e..ba8dbfc24c 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -140,7 +140,7 @@ async def fetch_evaluation_results(evaluation_id: str, request: Request): try: results = await evaluation_service.retrieve_evaluation_results(evaluation_id) - return {**results, "evaluation_id": evaluation_id} + return {"results": results, "evaluation_id": evaluation_id} except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) From 2cf36403e2e7f87296f1e43578e20e26eed1e0b2 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 19:34:38 +0100 Subject: [PATCH 051/414] Update - refactored AggregatedResult to a model and type its fields --- .../agenta_backend/models/converters.py | 4 +- .../agenta_backend/models/db_models.py | 15 +++++--- .../agenta_backend/tasks/evaluations.py | 37 ++++++++++--------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 0984dfb2c9..a85cbdc868 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -17,7 +17,7 @@ EvaluationScenarioDB, VariantBaseDB, UserDB, - AggregatedResult, + AggregatedResultDB, ) from agenta_backend.models.api.api_models import ( AppVariant, @@ -83,7 +83,7 @@ async def evaluation_db_to_pydantic( ) -def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: +def aggregated_result_to_pydantic(results: List[AggregatedResultDB]) -> List[dict]: list_of_aggregated_results = [] for aggregated_result in results: result_dict = { diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 68af9039e5..6eb33954c9 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -221,7 +221,7 @@ class Config: class Result(EmbeddedModel): type: str - value: str + value: Union[str, float, int] class EvaluationScenarioResult(EmbeddedModel): @@ -229,9 +229,14 @@ class EvaluationScenarioResult(EmbeddedModel): result: Result -class AggregatedResult(EmbeddedModel): - evaluator_config = EvaluatorConfigDB - result = Result +class AggregatedResultDB(Model): + evaluator_config: EvaluatorConfigDB = Reference() + result: Result + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Config: + collection = "aggregated_results" class EvaluationScenarioInputDB(EmbeddedModel): @@ -253,7 +258,7 @@ class EvaluationDB(Model): testset: TestSetDB = Reference() variants: List[ObjectId] evaluators_configs: List[EvaluatorConfigDB] - aggregated_results: List[AggregatedResult] + aggregated_results: List[AggregatedResultDB] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 260b40df58..d7075d77d4 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -8,7 +8,6 @@ from agenta_backend.services import llm_apps_service from agenta_backend.services.db_manager import ( - create_new_evaluation, fetch_evaluation_by_id, fetch_app_variant_by_id, get_deployment_by_objectid, @@ -16,14 +15,14 @@ create_new_evaluation_scenario, update_evaluation_with_aggregated_results, ) -from agenta_backend.models.api.evaluation_model import EvaluatorConfig, NewEvaluation, EvaluationStatusEnum +from agenta_backend.models.api.evaluation_model import EvaluatorConfig, NewEvaluation from agenta_backend.models.db_models import ( - AggregatedResult, AppDB, EvaluationScenarioOutputDB, EvaluationScenarioResult, EvaluatorConfigDB, + AggregatedResultDB, Result, ) @@ -31,20 +30,21 @@ @shared_task(queue="agenta_backend.tasks.evaluations.evaluate") -def evaluate(app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str): +def evaluate( + app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str +): loop = asyncio.get_event_loop() new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) # NOTE: This will generate a name in case it's run from cli - evaluation_evaluators_configs, evaluator_key_name_mapping = process_evaluators_configs( - new_evaluation.evaluators_configs - ) + ( + evaluation_evaluators_configs, + evaluator_key_name_mapping, + ) = process_evaluators_configs(new_evaluation.evaluators_configs) testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) - new_evaluation_db = loop.run_until_complete( - fetch_evaluation_by_id(evaluation_id) - ) + new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) evaluators_aggregated_data = defaultdict(list) for variant_id in new_evaluation.variant_ids: @@ -55,7 +55,7 @@ def evaluate(app_data: dict, new_evaluation_data: dict, evaluation_id: str, test ) # TODO: remove if abraham's fix is working - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + uri = deployment.uri.replace("http://localhost", "http://obsidian") for data_point in testset.csvdata: variant_output = llm_apps_service.get_llm_app_output(uri, data_point) @@ -94,7 +94,9 @@ def evaluate(app_data: dict, new_evaluation_data: dict, evaluation_id: str, test ) ) - aggregated_results = aggregate_evaluator_results(evaluators_aggregated_data, evaluator_key_name_mapping) + aggregated_results = aggregate_evaluator_results( + evaluators_aggregated_data, evaluator_key_name_mapping + ) updated_evaluation = loop.run_until_complete( update_evaluation_with_aggregated_results( @@ -123,13 +125,14 @@ def aggregate_evaluator_results(evaluators_aggregated_data, evaluator_key_name_m aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): average_value = sum(values) / len(values) if values else 0 - evaluator_name = evaluator_key_name_mapping.get(evaluator_key, "Unknown Evaluator") - aggregated_result_value: AggregatedResult = AggregatedResult( + evaluator_name = evaluator_key_name_mapping.get( + evaluator_key, "Unknown Evaluator" + ) + aggregated_result_value = AggregatedResultDB( evaluator_config=EvaluatorConfigDB( - name=evaluator_name, - evaluator_key=evaluator_key + name=evaluator_name, evaluator_key=evaluator_key ), result=Result(type="number", value=str(average_value)), ) aggregated_results.append(aggregated_result_value) - return aggregated_results \ No newline at end of file + return aggregated_results From 66f8c8879693af61a8ed8342e189f24bcb91a7d6 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 19:35:15 +0100 Subject: [PATCH 052/414] Update - include status to update evaluation with agg... function --- agenta-backend/agenta_backend/services/db_manager.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index ea4bb88a76..7ba43dcf47 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -19,7 +19,7 @@ ) from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( - AggregatedResult, + AggregatedResultDB, AppDB, AppVariantDB, EvaluationScenarioInputDB, @@ -34,14 +34,15 @@ EvaluationScenarioDB, ImageDB, OrganizationDB, + DeploymentDB, TemplateDB, TestSetDB, UserDB, ) from agenta_backend.utils.common import check_user_org_access, engine +from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum -from agenta_backend.models.db_models import DeploymentDB from fastapi import HTTPException from fastapi.responses import JSONResponse @@ -1678,13 +1679,14 @@ async def create_new_evaluation_scenario( async def update_evaluation_with_aggregated_results( - evaluation_id: ObjectId, aggregated_results: List[AggregatedResult] + evaluation_id: ObjectId, aggregated_results: List[AggregatedResultDB] ) -> EvaluationDB: evaluation = await engine.find_one(EvaluationDB, EvaluationDB.id == evaluation_id) if not evaluation: raise ValueError("Evaluation not found") + evaluation.status = EvaluationStatusEnum.EVALUATION_FINISHED evaluation.aggregated_results = aggregated_results evaluation.updated_at = datetime.utcnow().isoformat() From b6c32be8bdad8d6ee1b07202082549bf2ae9c1f3 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 21 Dec 2023 19:42:35 +0100 Subject: [PATCH 053/414] Refactor - renamed variable for clarity --- agenta-backend/agenta_backend/tasks/evaluations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index d7075d77d4..ca0be7bade 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -128,11 +128,11 @@ def aggregate_evaluator_results(evaluators_aggregated_data, evaluator_key_name_m evaluator_name = evaluator_key_name_mapping.get( evaluator_key, "Unknown Evaluator" ) - aggregated_result_value = AggregatedResultDB( + aggregated_result_db = AggregatedResultDB( evaluator_config=EvaluatorConfigDB( name=evaluator_name, evaluator_key=evaluator_key ), result=Result(type="number", value=str(average_value)), ) - aggregated_results.append(aggregated_result_value) + aggregated_results.append(aggregated_result_db) return aggregated_results From 093249c07053c688957ece773b35c09c8171e7be Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 22 Dec 2023 10:43:23 +0100 Subject: [PATCH 054/414] add crud for evaluators configs --- .../models/api/evaluation_model.py | 9 +- .../agenta_backend/models/db_models.py | 5 +- .../routers/evaluators_router.py | 76 +++++++++++++++-- .../agenta_backend/services/db_manager.py | 76 +++++++++++++++++ .../services/evaluation_service.py | 6 +- .../services/evaluator_manager.py | 82 +++++++++++++++++++ .../agenta_backend/tasks/evaluations.py | 44 +++++++--- 7 files changed, 275 insertions(+), 23 deletions(-) create mode 100644 agenta-backend/agenta_backend/services/evaluator_manager.py diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index a277281ddd..471573f805 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -173,4 +173,11 @@ class NewEvaluation(BaseModel): class Evaluator(BaseModel): name: str - key: str \ No newline at end of file + key: str + + +class NewEvaluatorConfig(BaseModel): + app_id: str + name: str + evaluator_key: str + settings_values: dict diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 6eb33954c9..8e3f1a1959 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -209,6 +209,9 @@ class EvaluationSettingsTemplate(EmbeddedModel): class EvaluatorConfigDB(Model): + app: AppDB = Reference(key_name="app") + organization: OrganizationDB = Reference(key_name="organization") + user: UserDB = Reference(key_name="user") name: str evaluator_key: str settings_values: Optional[Dict[str, Any]] = None @@ -216,7 +219,7 @@ class EvaluatorConfigDB(Model): updated_at: datetime = Field(default=datetime.utcnow()) class Config: - collection = "evaluator_config" + collection = "evaluators_configs" class Result(EmbeddedModel): diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index bb49659117..625f21a19f 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -1,14 +1,40 @@ import os import json from typing import List +import logging -from fastapi import HTTPException, APIRouter +from fastapi import HTTPException, APIRouter, Query +from fastapi.responses import JSONResponse from agenta_backend.models.api.evaluation_model import ( - Evaluator + Evaluator, + EvaluatorConfig, + NewEvaluatorConfig, ) +from agenta_backend.services import ( + db_manager, +) + +from agenta_backend.services.evaluator_manager import ( + get_evaluators_configs, + create_evaluator_config, +) + +from agenta_backend.utils.common import ( + check_access_to_app +) + +if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: + from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all + get_user_and_org_id, + ) +else: + from agenta_backend.services.selectors import get_user_and_org_id + router = APIRouter() +logger = logging.getLogger(__name__) + @router.get("/", response_model=List[Evaluator]) async def get_evaluators(): @@ -18,15 +44,53 @@ async def get_evaluators(): List[Evaluator]: A list of evaluator objects. """ - file_path = 'agenta_backend/resources/evaluators/evaluators.json' + file_path = "agenta_backend/resources/evaluators/evaluators.json" if not os.path.exists(file_path): raise HTTPException(status_code=404, detail="Evaluators file not found") try: - with open(file_path, 'r') as file: + with open(file_path, "r") as file: evaluators = json.load(file) except Exception as e: - raise HTTPException(status_code=500, detail=f"Error reading evaluators file: {str(e)}") + raise HTTPException( + status_code=500, detail=f"Error reading evaluators file: {str(e)}" + ) + + return evaluators + + +@router.get("/configs/", response_model=List[EvaluatorConfig]) +async def get_evaluator_configs( + app_id: str = Query(), response_model=List[EvaluatorConfig] +): + """Endpoint to fetch evaluator configurations for a specific app. + + Args: + app_id (str): The ID of the app. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ + return await get_evaluators_configs(app_id) + + +@router.post("/configs/", response_model=EvaluatorConfig) +async def create_new_evaluator_config( + payload: NewEvaluatorConfig, +): + """Endpoint to fetch evaluator configurations for a specific app. + + Args: + app_id (str): The ID of the app. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ - return evaluators \ No newline at end of file + return await create_evaluator_config( + app_id=payload.app_id, + name=payload.name, + evaluator_key=payload.evaluator_key, + settings_values=payload.settings_values, + ) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 7ba43dcf47..2325e1a974 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1692,3 +1692,79 @@ async def update_evaluation_with_aggregated_results( await engine.save(evaluation) return evaluation + + +async def fetch_evaluators_configs(app_id: str): + """Fetches a list of evaluator configurations from the database. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ + assert app_id is not None, "evaluation_id cannot be None" + + try: + query_expression = query.eq(EvaluatorConfigDB.app, ObjectId(app_id)) + evaluators_configs: [EvaluatorConfigDB] = await engine.find( + EvaluatorConfigDB, query_expression + ) + return evaluators_configs + except Exception as e: + raise e + + +async def create_evaluator_config( + app: AppDB, + user: UserDB, + organization: OrganizationDB, + name: str, + evaluator_key: str, + settings_values: Optional[Dict[str, Any]] = None, +) -> EvaluatorConfigDB: + """Create a new evaluator configuration in the database.""" + + + new_evaluator_config = EvaluatorConfigDB( + app=app, + user=user, + organization=organization, + name=name, + evaluator_key=evaluator_key, + settings_values=settings_values, + ) + + try: + await engine.save(new_evaluator_config) + return new_evaluator_config + except Exception as e: + raise e + + +async def update_evaluator_config( + evaluator_config_id: str, updates: Dict[str, Any] +) -> EvaluatorConfigDB: + """Edit an existing evaluator configuration in the database.""" + assert evaluator_config_id is not None, "Evaluator Config ID cannot be None" + + try: + updated_evaluator_config = await engine.find_one_and_update( + EvaluatorConfigDB, + query.eq("_id", ObjectId(evaluator_config_id)), + {"$set": updates}, + return_document=True, + ) + return updated_evaluator_config + except Exception as e: + raise e + + +async def delete_evaluator_config(evaluator_config_id: str) -> bool: + """Delete an evaluator configuration from the database.""" + assert evaluator_config_id is not None, "Evaluator Config ID cannot be None" + + try: + delete_result = await engine.find_one_and_delete( + EvaluatorConfigDB, query.eq("_id", ObjectId(evaluator_config_id)) + ) + return delete_result is not None + except Exception as e: + raise e diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 1e5a9589c5..fe92d30c69 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -869,8 +869,8 @@ async def create_new_evaluation( app = AppDB(**app_data) # This will generate a name in case it's run from cli - evaluation_evaluators_configs, _ = process_evaluators_configs( - new_evaluation.evaluators_configs + evaluators_configs, _ = process_evaluators_configs( + app, new_evaluation.evaluators_configs ) testset = await db_manager.fetch_testset_by_id(new_evaluation.testset_id) evaluation_db = await db_manager.create_new_evaluation( @@ -880,7 +880,7 @@ async def create_new_evaluation( testset=testset, status=EvaluationStatusEnum.EVALUATION_STARTED, variants=new_evaluation.variant_ids, - evaluators_configs=evaluation_evaluators_configs, + evaluators_configs=evaluators_configs, ) return await converters.evaluation_db_to_pydantic(evaluation_db) diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py new file mode 100644 index 0000000000..9e7d9337a8 --- /dev/null +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -0,0 +1,82 @@ +from typing import Any, Dict, Optional + +from agenta_backend.services.db_manager import ( + fetch_evaluators_configs, + create_evaluator_config as db_manager_create_evaluator_config, + update_evaluator_config, + delete_evaluator_config, + fetch_app_by_id +) + + +from agenta_backend.models.db_models import EvaluatorConfigDB + + +async def get_evaluators_configs(app_id: str): + """Get evaluators configs by app_id. + + Args: + app_id (str): The ID of the app. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ + return await fetch_evaluators_configs(app_id) + + +async def create_evaluator_config( + app_id: str, + name: str, + evaluator_key: str, + settings_values: Optional[Dict[str, Any]] = None, +) -> EvaluatorConfigDB: + """ + Create a new evaluator configuration for an app. + + Args: + app_id (str): The ID of the app. + name (str): The name of the evaluator config. + evaluator_key (str): The key of the evaluator. + settings_values (Optional[Dict[str, Any]]): Additional settings for the evaluator. + + Returns: + EvaluatorConfigDB: The newly created evaluator configuration object. + """ + app = await fetch_app_by_id(app_id) + return await db_manager_create_evaluator_config( + app=app, + organization=app.organization, + user=app.user, + name=name, + evaluator_key=evaluator_key, + settings_values=settings_values + ) + + +async def update_evaluator_config( + evaluator_config_id: str, updates: Dict[str, Any] +) -> EvaluatorConfigDB: + """ + Edit an existing evaluator configuration. + + Args: + evaluator_config_id (str): The ID of the evaluator configuration to be updated. + updates (Dict[str, Any]): A dictionary containing the updates. + + Returns: + EvaluatorConfigDB: The updated evaluator configuration object. + """ + return await update_evaluator_config(evaluator_config_id, updates) + + +async def remove_evaluator_config(evaluator_config_id: str) -> bool: + """ + Delete an evaluator configuration. + + Args: + evaluator_config_id (str): The ID of the evaluator configuration to be deleted. + + Returns: + bool: True if the deletion was successful, False otherwise. + """ + return await delete_evaluator_config(evaluator_config_id) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index ca0be7bade..fab619b37d 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -39,9 +39,9 @@ def evaluate( # NOTE: This will generate a name in case it's run from cli ( - evaluation_evaluators_configs, + evaluators_configs, evaluator_key_name_mapping, - ) = process_evaluators_configs(new_evaluation.evaluators_configs) + ) = process_evaluators_configs(app, new_evaluation.evaluators_configs) testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) @@ -49,19 +49,20 @@ def evaluate( for variant_id in new_evaluation.variant_ids: variant_id = str(variant_id) + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) deployment = loop.run_until_complete( get_deployment_by_objectid(app_variant_db.base.deployment) ) # TODO: remove if abraham's fix is working - uri = deployment.uri.replace("http://localhost", "http://obsidian") + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") for data_point in testset.csvdata: variant_output = llm_apps_service.get_llm_app_output(uri, data_point) evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config in evaluation_evaluators_configs: + for evaluator_config in evaluators_configs: result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], @@ -95,7 +96,7 @@ def evaluate( ) aggregated_results = aggregate_evaluator_results( - evaluators_aggregated_data, evaluator_key_name_mapping + app, evaluators_aggregated_data, evaluator_key_name_mapping ) updated_evaluation = loop.run_until_complete( @@ -106,22 +107,37 @@ def evaluate( def process_evaluators_configs( + app: AppDB, evaluators_configs: List[EvaluatorConfig], ) -> Tuple[List[EvaluatorConfigDB], Dict[str, str]]: """Process evaluators_configs to include names if missing and return a mapping of evaluator keys to names.""" + processed_configs = [] evaluator_key_name_mapping = {} + for config in evaluators_configs: - config_dict = config.dict() - if "name" not in config_dict: - config_dict["name"] = f"Evaluator_{uuid.uuid4()}" # Generate a random name - processed_config = EvaluatorConfigDB(**config_dict) + # Handle the 'name' field with a default value if it's None + name = getattr(config, "name", f"Evaluator_{uuid.uuid4()}") + + processed_config = EvaluatorConfigDB( + user=app.user, + organization=app.organization, + name=name, + app=app, + evaluator_key=config.evaluator_key, + ) + processed_configs.append(processed_config) - evaluator_key_name_mapping[config_dict["evaluator_key"]] = config_dict["name"] + evaluator_key_name_mapping[ + processed_config.evaluator_key + ] = processed_config.name + return processed_configs, evaluator_key_name_mapping -def aggregate_evaluator_results(evaluators_aggregated_data, evaluator_key_name_mapping): +def aggregate_evaluator_results( + app, evaluators_aggregated_data, evaluator_key_name_mapping +): aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): average_value = sum(values) / len(values) if values else 0 @@ -130,7 +146,11 @@ def aggregate_evaluator_results(evaluators_aggregated_data, evaluator_key_name_m ) aggregated_result_db = AggregatedResultDB( evaluator_config=EvaluatorConfigDB( - name=evaluator_name, evaluator_key=evaluator_key + app=app, + user=app.user, + organization=app.organization, + name=evaluator_name, + evaluator_key=evaluator_key, ), result=Result(type="number", value=str(average_value)), ) From 2ca31b5ba54c33f0a1ee0589594bf4fad8e8d3cc Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 22 Dec 2023 12:56:45 +0100 Subject: [PATCH 055/414] remove process_evaluators_configs and improve crud for evaluators configs --- .../models/api/evaluation_model.py | 2 +- .../agenta_backend/services/db_manager.py | 18 ++++++ .../services/evaluation_service.py | 10 ++-- .../services/evaluator_manager.py | 13 +++++ .../agenta_backend/tasks/evaluations.py | 56 ++++++++++--------- 5 files changed, 66 insertions(+), 33 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 471573f805..39bc8bbb57 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -167,7 +167,7 @@ class EvaluatorConfig(BaseModel): class NewEvaluation(BaseModel): app_id: str variant_ids: List[str] - evaluators_configs: List[EvaluatorConfig] + evaluators_configs: List[str] testset_id: str diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 2325e1a974..9e562013a8 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1712,6 +1712,24 @@ async def fetch_evaluators_configs(app_id: str): raise e +async def fetch_evaluator_config(evaluator_config_id: str): + """Fetches a list of evaluator configurations from the database. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ + + try: + query_expression = query.eq(EvaluatorConfigDB._id, ObjectId(evaluator_config_id)) + evaluator_config: EvaluatorConfigDB = await engine.find( + EvaluatorConfigDB, query_expression + ) + return evaluator_config + except Exception as e: + raise e + + + async def create_evaluator_config( app: AppDB, user: UserDB, diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index fe92d30c69..fefd326db8 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -863,15 +863,15 @@ async def create_new_evaluation( Evaluation """ - from agenta_backend.tasks.evaluations import process_evaluators_configs + # from agenta_backend.tasks.evaluations import process_evaluators_configs new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) # This will generate a name in case it's run from cli - evaluators_configs, _ = process_evaluators_configs( - app, new_evaluation.evaluators_configs - ) + # evaluators_configs, _ = process_evaluators_configs( + # app, new_evaluation.evaluators_configs + # ) testset = await db_manager.fetch_testset_by_id(new_evaluation.testset_id) evaluation_db = await db_manager.create_new_evaluation( app=app, @@ -880,7 +880,7 @@ async def create_new_evaluation( testset=testset, status=EvaluationStatusEnum.EVALUATION_STARTED, variants=new_evaluation.variant_ids, - evaluators_configs=evaluators_configs, + evaluators_configs=new_evaluation.evaluators_configs, ) return await converters.evaluation_db_to_pydantic(evaluation_db) diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 9e7d9337a8..80fbf37afd 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -1,6 +1,7 @@ from typing import Any, Dict, Optional from agenta_backend.services.db_manager import ( + fetch_evaluator_config, fetch_evaluators_configs, create_evaluator_config as db_manager_create_evaluator_config, update_evaluator_config, @@ -24,6 +25,18 @@ async def get_evaluators_configs(app_id: str): return await fetch_evaluators_configs(app_id) +async def get_evaluator_config(app_id: str): + """Get evaluators configs by app_id. + + Args: + app_id (str): The ID of the app. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ + return await fetch_evaluator_config(app_id) + + async def create_evaluator_config( app_id: str, name: str, diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index fab619b37d..f132a751a7 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -10,6 +10,7 @@ from agenta_backend.services.db_manager import ( fetch_evaluation_by_id, fetch_app_variant_by_id, + fetch_evaluator_config, get_deployment_by_objectid, fetch_testset_by_id, create_new_evaluation_scenario, @@ -38,10 +39,10 @@ def evaluate( app = AppDB(**app_data) # NOTE: This will generate a name in case it's run from cli - ( - evaluators_configs, - evaluator_key_name_mapping, - ) = process_evaluators_configs(app, new_evaluation.evaluators_configs) + # ( + # evaluators_configs, + # evaluator_key_name_mapping, + # ) = process_evaluators_configs(app, new_evaluation.evaluators_configs) testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) @@ -62,7 +63,8 @@ def evaluate( variant_output = llm_apps_service.get_llm_app_output(uri, data_point) evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config in evaluators_configs: + for evaluator_config_id in new_evaluation.evaluators_configs: + evaluator_config = fetch_evaluator_config(evaluator_config_id) result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], @@ -106,33 +108,33 @@ def evaluate( ) -def process_evaluators_configs( - app: AppDB, - evaluators_configs: List[EvaluatorConfig], -) -> Tuple[List[EvaluatorConfigDB], Dict[str, str]]: - """Process evaluators_configs to include names if missing and return a mapping of evaluator keys to names.""" +# def process_evaluators_configs( +# app: AppDB, +# evaluators_configs: List[EvaluatorConfig], +# ) -> Tuple[List[EvaluatorConfigDB], Dict[str, str]]: +# """Process evaluators_configs to include names if missing and return a mapping of evaluator keys to names.""" - processed_configs = [] - evaluator_key_name_mapping = {} +# processed_configs = [] +# evaluator_key_name_mapping = {} - for config in evaluators_configs: - # Handle the 'name' field with a default value if it's None - name = getattr(config, "name", f"Evaluator_{uuid.uuid4()}") +# for config in evaluators_configs: +# # Handle the 'name' field with a default value if it's None +# name = getattr(config, "name", f"Evaluator_{uuid.uuid4()}") - processed_config = EvaluatorConfigDB( - user=app.user, - organization=app.organization, - name=name, - app=app, - evaluator_key=config.evaluator_key, - ) +# processed_config = EvaluatorConfigDB( +# user=app.user, +# organization=app.organization, +# name=name, +# app=app, +# evaluator_key=config.evaluator_key, +# ) - processed_configs.append(processed_config) - evaluator_key_name_mapping[ - processed_config.evaluator_key - ] = processed_config.name +# processed_configs.append(processed_config) +# evaluator_key_name_mapping[ +# processed_config.evaluator_key +# ] = processed_config.name - return processed_configs, evaluator_key_name_mapping +# return processed_configs, evaluator_key_name_mapping def aggregate_evaluator_results( From 3789917c671f09e5abf7d3700a00aeaa30be1269 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 22 Dec 2023 12:57:18 +0100 Subject: [PATCH 056/414] adjust client payload when creating a new evaluation --- agenta-cli/agenta/client/client.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/agenta-cli/agenta/client/client.py b/agenta-cli/agenta/client/client.py index dc5cbdcd9b..d5e4547f74 100644 --- a/agenta-cli/agenta/client/client.py +++ b/agenta-cli/agenta/client/client.py @@ -538,20 +538,14 @@ def run_evaluation(app_name: str, host: str, api_key: str = None) -> str: api_key (str): The API key to use for the request. """ - evaluators_configs = [ - { - "evaluator_key": "auto_similarity_match", - } - ] - new_evaluation = { - "app_id": "6581e69500afd8dfe404f765", + "app_id": "6583e552eb855930ec6b1bdd", "variant_ids": [ - "6581e69500afd8dfe404f76b", + "6583e552eb855930ec6b1be3", # "6570aed55d0eaff2293088e6" ], - "evaluators_configs": evaluators_configs, - "testset_id": "6581e69500afd8dfe404f76c", + "evaluators_configs": ["65856b2b11d53fcce5894ab6"], + "testset_id": "6583e552eb855930ec6b1be4", } response = requests.post( From 10b17b4b31acae7c803a609cdbc8d0567f2503b8 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 22 Dec 2023 13:29:52 +0100 Subject: [PATCH 057/414] Update - modified aggregate evaluator results function --- .../agenta_backend/tasks/evaluations.py | 65 ++++--------------- 1 file changed, 13 insertions(+), 52 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index f132a751a7..cf8de0ed5c 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -13,16 +13,17 @@ fetch_evaluator_config, get_deployment_by_objectid, fetch_testset_by_id, + create_evaluator_config, + create_aggregated_results, create_new_evaluation_scenario, update_evaluation_with_aggregated_results, ) -from agenta_backend.models.api.evaluation_model import EvaluatorConfig, NewEvaluation +from agenta_backend.models.api.evaluation_model import NewEvaluation from agenta_backend.models.db_models import ( AppDB, EvaluationScenarioOutputDB, EvaluationScenarioResult, - EvaluatorConfigDB, AggregatedResultDB, Result, ) @@ -38,12 +39,6 @@ def evaluate( new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) - # NOTE: This will generate a name in case it's run from cli - # ( - # evaluators_configs, - # evaluator_key_name_mapping, - # ) = process_evaluators_configs(app, new_evaluation.evaluators_configs) - testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) evaluators_aggregated_data = defaultdict(list) @@ -97,10 +92,11 @@ def evaluate( ) ) - aggregated_results = aggregate_evaluator_results( - app, evaluators_aggregated_data, evaluator_key_name_mapping + aggregated_results = loop.run_until_complete( + aggregate_evaluator_results( + app, evaluators_aggregated_data, evaluator_key_name_mapping + ) ) - updated_evaluation = loop.run_until_complete( update_evaluation_with_aggregated_results( new_evaluation_db.id, aggregated_results @@ -108,53 +104,18 @@ def evaluate( ) -# def process_evaluators_configs( -# app: AppDB, -# evaluators_configs: List[EvaluatorConfig], -# ) -> Tuple[List[EvaluatorConfigDB], Dict[str, str]]: -# """Process evaluators_configs to include names if missing and return a mapping of evaluator keys to names.""" - -# processed_configs = [] -# evaluator_key_name_mapping = {} - -# for config in evaluators_configs: -# # Handle the 'name' field with a default value if it's None -# name = getattr(config, "name", f"Evaluator_{uuid.uuid4()}") - -# processed_config = EvaluatorConfigDB( -# user=app.user, -# organization=app.organization, -# name=name, -# app=app, -# evaluator_key=config.evaluator_key, -# ) - -# processed_configs.append(processed_config) -# evaluator_key_name_mapping[ -# processed_config.evaluator_key -# ] = processed_config.name - -# return processed_configs, evaluator_key_name_mapping - - -def aggregate_evaluator_results( - app, evaluators_aggregated_data, evaluator_key_name_mapping -): +async def aggregate_evaluator_results( + app: AppDB, evaluators_aggregated_data: dict, evaluator_key_name_mapping: dict +) -> List[AggregatedResultDB]: aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): average_value = sum(values) / len(values) if values else 0 evaluator_name = evaluator_key_name_mapping.get( evaluator_key, "Unknown Evaluator" ) - aggregated_result_db = AggregatedResultDB( - evaluator_config=EvaluatorConfigDB( - app=app, - user=app.user, - organization=app.organization, - name=evaluator_name, - evaluator_key=evaluator_key, - ), - result=Result(type="number", value=str(average_value)), + evaluator_config = await create_evaluator_config( + app, app.user, app.organization, evaluator_name, evaluator_key ) + aggregated_result_db = await create_aggregated_results(evaluator_config, str(average_value)) aggregated_results.append(aggregated_result_db) return aggregated_results From 4a4b971e2fc9d461f1f9cb0be0372ec0d054d122 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 22 Dec 2023 13:30:22 +0100 Subject: [PATCH 058/414] Update - implemented create aggregated results db function --- .../agenta_backend/services/db_manager.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 9e562013a8..950be749c2 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -19,6 +19,7 @@ ) from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( + Result, AggregatedResultDB, AppDB, AppVariantDB, @@ -1694,6 +1695,22 @@ async def update_evaluation_with_aggregated_results( return evaluation +async def create_aggregated_results( + evaluator_config: EvaluatorConfigDB, average_value: Any +) -> AggregatedResultDB: + """Create an aggregated results in the database.""" + + aggregated_result = AggregatedResultDB( + evaluator_config=evaluator_config, + result=Result(type="number", value=average_value), + ) + try: + await engine.save(aggregated_result) + return aggregated_result + except Exception as e: + raise e + + async def fetch_evaluators_configs(app_id: str): """Fetches a list of evaluator configurations from the database. @@ -1740,7 +1757,6 @@ async def create_evaluator_config( ) -> EvaluatorConfigDB: """Create a new evaluator configuration in the database.""" - new_evaluator_config = EvaluatorConfigDB( app=app, user=user, From f8ea7bc4ae6ba428bf92114fe973fe42b149b458 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 22 Dec 2023 16:40:21 +0100 Subject: [PATCH 059/414] Refactor - modified evaluate task and clean up code --- .../agenta_backend/models/db_models.py | 2 +- .../routers/evaluation_router.py | 1 + .../agenta_backend/services/db_manager.py | 38 ++++++++++++++---- .../services/evaluation_service.py | 21 +++++----- .../services/evaluator_manager.py | 8 ++-- .../agenta_backend/tasks/evaluations.py | 40 ++++++++----------- 6 files changed, 63 insertions(+), 47 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 8e3f1a1959..98589d05fe 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -233,7 +233,7 @@ class EvaluationScenarioResult(EmbeddedModel): class AggregatedResultDB(Model): - evaluator_config: EvaluatorConfigDB = Reference() + evaluator_config: ObjectId result: Result created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index ba8dbfc24c..79a758eab1 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -89,6 +89,7 @@ async def create_evaluation( evaluation = await evaluation_service.create_new_evaluation( app_data=app_data, new_evaluation_data=new_evaluation_data, + evaluators_configs=payload.evaluators_configs ) # Start celery task diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 950be749c2..783ffed422 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1623,7 +1623,7 @@ async def create_new_evaluation( testset: TestSetDB, status: str, variants: [AppVariantDB], - evaluators_configs: [EvaluatorConfigDB], + evaluators_configs: List[EvaluatorConfigDB], ) -> EvaluationDB: """Create a new evaluation scenario. Returns: @@ -1696,12 +1696,12 @@ async def update_evaluation_with_aggregated_results( async def create_aggregated_results( - evaluator_config: EvaluatorConfigDB, average_value: Any + evaluator_config_id: str, average_value: Any ) -> AggregatedResultDB: """Create an aggregated results in the database.""" aggregated_result = AggregatedResultDB( - evaluator_config=evaluator_config, + evaluator_config=ObjectId(evaluator_config_id), result=Result(type="number", value=average_value), ) try: @@ -1730,15 +1730,15 @@ async def fetch_evaluators_configs(app_id: str): async def fetch_evaluator_config(evaluator_config_id: str): - """Fetches a list of evaluator configurations from the database. + """Fetch evaluator configurations from the database. Returns: - List[EvaluatorConfigDB]: A list of evaluator configuration objects. + EvaluatorConfigDB: the evaluator configuration object. """ try: - query_expression = query.eq(EvaluatorConfigDB._id, ObjectId(evaluator_config_id)) - evaluator_config: EvaluatorConfigDB = await engine.find( + query_expression = query.eq(EvaluatorConfigDB.id, ObjectId(evaluator_config_id)) + evaluator_config: EvaluatorConfigDB = await engine.find_one( EvaluatorConfigDB, query_expression ) return evaluator_config @@ -1746,6 +1746,30 @@ async def fetch_evaluator_config(evaluator_config_id: str): raise e +async def fetch_evaluator_config_by_appId( + app_id: str, evaluator_name: str +) -> EvaluatorConfigDB: + """Fetch the evaluator config from the database using the app Id and evaluator name. + + Args: + app_id (str): The app Id + evaluator_name (str): The name of the evaluator + + Returns: + EvaluatorConfigDB: the evaluator configuration object. + """ + + try: + query_expression = query.eq(EvaluatorConfigDB.app, ObjectId(app_id)) & query.eq( + EvaluatorConfigDB.evaluator_key, evaluator_name + ) + evaluator_config: EvaluatorConfigDB = await engine.find_one( + EvaluatorConfigDB, query_expression + ) + return evaluator_config + except Exception as e: + raise e + async def create_evaluator_config( app: AppDB, diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index fefd326db8..fd7251b736 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -850,29 +850,28 @@ async def fetch_custom_evaluation_names( async def create_new_evaluation( - app_data: dict, new_evaluation_data: dict + app_data: dict, new_evaluation_data: dict, evaluators_configs: List[str] ) -> Evaluation: """ - Create a new evaluation based on the provided payload and additional arguments. + Create a new evaluation. Args: - payload (NewEvaluation): The evaluation payload. - **user_org_data (dict): Additional keyword arguments, e.g., user id. + app_data (dict): Required app data + new_evaluation_data (dict): Required new evaluation data + evaluators_configs (List[str]): List of evaluator configurations Returns: Evaluation """ - # from agenta_backend.tasks.evaluations import process_evaluators_configs - new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) - # This will generate a name in case it's run from cli - # evaluators_configs, _ = process_evaluators_configs( - # app, new_evaluation.evaluators_configs - # ) testset = await db_manager.fetch_testset_by_id(new_evaluation.testset_id) + evaluators_configs_db = [ + await db_manager.fetch_evaluator_config(evaluator_config) + for evaluator_config in evaluators_configs + ] evaluation_db = await db_manager.create_new_evaluation( app=app, organization=app.organization, @@ -880,7 +879,7 @@ async def create_new_evaluation( testset=testset, status=EvaluationStatusEnum.EVALUATION_STARTED, variants=new_evaluation.variant_ids, - evaluators_configs=new_evaluation.evaluators_configs, + evaluators_configs=evaluators_configs_db, ) return await converters.evaluation_db_to_pydantic(evaluation_db) diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 80fbf37afd..049e95c4d4 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -25,16 +25,16 @@ async def get_evaluators_configs(app_id: str): return await fetch_evaluators_configs(app_id) -async def get_evaluator_config(app_id: str): +async def get_evaluator_config(config_id: str): """Get evaluators configs by app_id. Args: - app_id (str): The ID of the app. + config_id (str): The ID of the evaluator configuration. Returns: - List[EvaluatorConfigDB]: A list of evaluator configuration objects. + EvaluatorConfigDB: the evaluator configuration object. """ - return await fetch_evaluator_config(app_id) + return await fetch_evaluator_config(config_id) async def create_evaluator_config( diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index cf8de0ed5c..c3bc652bcc 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -1,10 +1,7 @@ -from collections import defaultdict -from bson import ObjectId -from celery import shared_task import asyncio -from datetime import datetime -from typing import List, Tuple, Dict -import uuid +from typing import List +from celery import shared_task +from collections import defaultdict from agenta_backend.services import llm_apps_service from agenta_backend.services.db_manager import ( @@ -13,13 +10,11 @@ fetch_evaluator_config, get_deployment_by_objectid, fetch_testset_by_id, - create_evaluator_config, create_aggregated_results, create_new_evaluation_scenario, + fetch_evaluator_config_by_appId, update_evaluation_with_aggregated_results, ) -from agenta_backend.models.api.evaluation_model import NewEvaluation - from agenta_backend.models.db_models import ( AppDB, EvaluationScenarioOutputDB, @@ -27,8 +22,8 @@ AggregatedResultDB, Result, ) - from agenta_backend.services import evaluators_service +from agenta_backend.models.api.evaluation_model import NewEvaluation @shared_task(queue="agenta_backend.tasks.evaluations.evaluate") @@ -36,14 +31,14 @@ def evaluate( app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str ): loop = asyncio.get_event_loop() - new_evaluation = NewEvaluation(**new_evaluation_data) app = AppDB(**app_data) + evaluation = NewEvaluation(**new_evaluation_data) testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) evaluators_aggregated_data = defaultdict(list) - for variant_id in new_evaluation.variant_ids: + for variant_id in evaluation.variant_ids: variant_id = str(variant_id) app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) @@ -58,8 +53,10 @@ def evaluate( variant_output = llm_apps_service.get_llm_app_output(uri, data_point) evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config_id in new_evaluation.evaluators_configs: - evaluator_config = fetch_evaluator_config(evaluator_config_id) + for evaluator_config_id in evaluation.evaluators_configs: + evaluator_config = loop.run_until_complete( + fetch_evaluator_config(evaluator_config_id) + ) result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], @@ -93,9 +90,7 @@ def evaluate( ) aggregated_results = loop.run_until_complete( - aggregate_evaluator_results( - app, evaluators_aggregated_data, evaluator_key_name_mapping - ) + aggregate_evaluator_results(app, evaluators_aggregated_data) ) updated_evaluation = loop.run_until_complete( update_evaluation_with_aggregated_results( @@ -105,17 +100,14 @@ def evaluate( async def aggregate_evaluator_results( - app: AppDB, evaluators_aggregated_data: dict, evaluator_key_name_mapping: dict + app: AppDB, evaluators_aggregated_data: dict ) -> List[AggregatedResultDB]: aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): average_value = sum(values) / len(values) if values else 0 - evaluator_name = evaluator_key_name_mapping.get( - evaluator_key, "Unknown Evaluator" - ) - evaluator_config = await create_evaluator_config( - app, app.user, app.organization, evaluator_name, evaluator_key + evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) + aggregated_result_db = await create_aggregated_results( + str(evaluator_config.id), str(average_value) ) - aggregated_result_db = await create_aggregated_results(evaluator_config, str(average_value)) aggregated_results.append(aggregated_result_db) return aggregated_results From bd3048363ac455f07ccdb3220206db031eb82246 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 22 Dec 2023 17:04:17 +0100 Subject: [PATCH 060/414] Refactor - change AggregatedResult db model to embedded model --- .../agenta_backend/models/converters.py | 12 +++-------- .../agenta_backend/models/db_models.py | 9 ++------- .../agenta_backend/services/db_manager.py | 20 ++----------------- .../agenta_backend/tasks/evaluations.py | 12 +++++------ 4 files changed, 13 insertions(+), 40 deletions(-) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index a85cbdc868..b567321f57 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -17,7 +17,7 @@ EvaluationScenarioDB, VariantBaseDB, UserDB, - AggregatedResultDB, + AggregatedResult, ) from agenta_backend.models.api.api_models import ( AppVariant, @@ -83,16 +83,10 @@ async def evaluation_db_to_pydantic( ) -def aggregated_result_to_pydantic(results: List[AggregatedResultDB]) -> List[dict]: +def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: list_of_aggregated_results = [] for aggregated_result in results: - result_dict = { - "evaluator_config": aggregated_result.evaluator_config.json( - exclude={"created_at", "updated_at"} - ), - "result": aggregated_result.result.json(), - } - list_of_aggregated_results.append(result_dict) + list_of_aggregated_results.append(aggregated_result.dict()) return list_of_aggregated_results diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 98589d05fe..ed23a959b1 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -232,14 +232,9 @@ class EvaluationScenarioResult(EmbeddedModel): result: Result -class AggregatedResultDB(Model): +class AggregatedResult(EmbeddedModel): evaluator_config: ObjectId result: Result - created_at: datetime = Field(default=datetime.utcnow()) - updated_at: datetime = Field(default=datetime.utcnow()) - - class Config: - collection = "aggregated_results" class EvaluationScenarioInputDB(EmbeddedModel): @@ -261,7 +256,7 @@ class EvaluationDB(Model): testset: TestSetDB = Reference() variants: List[ObjectId] evaluators_configs: List[EvaluatorConfigDB] - aggregated_results: List[AggregatedResultDB] + aggregated_results: List[AggregatedResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 783ffed422..a1f724d951 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -20,7 +20,7 @@ from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( Result, - AggregatedResultDB, + AggregatedResult, AppDB, AppVariantDB, EvaluationScenarioInputDB, @@ -1680,7 +1680,7 @@ async def create_new_evaluation_scenario( async def update_evaluation_with_aggregated_results( - evaluation_id: ObjectId, aggregated_results: List[AggregatedResultDB] + evaluation_id: ObjectId, aggregated_results: List[AggregatedResult] ) -> EvaluationDB: evaluation = await engine.find_one(EvaluationDB, EvaluationDB.id == evaluation_id) @@ -1695,22 +1695,6 @@ async def update_evaluation_with_aggregated_results( return evaluation -async def create_aggregated_results( - evaluator_config_id: str, average_value: Any -) -> AggregatedResultDB: - """Create an aggregated results in the database.""" - - aggregated_result = AggregatedResultDB( - evaluator_config=ObjectId(evaluator_config_id), - result=Result(type="number", value=average_value), - ) - try: - await engine.save(aggregated_result) - return aggregated_result - except Exception as e: - raise e - - async def fetch_evaluators_configs(app_id: str): """Fetches a list of evaluator configurations from the database. diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index c3bc652bcc..6b2e72d445 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -10,7 +10,6 @@ fetch_evaluator_config, get_deployment_by_objectid, fetch_testset_by_id, - create_aggregated_results, create_new_evaluation_scenario, fetch_evaluator_config_by_appId, update_evaluation_with_aggregated_results, @@ -19,7 +18,7 @@ AppDB, EvaluationScenarioOutputDB, EvaluationScenarioResult, - AggregatedResultDB, + AggregatedResult, Result, ) from agenta_backend.services import evaluators_service @@ -101,13 +100,14 @@ def evaluate( async def aggregate_evaluator_results( app: AppDB, evaluators_aggregated_data: dict -) -> List[AggregatedResultDB]: +) -> List[AggregatedResult]: aggregated_results = [] for evaluator_key, values in evaluators_aggregated_data.items(): average_value = sum(values) / len(values) if values else 0 evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) - aggregated_result_db = await create_aggregated_results( - str(evaluator_config.id), str(average_value) + aggregated_result = AggregatedResult( + evaluator_config=evaluator_config.id, + result=Result(type="number", value=average_value), ) - aggregated_results.append(aggregated_result_db) + aggregated_results.append(aggregated_result) return aggregated_results From d3b5638c3e1203e00925b0cc1a894225764964c1 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 22 Dec 2023 18:00:58 +0100 Subject: [PATCH 061/414] refactor --- .../routers/evaluators_router.py | 26 ++++++++++++++----- .../services/evaluator_manager.py | 23 ++++++---------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 625f21a19f..411ed2e94d 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -16,10 +16,7 @@ db_manager, ) -from agenta_backend.services.evaluator_manager import ( - get_evaluators_configs, - create_evaluator_config, -) +from agenta_backend.services import evaluator_manager from agenta_backend.utils.common import ( check_access_to_app @@ -72,7 +69,7 @@ async def get_evaluator_configs( Returns: List[EvaluatorConfigDB]: A list of evaluator configuration objects. """ - return await get_evaluators_configs(app_id) + return await evaluator_manager.get_evaluators_configs(app_id) @router.post("/configs/", response_model=EvaluatorConfig) @@ -88,9 +85,26 @@ async def create_new_evaluator_config( List[EvaluatorConfigDB]: A list of evaluator configuration objects. """ - return await create_evaluator_config( + return await evaluator_manager.create_evaluator_config( app_id=payload.app_id, name=payload.name, evaluator_key=payload.evaluator_key, settings_values=payload.settings_values, ) + + +@router.delete("/configs/{evaluator_id}/", response_model=bool) +async def delete_evaluator_config(evaluator_id: str): + """Endpoint to delete a specific evaluator configuration. + + Args: + evaluator_id (str): The unique identifier of the evaluator configuration. + + Returns: + bool: True if deletion was successful, False otherwise. + """ + try: + success = await evaluator_manager.delete_evaluator_config(evaluator_id) + return success + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error deleting evaluator configuration: {str(e)}") diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 049e95c4d4..ee99f1a45f 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -1,13 +1,6 @@ from typing import Any, Dict, Optional -from agenta_backend.services.db_manager import ( - fetch_evaluator_config, - fetch_evaluators_configs, - create_evaluator_config as db_manager_create_evaluator_config, - update_evaluator_config, - delete_evaluator_config, - fetch_app_by_id -) +from agenta_backend.services import db_manager from agenta_backend.models.db_models import EvaluatorConfigDB @@ -22,7 +15,7 @@ async def get_evaluators_configs(app_id: str): Returns: List[EvaluatorConfigDB]: A list of evaluator configuration objects. """ - return await fetch_evaluators_configs(app_id) + return await db_manager.fetch_evaluators_configs(app_id) async def get_evaluator_config(config_id: str): @@ -34,7 +27,7 @@ async def get_evaluator_config(config_id: str): Returns: EvaluatorConfigDB: the evaluator configuration object. """ - return await fetch_evaluator_config(config_id) + return await db_manager.fetch_evaluator_config(config_id) async def create_evaluator_config( @@ -55,8 +48,8 @@ async def create_evaluator_config( Returns: EvaluatorConfigDB: The newly created evaluator configuration object. """ - app = await fetch_app_by_id(app_id) - return await db_manager_create_evaluator_config( + app = await db_manager.fetch_app_by_id(app_id) + return await db_manager.create_evaluator_config( app=app, organization=app.organization, user=app.user, @@ -79,10 +72,10 @@ async def update_evaluator_config( Returns: EvaluatorConfigDB: The updated evaluator configuration object. """ - return await update_evaluator_config(evaluator_config_id, updates) + return await db_manager.update_evaluator_config(evaluator_config_id, updates) -async def remove_evaluator_config(evaluator_config_id: str) -> bool: +async def delete_evaluator_config(evaluator_config_id: str) -> bool: """ Delete an evaluator configuration. @@ -92,4 +85,4 @@ async def remove_evaluator_config(evaluator_config_id: str) -> bool: Returns: bool: True if the deletion was successful, False otherwise. """ - return await delete_evaluator_config(evaluator_config_id) + return await db_manager.delete_evaluator_config(evaluator_config_id) From dfd1a0e266ad9037386aa7f5b7bbd739bad77b82 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 22 Dec 2023 18:01:46 +0100 Subject: [PATCH 062/414] delete evaluator config --- agenta-backend/agenta_backend/services/db_manager.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index a1f724d951..7b2bfba7a4 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1804,9 +1804,7 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: assert evaluator_config_id is not None, "Evaluator Config ID cannot be None" try: - delete_result = await engine.find_one_and_delete( - EvaluatorConfigDB, query.eq("_id", ObjectId(evaluator_config_id)) - ) + delete_result = remove_document_using_driver(str(evaluator_config_id), "evaluators_configs") return delete_result is not None except Exception as e: raise e From 7bacb393bad916ef14b80861aeb9e38876d0e5cc Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 22 Dec 2023 20:44:52 +0100 Subject: [PATCH 063/414] Refactor - type evaluators_service functions --- .../services/evaluators_service.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 07bb6b202b..74cd867c00 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,14 +1,15 @@ import re +from typing import Any -def auto_exact_match(variant_output, correct_answer): +def auto_exact_match(variant_output: str, correct_answer: str): if variant_output == correct_answer: return 1 else: return 0 -def auto_similarity_match(variant_output, correct_answer): +def auto_similarity_match(variant_output: str, correct_answer: str): set1 = set(variant_output.split()) set2 = set(correct_answer.split()) intersect = set1.intersection(set2) @@ -18,22 +19,21 @@ def auto_similarity_match(variant_output, correct_answer): return similarity -def auto_regex_test(test_string, regex, should_match): +def auto_regex_test(test_string: str, regex: Any, should_match: bool): re_pattern = re.compile(regex, re.IGNORECASE) result = bool(re_pattern.search(test_string)) return result == should_match def evaluate( - evaluator_name, - correct_answer, - variant_output, - *additional_args, - **additional_kwargs, + evaluator_name: str, + correct_answer: str, + variant_output :str, + *additional_args: dict, + **additional_kwargs: dict, ): try: evaluation_function = globals()[evaluator_name] - return evaluation_function( correct_answer, variant_output, *additional_args, **additional_kwargs ) From 6d5db97f898736a7f39278d528cf0cc09e3b0476 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 22 Dec 2023 23:06:19 +0100 Subject: [PATCH 064/414] another schema update. hopefully the last --- agenta-backend/agenta_backend/models/db_models.py | 6 +++--- agenta-backend/agenta_backend/services/db_manager.py | 2 +- .../agenta_backend/services/evaluation_service.py | 7 ++----- agenta-backend/agenta_backend/tasks/evaluations.py | 4 +++- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index ed23a959b1..b061fb0744 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -228,7 +228,7 @@ class Result(EmbeddedModel): class EvaluationScenarioResult(EmbeddedModel): - evaluator_key: str + evaluator_config: ObjectId result: Result @@ -255,7 +255,7 @@ class EvaluationDB(Model): status: str = Field(default="EVALUATION_INITIALIZED") testset: TestSetDB = Reference() variants: List[ObjectId] - evaluators_configs: List[EvaluatorConfigDB] + evaluators_configs: List[ObjectId] aggregated_results: List[AggregatedResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) @@ -273,7 +273,7 @@ class EvaluationScenarioDB(Model): correct_answer: Optional[str] is_pinned: Optional[bool] note: Optional[str] - evaluators_configs: List[EvaluatorConfigDB] + evaluators_configs: List[ObjectId] results: List[EvaluationScenarioResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 7b2bfba7a4..efa1e34f4b 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1623,7 +1623,7 @@ async def create_new_evaluation( testset: TestSetDB, status: str, variants: [AppVariantDB], - evaluators_configs: List[EvaluatorConfigDB], + evaluators_configs: List[str], ) -> EvaluationDB: """Create a new evaluation scenario. Returns: diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index fd7251b736..5d8c2b225a 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -868,10 +868,7 @@ async def create_new_evaluation( app = AppDB(**app_data) testset = await db_manager.fetch_testset_by_id(new_evaluation.testset_id) - evaluators_configs_db = [ - await db_manager.fetch_evaluator_config(evaluator_config) - for evaluator_config in evaluators_configs - ] + evaluation_db = await db_manager.create_new_evaluation( app=app, organization=app.organization, @@ -879,7 +876,7 @@ async def create_new_evaluation( testset=testset, status=EvaluationStatusEnum.EVALUATION_STARTED, variants=new_evaluation.variant_ids, - evaluators_configs=evaluators_configs_db, + evaluators_configs=new_evaluation.evaluators_configs, ) return await converters.evaluation_db_to_pydantic(evaluation_db) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 6b2e72d445..32b609b875 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -1,5 +1,6 @@ import asyncio from typing import List +from bson import ObjectId from celery import shared_task from collections import defaultdict @@ -56,6 +57,7 @@ def evaluate( evaluator_config = loop.run_until_complete( fetch_evaluator_config(evaluator_config_id) ) + result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], @@ -63,7 +65,7 @@ def evaluate( ) result_object = EvaluationScenarioResult( - evaluator_key=evaluator_config.evaluator_key, + evaluator_config=evaluator_config.id, result=Result(type="number", value=result), ) evaluators_results.append(result_object) From 0d1a3787d145b3caec6fc8e077d27526117e6a1c Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 08:48:06 +0100 Subject: [PATCH 065/414] fix auto similarity match --- .../agenta_backend/services/evaluators_service.py | 15 ++++++++++----- .../agenta_backend/tasks/evaluations.py | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 07bb6b202b..518619a30e 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,21 +1,25 @@ import re +from langchain.chains import LLMChain +from langchain.llms import OpenAI +from langchain.prompts import PromptTemplate - -def auto_exact_match(variant_output, correct_answer): +def auto_exact_match(variant_output, correct_answer, settings_values): if variant_output == correct_answer: return 1 else: return 0 -def auto_similarity_match(variant_output, correct_answer): +def auto_similarity_match(variant_output, correct_answer, settings_values): set1 = set(variant_output.split()) set2 = set(correct_answer.split()) intersect = set1.intersection(set2) union = set1.union(set2) similarity = len(intersect) / len(union) - return similarity + + is_similar = True if similarity > settings_values["similarity_threshold"] else False + return is_similar def auto_regex_test(test_string, regex, should_match): @@ -28,6 +32,7 @@ def evaluate( evaluator_name, correct_answer, variant_output, + settings_values, *additional_args, **additional_kwargs, ): @@ -35,7 +40,7 @@ def evaluate( evaluation_function = globals()[evaluator_name] return evaluation_function( - correct_answer, variant_output, *additional_args, **additional_kwargs + correct_answer, variant_output, settings_values, *additional_args, **additional_kwargs ) except KeyError: raise ValueError(f"Evaluation method '{evaluator_name}' not found.") diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 32b609b875..91db38978e 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -62,6 +62,7 @@ def evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], variant_output, + evaluator_config.settings_values ) result_object = EvaluationScenarioResult( From 2246aa86b101f60a63260edee87ec495d10da394 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:19:40 +0100 Subject: [PATCH 066/414] Update - added bool to value union --- agenta-backend/agenta_backend/models/db_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index ed23a959b1..37ebd6f3e3 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -224,7 +224,7 @@ class Config: class Result(EmbeddedModel): type: str - value: Union[str, float, int] + value: Union[str, float, int, bool] class EvaluationScenarioResult(EmbeddedModel): From 9e7d9652133cd8f3e399eeb31396be73d195dfd4 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:20:31 +0100 Subject: [PATCH 067/414] Update - modified evaluate task to accomodate auto_regex_test --- .../agenta_backend/tasks/evaluations.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 6b2e72d445..aaf71c6668 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -56,11 +56,18 @@ def evaluate( evaluator_config = loop.run_until_complete( fetch_evaluator_config(evaluator_config_id) ) - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - data_point["correct_answer"], - variant_output, - ) + if evaluator_config.evaluator_key in ["auto_regex_test"]: + result = evaluators_service.auto_regex_test( + variant_output, + evaluator_config.settings_values.get("regex_pattern"), + evaluator_config.settings_values.get("regex_should_match"), + ) # result will come out as a bool (True or False) + else: + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + data_point["correct_answer"], + variant_output, + ) result_object = EvaluationScenarioResult( evaluator_key=evaluator_config.evaluator_key, From cac2df5b1c52aa85fbef75b7611f48a145ac79cb Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:21:00 +0100 Subject: [PATCH 068/414] Update - modified module to hint-type args --- agenta-backend/agenta_backend/services/evaluators_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 74cd867c00..b4ce089e72 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -29,7 +29,7 @@ def evaluate( evaluator_name: str, correct_answer: str, variant_output :str, - *additional_args: dict, + *additional_args: tuple, **additional_kwargs: dict, ): try: From a983aeb346265c9486ba41149ffe3b2e39806849 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 22 Dec 2023 20:44:52 +0100 Subject: [PATCH 069/414] Refactor - type evaluators_service functions --- .../services/evaluators_service.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 518619a30e..5d2762cdf7 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,16 +1,18 @@ import re +from typing import Any from langchain.chains import LLMChain from langchain.llms import OpenAI from langchain.prompts import PromptTemplate -def auto_exact_match(variant_output, correct_answer, settings_values): + +def auto_exact_match(variant_output: str, correct_answer: str, settings_values: dict): if variant_output == correct_answer: return 1 else: return 0 -def auto_similarity_match(variant_output, correct_answer, settings_values): +def auto_similarity_match(variant_output: str, correct_answer: str, settings_values: dict): set1 = set(variant_output.split()) set2 = set(correct_answer.split()) intersect = set1.intersection(set2) @@ -22,23 +24,22 @@ def auto_similarity_match(variant_output, correct_answer, settings_values): return is_similar -def auto_regex_test(test_string, regex, should_match): +def auto_regex_test(test_string: str, regex: Any, should_match: bool): re_pattern = re.compile(regex, re.IGNORECASE) result = bool(re_pattern.search(test_string)) return result == should_match def evaluate( - evaluator_name, - correct_answer, - variant_output, - settings_values, - *additional_args, - **additional_kwargs, + evaluator_name: str, + correct_answer: str, + variant_output :str, + settings_values: dict, + *additional_args: dict, + **additional_kwargs: dict, ): try: evaluation_function = globals()[evaluator_name] - return evaluation_function( correct_answer, variant_output, settings_values, *additional_args, **additional_kwargs ) From 110ba057fdc5c66ec9a319529008dd9c3fa45116 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:19:40 +0100 Subject: [PATCH 070/414] Update - added bool to value union --- agenta-backend/agenta_backend/models/db_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index b061fb0744..b3b1870d1f 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -224,7 +224,7 @@ class Config: class Result(EmbeddedModel): type: str - value: Union[str, float, int] + value: Union[str, float, int, bool] class EvaluationScenarioResult(EmbeddedModel): From 29d797a2e824e4c855def45fa6f86a8d0b6fad12 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:20:31 +0100 Subject: [PATCH 071/414] Update - modified evaluate task to accomodate auto_regex_test --- .../agenta_backend/tasks/evaluations.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 91db38978e..0f379ff041 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -57,13 +57,19 @@ def evaluate( evaluator_config = loop.run_until_complete( fetch_evaluator_config(evaluator_config_id) ) - - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - data_point["correct_answer"], - variant_output, - evaluator_config.settings_values - ) + if evaluator_config.evaluator_key == "auto_regex_test": + result = evaluators_service.auto_regex_test( + variant_output, + evaluator_config.settings_values.get("regex_pattern"), + evaluator_config.settings_values.get("regex_should_match"), + ) # result will come out as a bool (True or False) + else: + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + data_point["correct_answer"], + variant_output, + evaluator_config.settings_values + ) result_object = EvaluationScenarioResult( evaluator_config=evaluator_config.id, From a4914f9e0e471ad48eca935bd36940a797b22914 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:21:00 +0100 Subject: [PATCH 072/414] Update - modified module to hint-type args --- agenta-backend/agenta_backend/services/evaluators_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 5d2762cdf7..f6129fdf85 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -35,7 +35,7 @@ def evaluate( correct_answer: str, variant_output :str, settings_values: dict, - *additional_args: dict, + *additional_args: tuple, **additional_kwargs: dict, ): try: From b19e838c1b7e966b2b6038023e9a7ee62d0c31c2 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 10:45:26 +0100 Subject: [PATCH 073/414] format --- .../agenta_backend/routers/evaluators_router.py | 8 ++++---- agenta-backend/agenta_backend/services/db_manager.py | 4 +++- .../agenta_backend/services/evaluator_manager.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 411ed2e94d..cc3b933b0a 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -18,9 +18,7 @@ from agenta_backend.services import evaluator_manager -from agenta_backend.utils.common import ( - check_access_to_app -) +from agenta_backend.utils.common import check_access_to_app if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all @@ -107,4 +105,6 @@ async def delete_evaluator_config(evaluator_id: str): success = await evaluator_manager.delete_evaluator_config(evaluator_id) return success except Exception as e: - raise HTTPException(status_code=500, detail=f"Error deleting evaluator configuration: {str(e)}") + raise HTTPException( + status_code=500, detail=f"Error deleting evaluator configuration: {str(e)}" + ) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index efa1e34f4b..221b6bb57c 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1804,7 +1804,9 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: assert evaluator_config_id is not None, "Evaluator Config ID cannot be None" try: - delete_result = remove_document_using_driver(str(evaluator_config_id), "evaluators_configs") + delete_result = remove_document_using_driver( + str(evaluator_config_id), "evaluators_configs" + ) return delete_result is not None except Exception as e: raise e diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index ee99f1a45f..62538f78db 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -55,7 +55,7 @@ async def create_evaluator_config( user=app.user, name=name, evaluator_key=evaluator_key, - settings_values=settings_values + settings_values=settings_values, ) From 581b0ed80dde0612853f32b862bbf92108f90a9b Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 10:46:21 +0100 Subject: [PATCH 074/414] add result as return type in all evaluators methods --- .../services/evaluators_service.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 518619a30e..ff305f331e 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -2,15 +2,16 @@ from langchain.chains import LLMChain from langchain.llms import OpenAI from langchain.prompts import PromptTemplate +from agenta_backend.services.db_manager import Result -def auto_exact_match(variant_output, correct_answer, settings_values): - if variant_output == correct_answer: - return 1 - else: - return 0 +def auto_exact_match(variant_output, correct_answer, settings_values) -> Result: + exact_match = True if variant_output == correct_answer else False + result = Result(type="bool", value=exact_match) + return result -def auto_similarity_match(variant_output, correct_answer, settings_values): + +def auto_similarity_match(variant_output, correct_answer, settings_values) -> Result: set1 = set(variant_output.split()) set2 = set(correct_answer.split()) intersect = set1.intersection(set2) @@ -19,7 +20,8 @@ def auto_similarity_match(variant_output, correct_answer, settings_values): similarity = len(intersect) / len(union) is_similar = True if similarity > settings_values["similarity_threshold"] else False - return is_similar + result = Result(type="bool", value=is_similar) + return result def auto_regex_test(test_string, regex, should_match): @@ -40,7 +42,11 @@ def evaluate( evaluation_function = globals()[evaluator_name] return evaluation_function( - correct_answer, variant_output, settings_values, *additional_args, **additional_kwargs + correct_answer, + variant_output, + settings_values, + *additional_args, + **additional_kwargs, ) except KeyError: raise ValueError(f"Evaluation method '{evaluator_name}' not found.") From 25a2fe64eb4ce6c2549b3c6c496f27e25132c039 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 10:46:49 +0100 Subject: [PATCH 075/414] add inputs --- .../agenta_backend/tasks/evaluations.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 91db38978e..7489e8df4b 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -17,6 +17,7 @@ ) from agenta_backend.models.db_models import ( AppDB, + EvaluationScenarioInputDB, EvaluationScenarioOutputDB, EvaluationScenarioResult, AggregatedResult, @@ -50,6 +51,8 @@ def evaluate( uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") for data_point in testset.csvdata: + + # 1. We call the llm app variant_output = llm_apps_service.get_llm_app_output(uri, data_point) evaluators_results: [EvaluationScenarioResult] = [] @@ -57,30 +60,43 @@ def evaluate( evaluator_config = loop.run_until_complete( fetch_evaluator_config(evaluator_config_id) ) - + # 2. We evaluate result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], variant_output, - evaluator_config.settings_values + evaluator_config.settings_values, ) result_object = EvaluationScenarioResult( evaluator_config=evaluator_config.id, - result=Result(type="number", value=result), + result=result, ) evaluators_results.append(result_object) evaluators_aggregated_data[evaluator_config.evaluator_key].append( result ) + # 3. We add inputs + raw_inputs = app_variant_db.parameters.get('inputs', []) if app_variant_db.parameters else [] + inputs = [] + if raw_inputs: + inputs = [ + EvaluationScenarioInputDB( + name=input_item['name'], + type='text', + value=data_point[input_item['name']] + ) for input_item in raw_inputs + ] + + # 4. We create a new evaluation scenario evaluation_scenario = loop.run_until_complete( create_new_evaluation_scenario( user=app.user, organization=app.organization, evaluation=new_evaluation_db, evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=[], + inputs=inputs, is_pinned=False, note="", correct_answer=data_point["correct_answer"], From 428bdf4ad589229d52bce098904e2e9e8748242c Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 22 Dec 2023 20:44:52 +0100 Subject: [PATCH 076/414] Refactor - type evaluators_service functions --- .../services/evaluators_service.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index ff305f331e..12af067459 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,17 +1,18 @@ import re +from typing import Any from langchain.chains import LLMChain from langchain.llms import OpenAI from langchain.prompts import PromptTemplate from agenta_backend.services.db_manager import Result -def auto_exact_match(variant_output, correct_answer, settings_values) -> Result: +def auto_exact_match(variant_output: str, correct_answer: str, settings_values: dict) -> Result: exact_match = True if variant_output == correct_answer else False result = Result(type="bool", value=exact_match) return result -def auto_similarity_match(variant_output, correct_answer, settings_values) -> Result: +def auto_similarity_match(variant_output: str, correct_answer: str, settings_values: dict) -> Result: set1 = set(variant_output.split()) set2 = set(correct_answer.split()) intersect = set1.intersection(set2) @@ -24,23 +25,22 @@ def auto_similarity_match(variant_output, correct_answer, settings_values) -> Re return result -def auto_regex_test(test_string, regex, should_match): +def auto_regex_test(test_string: str, regex: Any, should_match: bool): re_pattern = re.compile(regex, re.IGNORECASE) result = bool(re_pattern.search(test_string)) return result == should_match def evaluate( - evaluator_name, - correct_answer, - variant_output, - settings_values, - *additional_args, - **additional_kwargs, + evaluator_name: str, + correct_answer: str, + variant_output :str, + settings_values: dict, + *additional_args: dict, + **additional_kwargs: dict, ): try: evaluation_function = globals()[evaluator_name] - return evaluation_function( correct_answer, variant_output, From 4a59d8afce9f73021a5c6006c4e3d9d2d2a88df2 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:19:40 +0100 Subject: [PATCH 077/414] Update - added bool to value union --- agenta-backend/agenta_backend/models/db_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index b061fb0744..b3b1870d1f 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -224,7 +224,7 @@ class Config: class Result(EmbeddedModel): type: str - value: Union[str, float, int] + value: Union[str, float, int, bool] class EvaluationScenarioResult(EmbeddedModel): From 17dc852e5a06bc66b349e81c018436fa5eb36261 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:20:31 +0100 Subject: [PATCH 078/414] Update - modified evaluate task to accomodate auto_regex_test --- .../agenta_backend/tasks/evaluations.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 7489e8df4b..2dd52fb7ab 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -60,13 +60,21 @@ def evaluate( evaluator_config = loop.run_until_complete( fetch_evaluator_config(evaluator_config_id) ) + # 2. We evaluate - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - data_point["correct_answer"], - variant_output, - evaluator_config.settings_values, - ) + if evaluator_config.evaluator_key == "auto_regex_test": + result = evaluators_service.auto_regex_test( + variant_output, + evaluator_config.settings_values.get("regex_pattern"), + evaluator_config.settings_values.get("regex_should_match"), + ) + else: + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + data_point["correct_answer"], + variant_output, + evaluator_config.settings_values + ) result_object = EvaluationScenarioResult( evaluator_config=evaluator_config.id, From bca4b1509fe2938d089933decba9261f00c679a7 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 09:21:00 +0100 Subject: [PATCH 079/414] Update - modified module to hint-type args --- agenta-backend/agenta_backend/services/evaluators_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 12af067459..b56f1b4cb5 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -36,7 +36,7 @@ def evaluate( correct_answer: str, variant_output :str, settings_values: dict, - *additional_args: dict, + *additional_args: tuple, **additional_kwargs: dict, ): try: From 1b3e299554233fbfe6b067e7be17bc00b31bd288 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 11:02:52 +0100 Subject: [PATCH 080/414] commented out the aggregation logic temp --- .../agenta_backend/tasks/evaluations.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 7489e8df4b..d4dc9e911d 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -107,14 +107,14 @@ def evaluate( ) ) - aggregated_results = loop.run_until_complete( - aggregate_evaluator_results(app, evaluators_aggregated_data) - ) - updated_evaluation = loop.run_until_complete( - update_evaluation_with_aggregated_results( - new_evaluation_db.id, aggregated_results - ) - ) + # aggregated_results = loop.run_until_complete( + # aggregate_evaluator_results(app, evaluators_aggregated_data) + # ) + # updated_evaluation = loop.run_until_complete( + # update_evaluation_with_aggregated_results( + # new_evaluation_db.id, aggregated_results + # ) + # ) async def aggregate_evaluator_results( From 04a67a0728104f223d86a467cc22f7d0a11e4283 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 11:06:42 +0100 Subject: [PATCH 081/414] Update - modified auto_regex_test evaluator --- .../agenta_backend/services/evaluators_service.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index b56f1b4cb5..3355d39253 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -25,10 +25,10 @@ def auto_similarity_match(variant_output: str, correct_answer: str, settings_val return result -def auto_regex_test(test_string: str, regex: Any, should_match: bool): +def auto_regex_test(test_string: str, regex: Any, should_match: bool) -> Result: re_pattern = re.compile(regex, re.IGNORECASE) - result = bool(re_pattern.search(test_string)) - return result == should_match + result = bool(re_pattern.search(test_string)) == should_match + return Result(type="bool", value=result) def evaluate( From ecd4e86c31bce6652fdbd37c1dc0c6ec354dcae4 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 11:07:19 +0100 Subject: [PATCH 082/414] Update - modified evaluations --- agenta-backend/agenta_backend/tasks/evaluations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 2dd52fb7ab..e04937b27d 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -124,7 +124,7 @@ def evaluate( ) ) - +# TODO: find a good solution for aggregating evaluator results async def aggregate_evaluator_results( app: AppDB, evaluators_aggregated_data: dict ) -> List[AggregatedResult]: From f183e35dba673e7ddf58ff84f1877f316bcc651c Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 11:08:51 +0100 Subject: [PATCH 083/414] :art: Format - ran black --- .../routers/evaluation_router.py | 2 +- .../services/evaluators_service.py | 10 +++++++--- .../agenta_backend/tasks/evaluations.py | 19 ++++++++++++------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 79a758eab1..39a21f9926 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -89,7 +89,7 @@ async def create_evaluation( evaluation = await evaluation_service.create_new_evaluation( app_data=app_data, new_evaluation_data=new_evaluation_data, - evaluators_configs=payload.evaluators_configs + evaluators_configs=payload.evaluators_configs, ) # Start celery task diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 3355d39253..06335b55c3 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -6,13 +6,17 @@ from agenta_backend.services.db_manager import Result -def auto_exact_match(variant_output: str, correct_answer: str, settings_values: dict) -> Result: +def auto_exact_match( + variant_output: str, correct_answer: str, settings_values: dict +) -> Result: exact_match = True if variant_output == correct_answer else False result = Result(type="bool", value=exact_match) return result -def auto_similarity_match(variant_output: str, correct_answer: str, settings_values: dict) -> Result: +def auto_similarity_match( + variant_output: str, correct_answer: str, settings_values: dict +) -> Result: set1 = set(variant_output.split()) set2 = set(correct_answer.split()) intersect = set1.intersection(set2) @@ -34,7 +38,7 @@ def auto_regex_test(test_string: str, regex: Any, should_match: bool) -> Result: def evaluate( evaluator_name: str, correct_answer: str, - variant_output :str, + variant_output: str, settings_values: dict, *additional_args: tuple, **additional_kwargs: dict, diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index e04937b27d..04a491480d 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -51,7 +51,6 @@ def evaluate( uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") for data_point in testset.csvdata: - # 1. We call the llm app variant_output = llm_apps_service.get_llm_app_output(uri, data_point) @@ -73,7 +72,7 @@ def evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], variant_output, - evaluator_config.settings_values + evaluator_config.settings_values, ) result_object = EvaluationScenarioResult( @@ -86,15 +85,20 @@ def evaluate( ) # 3. We add inputs - raw_inputs = app_variant_db.parameters.get('inputs', []) if app_variant_db.parameters else [] + raw_inputs = ( + app_variant_db.parameters.get("inputs", []) + if app_variant_db.parameters + else [] + ) inputs = [] if raw_inputs: inputs = [ EvaluationScenarioInputDB( - name=input_item['name'], - type='text', - value=data_point[input_item['name']] - ) for input_item in raw_inputs + name=input_item["name"], + type="text", + value=data_point[input_item["name"]], + ) + for input_item in raw_inputs ] # 4. We create a new evaluation scenario @@ -124,6 +128,7 @@ def evaluate( ) ) + # TODO: find a good solution for aggregating evaluator results async def aggregate_evaluator_results( app: AppDB, evaluators_aggregated_data: dict From 750b3e330979e544732f3ef28997ef56c51d0d76 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 11:30:14 +0100 Subject: [PATCH 084/414] Update - comment out code in aggregate evaluator results function --- .../agenta_backend/tasks/evaluations.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 04a491480d..18ac785f2f 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -129,17 +129,17 @@ def evaluate( ) -# TODO: find a good solution for aggregating evaluator results async def aggregate_evaluator_results( app: AppDB, evaluators_aggregated_data: dict ) -> List[AggregatedResult]: aggregated_results = [] - for evaluator_key, values in evaluators_aggregated_data.items(): - average_value = sum(values) / len(values) if values else 0 - evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) - aggregated_result = AggregatedResult( - evaluator_config=evaluator_config.id, - result=Result(type="number", value=average_value), - ) - aggregated_results.append(aggregated_result) + # TODO: find a good solution for aggregating evaluator results + # for evaluator_key, values in evaluators_aggregated_data.items(): + # average_value = sum(values) / len(values) if values else 0 + # evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) + # aggregated_result = AggregatedResult( + # evaluator_config=evaluator_config.id, + # result=Result(type="number", value=average_value), + # ) + # aggregated_results.append(aggregated_result) return aggregated_results From a9d76ce29a4a1f0fcddbd4d962f8b9e6b7702647 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 11:49:32 +0100 Subject: [PATCH 085/414] Update - refactor auto_regex_test service --- .../services/evaluators_service.py | 8 +++++--- .../agenta_backend/tasks/evaluations.py | 19 ++++++------------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 06335b55c3..cb0ad54da8 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -29,9 +29,11 @@ def auto_similarity_match( return result -def auto_regex_test(test_string: str, regex: Any, should_match: bool) -> Result: - re_pattern = re.compile(regex, re.IGNORECASE) - result = bool(re_pattern.search(test_string)) == should_match +def auto_regex_test(_, test_string: str, settings_values: dict) -> Result: + re_pattern = re.compile(settings_values["regex_pattern"], re.IGNORECASE) + result = ( + bool(re_pattern.search(test_string)) == settings_values["regex_should_match"] + ) return Result(type="bool", value=result) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 18ac785f2f..1c43be0200 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -61,19 +61,12 @@ def evaluate( ) # 2. We evaluate - if evaluator_config.evaluator_key == "auto_regex_test": - result = evaluators_service.auto_regex_test( - variant_output, - evaluator_config.settings_values.get("regex_pattern"), - evaluator_config.settings_values.get("regex_should_match"), - ) - else: - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - data_point["correct_answer"], - variant_output, - evaluator_config.settings_values, - ) + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + data_point["correct_answer"], + variant_output, + evaluator_config.settings_values, + ) result_object = EvaluationScenarioResult( evaluator_config=evaluator_config.id, From 9642fd29f802f6677d8796337b2f484f6d9b992b Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 12:19:04 +0100 Subject: [PATCH 086/414] move ai critique to evaluators --- .../services/evaluation_service.py | 64 ------------------- .../services/evaluators_service.py | 64 +++++++++++++++++++ 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 5d8c2b225a..8e048d0371 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -413,70 +413,6 @@ async def get_evaluation_scenario_score( } -def evaluate_with_ai_critique( - llm_app_prompt_template: str, - llm_app_inputs: list, - correct_answer: str, - app_variant_output: str, - evaluation_prompt_template: str, - open_ai_key: str, - temperature: float = 0.9, -) -> str: - """Evaluate a response using an AI critique based on provided - - An evaluation prompt, - - An LLM App prompt, - - An LLM App output, - - a correct answer. - - Args: - llm_app_prompt_template (str): the prompt template of the llm app variant - llm_app_inputs (list): parameters - correct_answer (str): correct answer - app_variant_output (str): the output of an ll app variant with given parameters - evaluation_prompt_template (str): evaluation prompt set by an agenta user in the ai evaluation view - - Returns: - str: returns an evaluation - """ - llm = OpenAI(openai_api_key=open_ai_key, temperature=temperature) - - input_variables = [] - - # List of default variables - default_vars = [ - "app_variant_output", - "llm_app_prompt_template", - "correct_answer", - ] - - # Check default variables - for var in default_vars: - if "{%s}" % var in evaluation_prompt_template: - input_variables.append(var) - - # Iterate over llm_app_inputs and check if the variable name exists in the evaluation_prompt_template - for input_item in llm_app_inputs: - if "{%s}" % input_item["input_name"] in evaluation_prompt_template: - input_variables.append(input_item["input_name"]) - - chain_run_args = { - "llm_app_prompt_template": llm_app_prompt_template, - "correct_answer": correct_answer, - "app_variant_output": app_variant_output, - } - - for input_item in llm_app_inputs: - chain_run_args[input_item["input_name"]] = input_item["input_value"] - - prompt = PromptTemplate( - input_variables=input_variables, template=evaluation_prompt_template - ) - chain = LLMChain(llm=llm, prompt=prompt) - - output = chain.run(**chain_run_args) - return output.strip() - - def _extend_with_evaluation(evaluation_type: EvaluationType): evaluation = {} if ( diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index cb0ad54da8..008c87c587 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -37,6 +37,70 @@ def auto_regex_test(_, test_string: str, settings_values: dict) -> Result: return Result(type="bool", value=result) +def auto_ai_critique( + llm_app_prompt_template: str, + llm_app_inputs: list, + correct_answer: str, + app_variant_output: str, + evaluation_prompt_template: str, + open_ai_key: str, + temperature: float = 0.9, +) -> str: + """Evaluate a response using an AI critique based on provided + - An evaluation prompt, + - An LLM App prompt, + - An LLM App output, + - a correct answer. + + Args: + llm_app_prompt_template (str): the prompt template of the llm app variant + llm_app_inputs (list): parameters + correct_answer (str): correct answer + app_variant_output (str): the output of an ll app variant with given parameters + evaluation_prompt_template (str): evaluation prompt set by an agenta user in the ai evaluation view + + Returns: + str: returns an evaluation + """ + llm = OpenAI(openai_api_key=open_ai_key, temperature=temperature) + + input_variables = [] + + # List of default variables + default_vars = [ + "app_variant_output", + "llm_app_prompt_template", + "correct_answer", + ] + + # Check default variables + for var in default_vars: + if "{%s}" % var in evaluation_prompt_template: + input_variables.append(var) + + # Iterate over llm_app_inputs and check if the variable name exists in the evaluation_prompt_template + for input_item in llm_app_inputs: + if "{%s}" % input_item["input_name"] in evaluation_prompt_template: + input_variables.append(input_item["input_name"]) + + chain_run_args = { + "llm_app_prompt_template": llm_app_prompt_template, + "correct_answer": correct_answer, + "app_variant_output": app_variant_output, + } + + for input_item in llm_app_inputs: + chain_run_args[input_item["input_name"]] = input_item["input_value"] + + prompt = PromptTemplate( + input_variables=input_variables, template=evaluation_prompt_template + ) + chain = LLMChain(llm=llm, prompt=prompt) + + output = chain.run(**chain_run_args) + return output.strip() + + def evaluate( evaluator_name: str, correct_answer: str, From 088d215c3e291886b614bf353610cad8281d2094 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 13:23:56 +0100 Subject: [PATCH 087/414] remove import --- agenta-backend/agenta_backend/routers/evaluation_router.py | 1 - 1 file changed, 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 39a21f9926..4bf3f64264 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -26,7 +26,6 @@ ) from agenta_backend.services.evaluation_service import ( UpdateEvaluationScenarioError, - evaluate_with_ai_critique, fetch_custom_evaluation_names, fetch_custom_evaluations, fetch_custom_evaluation_detail, From 24783a212b52cd144e91e752889192b45fb19e21 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 14:54:40 +0100 Subject: [PATCH 088/414] move inputs to the first of the evaluation process --- .../agenta_backend/tasks/evaluations.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index f474771e9d..12bad85843 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -51,16 +51,33 @@ def evaluate( uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") for data_point in testset.csvdata: - # 1. We call the llm app + # 1. We prepare the inputs + raw_inputs = ( + app_variant_db.parameters.get("inputs", []) + if app_variant_db.parameters + else [] + ) + inputs = [] + if raw_inputs: + inputs = [ + EvaluationScenarioInputDB( + name=input_item["name"], + type="text", + value=data_point[input_item["name"]], + ) + for input_item in raw_inputs + ] + + # 2. We get the output from the llm app variant_output = llm_apps_service.get_llm_app_output(uri, data_point) + # 3. We evaluate evaluators_results: [EvaluationScenarioResult] = [] for evaluator_config_id in evaluation.evaluators_configs: evaluator_config = loop.run_until_complete( fetch_evaluator_config(evaluator_config_id) ) - # 2. We evaluate result = evaluators_service.evaluate( evaluator_config.evaluator_key, data_point["correct_answer"], @@ -77,23 +94,6 @@ def evaluate( result ) - # 3. We add inputs - raw_inputs = ( - app_variant_db.parameters.get("inputs", []) - if app_variant_db.parameters - else [] - ) - inputs = [] - if raw_inputs: - inputs = [ - EvaluationScenarioInputDB( - name=input_item["name"], - type="text", - value=data_point[input_item["name"]], - ) - for input_item in raw_inputs - ] - # 4. We create a new evaluation scenario evaluation_scenario = loop.run_until_complete( create_new_evaluation_scenario( From fa2fdf83694d40ce32d468c665eac6d9c8a9c295 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 15:44:59 +0100 Subject: [PATCH 089/414] Update - modified evaluator_configs get/post api routers --- .../models/api/evaluation_model.py | 1 + .../routers/evaluators_router.py | 24 ++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 39bc8bbb57..6b54501f0b 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -160,6 +160,7 @@ class EvaluationSettingsTemplate(BaseModel): class EvaluatorConfig(BaseModel): + id: str evaluator_key: str settings_values: Optional[Dict[str, Any]] diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index cc3b933b0a..e8537bf70c 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -56,9 +56,7 @@ async def get_evaluators(): @router.get("/configs/", response_model=List[EvaluatorConfig]) -async def get_evaluator_configs( - app_id: str = Query(), response_model=List[EvaluatorConfig] -): +async def get_evaluator_configs(app_id: str = Query()): """Endpoint to fetch evaluator configurations for a specific app. Args: @@ -67,7 +65,16 @@ async def get_evaluator_configs( Returns: List[EvaluatorConfigDB]: A list of evaluator configuration objects. """ - return await evaluator_manager.get_evaluators_configs(app_id) + + configs_db = await evaluator_manager.get_evaluators_configs(app_id) + return [ + EvaluatorConfig( + id=str(config_db.id), + evaluator_key=config_db.evaluator_key, + settings_values=config_db.settings_values, + ) + for config_db in configs_db + ] @router.post("/configs/", response_model=EvaluatorConfig) @@ -80,15 +87,20 @@ async def create_new_evaluator_config( app_id (str): The ID of the app. Returns: - List[EvaluatorConfigDB]: A list of evaluator configuration objects. + EvaluatorConfigDB: Evaluator configuration api model. """ - return await evaluator_manager.create_evaluator_config( + config_db = await evaluator_manager.create_evaluator_config( app_id=payload.app_id, name=payload.name, evaluator_key=payload.evaluator_key, settings_values=payload.settings_values, ) + return EvaluatorConfig( + id=str(config_db.id), + evaluator_key=config_db.evaluator_key, + settings_values=config_db.settings_values, + ) @router.delete("/configs/{evaluator_id}/", response_model=bool) From 0c680a5267a719869c817413387a0dd848a0a62c Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 15:45:31 +0100 Subject: [PATCH 090/414] Update - modified wehook example fake api router --- agenta-backend/agenta_backend/routers/evaluation_router.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 39a21f9926..d1be65d36e 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -630,4 +630,6 @@ async def webhook_example_fake(): """ # return a random score b/w 0 and 1 - return {"score": secrets.SystemRandom.random()} + random_generator = secrets.SystemRandom() + random_number = random_generator.random() + return {"score": random_number} From 398527d775a199f126762aade7bd556deb7bc99a Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 16:38:16 +0100 Subject: [PATCH 091/414] Feat - created auto_webhook_test evaluator service --- .../services/evaluators_service.py | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index cb0ad54da8..ff25a5ef33 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,5 +1,6 @@ import re -from typing import Any +import httpx +from typing import Any, Dict, Tuple from langchain.chains import LLMChain from langchain.llms import OpenAI from langchain.prompts import PromptTemplate @@ -7,7 +8,7 @@ def auto_exact_match( - variant_output: str, correct_answer: str, settings_values: dict + variant_output: str, correct_answer: str, settings_values: Dict[str, Any] ) -> Result: exact_match = True if variant_output == correct_answer else False result = Result(type="bool", value=exact_match) @@ -15,7 +16,7 @@ def auto_exact_match( def auto_similarity_match( - variant_output: str, correct_answer: str, settings_values: dict + variant_output: str, correct_answer: str, settings_values: Dict[str, Any] ) -> Result: set1 = set(variant_output.split()) set2 = set(correct_answer.split()) @@ -29,22 +30,48 @@ def auto_similarity_match( return result -def auto_regex_test(_, test_string: str, settings_values: dict) -> Result: +def auto_regex_test( + variant_output: str, correct_answer: str, settings_values: Dict[str, Any] +) -> Result: re_pattern = re.compile(settings_values["regex_pattern"], re.IGNORECASE) result = ( - bool(re_pattern.search(test_string)) == settings_values["regex_should_match"] + bool(re_pattern.search(variant_output)) == settings_values["regex_should_match"] ) return Result(type="bool", value=result) +def auto_webhook_test( + variant_output: str, correct_answer: str, settings_values: Dict[str, Any] +) -> Result: + try: + with httpx.Client() as client: + response = client.post( + url=settings_values["webhook_url"], json=settings_values["webhook_body"] + ) + response.raise_for_status() + response_data = response.json() + score = response_data.get("score", None) + if not score: + raise httpx.HTTPError("Webhook did not return a score") + if score < 0 or score > 1: + raise httpx.HTTPError( + "Webhook returned an invalid score. Score must be between 0 and 1" + ) + return Result(type="number", value=score) + except httpx.HTTPError as e: + print(f"An HTTP error occurred: {e}") + except Exception as e: + print(f"An error occurred: {e}") + + def evaluate( evaluator_name: str, correct_answer: str, variant_output: str, - settings_values: dict, - *additional_args: tuple, - **additional_kwargs: dict, -): + settings_values: Dict[str, Any], + *additional_args: Tuple[Any], + **additional_kwargs: Dict[str, Any], +) -> Result: try: evaluation_function = globals()[evaluator_name] return evaluation_function( From 15393c71bddc9b3ed0dadf2ba4b7b1de60582e4d Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 24 Dec 2023 16:39:10 +0100 Subject: [PATCH 092/414] Update - modified evaluate task --- agenta-backend/agenta_backend/tasks/evaluations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 1c43be0200..3febe3b1c7 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -63,8 +63,8 @@ def evaluate( # 2. We evaluate result = evaluators_service.evaluate( evaluator_config.evaluator_key, - data_point["correct_answer"], variant_output, + data_point["correct_answer"], evaluator_config.settings_values, ) From 9b5a3dff499f82abfc938cb95192d057a91839e9 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 24 Dec 2023 21:07:56 +0100 Subject: [PATCH 093/414] add auto ai critique --- .../services/evaluators_service.py | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 008c87c587..5a4a96a0eb 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -38,13 +38,7 @@ def auto_regex_test(_, test_string: str, settings_values: dict) -> Result: def auto_ai_critique( - llm_app_prompt_template: str, - llm_app_inputs: list, - correct_answer: str, - app_variant_output: str, - evaluation_prompt_template: str, - open_ai_key: str, - temperature: float = 0.9, + variant_output: str, correct_answer: str, settings_values: dict ) -> str: """Evaluate a response using an AI critique based on provided - An evaluation prompt, @@ -56,49 +50,57 @@ def auto_ai_critique( llm_app_prompt_template (str): the prompt template of the llm app variant llm_app_inputs (list): parameters correct_answer (str): correct answer - app_variant_output (str): the output of an ll app variant with given parameters + variant_output (str): the output of an ll app variant with given parameters evaluation_prompt_template (str): evaluation prompt set by an agenta user in the ai evaluation view Returns: str: returns an evaluation """ - llm = OpenAI(openai_api_key=open_ai_key, temperature=temperature) + llm = OpenAI( + openai_api_key=settings_values["open_ai_key"], + temperature=settings_values["temperature"], + ) input_variables = [] # List of default variables default_vars = [ - "app_variant_output", + "variant_output", "llm_app_prompt_template", "correct_answer", ] # Check default variables for var in default_vars: - if "{%s}" % var in evaluation_prompt_template: + if "{%s}" % var in settings_values["evaluation_prompt_template"]: input_variables.append(var) # Iterate over llm_app_inputs and check if the variable name exists in the evaluation_prompt_template - for input_item in llm_app_inputs: - if "{%s}" % input_item["input_name"] in evaluation_prompt_template: + for input_item in settings_values["llm_app_inputs"]: + if ( + "{%s}" % input_item["input_name"] + in settings_values["evaluation_prompt_template"] + ): input_variables.append(input_item["input_name"]) chain_run_args = { - "llm_app_prompt_template": llm_app_prompt_template, + "llm_app_prompt_template": settings_values["llm_app_prompt_template"], "correct_answer": correct_answer, - "app_variant_output": app_variant_output, + "variant_output": variant_output, } - for input_item in llm_app_inputs: + for input_item in settings_values["llm_app_inputs"]: chain_run_args[input_item["input_name"]] = input_item["input_value"] prompt = PromptTemplate( - input_variables=input_variables, template=evaluation_prompt_template + input_variables=input_variables, + template=settings_values["evaluation_prompt_template"], ) chain = LLMChain(llm=llm, prompt=prompt) output = chain.run(**chain_run_args) - return output.strip() + + return Result(type="text", value=output.strip()) def evaluate( From a3227b1d6ed56791a548a0726e19b71befca0d10 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 10:05:25 +0100 Subject: [PATCH 094/414] Update - remove jsonschema from supported modules in sandbox --- agenta-backend/agenta_backend/services/security/sandbox.py | 1 - 1 file changed, 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/security/sandbox.py b/agenta-backend/agenta_backend/services/security/sandbox.py index 63974b8c01..d58800e658 100644 --- a/agenta-backend/agenta_backend/services/security/sandbox.py +++ b/agenta-backend/agenta_backend/services/security/sandbox.py @@ -61,7 +61,6 @@ def execute_code_safely( "random", "datetime", "json", - "jsonschema", "requests", "numpy", ] From 1ee4cac9f18b321a36e88e98669d0495157e3ae1 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 10:06:02 +0100 Subject: [PATCH 095/414] Update - include additional kwargs for custom code evaluation --- agenta-backend/agenta_backend/tasks/evaluations.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 076e3e2cd0..b014f4dce6 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -24,6 +24,7 @@ Result, ) from agenta_backend.services import evaluators_service +from agenta_backend.services.helpers import format_inputs from agenta_backend.models.api.evaluation_model import NewEvaluation @@ -78,11 +79,22 @@ def evaluate( fetch_evaluator_config(evaluator_config_id) ) + # Format inputs for custom code + formatted_inputs = format_inputs(testset.csvdata) + additional_kwargs = ( + { + "app_params": app_variant_db.config.parameters, + "inputs": formatted_inputs, + } + if evaluator_config.evaluator_key == "custom_code_run" + else {} + ) result = evaluators_service.evaluate( evaluator_config.evaluator_key, variant_output, data_point["correct_answer"], evaluator_config.settings_values, + **additional_kwargs ) result_object = EvaluationScenarioResult( From 7303a1b071c925f180f673180b37cfc31f3c2d3a Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 10:06:40 +0100 Subject: [PATCH 096/414] Feat - created custom_code_run evaluator service --- .../services/evaluators_service.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 71f9182c4b..5fc96035c3 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -2,6 +2,7 @@ import httpx from typing import Any, Dict, Tuple +from agenta_backend.services.security import sandbox from agenta_backend.services.db_manager import Result from langchain.llms import OpenAI @@ -66,6 +67,25 @@ def auto_webhook_test( print(f"An error occurred: {e}") +def custom_code_run( + variant_output: str, + correct_answer: str, + settings_values: Dict[str, Any], + **kwargs: Dict[str, Any], +) -> Result: + try: + result = sandbox.execute_code_safely( + app_params=kwargs["app_params"], + inputs=kwargs["inputs"], + output=variant_output, + correct_answer=correct_answer, + code=settings_values["python_code"], + ) + return Result(type="number", value=result) + except Exception as exc: + raise exc + + def auto_ai_critique( llm_app_prompt_template: str, llm_app_inputs: list, From 621abd2283a688e46c094f98c7eda599a2d990d8 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 10:07:28 +0100 Subject: [PATCH 097/414] Update - modified format_inputs helper function --- agenta-backend/agenta_backend/services/helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py index 321be807c7..e848000e10 100644 --- a/agenta-backend/agenta_backend/services/helpers.py +++ b/agenta-backend/agenta_backend/services/helpers.py @@ -15,7 +15,8 @@ def format_inputs(list_of_dictionaries: List[Dict[str, Any]]) -> Dict: formatted_dictionary = {} for dictionary in list_of_dictionaries: - formatted_dictionary[dictionary["input_name"]] = dictionary["input_value"] + input_name = list(dictionary.keys())[0] + formatted_dictionary[input_name] = dictionary[input_name] return formatted_dictionary From 835a1782f467a03a8b075a6316978ced4bcb9959 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 10:51:49 +0100 Subject: [PATCH 098/414] Update - clean up evaluate task --- agenta-backend/agenta_backend/services/helpers.py | 3 +-- agenta-backend/agenta_backend/tasks/evaluations.py | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py index e848000e10..321be807c7 100644 --- a/agenta-backend/agenta_backend/services/helpers.py +++ b/agenta-backend/agenta_backend/services/helpers.py @@ -15,8 +15,7 @@ def format_inputs(list_of_dictionaries: List[Dict[str, Any]]) -> Dict: formatted_dictionary = {} for dictionary in list_of_dictionaries: - input_name = list(dictionary.keys())[0] - formatted_dictionary[input_name] = dictionary[input_name] + formatted_dictionary[dictionary["input_name"]] = dictionary["input_value"] return formatted_dictionary diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index b014f4dce6..b843d95953 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -24,7 +24,6 @@ Result, ) from agenta_backend.services import evaluators_service -from agenta_backend.services.helpers import format_inputs from agenta_backend.models.api.evaluation_model import NewEvaluation @@ -79,12 +78,10 @@ def evaluate( fetch_evaluator_config(evaluator_config_id) ) - # Format inputs for custom code - formatted_inputs = format_inputs(testset.csvdata) additional_kwargs = ( { "app_params": app_variant_db.config.parameters, - "inputs": formatted_inputs, + "inputs": data_point, } if evaluator_config.evaluator_key == "custom_code_run" else {} From 5b946a2898dffb11211e686f1a5edc93844f321d Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 14:04:29 +0100 Subject: [PATCH 099/414] Update - add comment in additional_kwargs inputs key --- agenta-backend/agenta_backend/tasks/evaluations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index b843d95953..54694366c9 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -81,7 +81,7 @@ def evaluate( additional_kwargs = ( { "app_params": app_variant_db.config.parameters, - "inputs": data_point, + "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed } if evaluator_config.evaluator_key == "custom_code_run" else {} From a15600486a862141bc053dda579a41bcfba9c992 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Mon, 25 Dec 2023 16:58:59 +0100 Subject: [PATCH 100/414] update evaluators --- .../models/api/evaluation_model.py | 2 + .../resources/evaluators/evaluators.json | 50 ++++++++++++++++--- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 6b54501f0b..7befadb178 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -175,6 +175,8 @@ class NewEvaluation(BaseModel): class Evaluator(BaseModel): name: str key: str + icon: str + settings_template: dict class NewEvaluatorConfig(BaseModel): diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 7a81798ef9..673528ec46 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -1,26 +1,64 @@ [ { "name": "Exact Match", - "key": "auto_exact_match" + "key": "auto_exact_match", + "icon": "", + "settings_template": {} }, { "name": "Similarity Match", - "key": "auto_similarity_match" + "key": "auto_similarity_match", + "icon": "", + "settings_template": { + "threshold": 0.5 + } }, { "name": "Regex Test", - "key": "auto_regex_test" + "key": "auto_regex_test", + "icon": "", + "settings_template": { + "regex_pattern": { + "type": "regex", + "default": "", + "description": "Pattern (ex: ^this_word\\d{3}$)" + } + } }, { "name": "AI Critique", - "key": "auto_ai_critique" + "key": "auto_ai_critique", + "icon": "", + "settings_template": { + "prompt_template": { + "type": "text", + "default": "We have an LLM App that we want to evaluate its outputs....", + "description": "" + } + } }, { "name": "Code Evaluation", - "key": "custom_code_run" + "key": "auto_custom_code_run", + "icon": "", + "settings_template": {} }, { "name": "Webhook test", - "key": "auto_webhook_test" + "key": "auto_webhook_test", + "icon": "", + "settings_template": {} + }, + { + "name": "A/B Test", + "key": "human_a_b_testing", + "icon": "", + "settings_template": {} + }, + { + "name": "Single Model Test", + "key": "human_single_model_test", + "icon": "", + "settings_template": {} } ] From d58e7f34469e93cc0656917e33d389e8e854f94c Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 18:27:26 +0100 Subject: [PATCH 101/414] Update - modified aggregate evaluator results --- .../agenta_backend/models/db_models.py | 2 +- .../services/evaluators_service.py | 16 ++++--- .../agenta_backend/tasks/evaluations.py | 42 +++++++++++-------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index b3b1870d1f..91f08bb627 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -224,7 +224,7 @@ class Config: class Result(EmbeddedModel): type: str - value: Union[str, float, int, bool] + value: Any class EvaluationScenarioResult(EmbeddedModel): diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 94d9a3abe8..8c830f48df 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -48,12 +48,16 @@ def auto_webhook_test( ) -> Result: try: with httpx.Client() as client: - response = client.post( - url=settings_values["webhook_url"], json=settings_values["webhook_body"] - ) - response.raise_for_status() - response_data = response.json() - score = response_data.get("score", None) + # response = client.post( + # url=settings_values["webhook_url"], json=settings_values["webhook_body"] + # ) + # response.raise_for_status() + # response_data = response.json() + # score = response_data.get("score", None) + import secrets + + random_generator = secrets.SystemRandom() + score = random_generator.random() if not score: raise httpx.HTTPError("Webhook did not return a score") if score < 0 or score > 1: diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index b843d95953..88905dd375 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -81,7 +81,7 @@ def evaluate( additional_kwargs = ( { "app_params": app_variant_db.config.parameters, - "inputs": data_point, + "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed } if evaluator_config.evaluator_key == "custom_code_run" else {} @@ -121,27 +121,33 @@ def evaluate( ) ) - # aggregated_results = loop.run_until_complete( - # aggregate_evaluator_results(app, evaluators_aggregated_data) - # ) - # updated_evaluation = loop.run_until_complete( - # update_evaluation_with_aggregated_results( - # new_evaluation_db.id, aggregated_results - # ) - # ) + aggregated_results = loop.run_until_complete( + aggregate_evaluator_results(app, evaluators_aggregated_data) + ) + updated_evaluation = loop.run_until_complete( + update_evaluation_with_aggregated_results( + new_evaluation_db.id, aggregated_results + ) + ) async def aggregate_evaluator_results( app: AppDB, evaluators_aggregated_data: dict ) -> List[AggregatedResult]: aggregated_results = [] - # TODO: find a good solution for aggregating evaluator results - # for evaluator_key, values in evaluators_aggregated_data.items(): - # average_value = sum(values) / len(values) if values else 0 - # evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) - # aggregated_result = AggregatedResult( - # evaluator_config=evaluator_config.id, - # result=Result(type="number", value=average_value), - # ) - # aggregated_results.append(aggregated_result) + for evaluator_key, results in evaluators_aggregated_data.items(): + if evaluator_key != "auto_ai_critique": + average_value = ( + sum([result.value for result in results]) / len(results) + if results + else 0 + ) + evaluator_config = await fetch_evaluator_config_by_appId( + app.id, evaluator_key + ) + aggregated_result = AggregatedResult( + evaluator_config=evaluator_config.id, + result=Result(type="number", value=average_value), + ) + aggregated_results.append(aggregated_result) return aggregated_results From 21c39b52d80ae58ee06410568388a6b12e745867 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 25 Dec 2023 18:31:26 +0100 Subject: [PATCH 102/414] :art: Format - ran black --- .../services/evaluators_service.py | 16 ++++++---------- .../agenta_backend/tasks/evaluations.py | 2 +- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 8c830f48df..94d9a3abe8 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -48,16 +48,12 @@ def auto_webhook_test( ) -> Result: try: with httpx.Client() as client: - # response = client.post( - # url=settings_values["webhook_url"], json=settings_values["webhook_body"] - # ) - # response.raise_for_status() - # response_data = response.json() - # score = response_data.get("score", None) - import secrets - - random_generator = secrets.SystemRandom() - score = random_generator.random() + response = client.post( + url=settings_values["webhook_url"], json=settings_values["webhook_body"] + ) + response.raise_for_status() + response_data = response.json() + score = response_data.get("score", None) if not score: raise httpx.HTTPError("Webhook did not return a score") if score < 0 or score > 1: diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 6e982d58e3..88905dd375 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -81,7 +81,7 @@ def evaluate( additional_kwargs = ( { "app_params": app_variant_db.config.parameters, - "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed + "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed } if evaluator_config.evaluator_key == "custom_code_run" else {} From 33150f81ae8e6fa0190ec45d9d44c328c4d3971f Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Mon, 25 Dec 2023 19:20:43 +0100 Subject: [PATCH 103/414] skip auto evaluation flow for human evaluation --- .../routers/evaluation_router.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 7c7c90fe1f..5cb4b3af4e 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -90,12 +90,17 @@ async def create_evaluation( new_evaluation_data=new_evaluation_data, evaluators_configs=payload.evaluators_configs, ) - - # Start celery task - evaluate.delay( - app_data, new_evaluation_data, evaluation.id, evaluation.testset_id - ) - return evaluation + if ( + payload.evaluators_configs.len == 1 + and payload.evaluators_configs.evaluator_key + in ["human_a_b_testing", "human_single_model_test"] + ): + return evaluation + else: + evaluate.delay( + app_data, new_evaluation_data, evaluation.id, evaluation.testset_id + ) + return evaluation except KeyError: raise HTTPException( status_code=400, @@ -121,7 +126,7 @@ async def fetch_evaluation_status(evaluation_id: str, request: Request): evaluation = await evaluation_service.fetch_evaluation( evaluation_id, **user_org_data ) - return evaluation.status + return {"status": evaluation.status} except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) From 0417b2deefa94f162f192fc6cb1eb4c3cf626834 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 26 Dec 2023 12:38:13 +0500 Subject: [PATCH 104/414] UI: new evaluation, new evaluator, evaluation scenarios --- agenta-web/src/components/Layout/Layout.tsx | 187 ++++++++-------- agenta-web/src/components/Sidebar/Sidebar.tsx | 2 +- .../evaluationResults/EvaluationResults.tsx | 198 +++++++++++------ .../evaluationResults/NewEvaluationModal.tsx | 190 +++++++++++++++++ .../evaluations/evaluationResults/mock.ts | 181 ++++++++++++++-- .../EvaluationScenarios.tsx | 149 +++++++++++++ .../evaluations/evaluators/EvaluatorCard.tsx | 20 +- .../evaluations/evaluators/Evaluators.tsx | 61 ++++-- .../evaluators/NewEvaluatorModal.tsx | 201 ++++++++++++++++++ agenta-web/src/hooks/useAppId.ts | 6 + agenta-web/src/lib/Types.ts | 28 ++- agenta-web/src/lib/helpers/axiosConfig.ts | 2 + agenta-web/src/lib/helpers/colors.ts | 18 ++ agenta-web/src/lib/services/api.ts | 5 + .../evaluations-new/[evaluation_id]/index.tsx | 8 + .../apps/[app_id]/evaluations-new/index.tsx | 1 + agenta-web/src/services/evaluations/index.ts | 103 +++++++++ 17 files changed, 1144 insertions(+), 216 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx create mode 100644 agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx create mode 100644 agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx create mode 100644 agenta-web/src/hooks/useAppId.ts create mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations-new/[evaluation_id]/index.tsx create mode 100644 agenta-web/src/services/evaluations/index.ts diff --git a/agenta-web/src/components/Layout/Layout.tsx b/agenta-web/src/components/Layout/Layout.tsx index db0de4bf51..1cc9aee47e 100644 --- a/agenta-web/src/components/Layout/Layout.tsx +++ b/agenta-web/src/components/Layout/Layout.tsx @@ -17,6 +17,7 @@ import Image from "next/image" import moonIcon from "@/media/night.png" import sunIcon from "@/media/sun.png" import {useProfileData} from "@/contexts/profile.context" +import {ThemeProvider} from "react-jss" const {Content, Footer} = Layout @@ -124,6 +125,7 @@ const App: React.FC = ({children}) => { const router = useRouter() const appId = router.query.app_id as string const isDarkTheme = appTheme === "dark" + const {token} = theme.useToken() useEffect(() => { if (user && isDemo()) { @@ -215,101 +217,106 @@ const App: React.FC = ({children}) => { algorithm: isDarkTheme ? theme.darkAlgorithm : theme.defaultAlgorithm, }} > - - - - - - Apps}, - {title: capitalizedAppName}, - ]} - /> -
- toggleAppTheme("system"), - }, - { - key: "light", - label: "Light", - onClick: () => toggleAppTheme("light"), - }, - { - key: "dark", - label: "Dark", - onClick: () => toggleAppTheme("dark"), - }, - ], - selectedKeys: [themeMode], - }} + + + + + + + Apps}, + {title: capitalizedAppName}, + ]} + /> +
+ toggleAppTheme("system"), + }, + { + key: "light", + label: "Light", + onClick: () => toggleAppTheme("light"), + }, + { + key: "dark", + label: "Dark", + onClick: () => toggleAppTheme("dark"), + }, + ], + selectedKeys: [themeMode], + }} + > + e.preventDefault()}> + + {`Curren + + + + + +
+
+ + {children} + +
+
-
- - {children} - -
-
- - - - - - - - - - - -
Copyright © {new Date().getFullYear()} | Agenta.
-
+ + + +
Copyright © {new Date().getFullYear()} | Agenta.
+ +
- + )} diff --git a/agenta-web/src/components/Sidebar/Sidebar.tsx b/agenta-web/src/components/Sidebar/Sidebar.tsx index e35994fc37..46eeee7235 100644 --- a/agenta-web/src/components/Sidebar/Sidebar.tsx +++ b/agenta-web/src/components/Sidebar/Sidebar.tsx @@ -278,7 +278,7 @@ const Sidebar: React.FC = () => { ? "Perform 1-to-1 variant comparisons on testsets to identify superior options." : "" } - key="evaluations" + key="evaluations-new" > }> div:nth-of-type(1)": { - width: 16, - height: 16, + width: 6, + height: 6, borderRadius: "50%", - backgroundColor: "#52c41a", }, }, dot: { @@ -44,67 +50,111 @@ const useStyles = createUseStyles({ }, }) +const statusMapper = (token: GlobalToken) => ({ + [EvaluationStatus.INITIALIZED]: { + label: "Queued", + color: token.colorTextSecondary, + }, + [EvaluationStatus.STARTED]: { + label: "Running", + color: token.colorWarning, + }, + [EvaluationStatus.FINISHED]: { + label: "Completed", + color: token.colorSuccess, + }, + [EvaluationStatus.ERROR]: { + label: "Failed", + color: token.colorError, + }, +}) + interface Props {} const EvaluationResults: React.FC = () => { const {appTheme} = useAppTheme() const classes = useStyles() - const [rowData, setRowData] = useState<_Evaluation[]>(Mock.evaluations) + const appId = useAppId() + const router = useRouter() + const [evaluations, setEvaluations] = useState<_Evaluation[]>([]) + const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) + const [fetching, setFetching] = useState(false) + const {token} = theme.useToken() + + const fetcher = () => { + setFetching(true) + fetchAllEvaluations(appId) + .then(setEvaluations) + .catch(console.error) + .finally(() => setFetching(false)) + } + + useEffect(() => { + fetcher() + }, [appId]) const evaluatorConfigs = useMemo( () => uniqBy( - rowData + evaluations .map((item) => item.aggregated_results.map((item) => item.evaluator_config)) .flat(), "id", ), - [rowData], + [evaluations], ) - const [colDefs, setColDefs] = useState[]>([ - {field: "testset.name"}, - { - field: "variants", - valueGetter: (params) => params.data?.variants[0].variantName, - headerName: "Variant", - }, - ...evaluatorConfigs.map( - (config) => - ({ - field: "aggregated_results", - headerComponent: () => ( - - {config.name} - - ), - valueGetter: (params) => - params.data?.aggregated_results.find( - (item) => item.evaluator_config.id === config.id, - )?.result?.value || "", - }) as ColDef<_Evaluation>, - ), - { - field: "status", - cellRenderer: (params: ICellRendererParams) => { - const classes = useStyles() + const colDefs = useMemo(() => { + const colDefs: ColDef<_Evaluation>[] = [ + {field: "testset.name"}, + { + field: "variants", + valueGetter: (params) => params.data?.variants[0].variantName, + headerName: "Variant", + }, + ...evaluatorConfigs.map( + (config) => + ({ + field: "aggregated_results", + headerComponent: () => ( + + {config.name} + + ), + valueGetter: (params) => + params.data?.aggregated_results.find( + (item) => item.evaluator_config.id === config.id, + )?.result?.value || "", + }) as ColDef<_Evaluation>, + ), + { + field: "status", + cellRenderer: (params: ICellRendererParams<_Evaluation>) => { + const classes = useStyles() + const {label, color} = statusMapper(token)[params.value as EvaluationStatus] - return ( -
-
-
{capitalize(params.value)}
- - {(params.data?.duration || 0) / 1000} -
- ) + return ( + +
+ {label} + + + {dayjs + .duration(params.data?.duration || 0, "milliseconds") + .humanize()} + + + ) + }, }, - }, - { - field: "created_at", - headerName: "Created", - valueFormatter: (params) => dayjs(params.value).fromNow(), - }, - ]) + { + field: "created_at", + headerName: "Created", + valueFormatter: (params) => dayjs(params.value).fromNow(), + }, + ] + return colDefs + }, [evaluatorConfigs]) return (
@@ -115,21 +165,39 @@ const EvaluationResults: React.FC = () => { - -
- - rowData={rowData} - columnDefs={colDefs} - getRowId={(params) => params.data.id} - /> -
+ +
+ + rowData={evaluations} + columnDefs={colDefs} + getRowId={(params) => params.data.id} + onRowClicked={(params) => + router.push(`/${router.asPath}/${params.data?.id}`) + } + /> +
+
+ + setNewEvalModalOpen(false)} + onSuccess={() => { + setNewEvalModalOpen(false) + fetcher() + }} + />
) } diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx new file mode 100644 index 0000000000..9cbeea823f --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -0,0 +1,190 @@ +import {useAppId} from "@/hooks/useAppId" +import {Evaluator, EvaluatorConfig, JSSTheme, Variant, testset} from "@/lib/Types" +import {fetchTestsets, fetchVariants} from "@/lib/services/api" +import { + CreateEvaluationData, + createEvalutaiton, + fetchAllEvaluatorConfigs, + fetchAllEvaluators, +} from "@/services/evaluations" +import {PlusOutlined} from "@ant-design/icons" +import {Form, Modal, Select, Spin, Tag, Typography} from "antd" +import dayjs from "dayjs" +import Image from "next/image" +import React, {useEffect, useState} from "react" +import {createUseStyles} from "react-jss" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + spinContainer: { + display: "grid", + placeItems: "center", + height: "100%", + }, + selector: { + width: 300, + }, + evaluationImg: { + width: 20, + height: 20, + marginRight: 12, + filter: theme.isDark ? "invert(1)" : "none", + }, + configRow: { + display: "flex", + alignItems: "center", + justifyContent: "space-between", + }, + configRowContent: { + display: "flex", + alignItems: "center", + }, + date: { + fontSize: "0.75rem", + color: "#888", + }, + tag: { + transform: "scale(0.8)", + }, +})) + +type Props = { + onSuccess?: () => void +} & React.ComponentProps + +const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { + const classes = useStyles() + const appId = useAppId() + const [fetching, setFetching] = useState(false) + const [testSets, setTestSets] = useState([]) + const [variants, setVariants] = useState([]) + const [evaluatorConfigs, setEvaluatorConfigs] = useState([]) + const [evaluators, setEvaluators] = useState([]) + const [submitLoading, setSubmitLoading] = useState(false) + const [form] = Form.useForm() + + useEffect(() => { + setFetching(true) + form.resetFields() + Promise.all([ + fetchTestsets(appId), + fetchVariants(appId), + fetchAllEvaluatorConfigs(appId), + fetchAllEvaluators(), + ]) + .then(([testSets, variants, evaluatorConfigs, evaluators]) => { + setTestSets(testSets) + setVariants(variants) + setEvaluatorConfigs(evaluatorConfigs) + setEvaluators(evaluators) + }) + .catch(console.error) + .finally(() => setFetching(false)) + }, [props.open, appId]) + + const onSubmit = (values: CreateEvaluationData) => { + setSubmitLoading(true) + createEvalutaiton(appId, values) + .then(onSuccess) + .catch(console.error) + .finally(() => setSubmitLoading(false)) + } + + return ( + , loading: submitLoading}} + {...props} + > + +
+ + + + + + + + + +
+
+
+ ) +} + +export default NewEvaluationModal diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts b/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts index bc86328bb2..69942606b5 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts @@ -1,5 +1,6 @@ import { EvaluationSettingsTemplate, + EvaluationStatus, Evaluator, EvaluatorConfig, Org, @@ -7,6 +8,7 @@ import { User, Variant, _Evaluation, + _EvaluationScenario, } from "@/lib/Types" import exactMatchImg from "@/media/target.png" import similarityImg from "@/media/transparency.png" @@ -14,8 +16,8 @@ import regexImg from "@/media/programming.png" import webhookImg from "@/media/link.png" import aiImg from "@/media/artificial-intelligence.png" import codeImg from "@/media/browser.png" -import {PresetColors} from "antd/es/theme/internal" -import {stringToNumberInRange} from "@/lib/helpers/utils" +import {pickRandom, stringToNumberInRange} from "@/lib/helpers/utils" +import {getTagColors} from "@/lib/helpers/colors" const organizations: Org[] = [ { @@ -65,6 +67,43 @@ const evaluatorSettinsTemplates: EvaluationSettingsTemplate[] = [ type: "number", default: 0.5, description: "Threshold for similarity matching", + label: "Similarity Threshold", + }, + { + type: "text", + description: "Threshold for similarity matching", + label: "System Prompt", + }, + { + type: "code", + description: "Python code for evaluation", + label: "Code", + default: `from typing import Dict + + def evaluate( + app_params: Dict[str, str], + inputs: Dict[str, str], + output: str, + correct_answer: str + ) -> float: + # ... + return 0.75 # Replace with your calculated score`, + }, + { + type: "boolean", + default: false, + description: "Whether to use the default webhook", + label: "Use Default Webhook", + }, + { + type: "regex", + description: "Regex pattern ex: ^[0-9]{3}-[0-9]{3}-[0-9]{4}$", + label: "Regex", + }, + { + type: "string", + description: "URL of the webhook", + label: "Webhook URL", }, ] @@ -86,43 +125,48 @@ const evaluators: Evaluator[] = [ { name: "Regex Test", key: "auto_regex_test", - settings_template: {}, + settings_template: { + regex_pattern: evaluatorSettinsTemplates[4], + regex_should_match: evaluatorSettinsTemplates[3], + }, icon_url: regexImg, }, { name: "AI Critique", key: "auto_ai_critique", - settings_template: {}, + settings_template: { + llm_app_prompt_template: evaluatorSettinsTemplates[1], + }, icon_url: aiImg, }, { name: "Code Evaluation", key: "custom_code_run", - settings_template: {}, + settings_template: { + custom_code_evaluation_id: evaluatorSettinsTemplates[2], + }, icon_url: codeImg, }, { - name: "Webhook test", + name: "Webhook Test", key: "auto_webhook_test", - settings_template: {}, + settings_template: { + webhook_url: evaluatorSettinsTemplates[5], + }, icon_url: webhookImg, }, ].map((item) => ({ ...(item as Evaluator), - color: PresetColors[stringToNumberInRange(item.key, 0, PresetColors.length - 1)], + color: getTagColors()[stringToNumberInRange(item.key, 0, getTagColors().length - 1)], })) -const evaluatorConfigs: EvaluatorConfig[] = [ - { - evaluator_key: "similarity", - name: "Nearly Similar", - settings_values: { - similarity_threshold: 0.4, - }, - created_at: "2021-01-01T00:00:00.000Z", - id: "config1", - }, -] +const evaluatorConfigs: EvaluatorConfig[] = pickRandom(evaluators, 7).map((item, ix) => ({ + evaluator_key: item.key, + id: ix + "", + name: `Evaluator ${ix}`, + settings_values: {}, + created_at: new Date().toString(), +})) const evaluations: _Evaluation[] = [ { @@ -130,7 +174,7 @@ const evaluations: _Evaluation[] = [ organization: organizations[0], user: users[0], testset: testsets[0], - status: "completed", + status: EvaluationStatus.FINISHED, variants: [variants[0]], aggregated_results: [ { @@ -144,6 +188,102 @@ const evaluations: _Evaluation[] = [ created_at: "2021-01-01T00:00:00.000Z", duration: 50000, }, + { + id: "evaluation2", + organization: organizations[0], + user: users[0], + testset: testsets[0], + status: EvaluationStatus.INITIALIZED, + variants: [variants[0]], + aggregated_results: [ + { + evaluator_config: evaluatorConfigs[1], + result: { + type: "string", + value: "passed", + }, + }, + ], + created_at: "2022-01-01T00:00:00.000Z", + duration: 120000, + }, + { + id: "evaluation2", + organization: organizations[0], + user: users[0], + testset: testsets[0], + status: EvaluationStatus.STARTED, + variants: [variants[0]], + aggregated_results: [ + { + evaluator_config: evaluatorConfigs[2], + result: { + type: "string", + value: "valid", + }, + }, + ], + created_at: "2022-05-01T00:00:00.000Z", + duration: 120000, + }, + { + id: "evaluation2", + organization: organizations[0], + user: users[0], + testset: testsets[0], + status: EvaluationStatus.ERROR, + variants: [variants[0]], + aggregated_results: [ + { + evaluator_config: evaluatorConfigs[0], + result: { + type: "number", + value: 15, + }, + }, + ], + created_at: "2023-05-01T00:00:00.000Z", + duration: 2000, + }, +] + +const evaluationScenarios: _EvaluationScenario[] = [ + { + id: "evaluationScenario1", + user: users[0], + organization: organizations[0], + evaluation: evaluations[0], + inputs: [ + { + name: "country", + type: "text", + value: "Sample input text", + }, + ], + outputs: [ + { + type: "number", + value: 32.5, + }, + ], + correct_answer: { + type: "number", + value: 28, + }, + created_at: "2021-01-01T00:00:00.000Z", + updated_at: "2021-01-01T00:00:00.000Z", + is_pinned: false, + note: "This is a note", + evaluators_configs: [evaluatorConfigs[0]], + results: [ + { + evaluator: evaluators.find( + (item) => item.key === evaluatorConfigs[0].evaluator_key, + )!, + result: 12, + }, + ], + }, ] const Mock = { @@ -155,6 +295,7 @@ const Mock = { evaluators, evaluatorConfigs, evaluations, + evaluationScenarios, } export default Mock diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx new file mode 100644 index 0000000000..c0b9f9e6bc --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -0,0 +1,149 @@ +import {useAppTheme} from "@/components/Layout/ThemeContextProvider" +import {useAppId} from "@/hooks/useAppId" +import {JSSTheme, _Evaluation, _EvaluationScenario} from "@/lib/Types" +import {fetchAllEvaluationScenarios, fetchEvaluation} from "@/services/evaluations" +import {DeleteOutlined, DownloadOutlined} from "@ant-design/icons" +import {ColDef} from "ag-grid-community" +import {AgGridReact} from "ag-grid-react" +import {Spin, Typography} from "antd" +import dayjs from "dayjs" +import {useRouter} from "next/router" +import React, {useEffect, useMemo, useState} from "react" +import {createUseStyles} from "react-jss" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + header: { + marginTop: "1rem", + display: "flex", + alignItems: "center", + justifyContent: "space-between", + + "& > h3": { + margin: 0, + }, + + "& > :last-child": { + display: "flex", + alignItems: "center", + gap: "1rem", + }, + }, + date: { + fontSize: "0.75rem", + color: theme.colorTextSecondary, + display: "inline-block", + marginBottom: "1rem", + }, + table: { + height: 500, + }, +})) + +interface Props {} + +const EvaluationScenarios: React.FC = () => { + const router = useRouter() + const appId = useAppId() + const classes = useStyles() + const {appTheme} = useAppTheme() + const evaluationId = router.query.evaluation_id as string + const [scenarios, setScenarios] = useState<_EvaluationScenario[]>([]) + const [evalaution, setEvaluation] = useState<_Evaluation>() + const [fetching, setFetching] = useState(false) + + const colDefs = useMemo(() => { + const colDefs: ColDef<_EvaluationScenario>[] = [] + if (!scenarios.length || !evalaution) return colDefs + + scenarios[0]?.inputs.forEach((input, index) => { + colDefs.push({ + headerName: `Input: ${input.name}`, + field: `inputs.${index}`, + valueGetter: (params) => { + return params.data?.inputs[index].value || "" + }, + }) + }) + colDefs.push({ + headerName: "Expected Output", + field: "correct_answer", + valueGetter: (params) => { + return params.data?.correct_answer?.value || "" + }, + }) + evalaution?.variants.forEach((variant, index) => { + colDefs.push({ + headerName: `Output (${variant.variantName})`, + field: `outputs.${index}`, + valueGetter: (params) => { + return params.data?.outputs[index].value || "" + }, + }) + }) + scenarios[0]?.evaluators_configs.forEach((config, index) => { + colDefs.push({ + headerName: `Evaluator: ${config.name}`, + field: `results`, + valueGetter: (params) => { + return ( + params.data?.results.find( + (item) => item.evaluator.key === config.evaluator_key, + )?.result || "" + ) + }, + }) + }) + return colDefs + }, [evalaution, scenarios]) + + const fetcher = () => { + setFetching(true) + Promise.all([ + fetchAllEvaluationScenarios(appId, evaluationId), + fetchEvaluation(appId, evaluationId), + ]) + .then(([scenarios, evaluation]) => { + setScenarios(scenarios) + setEvaluation(evaluation) + }) + .catch(console.error) + .finally(() => setFetching(false)) + } + + useEffect(() => { + fetcher() + }, [appId, evaluationId]) + + return ( +
+
+ + Evaluation Result (Testset: {evalaution?.testset.name || ""}) + +
+ + +
+
+ + {dayjs(evalaution?.created_at).format("MM DD YYYY | H:M a")} + + + +
+ + rowData={scenarios} + columnDefs={colDefs} + getRowId={(params) => params.data.id} + /> +
+
+
+ ) +} + +export default EvaluationScenarios diff --git a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx index bc83bc550a..97cc664ce0 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx @@ -1,18 +1,13 @@ import React from "react" -import {EvaluatorConfig} from "@/lib/Types" +import {EvaluatorConfig, JSSTheme} from "@/lib/Types" import {DeleteOutlined, EditOutlined} from "@ant-design/icons" import {Card, Tag, Typography} from "antd" import {createUseStyles} from "react-jss" import Mock from "../evaluationResults/mock" import dayjs from "dayjs" import Image from "next/image" -import {useAppTheme} from "@/components/Layout/ThemeContextProvider" -type StyleProps = { - themeMode: "dark" | "light" -} - -const useStyles = createUseStyles({ +const useStyles = createUseStyles((theme: JSSTheme) => ({ body: { display: "flex", flexDirection: "column", @@ -25,25 +20,24 @@ const useStyles = createUseStyles({ justifyContent: "space-between", marginBottom: "1.5rem", }, - evaluationImg: ({themeMode}: StyleProps) => ({ + evaluationImg: { width: 27, height: 27, marginRight: "8px", - filter: themeMode === "dark" ? "invert(1)" : "none", - }), + filter: theme.isDark ? "invert(1)" : "none", + }, name: { marginTop: "0.25rem", marginBottom: 0, }, -}) +})) interface Props { evaluatorConfig: EvaluatorConfig } const EvaluatorCard: React.FC = ({evaluatorConfig}) => { - const {appTheme} = useAppTheme() - const classes = useStyles({themeMode: appTheme} as StyleProps) + const classes = useStyles() const evaluator = Mock.evaluators.find((item) => item.key === evaluatorConfig.evaluator_key)! return ( diff --git a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx index 08f995db6f..e9d414a3f0 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx @@ -1,11 +1,12 @@ -import React, {useState} from "react" +import React, {useEffect, useState} from "react" import {createUseStyles} from "react-jss" -import Mock from "../evaluationResults/mock" import EvaluatorCard from "./EvaluatorCard" -import {Button, Space} from "antd" +import {Button, Space, Spin} from "antd" import {PlusCircleOutlined} from "@ant-design/icons" -import {pickRandom} from "@/lib/helpers/utils" import {EvaluatorConfig} from "@/lib/Types" +import NewEvaluatorModal from "./NewEvaluatorModal" +import {useAppId} from "@/hooks/useAppId" +import {fetchAllEvaluatorConfigs} from "@/services/evaluations" const useStyles = createUseStyles({ root: { @@ -27,28 +28,50 @@ interface Props {} const Evaluators: React.FC = () => { const classes = useStyles() - const [evaluatorConfigs, setEvaluatorConfigs] = useState( - pickRandom(Mock.evaluators, 7).map((item, ix) => ({ - evaluator_key: item.key, - id: ix + "", - name: `Evaluator ${ix}`, - settings_values: {}, - created_at: new Date().toString(), - })), - ) + const appId = useAppId() + const [evaluatorConfigs, setEvaluatorConfigs] = useState([]) + const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) + const [fetching, setFetching] = useState(false) + + const fetcher = () => { + setFetching(true) + fetchAllEvaluatorConfigs(appId) + .then(setEvaluatorConfigs) + .catch(console.error) + .finally(() => setFetching(false)) + } + + useEffect(() => { + fetcher() + }, []) return (
- -
- {evaluatorConfigs.map((item) => ( - - ))} -
+ +
+ {evaluatorConfigs.map((item) => ( + + ))} +
+
+ + setNewEvalModalOpen(false)} + onSuccess={() => { + setNewEvalModalOpen(false) + fetcher() + }} + />
) } diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx new file mode 100644 index 0000000000..7ca71a4a1d --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx @@ -0,0 +1,201 @@ +import {useAppTheme} from "@/components/Layout/ThemeContextProvider" +import {useAppId} from "@/hooks/useAppId" +import {EvaluationSettingsTemplate, Evaluator, JSSTheme} from "@/lib/Types" +import {isValidRegex} from "@/lib/helpers/validators" +import { + CreateEvaluationConfigData, + createEvaluatorConfig, + fetchAllEvaluators, +} from "@/services/evaluations" +import {InfoCircleOutlined, PlusOutlined} from "@ant-design/icons" +import {Editor} from "@monaco-editor/react" +import {Form, Input, InputNumber, Modal, Radio, Spin, Switch, Tooltip, theme} from "antd" +import {Rule} from "antd/es/form" +import Image from "next/image" +import React, {useEffect, useMemo, useState} from "react" +import {createUseStyles} from "react-jss" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + spinContainer: { + display: "grid", + placeItems: "center", + height: "100%", + }, + label: { + display: "flex", + alignItems: "center", + gap: "0.5rem", + }, + evaluationImg: { + width: 20, + height: 20, + marginRight: "8px", + filter: theme.isDark ? "invert(1)" : "none", + }, + radioBtn: { + display: "flex", + alignItems: "center", + gap: "0.325rem", + }, +})) + +type DynamicFormFieldProps = EvaluationSettingsTemplate & { + name: string | string[] +} + +const DynamicFormField: React.FC = ({ + name, + label, + type, + default: defaultVal, + description, +}) => { + const {appTheme} = useAppTheme() + const classes = useStyles() + const {token} = theme.useToken() + + const rules: Rule[] = [{required: true, message: "This field is required"}] + if (type === "regex") + rules.push({ + validator: (_, value) => + new Promise((res, rej) => + isValidRegex(value) ? res("") : rej("Regex pattern is not valid"), + ), + }) + + return ( + + {label} + {description && ( + + + + )} +
+ } + initialValue={defaultVal} + rules={rules} + > + {type === "string" || type === "regex" ? ( + + ) : type === "number" ? ( + + ) : type === "boolean" ? ( + + ) : type === "text" ? ( + + ) : type === "code" ? ( + + ) : null} + + ) +} + +type Props = { + onSuccess?: () => void +} & React.ComponentProps + +const NewEvaluatorModal: React.FC = ({onSuccess, ...props}) => { + const classes = useStyles() + const [fetching, setFetching] = useState(false) + const [evaluators, setEvaluators] = useState([]) + const [selectedEval, setSelectedEval] = useState(null) + const [submitLoading, setSubmitLoading] = useState(false) + const appId = useAppId() + const [form] = Form.useForm() + + const evalFields = useMemo( + () => + Object.keys(selectedEval?.settings_template || {}).map((key) => ({ + key, + ...selectedEval?.settings_template[key]!, + })), + [selectedEval], + ) + + useEffect(() => { + setFetching(true) + setSelectedEval(null) + form.resetFields() + fetchAllEvaluators() + .then(setEvaluators) + .catch(console.error) + .finally(() => setFetching(false)) + }, [props.open]) + + const onSubmit = (values: CreateEvaluationConfigData) => { + setSubmitLoading(true) + createEvaluatorConfig(appId, values) + .then(onSuccess) + .catch(console.error) + .finally(() => setSubmitLoading(false)) + } + + return ( + , loading: submitLoading}} + {...props} + > + +
+ + + + + + setSelectedEval( + evaluators.find((item) => item.key === e.target.value) || null, + ) + } + > + {evaluators.map((evaluator) => ( + +
+ {evaluator.icon_url && ( + Exact match + )} + {evaluator.name} +
+
+ ))} +
+
+ {evalFields.map((field) => ( + + ))} + +
+
+ ) +} + +export default NewEvaluatorModal diff --git a/agenta-web/src/hooks/useAppId.ts b/agenta-web/src/hooks/useAppId.ts new file mode 100644 index 0000000000..19718d52bf --- /dev/null +++ b/agenta-web/src/hooks/useAppId.ts @@ -0,0 +1,6 @@ +import {useRouter} from "next/router" + +export const useAppId = (): string => { + const router = useRouter() + return (router.query.app_id ?? "") as string +} diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index c25943d6fa..cd96747ae5 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -1,5 +1,8 @@ import {StaticImageData} from "next/image" import {EvaluationFlow, EvaluationType} from "./enums" +import {GlobalToken} from "antd" + +export type JSSTheme = GlobalToken & {isDark: boolean} export interface testset { _id: string @@ -291,11 +294,13 @@ export type ChatMessage = { } type ValueType = number | string | boolean | GenericObject | null +type ValueTypeOptions = "text" | "number" | "boolean" | "string" | "code" | "regex" //evaluation revamp types export interface EvaluationSettingsTemplate { - type: string - default: ValueType + type: ValueTypeOptions + label: string + default?: ValueType description: string } @@ -316,7 +321,7 @@ export interface EvaluatorConfig { } export interface TypedValue { - type: string + type: ValueTypeOptions value: ValueType } @@ -325,12 +330,19 @@ export interface EvaluationScenarioResult { result: TypedValue } +export enum EvaluationStatus { + INITIALIZED = "EVALUATION_INITIALIZED", + STARTED = "EVALUATION_STARTED", + FINISHED = "EVALUATION_FINISHED", + ERROR = "EVALUATION_ERROR", +} + export interface _Evaluation { id: string organization: Org user: User testset: TestSet - status: "completed" | "failed" | "pending" + status: EvaluationStatus variants: Variant[] aggregated_results: { evaluator_config: EvaluatorConfig @@ -347,11 +359,11 @@ export interface _EvaluationScenario { evaluation: _Evaluation inputs: (TypedValue & {name: string})[] outputs: TypedValue[] - correct_answer?: string - created_at?: Date - updated_at?: Date + correct_answer?: TypedValue + created_at?: string + updated_at?: string is_pinned?: boolean note?: string evaluators_configs: EvaluatorConfig[] - results: EvaluationResult[] + results: {evaluator: Evaluator; result: ValueType}[] } diff --git a/agenta-web/src/lib/helpers/axiosConfig.ts b/agenta-web/src/lib/helpers/axiosConfig.ts index 8829470c38..258ad9f864 100644 --- a/agenta-web/src/lib/helpers/axiosConfig.ts +++ b/agenta-web/src/lib/helpers/axiosConfig.ts @@ -2,8 +2,10 @@ import axiosApi from "axios" import {getErrorMessage, globalErrorHandler} from "./errorHandler" import {signOut} from "supertokens-auth-react/recipe/thirdpartypasswordless" import router from "next/router" +import {getAgentaApiUrl} from "./utils" const axios = axiosApi.create({ + baseURL: getAgentaApiUrl(), headers: { "Content-Type": "application/json", }, diff --git a/agenta-web/src/lib/helpers/colors.ts b/agenta-web/src/lib/helpers/colors.ts index f4772ebab6..fa1f114573 100644 --- a/agenta-web/src/lib/helpers/colors.ts +++ b/agenta-web/src/lib/helpers/colors.ts @@ -40,6 +40,22 @@ const colors = [ "#0099FF", ] +const tagColors = [ + "blue", + "purple", + "cyan", + "green", + "magenta", + "pink", + "red", + "orange", + "yellow", + "volcano", + "geekblue", + "lime", + "gold", +] + export const getGradientFromStr = (value: string) => { return gradients[stringToNumberInRange(value, 0, gradients.length - 1)] } @@ -61,3 +77,5 @@ export const fadeColor = (hex: string, opacity: number) => { // Create the faded color in RGBA format return `rgba(${r}, ${g}, ${b}, ${opacity})` } + +export const getTagColors = () => [...tagColors] diff --git a/agenta-web/src/lib/services/api.ts b/agenta-web/src/lib/services/api.ts index eb72630ba0..b1eb5471f1 100644 --- a/agenta-web/src/lib/services/api.ts +++ b/agenta-web/src/lib/services/api.ts @@ -252,6 +252,11 @@ export const useLoadTestsetsList = (appId: string) => { } } +export const fetchTestsets = async (appId: string) => { + const response = await axios.get(`${getAgentaApiUrl()}/api/testsets/?app_id=${appId}`) + return response.data +} + export async function createNewTestset(appId: string, testsetName: string, testsetData: any) { const response = await axios.post(`${getAgentaApiUrl()}/api/testsets/${appId}/`, { name: testsetName, diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/[evaluation_id]/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations-new/[evaluation_id]/index.tsx new file mode 100644 index 0000000000..fb2d997f15 --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/evaluations-new/[evaluation_id]/index.tsx @@ -0,0 +1,8 @@ +import React from "react" +import EvaluationScenarios from "@/components/pages/evaluations/evaluationScenarios/EvaluationScenarios" + +const EvaluationDetail = () => { + return +} + +export default EvaluationDetail diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx index d0fcc7cefa..e84d84462c 100644 --- a/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx @@ -13,6 +13,7 @@ const Evaluations: React.FC = () => { return (
{ + const tagColors = getTagColors() + + await delay(1000) + return Mock.evaluators + + const response = await axios.get(`/api/evaluators/`) + return (response.data || []).map((item: Evaluator) => ({ + ...item, + color: tagColors[stringToNumberInRange(item.key, 0, tagColors.length - 1)], + })) as Evaluator[] +} + +// Evaluator Configs +export const fetchAllEvaluatorConfigs = async (appId: string) => { + await delay(1000) + return Mock.evaluatorConfigs + + const response = await axios.get(`/api/evaluators/configs`) + return response.data as EvaluatorConfig[] +} + +export const deleteEvaluatorConfig = async (appId: string, configId: string) => { + return axios.delete(`/api/evaluators/configs/${configId}`) +} + +export type CreateEvaluationConfigData = Omit +export const createEvaluatorConfig = async (appId: string, config: CreateEvaluationConfigData) => { + await delay(1000) + return console.log("create evaluation config", config) + return axios.post(`/api/evaluators/configs`, {...config, app_id: appId}) +} + +// Evaluations +export const fetchAllEvaluations = async (appId: string) => { + await delay(1000) + return Mock.evaluations + + const response = await axios.get(`/api/evaluations`, {params: {app_id: appId}}) + return response.data as _Evaluation[] +} + +export const fetchEvaluation = async (appId: string, evaluationId: string) => { + await delay(1000) + return Mock.evaluations[0] + + const response = await axios.get(`/api/evaluations/${evaluationId}`, { + params: {app_id: appId}, + }) + return response.data as _Evaluation +} + +export const fetchEvaluationStatus = async (appId: string, evaluationId: string) => { + await delay(1000) + return {status: pickRandom(Object.values(EvaluationStatus), 1)[0]} + + const response = await axios.get(`/api/evaluations/${evaluationId}/status`, { + params: {app_id: appId}, + }) + return response.data as {status: EvaluationStatus} +} + +export type CreateEvaluationData = { + testset: string[] + variants: string[] + evaluator_configs: string[] +} +export const createEvalutaiton = async (appId: string, evaluation: CreateEvaluationData) => { + await delay(1000) + return console.log("create evaluation", evaluation) + return axios.post(`/api/evaluations`, {...evaluation, app_id: appId}) +} + +// Evaluation Scenarios +export const fetchAllEvaluationScenarios = async (appId: string, evaluationId: string) => { + await delay(1000) + return Mock.evaluationScenarios + + const response = await axios.get(`/api/evaluations/${evaluationId}/evaluation_scenarios`, { + params: {app_id: appId}, + }) + return response.data as _EvaluationScenario[] +} From e4373cab6c6759902b5f227db79cc83ec83a0ec8 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 26 Dec 2023 11:16:55 +0100 Subject: [PATCH 105/414] removed icon --- .../agenta_backend/resources/evaluators/evaluators.json | 8 -------- 1 file changed, 8 deletions(-) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 673528ec46..45a582286d 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -2,13 +2,11 @@ { "name": "Exact Match", "key": "auto_exact_match", - "icon": "", "settings_template": {} }, { "name": "Similarity Match", "key": "auto_similarity_match", - "icon": "", "settings_template": { "threshold": 0.5 } @@ -16,7 +14,6 @@ { "name": "Regex Test", "key": "auto_regex_test", - "icon": "", "settings_template": { "regex_pattern": { "type": "regex", @@ -28,7 +25,6 @@ { "name": "AI Critique", "key": "auto_ai_critique", - "icon": "", "settings_template": { "prompt_template": { "type": "text", @@ -40,25 +36,21 @@ { "name": "Code Evaluation", "key": "auto_custom_code_run", - "icon": "", "settings_template": {} }, { "name": "Webhook test", "key": "auto_webhook_test", - "icon": "", "settings_template": {} }, { "name": "A/B Test", "key": "human_a_b_testing", - "icon": "", "settings_template": {} }, { "name": "Single Model Test", "key": "human_single_model_test", - "icon": "", "settings_template": {} } ] From df07043f5933a41a483e16f4dbfac20c70c8e89a Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 26 Dec 2023 11:28:24 +0100 Subject: [PATCH 106/414] remove human evaluations check for now --- .../agenta_backend/routers/evaluation_router.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 5cb4b3af4e..eb1d09ff77 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -90,17 +90,11 @@ async def create_evaluation( new_evaluation_data=new_evaluation_data, evaluators_configs=payload.evaluators_configs, ) - if ( - payload.evaluators_configs.len == 1 - and payload.evaluators_configs.evaluator_key - in ["human_a_b_testing", "human_single_model_test"] - ): - return evaluation - else: - evaluate.delay( - app_data, new_evaluation_data, evaluation.id, evaluation.testset_id - ) - return evaluation + + evaluate.delay( + app_data, new_evaluation_data, evaluation.id, evaluation.testset_id + ) + return evaluation except KeyError: raise HTTPException( status_code=400, From 5ab2b0f7d3dc6a87ee729f439e9eb00968f9ccb6 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 26 Dec 2023 11:48:26 +0100 Subject: [PATCH 107/414] fix return response for evaluator config --- .../agenta_backend/models/api/evaluation_model.py | 2 +- agenta-backend/agenta_backend/models/converters.py | 11 +++++++++++ .../agenta_backend/routers/evaluators_router.py | 8 ++------ .../agenta_backend/services/evaluator_manager.py | 4 +++- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 7befadb178..53e786cdf3 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -161,6 +161,7 @@ class EvaluationSettingsTemplate(BaseModel): class EvaluatorConfig(BaseModel): id: str + name: str evaluator_key: str settings_values: Optional[Dict[str, Any]] @@ -175,7 +176,6 @@ class NewEvaluation(BaseModel): class Evaluator(BaseModel): name: str key: str - icon: str settings_template: dict diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index b567321f57..30e52c6fea 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -5,6 +5,7 @@ from agenta_backend.models.api.user_models import User from agenta_backend.models.db_models import ( AppVariantDB, + EvaluatorConfigDB, ImageDB, TemplateDB, AppDB, @@ -39,6 +40,7 @@ SimpleEvaluationOutput, EvaluationScenario, Evaluation, + EvaluatorConfig, ) import logging @@ -294,3 +296,12 @@ def user_db_to_pydantic(user_db: UserDB) -> User: username=user_db.username, email=user_db.email, ).dict(exclude_unset=True) + + +def evaluator_config_db_to_pydantic(evaluator_config: EvaluatorConfigDB): + return EvaluatorConfig( + id=str(evaluator_config.id), + name=evaluator_config.name, + evaluator_key=evaluator_config.evaluator_key, + settings_values=evaluator_config.settings_values, + ) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index e8537bf70c..34a0d77bcd 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -90,17 +90,13 @@ async def create_new_evaluator_config( EvaluatorConfigDB: Evaluator configuration api model. """ - config_db = await evaluator_manager.create_evaluator_config( + evaluator_config = await evaluator_manager.create_evaluator_config( app_id=payload.app_id, name=payload.name, evaluator_key=payload.evaluator_key, settings_values=payload.settings_values, ) - return EvaluatorConfig( - id=str(config_db.id), - evaluator_key=config_db.evaluator_key, - settings_values=config_db.settings_values, - ) + return evaluator_config @router.delete("/configs/{evaluator_id}/", response_model=bool) diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 62538f78db..7ac111815f 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -4,6 +4,7 @@ from agenta_backend.models.db_models import EvaluatorConfigDB +from agenta_backend.models.converters import evaluator_config_db_to_pydantic async def get_evaluators_configs(app_id: str): @@ -49,7 +50,7 @@ async def create_evaluator_config( EvaluatorConfigDB: The newly created evaluator configuration object. """ app = await db_manager.fetch_app_by_id(app_id) - return await db_manager.create_evaluator_config( + evaluator_config = await db_manager.create_evaluator_config( app=app, organization=app.organization, user=app.user, @@ -57,6 +58,7 @@ async def create_evaluator_config( evaluator_key=evaluator_key, settings_values=settings_values, ) + return evaluator_config_db_to_pydantic(evaluator_config=evaluator_config) async def update_evaluator_config( From ce7c3a6e9fe47a4d7696003ab902badda8b4cc70 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 26 Dec 2023 13:15:38 +0100 Subject: [PATCH 108/414] fixed get evaluator config and get evaluators configs --- .../routers/evaluators_router.py | 30 +++++++++++-------- .../services/evaluator_manager.py | 28 +++++++++-------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 34a0d77bcd..f01ec6c9f8 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -66,15 +66,19 @@ async def get_evaluator_configs(app_id: str = Query()): List[EvaluatorConfigDB]: A list of evaluator configuration objects. """ - configs_db = await evaluator_manager.get_evaluators_configs(app_id) - return [ - EvaluatorConfig( - id=str(config_db.id), - evaluator_key=config_db.evaluator_key, - settings_values=config_db.settings_values, - ) - for config_db in configs_db - ] + evaluators_configs = await evaluator_manager.get_evaluators_configs(app_id) + return evaluators_configs + +@router.get("/configs/{evaluator_config_id}/", response_model=EvaluatorConfig) +async def get_evaluator_config(evaluator_config_id: str): + """Endpoint to fetch evaluator configurations for a specific app. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ + + evaluators_configs = await evaluator_manager.get_evaluator_config(evaluator_config_id) + return evaluators_configs @router.post("/configs/", response_model=EvaluatorConfig) @@ -99,18 +103,18 @@ async def create_new_evaluator_config( return evaluator_config -@router.delete("/configs/{evaluator_id}/", response_model=bool) -async def delete_evaluator_config(evaluator_id: str): +@router.delete("/configs/{evaluator_config_id}/", response_model=bool) +async def delete_evaluator_config(evaluator_config_id: str): """Endpoint to delete a specific evaluator configuration. Args: - evaluator_id (str): The unique identifier of the evaluator configuration. + evaluator_config_id (str): The unique identifier of the evaluator configuration. Returns: bool: True if deletion was successful, False otherwise. """ try: - success = await evaluator_manager.delete_evaluator_config(evaluator_id) + success = await evaluator_manager.delete_evaluator_config(evaluator_config_id) return success except Exception as e: raise HTTPException( diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 7ac111815f..d84b194b4c 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -1,34 +1,38 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, List from agenta_backend.services import db_manager from agenta_backend.models.db_models import EvaluatorConfigDB +from agenta_backend.models.api.evaluation_model import EvaluatorConfig from agenta_backend.models.converters import evaluator_config_db_to_pydantic - -async def get_evaluators_configs(app_id: str): - """Get evaluators configs by app_id. +async def get_evaluators_configs(app_id: str) -> List[EvaluatorConfig]: + """ + Get evaluators configs by app_id. Args: app_id (str): The ID of the app. Returns: - List[EvaluatorConfigDB]: A list of evaluator configuration objects. + List[EvaluatorConfig]: A list of evaluator configuration objects. """ - return await db_manager.fetch_evaluators_configs(app_id) + evaluator_configs_db = await db_manager.fetch_evaluators_configs(app_id) + return [evaluator_config_db_to_pydantic(evaluator_config_db) for evaluator_config_db in evaluator_configs_db] -async def get_evaluator_config(config_id: str): - """Get evaluators configs by app_id. +async def get_evaluator_config(evaluator_config_id: str) -> EvaluatorConfig: + """ + Get an evaluator configuration by its ID. Args: - config_id (str): The ID of the evaluator configuration. + evaluator_config_id (str): The ID of the evaluator configuration. Returns: - EvaluatorConfigDB: the evaluator configuration object. + EvaluatorConfig: The evaluator configuration object. """ - return await db_manager.fetch_evaluator_config(config_id) + evaluator_config_db = await db_manager.fetch_evaluator_config(evaluator_config_id) + return evaluator_config_db_to_pydantic(evaluator_config_db) async def create_evaluator_config( @@ -36,7 +40,7 @@ async def create_evaluator_config( name: str, evaluator_key: str, settings_values: Optional[Dict[str, Any]] = None, -) -> EvaluatorConfigDB: +) -> EvaluatorConfig: """ Create a new evaluator configuration for an app. From c97d6bfb927318d0697424f3ffe080db66d48ff8 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 26 Dec 2023 13:26:49 +0100 Subject: [PATCH 109/414] update default values for auto_custom_code_run and auto_webhook_test --- .../resources/evaluators/evaluators.json | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 45a582286d..d4adff5d5d 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -36,12 +36,24 @@ { "name": "Code Evaluation", "key": "auto_custom_code_run", - "settings_template": {} + "settings_template": { + "code": { + "type": "code", + "default": "from typing import Dict\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: str,\n correct_answer: str\n) -> float:\n # ...\n return 0.75 # Replace with your calculated score", + "description": "url" + } + } }, { "name": "Webhook test", "key": "auto_webhook_test", - "settings_template": {} + "settings_template": { + "webhook_url": { + "type": "string", + "default": "https://cloud.agenta.ai/api/evaluations/webhook_example_fake", + "description": "url" + } + } }, { "name": "A/B Test", From 615b40a3b95aaffe01a6c8b368bda64e7f612d04 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 26 Dec 2023 13:29:13 +0100 Subject: [PATCH 110/414] fix similarity threshold --- .../agenta_backend/resources/evaluators/evaluators.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index d4adff5d5d..5deeaf8bf4 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -8,7 +8,11 @@ "name": "Similarity Match", "key": "auto_similarity_match", "settings_template": { - "threshold": 0.5 + "similarity_threshold": { + "type": "number", + "default": 0.5, + "description": "" + } } }, { From 28a6d75348f175e8fe5eaa86bb066dc361739439 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 26 Dec 2023 13:39:08 +0100 Subject: [PATCH 111/414] add label and fix description --- .../resources/evaluators/evaluators.json | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 5deeaf8bf4..52218c0f25 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -2,16 +2,20 @@ { "name": "Exact Match", "key": "auto_exact_match", - "settings_template": {} + "settings_template": { + "label": "Exact Match Settings", + "description": "Settings for the Exact Match evaluator" + } }, { "name": "Similarity Match", "key": "auto_similarity_match", "settings_template": { "similarity_threshold": { + "label": "Similarity Threshold", "type": "number", "default": 0.5, - "description": "" + "description": "The threshold value for similarity comparison" } } }, @@ -20,9 +24,10 @@ "key": "auto_regex_test", "settings_template": { "regex_pattern": { + "label": "Regex Pattern", "type": "regex", "default": "", - "description": "Pattern (ex: ^this_word\\d{3}$)" + "description": "Pattern for regex testing (ex: ^this_word\\d{3}$)" } } }, @@ -31,9 +36,10 @@ "key": "auto_ai_critique", "settings_template": { "prompt_template": { + "label": "Prompt Template", "type": "text", - "default": "We have an LLM App that we want to evaluate its outputs....", - "description": "" + "default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.", + "description": "Template for AI critique prompts" } } }, @@ -42,9 +48,10 @@ "key": "auto_custom_code_run", "settings_template": { "code": { + "label": "Evaluation Code", "type": "code", "default": "from typing import Dict\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: str,\n correct_answer: str\n) -> float:\n # ...\n return 0.75 # Replace with your calculated score", - "description": "url" + "description": "Code for evaluating submissions" } } }, @@ -53,20 +60,27 @@ "key": "auto_webhook_test", "settings_template": { "webhook_url": { + "label": "Webhook URL", "type": "string", "default": "https://cloud.agenta.ai/api/evaluations/webhook_example_fake", - "description": "url" + "description": "URL for the webhook test" } } }, { "name": "A/B Test", "key": "human_a_b_testing", - "settings_template": {} + "settings_template": { + "label": "A/B Testing Settings", + "description": "Settings for A/B testing configurations" + } }, { "name": "Single Model Test", "key": "human_single_model_test", - "settings_template": {} + "settings_template": { + "label": "Single Model Testing Settings", + "description": "Settings for single model testing configurations" + } } -] +] \ No newline at end of file From 909eaec41f3408433787d60c67291ddc1f8779f9 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 26 Dec 2023 23:23:33 +0500 Subject: [PATCH 112/414] BE integration - step 1 --- .../evaluationResults/EvaluationResults.tsx | 47 ++++++++++++- .../evaluationResults/NewEvaluationModal.tsx | 4 +- .../evaluations/evaluationResults/mock.ts | 22 ++++--- .../evaluations/evaluators/EvaluatorCard.tsx | 24 ++++++- .../evaluations/evaluators/Evaluators.tsx | 22 +++++-- .../evaluators/NewEvaluatorModal.tsx | 15 ++++- agenta-web/src/lib/helpers/utils.ts | 25 +++++-- agenta-web/src/lib/services/api.ts | 32 ++++----- agenta-web/src/services/evaluations/index.ts | 66 +++++++++++-------- 9 files changed, 182 insertions(+), 75 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 57a55185a8..f7892b25d8 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -1,4 +1,4 @@ -import React, {useEffect, useMemo, useState} from "react" +import React, {useEffect, useMemo, useRef, useState} from "react" import {AgGridReact} from "ag-grid-react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {ColDef, ICellRendererParams} from "ag-grid-community" @@ -12,8 +12,10 @@ import relativeTime from "dayjs/plugin/relativeTime" import duration from "dayjs/plugin/duration" import NewEvaluationModal from "./NewEvaluationModal" import {useAppId} from "@/hooks/useAppId" -import {fetchAllEvaluations} from "@/services/evaluations" +import {fetchAllEvaluations, fetchEvaluationStatus} from "@/services/evaluations" import {useRouter} from "next/router" +import {useUpdateEffect} from "usehooks-ts" +import {shortPoll} from "@/lib/helpers/utils" dayjs.extend(relativeTime) dayjs.extend(duration) @@ -79,8 +81,19 @@ const EvaluationResults: React.FC = () => { const [evaluations, setEvaluations] = useState<_Evaluation[]>([]) const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) const [fetching, setFetching] = useState(false) + const stoppers = useRef() const {token} = theme.useToken() + const runningEvaluationIds = useMemo( + () => + evaluations + .filter((item) => + [EvaluationStatus.INITIALIZED, EvaluationStatus.STARTED].includes(item.status), + ) + .map((item) => item.id), + [evaluations], + ) + const fetcher = () => { setFetching(true) fetchAllEvaluations(appId) @@ -93,6 +106,36 @@ const EvaluationResults: React.FC = () => { fetcher() }, [appId]) + //update status of running evaluations through short polling + useUpdateEffect(() => { + stoppers.current?.() + + if (runningEvaluationIds.length) { + stoppers.current = shortPoll( + () => + Promise.all(runningEvaluationIds.map((id) => fetchEvaluationStatus(appId, id))) + .then((res) => { + setEvaluations((prev) => { + const newEvals = [...prev] + runningEvaluationIds.forEach((id, ix) => { + const index = newEvals.findIndex((e) => e.id === id) + if (index !== -1) { + newEvals[index].status = res[ix].status + } + }) + return newEvals + }) + }) + .catch(console.error), + {delayMs: 2000, timeoutMs: Infinity}, + ).stopper + } + + return () => { + stoppers.current?.() + } + }, [JSON.stringify(runningEvaluationIds)]) + const evaluatorConfigs = useMemo( () => uniqBy( diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 9cbeea823f..7aa842d5fd 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -106,7 +106,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { layout="vertical" > @@ -119,7 +119,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts b/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts index 69942606b5..d3aa0206c4 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts @@ -19,6 +19,15 @@ import codeImg from "@/media/browser.png" import {pickRandom, stringToNumberInRange} from "@/lib/helpers/utils" import {getTagColors} from "@/lib/helpers/colors" +const evaluatorIconsMap = { + auto_exact_match: exactMatchImg, + similarity: similarityImg, + auto_regex_test: regexImg, + auto_webhook_test: webhookImg, + auto_ai_critique: aiImg, + custom_code_run: codeImg, +} + const organizations: Org[] = [ { id: "org1", @@ -112,15 +121,13 @@ const evaluators: Evaluator[] = [ name: "Exact Match", key: "auto_exact_match", settings_template: {}, - icon_url: exactMatchImg, }, { name: "Similarity", - key: "similarity", + key: "auto_similarity_match", settings_template: { similarity_threshold: evaluatorSettinsTemplates[0], }, - icon_url: similarityImg, }, { name: "Regex Test", @@ -129,7 +136,6 @@ const evaluators: Evaluator[] = [ regex_pattern: evaluatorSettinsTemplates[4], regex_should_match: evaluatorSettinsTemplates[3], }, - icon_url: regexImg, }, { name: "AI Critique", @@ -137,7 +143,6 @@ const evaluators: Evaluator[] = [ settings_template: { llm_app_prompt_template: evaluatorSettinsTemplates[1], }, - icon_url: aiImg, }, { name: "Code Evaluation", @@ -145,7 +150,6 @@ const evaluators: Evaluator[] = [ settings_template: { custom_code_evaluation_id: evaluatorSettinsTemplates[2], }, - icon_url: codeImg, }, { name: "Webhook Test", @@ -153,10 +157,10 @@ const evaluators: Evaluator[] = [ settings_template: { webhook_url: evaluatorSettinsTemplates[5], }, - icon_url: webhookImg, }, ].map((item) => ({ ...(item as Evaluator), + icon_url: evaluatorIconsMap[item.key as keyof typeof evaluatorIconsMap], color: getTagColors()[stringToNumberInRange(item.key, 0, getTagColors().length - 1)], })) @@ -208,7 +212,7 @@ const evaluations: _Evaluation[] = [ duration: 120000, }, { - id: "evaluation2", + id: "evaluation3", organization: organizations[0], user: users[0], testset: testsets[0], @@ -227,7 +231,7 @@ const evaluations: _Evaluation[] = [ duration: 120000, }, { - id: "evaluation2", + id: "evaluation4", organization: organizations[0], user: users[0], testset: testsets[0], diff --git a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx index 97cc664ce0..024dff6585 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx @@ -6,6 +6,8 @@ import {createUseStyles} from "react-jss" import Mock from "../evaluationResults/mock" import dayjs from "dayjs" import Image from "next/image" +import AlertPopup from "@/components/AlertPopup/AlertPopup" +import {deleteEvaluatorConfig} from "@/services/evaluations" const useStyles = createUseStyles((theme: JSSTheme) => ({ body: { @@ -34,14 +36,32 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ interface Props { evaluatorConfig: EvaluatorConfig + onEdit?: () => void + onSuccessDelete?: () => void } -const EvaluatorCard: React.FC = ({evaluatorConfig}) => { +const EvaluatorCard: React.FC = ({evaluatorConfig, onEdit, onSuccessDelete}) => { const classes = useStyles() const evaluator = Mock.evaluators.find((item) => item.key === evaluatorConfig.evaluator_key)! + const onDelete = () => { + AlertPopup({ + title: "Delete evaluator", + message: "Are you sure you want to delete this evaluator?", + onOk: () => + deleteEvaluatorConfig(evaluatorConfig.id) + .then(onSuccessDelete) + .catch(console.error), + }) + } + return ( - , ]}> + , + , + ]} + >
diff --git a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx index e9d414a3f0..00829fab4c 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx @@ -19,7 +19,7 @@ const useStyles = createUseStyles({ }, grid: { display: "grid", - gridTemplateColumns: "repeat(auto-fit, minmax(240px, 1fr))", + gridTemplateColumns: "repeat(auto-fit, minmax(200px, 320px))", gap: "1rem", }, }) @@ -31,6 +31,7 @@ const Evaluators: React.FC = () => { const appId = useAppId() const [evaluatorConfigs, setEvaluatorConfigs] = useState([]) const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) + const [editIndex, setEditIndex] = useState(-1) const [fetching, setFetching] = useState(false) const fetcher = () => { @@ -51,15 +52,26 @@ const Evaluators: React.FC = () => {
- {evaluatorConfigs.map((item) => ( - + {evaluatorConfigs.map((item, ix) => ( + { + setEditIndex(ix) + setNewEvalModalOpen(true) + }} + onSuccessDelete={fetcher} + /> ))}
@@ -71,6 +83,8 @@ const Evaluators: React.FC = () => { setNewEvalModalOpen(false) fetcher() }} + editMode={editIndex !== -1} + initialValues={evaluatorConfigs[editIndex]} />
) diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx index 7ca71a4a1d..ef1b3b2af8 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx @@ -96,9 +96,16 @@ const DynamicFormField: React.FC = ({ type Props = { onSuccess?: () => void + initialValues?: CreateEvaluationConfigData + editMode?: boolean } & React.ComponentProps -const NewEvaluatorModal: React.FC = ({onSuccess, ...props}) => { +const NewEvaluatorModal: React.FC = ({ + onSuccess, + editMode = false, + initialValues, + ...props +}) => { const classes = useStyles() const [fetching, setFetching] = useState(false) const [evaluators, setEvaluators] = useState([]) @@ -118,7 +125,9 @@ const NewEvaluatorModal: React.FC = ({onSuccess, ...props}) => { useEffect(() => { setFetching(true) - setSelectedEval(null) + setSelectedEval( + evaluators.find((item) => item.key === initialValues?.evaluator_key) || null, + ) form.resetFields() fetchAllEvaluators() .then(setEvaluators) @@ -144,6 +153,7 @@ const NewEvaluatorModal: React.FC = ({onSuccess, ...props}) => { >
= ({onSuccess, ...props}) => { rules={[{required: true, message: "This field is required"}]} > setSelectedEval( evaluators.find((item) => item.key === e.target.value) || null, diff --git a/agenta-web/src/lib/helpers/utils.ts b/agenta-web/src/lib/helpers/utils.ts index 3e11530cf1..65f26eb76f 100644 --- a/agenta-web/src/lib/helpers/utils.ts +++ b/agenta-web/src/lib/helpers/utils.ts @@ -300,17 +300,30 @@ export async function batchExecute( return results } -export const shortPoll = async ( +export const shortPoll = ( func: Function, {delayMs, timeoutMs = 2000}: {delayMs: number; timeoutMs?: number}, ) => { let startTime = Date.now() let shouldContinue = true - while (shouldContinue && Date.now() - startTime < timeoutMs) { - try { - shouldContinue = await func() - } catch {} - await delay(delayMs) + + const executor = async () => { + while (shouldContinue && Date.now() - startTime < timeoutMs) { + try { + await func() + } catch {} + await delay(delayMs) + } + if (Date.now() - startTime >= timeoutMs) throw new Error("timeout") + } + + const promise = executor() + + return { + stopper: () => { + shouldContinue = false + }, + promise, } } diff --git a/agenta-web/src/lib/services/api.ts b/agenta-web/src/lib/services/api.ts index b1eb5471f1..bc0af33976 100644 --- a/agenta-web/src/lib/services/api.ts +++ b/agenta-web/src/lib/services/api.ts @@ -24,7 +24,7 @@ import { fromEvaluationScenarioResponseToEvaluationScenario, } from "../transformers" import {EvaluationFlow, EvaluationType} from "../enums" -import {delay, getAgentaApiUrl, removeKeys} from "../helpers/utils" +import {delay, getAgentaApiUrl, removeKeys, shortPoll} from "../helpers/utils" import {useProfileData} from "@/contexts/profile.context" /** * Raw interface for the parameters parsed from the openapi.json @@ -567,25 +567,17 @@ export const waitForAppToStart = async ({ }) => { const _variant = variant || (await fetchVariants(appId, true))[0] if (_variant) { - const shortPoll = async () => { - let started = false - while (!started) { - try { - await getVariantParametersFromOpenAPI( - appId, - _variant.variantId, - _variant.baseId, - true, - ) - started = true - } catch {} - await delay(interval) - } - } - await Promise.race([ - shortPoll(), - new Promise((_, rej) => setTimeout(() => rej(new Error("timeout")), timeout)), - ]) + const {stopper, promise} = shortPoll( + () => + getVariantParametersFromOpenAPI( + appId, + _variant.variantId, + _variant.baseId, + true, + ).then(() => stopper()), + {delayMs: interval, timeoutMs: timeout}, + ) + await promise } } diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index f5e0b4505e..b1141333a4 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -9,6 +9,12 @@ import { } from "@/lib/Types" import {getTagColors} from "@/lib/helpers/colors" import {delay, pickRandom, stringToNumberInRange} from "@/lib/helpers/utils" +import exactMatchImg from "@/media/target.png" +import similarityImg from "@/media/transparency.png" +import regexImg from "@/media/programming.png" +import webhookImg from "@/media/link.png" +import aiImg from "@/media/artificial-intelligence.png" +import codeImg from "@/media/browser.png" //Prefix convention: // - fetch: GET single entity from server @@ -17,38 +23,44 @@ import {delay, pickRandom, stringToNumberInRange} from "@/lib/helpers/utils" // - update: PUT data to server // - delete: DELETE data from server +const evaluatorIconsMap = { + auto_exact_match: exactMatchImg, + auto_similarity_match: similarityImg, + auto_regex_test: regexImg, + auto_webhook_test: webhookImg, + auto_ai_critique: aiImg, + auto_custom_code_run: codeImg, +} + //Evaluators export const fetchAllEvaluators = async () => { + // await delay(1000) + // return Mock.evaluators const tagColors = getTagColors() - await delay(1000) - return Mock.evaluators - const response = await axios.get(`/api/evaluators/`) - return (response.data || []).map((item: Evaluator) => ({ - ...item, - color: tagColors[stringToNumberInRange(item.key, 0, tagColors.length - 1)], - })) as Evaluator[] + return (response.data || []) + .filter((item: Evaluator) => !item.key.startsWith("human")) + .map((item: Evaluator) => ({ + ...item, + icon_url: evaluatorIconsMap[item.key as keyof typeof evaluatorIconsMap], + color: tagColors[stringToNumberInRange(item.key, 0, tagColors.length - 1)], + })) as Evaluator[] } // Evaluator Configs export const fetchAllEvaluatorConfigs = async (appId: string) => { - await delay(1000) - return Mock.evaluatorConfigs - - const response = await axios.get(`/api/evaluators/configs`) + const response = await axios.get(`/api/evaluators/configs/`, {params: {app_id: appId}}) return response.data as EvaluatorConfig[] } -export const deleteEvaluatorConfig = async (appId: string, configId: string) => { - return axios.delete(`/api/evaluators/configs/${configId}`) -} - export type CreateEvaluationConfigData = Omit export const createEvaluatorConfig = async (appId: string, config: CreateEvaluationConfigData) => { - await delay(1000) - return console.log("create evaluation config", config) - return axios.post(`/api/evaluators/configs`, {...config, app_id: appId}) + return axios.post(`/api/evaluators/configs/`, {...config, app_id: appId}) +} + +export const deleteEvaluatorConfig = async (configId: string) => { + return axios.delete(`/api/evaluators/configs/${configId}`) } // Evaluations @@ -56,7 +68,7 @@ export const fetchAllEvaluations = async (appId: string) => { await delay(1000) return Mock.evaluations - const response = await axios.get(`/api/evaluations`, {params: {app_id: appId}}) + const response = await axios.get(`/api/evaluations/`, {params: {app_id: appId}}) return response.data as _Evaluation[] } @@ -64,7 +76,7 @@ export const fetchEvaluation = async (appId: string, evaluationId: string) => { await delay(1000) return Mock.evaluations[0] - const response = await axios.get(`/api/evaluations/${evaluationId}`, { + const response = await axios.get(`/api/evaluations/${evaluationId}/`, { params: {app_id: appId}, }) return response.data as _Evaluation @@ -74,21 +86,19 @@ export const fetchEvaluationStatus = async (appId: string, evaluationId: string) await delay(1000) return {status: pickRandom(Object.values(EvaluationStatus), 1)[0]} - const response = await axios.get(`/api/evaluations/${evaluationId}/status`, { + const response = await axios.get(`/api/evaluations/${evaluationId}/status/`, { params: {app_id: appId}, }) return response.data as {status: EvaluationStatus} } export type CreateEvaluationData = { - testset: string[] - variants: string[] - evaluator_configs: string[] + testset_id: string + variant_ids: string[] + evaluators_configs: string[] } export const createEvalutaiton = async (appId: string, evaluation: CreateEvaluationData) => { - await delay(1000) - return console.log("create evaluation", evaluation) - return axios.post(`/api/evaluations`, {...evaluation, app_id: appId}) + return axios.post(`/api/evaluations/`, {...evaluation, app_id: appId}) } // Evaluation Scenarios @@ -96,7 +106,7 @@ export const fetchAllEvaluationScenarios = async (appId: string, evaluationId: s await delay(1000) return Mock.evaluationScenarios - const response = await axios.get(`/api/evaluations/${evaluationId}/evaluation_scenarios`, { + const response = await axios.get(`/api/evaluations/${evaluationId}/evaluation_scenarios/`, { params: {app_id: appId}, }) return response.data as _EvaluationScenario[] From d33f1fb4cec25f72d1cf31f771f50e085f9e860b Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 27 Dec 2023 11:01:44 +0100 Subject: [PATCH 113/414] Update - change backend_host to host.docker.internal --- .../tests/observability_router/test_observability_router.py | 2 +- .../tests/organization_router/test_organization_router.py | 2 +- .../agenta_backend/tests/testset_router/test_testset_router.py | 2 +- .../tests/user_profile_router/test_user_profile.py | 2 +- .../tests/variants_router/test_app_variant_router.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index 35dfb450fe..28b765749d 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -28,7 +28,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://host.docker.internal/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index db06d86840..0142d6acbe 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -16,7 +16,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://host.docker.internal/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index 894fc40f08..fec3275c00 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -18,7 +18,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://host.docker.internal/api" TESTSET_SUBMODULE_DIR = Path(__file__).parent diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index 6d7912aaf3..8833560c6b 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -15,7 +15,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://host.docker.internal/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index 7ece91b624..64b4ae533b 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -29,7 +29,7 @@ logger.setLevel(logging.DEBUG) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://host.docker.internal/api" @pytest.mark.asyncio From c112d5b266aac2cf71dcf45234b80b063e7244d4 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 27 Dec 2023 11:02:45 +0100 Subject: [PATCH 114/414] Update - modified delete_evaluator_config db function --- agenta-backend/agenta_backend/services/db_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 221b6bb57c..a95eeee304 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1807,6 +1807,6 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: delete_result = remove_document_using_driver( str(evaluator_config_id), "evaluators_configs" ) - return delete_result is not None + return delete_result is None # checking if delete_result is None (has been deleted) except Exception as e: raise e From 8f934cd5a4c603a85fb6f01c1b298b4cbd5af827 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 27 Dec 2023 12:06:57 +0100 Subject: [PATCH 115/414] add a setting for regex --- .../agenta_backend/resources/evaluators/evaluators.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 52218c0f25..82f1e2b42a 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -28,6 +28,12 @@ "type": "regex", "default": "", "description": "Pattern for regex testing (ex: ^this_word\\d{3}$)" + }, + "regex_should_match": { + "label": "Match/Mismatch", + "type": "boolean", + "default": true, + "description": "If the regex should match or mismatch" } } }, From 9968400e3095fdc1fb219f45c0d73db13c6de499 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 27 Dec 2023 12:12:53 +0100 Subject: [PATCH 116/414] Feat - prepare fixtures for evaluations --- .../variants_evaluators_router/conftest.py | 174 ++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py new file mode 100644 index 0000000000..ec87a7cfcc --- /dev/null +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -0,0 +1,174 @@ +import os +import httpx +import pytest +from pathlib import Path +from bson import ObjectId +from datetime import datetime + +from agenta_backend.models.db_engine import DBEngine +from agenta_backend.services.json_importer_helper import get_json +from agenta_backend.models.db_models import ( + AppDB, + AppVariantDB, + OrganizationDB, + TestSetDB, +) + + +# Initialize database engine +engine = DBEngine().engine() + +# Set global variables +BASE_URI = "http://host.docker.internal/" +BACKEND_URI = BASE_URI + "api/" +PARENT_DIRECTORY = Path(os.path.dirname(__file__)).parent.parent +OPEN_AI_KEY = "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" + + +@pytest.fixture(scope="session") +def fetch_templates(): + response = httpx.get(f"{BACKEND_URI}containers/templates/") + response_data = response.json() + return response_data + + +@pytest.fixture(scope="session") +def use_open_ai_key(): + return OPEN_AI_KEY + + +@pytest.fixture(scope="session") +def fetch_single_prompt_template(fetch_templates): + return fetch_templates[1] + + +@pytest.fixture(scope="session") +def ensure_frontend_reachable(): + response = httpx.get(f"{BASE_URI}apps/") + response.raise_for_status() + return response.text + + +@pytest.fixture() +async def fetch_app(): + apps = await engine.find(AppDB) + return {"app_id": str(apps[0].id)} + + +@pytest.fixture() +async def fetch_app_variant(fetch_app): + app = await fetch_app + app_variant = await engine.find_one( + AppVariantDB, AppVariantDB.app == ObjectId(app["app_id"]) + ) + return {"variant_id": str(app_variant.id), "app_id": app["app_id"]} + + +@pytest.fixture() +async def prepare_testset_csvdata(fetch_app_variant): + app_variant = await fetch_app_variant + app_db = await engine.find_one(AppDB, AppDB.id == ObjectId(app_variant["app_id"])) + org_db = await engine.find_one( + OrganizationDB, OrganizationDB.id == ObjectId(app_db.user.organizations[0]) + ) + json_path = os.path.join( + PARENT_DIRECTORY, + "resources", + "default_testsets", + "evaluation_testset.json", + ) + + csvdata = get_json(json_path) + testset = { + "name": f"{app_db.app_name}_testset", + "app_name": app_db.app_name, + "created_at": datetime.now().isoformat(), + "csvdata": csvdata, + } + testset_db = TestSetDB(**testset, app=app_db, user=app_db.user, organization=org_db) + await engine.save(testset_db) + return { + "testset_id": str(testset_db.id), + "variant_id": app_variant["variant_id"], + "app_id": app_variant["app_id"], + } + + +@pytest.fixture() +async def create_app_from_template(fetch_app, fetch_single_prompt_template): + payload = { + "app_name": fetch_app["app_name"], + "template_id": fetch_single_prompt_template["id"], + "env_vars": {"OPENAI_API_KEY": OPEN_AI_KEY}, + } + response = httpx.post( + f"{BACKEND_URI}/apps/app_and_variant_from_template/", json=payload + ) + return response.json() + + +@pytest.fixture() +async def auto_exact_match_evaluator_config(fetch_app): + app = await fetch_app + return { + "app_id": app["app_id"], + "name": "ExactMatchEvaluator", + "evaluator_key": "auto_exact_match", + "settings_values": {}, + } + + +@pytest.fixture() +async def auto_similarity_match_evaluator_config(fetch_app): + app = await fetch_app + return { + "app_id": app["app_id"], + "name": "SimilarityMatchEvaluator", + "evaluator_key": "auto_similarity_match", + "settings_values": {"similarity_threshold": 0.3}, + } + + +@pytest.fixture() +async def auto_regex_test_evaluator_config(fetch_app): + app = await fetch_app + return { + "app_id": app["app_id"], + "name": "RegexEvaluator", + "evaluator_key": "auto_regex_test", + "settings_values": { + "regex_pattern": "^value\\d{3}$", + "regex_should_match": False, + }, + } + + +@pytest.fixture() +async def auto_webhook_test_evaluator_config(fetch_app): + app = await fetch_app + return { + "app_id": app["app_id"], + "name": "WebhookEvaluator", + "evaluator_key": "auto_webhook_test", + "settings_values": { + "webhook_url": f"{BACKEND_URI}evaluations/webhook_example_fake/", + "webhook_body": {}, + }, + } + + +@pytest.fixture() +async def auto_ai_critique_evaluator_config(fetch_app): + app = await fetch_app + return { + "app_id": app["app_id"], + "name": "AICritique_Evaluator", + "evaluator_key": "auto_ai_critique", + "settings_values": { + "open_ai_key": OPEN_AI_KEY, + "temperature": 0.9, + "evaluation_prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.", + "llm_app_prompt_template": "", + "llm_app_inputs": [{"input_name": "country", "input_value": "tunisia"}], + }, + } From 1b2f2f82694c341ac3915595acf5aff39ad0688e Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 27 Dec 2023 12:13:44 +0100 Subject: [PATCH 117/414] Feat - implemented testcases for evaluator configs and evaluation creation, etc --- .../test_evaluators_router.py | 205 ++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py new file mode 100644 index 0000000000..d85e4899c3 --- /dev/null +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -0,0 +1,205 @@ +import httpx +import pytest +import asyncio + +from agenta_backend.models.db_engine import DBEngine +from agenta_backend.models.db_models import EvaluationDB +from agenta_backend.models.api.evaluation_model import Evaluation, EvaluationStatusEnum + + +# Initialize database engine +engine = DBEngine().engine() + +# Initialize http client +test_client = httpx.AsyncClient() +timeout = httpx.Timeout(timeout=5, read=None, write=5) + +# Set global variables +BACKEND_API_HOST = "http://host.docker.internal/api" + + +@pytest.mark.asyncio +async def test_get_evaluators_endpoint(): + response = await test_client.get( + f"{BACKEND_API_HOST}/evaluators/", + timeout=timeout, + ) + assert response.status_code == 200 + assert len(response.json()) == 8 # currently we have 8 evaluators + + +@pytest.mark.asyncio +async def test_create_auto_exact_match_evaluator_config( + auto_exact_match_evaluator_config, +): + payload = await auto_exact_match_evaluator_config + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout + ) + assert response.status_code == 200 + assert response.json()["evaluator_key"] == payload["evaluator_key"] + assert response.json()["settings_values"] == payload["settings_values"] + + +@pytest.mark.asyncio +async def test_create_auto_similarity_match_evaluator_config( + auto_similarity_match_evaluator_config, +): + payload = await auto_similarity_match_evaluator_config + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout + ) + assert response.status_code == 200 + assert response.json()["evaluator_key"] == payload["evaluator_key"] + assert response.json()["settings_values"] == payload["settings_values"] + + +@pytest.mark.asyncio +async def test_create_auto_regex_test_evaluator_config( + auto_regex_test_evaluator_config, +): + payload = await auto_regex_test_evaluator_config + payload["settings_values"]["regex_pattern"] = "^Nigeria\\d{3}$" + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout + ) + assert response.status_code == 200 + assert response.json()["evaluator_key"] == payload["evaluator_key"] + assert response.json()["settings_values"] == payload["settings_values"] + + +@pytest.mark.asyncio +async def test_create_auto_webhook_test_evaluator_config( + auto_webhook_test_evaluator_config, +): + payload = await auto_webhook_test_evaluator_config + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout + ) + assert response.status_code == 200 + assert response.json()["evaluator_key"] == payload["evaluator_key"] + assert response.json()["settings_values"] == payload["settings_values"] + + +@pytest.mark.asyncio +async def test_create_auto_ai_critique_evaluator_config( + auto_ai_critique_evaluator_config, +): + payload = await auto_ai_critique_evaluator_config + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout + ) + assert response.status_code == 200 + assert response.json()["evaluator_key"] == payload["evaluator_key"] + assert response.json()["settings_values"] == payload["settings_values"] + + +@pytest.mark.asyncio +async def test_get_evaluator_configs(fetch_app): + app = await fetch_app + response = await test_client.get( + f"{BACKEND_API_HOST}/evaluators/configs/?app_id={app['app_id']}", + timeout=timeout, + ) + assert response.status_code == 200 + assert type(response.json()) == list + + +@pytest.mark.asyncio +async def test_create_evaluation(prepare_testset_csvdata): + # Fetch app variant and testset + testset = await prepare_testset_csvdata + + # Prepare payload + payload = { + "app_id": testset["app_id"], + "variant_ids": [ + testset["variant_id"], + ], + "evaluators_configs": [], + "testset_id": "", + } + + # Fetch evaluator configs + response = await test_client.get( + f"{BACKEND_API_HOST}/evaluators/configs/?app_id={testset['app_id']}", + timeout=timeout, + ) + list_of_configs_ids = [] + evaluator_configs = response.json() + for evaluator_config in evaluator_configs: + list_of_configs_ids.append(evaluator_config["id"]) + + # Update payload with list of configs ids and testset id + payload["evaluators_configs"] = list_of_configs_ids + payload["testset_id"] = testset["testset_id"] + + # Make request to create evaluation + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout + ) + response_data = response.json() + + assert response.status_code == 200 + assert response_data["app_id"] == payload["app_id"] + assert response_data["status"] == EvaluationStatusEnum.EVALUATION_INITIALIZED + assert response_data is not None and isinstance(response_data, Evaluation) + + +@pytest.mark.asyncio +async def test_fetch_evaluation_status(): + evaluations = await engine.find(EvaluationDB) # will return only one in this case + evaluation = evaluations[0] + + # Prepare short-polling request + max_attempts = 10 + intervals = 2 # seconds + for _ in range(max_attempts): + response = await test_client.get( + f"{BACKEND_API_HOST}/evaluations/{str(evaluation.id)}/status/", + timeout=timeout, + ) + response_data = response.json() + if response_data["status"] == EvaluationStatusEnum.EVALUATION_FINISHED: + assert True + return + asyncio.sleep(intervals) + + assert ( + False + ), f"Evaluation status did not become '{EvaluationStatusEnum.EVALUATION_FINISHED}' within the specified polling time" + + +@pytest.mark.asyncio +async def test_fetch_evaluation_results(): + evaluations = await engine.find(EvaluationDB) # will return only one in this case + evaluation = evaluations[0] + + response = await test_client.get( + f"{BACKEND_API_HOST}/evaluations/{str(evaluation.id)}/results/", timeout=timeout + ) + response_data = response.json() + + assert response.status_code == 200 + assert response_data["evaluation_id"] == str(evaluation.id) + assert len(response_data["results"]) == 5 + + +@pytest.mark.asyncio +async def test_delete_evaluator_config(fetch_app): + app = await fetch_app + response = await test_client.get( + f"{BACKEND_API_HOST}/evaluators/configs/?app_id={app['app_id']}", + timeout=timeout, + ) + list_of_deleted_configs = [] + evaluator_configs = response.json() + for evaluator_config in evaluator_configs: + response = await test_client.delete( + f"{BACKEND_API_HOST}/evaluators/configs/{str(evaluator_config['id'])}/", + timeout=timeout, + ) + list_of_deleted_configs.append(response.json()) + + count_of_deleted_configs = sum(list_of_deleted_configs) + assert len(evaluator_configs) == count_of_deleted_configs From b4d4bdaa79c1f0d58516b861451140cb5e675277 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 27 Dec 2023 13:53:30 +0100 Subject: [PATCH 118/414] fix update evaluator config --- .../models/api/evaluation_model.py | 6 ++++ .../routers/evaluators_router.py | 22 ++++++++++++- .../agenta_backend/services/db_manager.py | 31 ++++++++++++------- .../services/evaluator_manager.py | 11 +++++-- 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 53e786cdf3..cf894aa8ed 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -184,3 +184,9 @@ class NewEvaluatorConfig(BaseModel): name: str evaluator_key: str settings_values: dict + + +class UpdateEvaluatorConfig(BaseModel): + name: Optional[str] + evaluator_key: Optional[str] + settings_values: Optional[dict] diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index f01ec6c9f8..71dafc5c43 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -10,6 +10,7 @@ Evaluator, EvaluatorConfig, NewEvaluatorConfig, + UpdateEvaluatorConfig, ) from agenta_backend.services import ( @@ -69,6 +70,7 @@ async def get_evaluator_configs(app_id: str = Query()): evaluators_configs = await evaluator_manager.get_evaluators_configs(app_id) return evaluators_configs + @router.get("/configs/{evaluator_config_id}/", response_model=EvaluatorConfig) async def get_evaluator_config(evaluator_config_id: str): """Endpoint to fetch evaluator configurations for a specific app. @@ -77,7 +79,9 @@ async def get_evaluator_config(evaluator_config_id: str): List[EvaluatorConfigDB]: A list of evaluator configuration objects. """ - evaluators_configs = await evaluator_manager.get_evaluator_config(evaluator_config_id) + evaluators_configs = await evaluator_manager.get_evaluator_config( + evaluator_config_id + ) return evaluators_configs @@ -103,6 +107,22 @@ async def create_new_evaluator_config( return evaluator_config +@router.put("/configs/{evaluator_config_id}/", response_model=EvaluatorConfig) +async def get_evaluator_config( + evaluator_config_id: str, payload: UpdateEvaluatorConfig +): + """Endpoint to fetch evaluator configurations for a specific app. + + Returns: + List[EvaluatorConfigDB]: A list of evaluator configuration objects. + """ + + evaluators_configs = await evaluator_manager.update_evaluator_config( + evaluator_config_id=evaluator_config_id, updates=payload + ) + return evaluators_configs + + @router.delete("/configs/{evaluator_config_id}/", response_model=bool) async def delete_evaluator_config(evaluator_config_id: str): """Endpoint to delete a specific evaluator configuration. diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 221b6bb57c..35032fa500 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1784,19 +1784,26 @@ async def create_evaluator_config( async def update_evaluator_config( evaluator_config_id: str, updates: Dict[str, Any] ) -> EvaluatorConfigDB: - """Edit an existing evaluator configuration in the database.""" - assert evaluator_config_id is not None, "Evaluator Config ID cannot be None" + """ + Update an evaluator configuration in the database with the provided id. - try: - updated_evaluator_config = await engine.find_one_and_update( - EvaluatorConfigDB, - query.eq("_id", ObjectId(evaluator_config_id)), - {"$set": updates}, - return_document=True, - ) - return updated_evaluator_config - except Exception as e: - raise e + Arguments: + evaluator_config_id (str): The ID of the evaluator configuration to be updated. + updates (Dict[str, Any]): The updates to apply to the evaluator configuration. + + Returns: + EvaluatorConfigDB: The updated evaluator configuration object. + """ + evaluator_config = await engine.find_one( + EvaluatorConfigDB, EvaluatorConfigDB.id == ObjectId(evaluator_config_id) + ) + updates_dict = updates.dict(exclude_unset=True) + + for key, value in updates_dict.items(): + if key in evaluator_config.__fields__: + setattr(evaluator_config, key, value) + await engine.save(evaluator_config) + return evaluator_config async def delete_evaluator_config(evaluator_config_id: str) -> bool: diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index d84b194b4c..faceface00 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -7,6 +7,7 @@ from agenta_backend.models.api.evaluation_model import EvaluatorConfig from agenta_backend.models.converters import evaluator_config_db_to_pydantic + async def get_evaluators_configs(app_id: str) -> List[EvaluatorConfig]: """ Get evaluators configs by app_id. @@ -18,7 +19,10 @@ async def get_evaluators_configs(app_id: str) -> List[EvaluatorConfig]: List[EvaluatorConfig]: A list of evaluator configuration objects. """ evaluator_configs_db = await db_manager.fetch_evaluators_configs(app_id) - return [evaluator_config_db_to_pydantic(evaluator_config_db) for evaluator_config_db in evaluator_configs_db] + return [ + evaluator_config_db_to_pydantic(evaluator_config_db) + for evaluator_config_db in evaluator_configs_db + ] async def get_evaluator_config(evaluator_config_id: str) -> EvaluatorConfig: @@ -78,7 +82,10 @@ async def update_evaluator_config( Returns: EvaluatorConfigDB: The updated evaluator configuration object. """ - return await db_manager.update_evaluator_config(evaluator_config_id, updates) + evaluator_config = await db_manager.update_evaluator_config( + evaluator_config_id, updates + ) + return evaluator_config_db_to_pydantic(evaluator_config=evaluator_config) async def delete_evaluator_config(evaluator_config_id: str) -> bool: From b22b1c82a96311f094c399062b22cdf871a94890 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Wed, 27 Dec 2023 18:06:03 +0500 Subject: [PATCH 119/414] BE integration - step 2 | UI enhancements --- .../agenta_backend/models/converters.py | 2 +- .../evaluationResults/EvaluationResults.tsx | 69 ++++++-- .../evaluationResults/NewEvaluationModal.tsx | 7 +- .../evaluations/evaluationResults/mock.ts | 8 +- .../EvaluationScenarios.tsx | 2 +- .../evaluations/evaluators/EvaluatorCard.tsx | 28 +++- .../evaluations/evaluators/Evaluators.tsx | 34 +++- .../evaluators/NewEvaluatorModal.tsx | 154 +++++++++--------- agenta-web/src/lib/Types.ts | 14 +- agenta-web/src/lib/atoms/evaluation.ts | 4 +- agenta-web/src/services/evaluations/index.ts | 67 +++++--- 11 files changed, 254 insertions(+), 135 deletions(-) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 30e52c6fea..63079c98f9 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -100,7 +100,7 @@ def evaluation_scenario_db_to_pydantic( evaluation_id=str(evaluation_scenario_db.evaluation.id), inputs=evaluation_scenario_db.inputs, outputs=evaluation_scenario_db.outputs, - vote=evaluation_scenario_db.vote, + vote="", score=evaluation_scenario_db.score, correct_answer=evaluation_scenario_db.correct_answer, is_pinned=evaluation_scenario_db.is_pinned or False, diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index f7892b25d8..53bb7f6fd3 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -5,27 +5,29 @@ import {ColDef, ICellRendererParams} from "ag-grid-community" import {createUseStyles} from "react-jss" import {Button, GlobalToken, Space, Spin, Typography, theme} from "antd" import {DeleteOutlined, PlusCircleOutlined, SlidersOutlined, SwapOutlined} from "@ant-design/icons" -import {EvaluationStatus, _Evaluation} from "@/lib/Types" +import {EvaluationStatus, JSSTheme, _Evaluation} from "@/lib/Types" import {uniqBy} from "lodash" import dayjs from "dayjs" import relativeTime from "dayjs/plugin/relativeTime" import duration from "dayjs/plugin/duration" import NewEvaluationModal from "./NewEvaluationModal" import {useAppId} from "@/hooks/useAppId" -import {fetchAllEvaluations, fetchEvaluationStatus} from "@/services/evaluations" +import {deleteEvaluations, fetchAllEvaluations, fetchEvaluationStatus} from "@/services/evaluations" import {useRouter} from "next/router" import {useUpdateEffect} from "usehooks-ts" import {shortPoll} from "@/lib/helpers/utils" +import AlertPopup from "@/components/AlertPopup/AlertPopup" dayjs.extend(relativeTime) dayjs.extend(duration) -const useStyles = createUseStyles({ +const useStyles = createUseStyles((theme: JSSTheme) => ({ root: { display: "flex", flexDirection: "column", gap: "1rem", }, table: { + width: "100%", height: 500, }, buttonsGroup: { @@ -39,18 +41,22 @@ const useStyles = createUseStyles({ marginBottom: 0, "& > div:nth-of-type(1)": { - width: 6, height: 6, + aspectRatio: 1 / 1, borderRadius: "50%", }, }, dot: { - width: 3, height: 3, + aspectRatio: 1 / 1, borderRadius: "50%", - backgroundColor: "#444", + backgroundColor: theme.colorTextSecondary, + marginTop: 2, }, -}) + date: { + color: theme.colorTextSecondary, + }, +})) const statusMapper = (token: GlobalToken) => ({ [EvaluationStatus.INITIALIZED]: { @@ -81,6 +87,7 @@ const EvaluationResults: React.FC = () => { const [evaluations, setEvaluations] = useState<_Evaluation[]>([]) const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) const [fetching, setFetching] = useState(false) + const [selected, setSelected] = useState<_Evaluation[]>([]) const stoppers = useRef() const {token} = theme.useToken() @@ -94,6 +101,17 @@ const EvaluationResults: React.FC = () => { [evaluations], ) + const onDelete = () => { + AlertPopup({ + title: "Delete Evaluations", + message: `Are you sure you want to delete all ${selected.length} selected evaluations?`, + onOk: () => + deleteEvaluations(selected.map((item) => item.id)) + .catch(console.error) + .then(fetcher), + }) + } + const fetcher = () => { setFetching(true) fetchAllEvaluations(appId) @@ -113,7 +131,7 @@ const EvaluationResults: React.FC = () => { if (runningEvaluationIds.length) { stoppers.current = shortPoll( () => - Promise.all(runningEvaluationIds.map((id) => fetchEvaluationStatus(appId, id))) + Promise.all(runningEvaluationIds.map((id) => fetchEvaluationStatus(id))) .then((res) => { setEvaluations((prev) => { const newEvals = [...prev] @@ -149,6 +167,12 @@ const EvaluationResults: React.FC = () => { const colDefs = useMemo(() => { const colDefs: ColDef<_Evaluation>[] = [ + { + field: "id", + headerCheckboxSelection: true, + checkboxSelection: true, + showDisabledCheckboxes: true, + }, {field: "testset.name"}, { field: "variants", @@ -181,7 +205,7 @@ const EvaluationResults: React.FC = () => {
{label} - + {dayjs .duration(params.data?.duration || 0, "milliseconds") .humanize()} @@ -202,10 +226,28 @@ const EvaluationResults: React.FC = () => { return (
- -
diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 7aa842d5fd..ecfc5b6d32 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -8,7 +8,7 @@ import { fetchAllEvaluators, } from "@/services/evaluations" import {PlusOutlined} from "@ant-design/icons" -import {Form, Modal, Select, Spin, Tag, Typography} from "antd" +import {Divider, Form, Modal, Select, Spin, Tag, Typography} from "antd" import dayjs from "dayjs" import Image from "next/image" import React, {useEffect, useState} from "react" @@ -45,6 +45,10 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ tag: { transform: "scale(0.8)", }, + divider: { + margin: "1rem -1.5rem", + width: "unset", + }, })) type Props = { @@ -97,6 +101,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { okButtonProps={{icon: , loading: submitLoading}} {...props} > + = () => { setFetching(true) Promise.all([ fetchAllEvaluationScenarios(appId, evaluationId), - fetchEvaluation(appId, evaluationId), + fetchEvaluation(evaluationId), ]) .then(([scenarios, evaluation]) => { setScenarios(scenarios) diff --git a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx index 024dff6585..ecf21b21b3 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx @@ -3,13 +3,19 @@ import {EvaluatorConfig, JSSTheme} from "@/lib/Types" import {DeleteOutlined, EditOutlined} from "@ant-design/icons" import {Card, Tag, Typography} from "antd" import {createUseStyles} from "react-jss" -import Mock from "../evaluationResults/mock" import dayjs from "dayjs" import Image from "next/image" import AlertPopup from "@/components/AlertPopup/AlertPopup" import {deleteEvaluatorConfig} from "@/services/evaluations" +import {useAtom} from "jotai" +import {evaluatorsAtom} from "@/lib/atoms/evaluation" const useStyles = createUseStyles((theme: JSSTheme) => ({ + card: { + "& .ant-card-body": { + padding: "1.25rem 0.75rem 1rem 1rem", + }, + }, body: { display: "flex", flexDirection: "column", @@ -23,14 +29,20 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ marginBottom: "1.5rem", }, evaluationImg: { - width: 27, - height: 27, + width: 32, + height: 32, marginRight: "8px", filter: theme.isDark ? "invert(1)" : "none", }, name: { - marginTop: "0.25rem", - marginBottom: 0, + marginTop: "0.5rem", + marginBottom: "0 !important", + fontWeight: "500 !important", + fontSize: "1rem", + }, + date: { + fontSize: "0.75rem", + color: theme.colorTextSecondary, }, })) @@ -42,7 +54,8 @@ interface Props { const EvaluatorCard: React.FC = ({evaluatorConfig, onEdit, onSuccessDelete}) => { const classes = useStyles() - const evaluator = Mock.evaluators.find((item) => item.key === evaluatorConfig.evaluator_key)! + const [evaluators] = useAtom(evaluatorsAtom) + const evaluator = evaluators.find((item) => item.key === evaluatorConfig.evaluator_key)! const onDelete = () => { AlertPopup({ @@ -57,6 +70,7 @@ const EvaluatorCard: React.FC = ({evaluatorConfig, onEdit, onSuccessDelet return ( , , @@ -64,7 +78,7 @@ const EvaluatorCard: React.FC = ({evaluatorConfig, onEdit, onSuccessDelet >
- + {dayjs(evaluatorConfig.created_at).format("DD MMM YY")} {evaluator.name} diff --git a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx index 00829fab4c..7c7c621780 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx @@ -1,12 +1,14 @@ -import React, {useEffect, useState} from "react" +import React, {useEffect, useMemo, useState} from "react" import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" -import {Button, Space, Spin} from "antd" +import {Button, Input, Space, Spin} from "antd" import {PlusCircleOutlined} from "@ant-design/icons" import {EvaluatorConfig} from "@/lib/Types" import NewEvaluatorModal from "./NewEvaluatorModal" import {useAppId} from "@/hooks/useAppId" -import {fetchAllEvaluatorConfigs} from "@/services/evaluations" +import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations" +import {useAtom} from "jotai" +import {evaluatorsAtom} from "@/lib/atoms/evaluation" const useStyles = createUseStyles({ root: { @@ -19,7 +21,7 @@ const useStyles = createUseStyles({ }, grid: { display: "grid", - gridTemplateColumns: "repeat(auto-fit, minmax(200px, 320px))", + gridTemplateColumns: "repeat(auto-fill, minmax(min(260px, 100%), 1fr))", gap: "1rem", }, }) @@ -31,13 +33,18 @@ const Evaluators: React.FC = () => { const appId = useAppId() const [evaluatorConfigs, setEvaluatorConfigs] = useState([]) const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) + const [_, setEvaluators] = useAtom(evaluatorsAtom) const [editIndex, setEditIndex] = useState(-1) const [fetching, setFetching] = useState(false) + const [searchTerm, setSearchTerm] = useState("") const fetcher = () => { setFetching(true) - fetchAllEvaluatorConfigs(appId) - .then(setEvaluatorConfigs) + Promise.all([fetchAllEvaluatorConfigs(appId), fetchAllEvaluators()]) + .then(([configs, evaluators]) => { + setEvaluatorConfigs(configs) + setEvaluators(evaluators) + }) .catch(console.error) .finally(() => setFetching(false)) } @@ -46,9 +53,22 @@ const Evaluators: React.FC = () => { fetcher() }, []) + const filtered = useMemo(() => { + if (!searchTerm) return evaluatorConfigs + return evaluatorConfigs.filter((item) => + item.name.toLowerCase().includes(searchTerm.toLowerCase()), + ) + }, [searchTerm, evaluatorConfigs]) + return (
+ setSearchTerm(term)} + placeholder="Search" + allowClear + enterButton + /> From 0a1ac285515bdb76c60f988ec7aa0a42407f824a Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Wed, 27 Dec 2023 16:06:23 +0100 Subject: [PATCH 121/414] compare view route --- .../pages/apps/[app_id]/evaluations-new/compare/index.tsx | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx new file mode 100644 index 0000000000..4dd22327da --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx @@ -0,0 +1,8 @@ +import EvaluationCompare from "@/components/pages/evaluations/evaluationCompare/evaluationCompare" +import React from "react" + +const EvaluationCompareDetails = () => { + return +} + +export default EvaluationCompareDetails From 6157a99c6c0a67ddbf7c7905da1546ecb3581a62 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Wed, 27 Dec 2023 16:12:02 +0100 Subject: [PATCH 122/414] modifies import path --- .../src/pages/apps/[app_id]/evaluations-new/compare/index.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx index 4dd22327da..0e346a3b7b 100644 --- a/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx @@ -1,5 +1,5 @@ -import EvaluationCompare from "@/components/pages/evaluations/evaluationCompare/evaluationCompare" import React from "react" +import EvaluationCompare from "@/components/pages/evaluations/evaluationCompare/EvaluationCompare" const EvaluationCompareDetails = () => { return From 345ac2660e3e1276fa0ef1f046d170f50db45ea0 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Wed, 27 Dec 2023 16:16:36 +0100 Subject: [PATCH 123/414] eval comparison view --- .../evaluationCompare/EvaluationCompare.tsx | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx new file mode 100644 index 0000000000..e6ef8a3ffc --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -0,0 +1,152 @@ +import {useAppTheme} from "@/components/Layout/ThemeContextProvider" +import {useAppId} from "@/hooks/useAppId" +import {JSSTheme, _Evaluation, _EvaluationScenario} from "@/lib/Types" +import {fetchAllEvaluationScenarios, fetchEvaluation} from "@/services/evaluations" +import {ColDef} from "ag-grid-community" +import {AgGridReact} from "ag-grid-react" +import {Space, Spin, Tag, Typography} from "antd" +import dayjs from "dayjs" +import {useRouter} from "next/router" +import React, {useEffect, useMemo, useState} from "react" +import {createUseStyles} from "react-jss" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + header: { + margin: "1rem 0", + "& > h3": { + textAlign: "center", + }, + }, + date: { + fontSize: "0.75rem", + color: theme.colorTextSecondary, + display: "inline-block", + marginBottom: "1rem", + }, + table: { + height: 500, + }, +})) + +interface Props {} + +const EvaluationCompareMode: React.FC = () => { + const router = useRouter() + const appId = useAppId() + const classes = useStyles() + const {appTheme} = useAppTheme() + const evaluationId = router.query.evaluation_id as string + const [scenarios, setScenarios] = useState<_EvaluationScenario[]>([]) + const [evalaution, setEvaluation] = useState<_Evaluation>() + const [fetching, setFetching] = useState(false) + + const colDefs = useMemo(() => { + const colDefs: ColDef<_EvaluationScenario>[] = [] + if (!scenarios.length || !evalaution) return colDefs + + scenarios[0]?.inputs.forEach((input, index) => { + colDefs.push({ + headerName: `Input: ${input.name}`, + field: `inputs.${index}`, + valueGetter: (params) => { + return params.data?.inputs[index].value || "" + }, + }) + }) + colDefs.push({ + headerName: "Expected Output", + field: "correct_answer", + valueGetter: (params) => { + return params.data?.correct_answer?.value || "" + }, + }) + + Array.from({length: 3}).map((_) => { + evalaution?.variants.forEach((variant, index) => { + colDefs.push({ + headerName: `Output (${variant.variantName})`, + field: `outputs.${index}`, + valueGetter: (params) => { + return params.data?.outputs[index].value || "" + }, + }) + }) + scenarios[0]?.evaluators_configs.forEach((config, index) => { + colDefs.push({ + headerName: `Evaluator: ${config.name}`, + field: `results`, + valueGetter: (params) => { + return ( + params.data?.results.find( + (item) => item.evaluator.key === config.evaluator_key, + )?.result || "" + ) + }, + }) + }) + }) + + return colDefs + }, [evalaution, scenarios]) + + const fetcher = () => { + setFetching(true) + Promise.all([ + fetchAllEvaluationScenarios(appId, evaluationId), + fetchEvaluation(evaluationId), + ]) + .then(([scenarios, evaluation]) => { + setScenarios(scenarios) + setEvaluation(evaluation) + }) + .catch(console.error) + .finally(() => setFetching(false)) + } + + useEffect(() => { + fetcher() + }, [appId, evaluationId]) + + const handleDeleteVariant = (variantId: string) => { + console.log(variantId) + } + + return ( +
+
+ + Testset: {evalaution?.testset.name || ""} + + + Variants: + {evalaution?.variants?.map((variant) => ( + handleDeleteVariant(variant.variantId)} + closable + > + {variant.variantName} + + ))} + +
+ + +
+ + rowData={scenarios} + columnDefs={colDefs} + getRowId={(params) => params.data.id} + /> +
+
+
+ ) +} + +export default EvaluationCompareMode From c0a4482f26fcc2034397c08b2a9f6ebe87f998e1 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Wed, 27 Dec 2023 17:09:17 +0100 Subject: [PATCH 124/414] typo fix --- .../pages/evaluations/evaluationResults/EvaluationResults.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 483c6c2bdf..745216a71c 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -248,7 +248,7 @@ const EvaluationResults: React.FC = () => { type="primary" onClick={() => router.push( - `/apps/${appId}/evaluations-new/compare/?evalautions=${selected + `/apps/${appId}/evaluations-new/compare/?evaluations=${selected .map((item) => item.id) .join(",")}`, ) From 1eb3a8e0f716a1be54aa84ce86942195a45949ce Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Wed, 27 Dec 2023 21:14:11 +0100 Subject: [PATCH 125/414] modify fetch functions to handle multiple evaluation IDs --- .../evaluationCompare/EvaluationCompare.tsx | 124 ++++++++++-------- 1 file changed, 72 insertions(+), 52 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx index e6ef8a3ffc..ebad490966 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -35,9 +35,9 @@ const EvaluationCompareMode: React.FC = () => { const appId = useAppId() const classes = useStyles() const {appTheme} = useAppTheme() - const evaluationId = router.query.evaluation_id as string + const evaluationIds = router.query.evaluations as string const [scenarios, setScenarios] = useState<_EvaluationScenario[]>([]) - const [evalaution, setEvaluation] = useState<_Evaluation>() + const [evalaution, setEvaluation] = useState<_Evaluation[]>() const [fetching, setFetching] = useState(false) const colDefs = useMemo(() => { @@ -61,51 +61,68 @@ const EvaluationCompareMode: React.FC = () => { }, }) - Array.from({length: 3}).map((_) => { - evalaution?.variants.forEach((variant, index) => { - colDefs.push({ - headerName: `Output (${variant.variantName})`, - field: `outputs.${index}`, - valueGetter: (params) => { - return params.data?.outputs[index].value || "" - }, - }) - }) - scenarios[0]?.evaluators_configs.forEach((config, index) => { - colDefs.push({ - headerName: `Evaluator: ${config.name}`, - field: `results`, - valueGetter: (params) => { - return ( - params.data?.results.find( - (item) => item.evaluator.key === config.evaluator_key, - )?.result || "" - ) - }, - }) - }) - }) + evalaution.map( + (evalaution) => + evalaution?.variants.forEach((variant, index) => { + colDefs.push({ + headerName: `Output (${variant.variantName})`, + field: `outputs.${index}`, + valueGetter: (params) => { + return params.data?.outputs[index].value || "" + }, + }) + }), + ) + + scenarios.map( + (scenario) => + scenario?.evaluators_configs.forEach((config, index) => { + colDefs.push({ + headerName: `Evaluator: ${config.name}`, + field: `results`, + valueGetter: (params) => { + return ( + params.data?.results.find( + (item) => item.evaluator.key === config.evaluator_key, + )?.result || "" + ) + }, + }) + }), + ) return colDefs }, [evalaution, scenarios]) - const fetcher = () => { - setFetching(true) - Promise.all([ - fetchAllEvaluationScenarios(appId, evaluationId), - fetchEvaluation(evaluationId), - ]) - .then(([scenarios, evaluation]) => { - setScenarios(scenarios) - setEvaluation(evaluation) - }) - .catch(console.error) - .finally(() => setFetching(false)) - } - useEffect(() => { + const fetcher = async () => { + setFetching(true) + + try { + const evaluationIdsArray = evaluationIds?.split(",") || [] + + const fetchPromises = evaluationIdsArray.map((evalId) => { + return Promise.all([ + fetchAllEvaluationScenarios(appId, evalId), + fetchEvaluation(evalId), + ]) + }) + + const results = await Promise.all(fetchPromises) + const fetchedScenarios = results.map(([[scenarios]]) => scenarios) + const fetchedEvaluations = results.map(([_, evaluation]) => evaluation) + + setScenarios(fetchedScenarios) + setEvaluation(fetchedEvaluations) + } catch (error) { + console.error(error) + } finally { + setFetching(false) + } + } + fetcher() - }, [appId, evaluationId]) + }, [appId, evaluationIds]) const handleDeleteVariant = (variantId: string) => { console.log(variantId) @@ -115,20 +132,23 @@ const EvaluationCompareMode: React.FC = () => {
- Testset: {evalaution?.testset.name || ""} + Testset: {evalaution ? evalaution[0]?.testset.name : ""} Variants: - {evalaution?.variants?.map((variant) => ( - handleDeleteVariant(variant.variantId)} - closable - > - {variant.variantName} - - ))} + {evalaution?.map( + (evalaution) => + evalaution?.variants?.map((variant) => ( + handleDeleteVariant(variant.variantId)} + closable + > + {variant.variantName} + + )), + )}
From d4d365c3dfb1a2ebd0fa22bdccec67a74b338a3b Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 27 Dec 2023 22:37:31 +0100 Subject: [PATCH 126/414] add aggregated results in evaluations response --- .../models/api/evaluation_model.py | 11 +++++++++++ .../agenta_backend/models/converters.py | 16 ++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index cf894aa8ed..c65776354e 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -37,6 +37,16 @@ class EvaluationScenarioStatusEnum(str, Enum): COMPARISON_RUN_STARTED = "COMPARISON_RUN_STARTED" +class Result(BaseModel): + type: str + value: Any + + +class AggregatedResult(BaseModel): + evaluator_config: str + result: Result + + class Evaluation(BaseModel): id: str app_id: str @@ -47,6 +57,7 @@ class Evaluation(BaseModel): testset_id: str testset_name: str status: str + aggregated_results: List[AggregatedResult] created_at: datetime updated_at: datetime diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 63079c98f9..d99c9c914b 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -80,16 +80,24 @@ async def evaluation_db_to_pydantic( variant_names=variant_names, testset_id=str(evaluation_db.testset.id), testset_name=evaluation_db.testset.name, + aggregated_results=aggregated_result_to_pydantic( + evaluation_db.aggregated_results + ), created_at=evaluation_db.created_at, updated_at=evaluation_db.updated_at, ) def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: - list_of_aggregated_results = [] - for aggregated_result in results: - list_of_aggregated_results.append(aggregated_result.dict()) - return list_of_aggregated_results + return [ + { + "evaluator_config": str( + result.evaluator_config + ), + "result": result.result.dict(), + } + for result in results + ] def evaluation_scenario_db_to_pydantic( From 5e5dedd74573219b94aec3d225a57fcc7c12a573 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 27 Dec 2023 22:52:34 +0100 Subject: [PATCH 127/414] Update - modified test_create_evaluation testcase --- .../variants_evaluators_router/test_evaluators_router.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index d85e4899c3..2ad7074ce4 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -117,7 +117,7 @@ async def test_create_evaluation(prepare_testset_csvdata): testset["variant_id"], ], "evaluators_configs": [], - "testset_id": "", + "testset_id": "" } # Fetch evaluator configs @@ -142,7 +142,7 @@ async def test_create_evaluation(prepare_testset_csvdata): assert response.status_code == 200 assert response_data["app_id"] == payload["app_id"] - assert response_data["status"] == EvaluationStatusEnum.EVALUATION_INITIALIZED + assert response_data["status"] == EvaluationStatusEnum.EVALUATION_STARTED assert response_data is not None and isinstance(response_data, Evaluation) @@ -163,7 +163,7 @@ async def test_fetch_evaluation_status(): if response_data["status"] == EvaluationStatusEnum.EVALUATION_FINISHED: assert True return - asyncio.sleep(intervals) + await asyncio.sleep(intervals) assert ( False From 04b88f79b1fe7b34964bb507ac89437cc72a4fb8 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 27 Dec 2023 22:53:14 +0100 Subject: [PATCH 128/414] Update - created fixture to create app from template --- .../variants_evaluators_router/conftest.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index ec87a7cfcc..8b952b5d54 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -52,7 +52,11 @@ def ensure_frontend_reachable(): @pytest.fixture() async def fetch_app(): apps = await engine.find(AppDB) - return {"app_id": str(apps[0].id)} + return { + "app_id": str(apps[0].id), + "app_name": apps[0].app_name, + "org_id": str(apps[0].user.organizations[0]), + } @pytest.fixture() @@ -65,17 +69,34 @@ async def fetch_app_variant(fetch_app): @pytest.fixture() -async def prepare_testset_csvdata(fetch_app_variant): - app_variant = await fetch_app_variant +async def create_app_from_template(fetch_app, fetch_single_prompt_template): + app = await fetch_app + payload = { + "app_name": app["app_name"], + "template_id": fetch_single_prompt_template["id"], + "env_vars": {"OPENAI_API_KEY": OPEN_AI_KEY}, + "organization_id": app["org_id"], + } + print("Payload: ", payload) + response = httpx.post( + f"{BACKEND_URI}apps/app_and_variant_from_template/", json=payload + ) + return response.json() + + +@pytest.fixture() +async def prepare_testset_csvdata(create_app_from_template): + app_variant = await create_app_from_template + print("AppV: ", app_variant) app_db = await engine.find_one(AppDB, AppDB.id == ObjectId(app_variant["app_id"])) org_db = await engine.find_one( - OrganizationDB, OrganizationDB.id == ObjectId(app_db.user.organizations[0]) + OrganizationDB, OrganizationDB.id == ObjectId(app_variant["organization_id"]) ) json_path = os.path.join( PARENT_DIRECTORY, "resources", "default_testsets", - "evaluation_testset.json", + "chat_openai_testset.json", ) csvdata = get_json(json_path) @@ -94,19 +115,6 @@ async def prepare_testset_csvdata(fetch_app_variant): } -@pytest.fixture() -async def create_app_from_template(fetch_app, fetch_single_prompt_template): - payload = { - "app_name": fetch_app["app_name"], - "template_id": fetch_single_prompt_template["id"], - "env_vars": {"OPENAI_API_KEY": OPEN_AI_KEY}, - } - response = httpx.post( - f"{BACKEND_URI}/apps/app_and_variant_from_template/", json=payload - ) - return response.json() - - @pytest.fixture() async def auto_exact_match_evaluator_config(fetch_app): app = await fetch_app From 037b071a0297d42d3fb9df4e7ad479c7a915f4d9 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 27 Dec 2023 23:07:34 +0100 Subject: [PATCH 129/414] add results to evaluation scenarios and fix response --- .../models/api/evaluation_model.py | 8 +++++-- .../agenta_backend/models/converters.py | 22 ++++++++++++++----- .../agenta_backend/models/db_models.py | 6 ++--- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index c65776354e..3a6dbd0019 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -75,6 +75,11 @@ class EvaluationUpdate(BaseModel): evaluation_type_settings: Optional[EvaluationTypeSettings] +class EvaluationScenarioResult(BaseModel): + evaluator_config: str + result: Result + + class EvaluationScenarioInput(BaseModel): input_name: str input_value: str @@ -90,12 +95,11 @@ class EvaluationScenario(BaseModel): evaluation_id: str inputs: List[EvaluationScenarioInput] outputs: List[EvaluationScenarioOutput] - vote: Optional[str] - score: Optional[Union[str, int]] evaluation: Optional[str] correct_answer: Optional[str] is_pinned: Optional[bool] note: Optional[str] + results: List[EvaluationScenarioResult] class AICritiqueCreate(BaseModel): diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index d99c9c914b..a1e51dbd61 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -5,6 +5,7 @@ from agenta_backend.models.api.user_models import User from agenta_backend.models.db_models import ( AppVariantDB, + EvaluationScenarioResult, EvaluatorConfigDB, ImageDB, TemplateDB, @@ -91,9 +92,19 @@ async def evaluation_db_to_pydantic( def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: return [ { - "evaluator_config": str( - result.evaluator_config - ), + "evaluator_config": str(result.evaluator_config), + "result": result.result.dict(), + } + for result in results + ] + + +def evaluation_scenarios_results_to_pydantic( + results: List[EvaluationScenarioResult], +) -> List[dict]: + return [ + { + "evaluator_config": str(result.evaluator_config), "result": result.result.dict(), } for result in results @@ -108,11 +119,12 @@ def evaluation_scenario_db_to_pydantic( evaluation_id=str(evaluation_scenario_db.evaluation.id), inputs=evaluation_scenario_db.inputs, outputs=evaluation_scenario_db.outputs, - vote="", - score=evaluation_scenario_db.score, correct_answer=evaluation_scenario_db.correct_answer, is_pinned=evaluation_scenario_db.is_pinned or False, note=evaluation_scenario_db.note or "", + results=evaluation_scenarios_results_to_pydantic( + evaluation_scenario_db.results + ), ) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 91f08bb627..66a1d34dcf 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -265,9 +265,9 @@ class Config: class EvaluationScenarioDB(Model): - user: UserDB = Reference() - organization: OrganizationDB = Reference() - evaluation: EvaluationDB = Reference() + user: UserDB = Reference(key_name="user") + organization: OrganizationDB = Reference(key_name="organization") + evaluation: EvaluationDB = Reference(key_name="evaluations") inputs: List[EvaluationScenarioInputDB] outputs: List[EvaluationScenarioOutputDB] correct_answer: Optional[str] From 84bb5b960ee59efd7e2a8014e83f490c347eaeb9 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Thu, 28 Dec 2023 03:30:37 +0500 Subject: [PATCH 130/414] duration counter and ui --- .../evaluationResults/EvaluationResults.tsx | 30 ++++++++++++------- .../evaluationResults/NewEvaluationModal.tsx | 2 +- .../EvaluationScenarios.tsx | 2 +- .../evaluations/evaluators/EvaluatorCard.tsx | 2 +- .../evaluators/NewEvaluatorModal.tsx | 2 +- agenta-web/src/hooks/useDurationCounter.ts | 17 +++++++++++ agenta-web/src/lib/helpers/utils.ts | 21 +++++++++++++ agenta-web/src/services/evaluations/index.ts | 10 +++++-- 8 files changed, 69 insertions(+), 17 deletions(-) create mode 100644 agenta-web/src/hooks/useDurationCounter.ts diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 53bb7f6fd3..ac3ccb169a 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -15,8 +15,9 @@ import {useAppId} from "@/hooks/useAppId" import {deleteEvaluations, fetchAllEvaluations, fetchEvaluationStatus} from "@/services/evaluations" import {useRouter} from "next/router" import {useUpdateEffect} from "usehooks-ts" -import {shortPoll} from "@/lib/helpers/utils" +import {durationToStr, shortPoll} from "@/lib/helpers/utils" import AlertPopup from "@/components/AlertPopup/AlertPopup" +import {useDurationCounter} from "@/hooks/useDurationCounter" dayjs.extend(relativeTime) dayjs.extend(duration) @@ -28,7 +29,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, table: { width: "100%", - height: 500, + height: "calc(100vh - 260px)", }, buttonsGroup: { alignSelf: "flex-end", @@ -50,11 +51,11 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ height: 3, aspectRatio: 1 / 1, borderRadius: "50%", - backgroundColor: theme.colorTextSecondary, + backgroundColor: "#8c8c8c", marginTop: 2, }, date: { - color: theme.colorTextSecondary, + color: "#8c8c8c", }, })) @@ -168,20 +169,24 @@ const EvaluationResults: React.FC = () => { const colDefs = useMemo(() => { const colDefs: ColDef<_Evaluation>[] = [ { + minWidth: 280, field: "id", + flex: 1, headerCheckboxSelection: true, checkboxSelection: true, showDisabledCheckboxes: true, }, - {field: "testset.name"}, + {field: "testset.name", flex: 1}, { field: "variants", + flex: 1, valueGetter: (params) => params.data?.variants[0].variantName, headerName: "Variant", }, ...evaluatorConfigs.map( (config) => ({ + flex: 1, field: "aggregated_results", headerComponent: () => ( @@ -195,9 +200,17 @@ const EvaluationResults: React.FC = () => { }) as ColDef<_Evaluation>, ), { + flex: 1, field: "status", + minWidth: 220, cellRenderer: (params: ICellRendererParams<_Evaluation>) => { const classes = useStyles() + const duration = useDurationCounter( + params.data?.duration || 0, + [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes( + params.value, + ), + ) const {label, color} = statusMapper(token)[params.value as EvaluationStatus] return ( @@ -205,16 +218,13 @@ const EvaluationResults: React.FC = () => {
{label} - - {dayjs - .duration(params.data?.duration || 0, "milliseconds") - .humanize()} - + {duration} ) }, }, { + flex: 1, field: "created_at", headerName: "Created", valueFormatter: (params) => dayjs(params.value).fromNow(), diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index ecfc5b6d32..2757e27eb8 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -40,7 +40,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, date: { fontSize: "0.75rem", - color: "#888", + color: "#8c8c8c", }, tag: { transform: "scale(0.8)", diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx index 96a3e971e0..05c85fa6bc 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -30,7 +30,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, date: { fontSize: "0.75rem", - color: theme.colorTextSecondary, + color: "#8c8c8c", display: "inline-block", marginBottom: "1rem", }, diff --git a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx index ecf21b21b3..8b6357d3cb 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx @@ -42,7 +42,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, date: { fontSize: "0.75rem", - color: theme.colorTextSecondary, + color: "#8c8c8c", }, })) diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx index 3e94717b3c..ac66535940 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx @@ -154,7 +154,7 @@ const NewEvaluatorModal: React.FC = ({ : , loading: submitLoading, diff --git a/agenta-web/src/hooks/useDurationCounter.ts b/agenta-web/src/hooks/useDurationCounter.ts new file mode 100644 index 0000000000..ab884339d6 --- /dev/null +++ b/agenta-web/src/hooks/useDurationCounter.ts @@ -0,0 +1,17 @@ +import {durationToStr} from "@/lib/helpers/utils" +import {useEffect, useState} from "react" + +export const useDurationCounter = (duration: number, isRunning: boolean = true) => { + const [elapsed, setElapsed] = useState(duration) + + useEffect(() => { + if (isRunning) { + const interval = setInterval(() => { + setElapsed((prev) => prev + 100) + }, 100) + return () => clearInterval(interval) + } + }, [isRunning]) + + return durationToStr(elapsed) +} diff --git a/agenta-web/src/lib/helpers/utils.ts b/agenta-web/src/lib/helpers/utils.ts index 65f26eb76f..093d0d4ef0 100644 --- a/agenta-web/src/lib/helpers/utils.ts +++ b/agenta-web/src/lib/helpers/utils.ts @@ -3,6 +3,7 @@ import {EvaluationType} from "../enums" import {GenericObject} from "../Types" import promiseRetry from "promise-retry" import {getErrorMessage} from "./errorHandler" +import dayjs from "dayjs" const llmAvailableProvidersToken = "llmAvailableProvidersToken" @@ -338,3 +339,23 @@ export function pickRandom(arr: T[], len: number) { return result } + +export function durationToStr(duration: number) { + const days = Math.floor(dayjs.duration(duration, "milliseconds").asDays()) + const hours = Math.floor(dayjs.duration(duration, "milliseconds").asHours()) + const mins = Math.floor(dayjs.duration(duration, "milliseconds").asMinutes()) + const secs = Math.floor(dayjs.duration(duration, "milliseconds").asSeconds()) + + if (days > 0) return `${days} days` + if (hours > 0) return `${hours} hours` + if (mins > 0) return `${mins} mins` + return `${secs} seconds` +} + +type DayjsDate = Parameters[0] +export function getDurationStr(date1: DayjsDate, date2: DayjsDate) { + const d1 = dayjs(date1) + const d2 = dayjs(date2) + + return durationToStr(d2.diff(d1, "milliseconds")) +} diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index 7d367c59a2..f3a399c1d5 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -62,11 +62,11 @@ export const updateEvaluatorConfig = async ( configId: string, config: Partial, ) => { - return axios.put(`/api/evaluators/configs/${configId}`, config) + return axios.put(`/api/evaluators/configs/${configId}/`, config) } export const deleteEvaluatorConfig = async (configId: string) => { - return axios.delete(`/api/evaluators/configs/${configId}`) + return axios.delete(`/api/evaluators/configs/${configId}/`) } // Evaluations @@ -75,7 +75,11 @@ const evaluationTransformer = (item: any) => ({ appId: item.app_id, created_at: item.created_at, updated_at: item.updated_at, - duration: dayjs(item.updated_at).diff(dayjs(item.created_at), "milliseconds"), + duration: dayjs( + [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes(item.status) + ? Date.now() + : item.updated_at, + ).diff(dayjs(item.created_at), "milliseconds"), status: item.status, testset: { id: item.testset_id, From 31c58232f8abffc2846b70139e1559ed44b70e9a Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 28 Dec 2023 08:48:00 +0100 Subject: [PATCH 131/414] include aggregated results for ai critique --- .../agenta_backend/tasks/evaluations.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 88905dd375..89b41374ef 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -136,18 +136,17 @@ async def aggregate_evaluator_results( ) -> List[AggregatedResult]: aggregated_results = [] for evaluator_key, results in evaluators_aggregated_data.items(): - if evaluator_key != "auto_ai_critique": - average_value = ( - sum([result.value for result in results]) / len(results) - if results - else 0 - ) - evaluator_config = await fetch_evaluator_config_by_appId( - app.id, evaluator_key - ) - aggregated_result = AggregatedResult( - evaluator_config=evaluator_config.id, - result=Result(type="number", value=average_value), - ) - aggregated_results.append(aggregated_result) + average_value = ( + sum([result.value for result in results]) / len(results) + if results + else 0 + ) + evaluator_config = await fetch_evaluator_config_by_appId( + app.id, evaluator_key + ) + aggregated_result = AggregatedResult( + evaluator_config=evaluator_config.id, + result=Result(type="number", value=average_value), + ) + aggregated_results.append(aggregated_result) return aggregated_results From b67676811f0dfc48b20483564f31fa0a8b8cc8d7 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 09:51:35 +0100 Subject: [PATCH 132/414] Update - change value type to Any --- .../agenta_backend/models/api/evaluation_model.py | 9 +++++---- agenta-backend/agenta_backend/models/db_models.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 3a6dbd0019..7c1b558ca4 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -81,13 +81,14 @@ class EvaluationScenarioResult(BaseModel): class EvaluationScenarioInput(BaseModel): - input_name: str - input_value: str + name: str + type: str + value: Any class EvaluationScenarioOutput(BaseModel): - variant_id: str - variant_output: str + type: str + value: Any class EvaluationScenario(BaseModel): diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 66a1d34dcf..79bdfa2af7 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -245,7 +245,7 @@ class EvaluationScenarioInputDB(EmbeddedModel): class EvaluationScenarioOutputDB(EmbeddedModel): type: str - value: str + value: Any class EvaluationDB(Model): From 3a1b76bfb5a432c9d159769ca1d063eaf94b0941 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 09:53:46 +0100 Subject: [PATCH 133/414] Update - modified evaluation scenario db to pydantic converters --- agenta-backend/agenta_backend/models/converters.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index a1e51dbd61..268c1e1c9c 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -42,6 +42,8 @@ EvaluationScenario, Evaluation, EvaluatorConfig, + EvaluationScenarioInput, + EvaluationScenarioOutput ) import logging @@ -117,8 +119,8 @@ def evaluation_scenario_db_to_pydantic( return EvaluationScenario( id=str(evaluation_scenario_db.id), evaluation_id=str(evaluation_scenario_db.evaluation.id), - inputs=evaluation_scenario_db.inputs, - outputs=evaluation_scenario_db.outputs, + inputs=[EvaluationScenarioInput(**scenario_input.dict()) for scenario_input in evaluation_scenario_db.inputs], + outputs=[EvaluationScenarioOutput(**scenario_output.dict()) for scenario_output in evaluation_scenario_db.outputs], correct_answer=evaluation_scenario_db.correct_answer, is_pinned=evaluation_scenario_db.is_pinned or False, note=evaluation_scenario_db.note or "", From bbffecf4cebc0c832faf7202b375b593d3883f23 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 09:55:01 +0100 Subject: [PATCH 134/414] :art: Format - ran black --- agenta-backend/agenta_backend/models/converters.py | 12 +++++++++--- agenta-backend/agenta_backend/tasks/evaluations.py | 8 ++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 268c1e1c9c..8b8a8a1c7b 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -43,7 +43,7 @@ Evaluation, EvaluatorConfig, EvaluationScenarioInput, - EvaluationScenarioOutput + EvaluationScenarioOutput, ) import logging @@ -119,8 +119,14 @@ def evaluation_scenario_db_to_pydantic( return EvaluationScenario( id=str(evaluation_scenario_db.id), evaluation_id=str(evaluation_scenario_db.evaluation.id), - inputs=[EvaluationScenarioInput(**scenario_input.dict()) for scenario_input in evaluation_scenario_db.inputs], - outputs=[EvaluationScenarioOutput(**scenario_output.dict()) for scenario_output in evaluation_scenario_db.outputs], + inputs=[ + EvaluationScenarioInput(**scenario_input.dict()) + for scenario_input in evaluation_scenario_db.inputs + ], + outputs=[ + EvaluationScenarioOutput(**scenario_output.dict()) + for scenario_output in evaluation_scenario_db.outputs + ], correct_answer=evaluation_scenario_db.correct_answer, is_pinned=evaluation_scenario_db.is_pinned or False, note=evaluation_scenario_db.note or "", diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 89b41374ef..35bea882fb 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -137,13 +137,9 @@ async def aggregate_evaluator_results( aggregated_results = [] for evaluator_key, results in evaluators_aggregated_data.items(): average_value = ( - sum([result.value for result in results]) / len(results) - if results - else 0 - ) - evaluator_config = await fetch_evaluator_config_by_appId( - app.id, evaluator_key + sum([result.value for result in results]) / len(results) if results else 0 ) + evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) aggregated_result = AggregatedResult( evaluator_config=evaluator_config.id, result=Result(type="number", value=average_value), From e9a346e5b26130c84639657a0ae9855fd6f86ae2 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 28 Dec 2023 15:58:52 +0100 Subject: [PATCH 135/414] aggregated results contains evaluator config --- .../models/api/evaluation_model.py | 28 +++++++++---------- .../agenta_backend/models/converters.py | 23 +++++++++------ 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 7c1b558ca4..0d105b0805 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -4,6 +4,19 @@ from typing import Optional, List, Dict, Any, Union +class Evaluator(BaseModel): + name: str + key: str + settings_template: dict + + +class EvaluatorConfig(BaseModel): + id: str + name: str + evaluator_key: str + settings_values: Optional[Dict[str, Any]] + + class EvaluationTypeSettings(BaseModel): similarity_threshold: Optional[float] regex_pattern: Optional[str] @@ -43,7 +56,7 @@ class Result(BaseModel): class AggregatedResult(BaseModel): - evaluator_config: str + evaluator_config: EvaluatorConfig result: Result @@ -175,13 +188,6 @@ class EvaluationSettingsTemplate(BaseModel): description: str -class EvaluatorConfig(BaseModel): - id: str - name: str - evaluator_key: str - settings_values: Optional[Dict[str, Any]] - - class NewEvaluation(BaseModel): app_id: str variant_ids: List[str] @@ -189,12 +195,6 @@ class NewEvaluation(BaseModel): testset_id: str -class Evaluator(BaseModel): - name: str - key: str - settings_template: dict - - class NewEvaluatorConfig(BaseModel): app_id: str name: str diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 8b8a8a1c7b..5701d9e2ce 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -83,7 +83,7 @@ async def evaluation_db_to_pydantic( variant_names=variant_names, testset_id=str(evaluation_db.testset.id), testset_name=evaluation_db.testset.name, - aggregated_results=aggregated_result_to_pydantic( + aggregated_results= await aggregated_result_to_pydantic( evaluation_db.aggregated_results ), created_at=evaluation_db.created_at, @@ -91,14 +91,21 @@ async def evaluation_db_to_pydantic( ) -def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: - return [ - { - "evaluator_config": str(result.evaluator_config), +async def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: + transformed_results = [] + for result in results: + evaluator_config_db = await db_manager.fetch_evaluator_config(str(result.evaluator_config)) + evaluator_config_dict = evaluator_config_db.dict() if evaluator_config_db else None + + if evaluator_config_dict: + evaluator_config_dict['id'] = str(evaluator_config_dict['id']) + + transformed_results.append({ + "evaluator_config": evaluator_config_dict, "result": result.result.dict(), - } - for result in results - ] + }) + + return transformed_results def evaluation_scenarios_results_to_pydantic( From 1acbc3b8f39ca6da210ae5e455aeb875bdf09969 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 16:21:15 +0100 Subject: [PATCH 136/414] Update - change httpexception to exception base --- agenta-backend/agenta_backend/routers/app_router.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/app_router.py b/agenta-backend/agenta_backend/routers/app_router.py index b7d7e50cbf..bba060f7d6 100644 --- a/agenta-backend/agenta_backend/routers/app_router.py +++ b/agenta-backend/agenta_backend/routers/app_router.py @@ -339,9 +339,8 @@ async def create_app_and_variant_from_template( app_name, organization_id, **user_org_data ) if app is not None: - raise HTTPException( - status_code=400, - detail=f"App with name {app_name} already exists", + raise Exception( + f"App with name {app_name} already exists", ) logger.debug("Step 4: Creating new app and initializing environments") @@ -385,9 +384,7 @@ async def create_app_and_variant_from_template( logger.debug("Step 8: Starting variant and injecting environment variables") if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: if not os.environ["OPENAI_API_KEY"]: - raise HTTPException( - status_code=400, - detail="Unable to start app container. Please file an issue by clicking on the button below.", + raise Exception("Unable to start app container. Please file an issue by clicking on the button below.", ) envvars = { **(payload.env_vars or {}), From 0b8970e019d71e47912103ea9f86df31e299d236 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 16:21:41 +0100 Subject: [PATCH 137/414] Update - modified create evaluation router --- agenta-backend/agenta_backend/routers/evaluation_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 5cb4b3af4e..719b1bb5d5 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -91,7 +91,7 @@ async def create_evaluation( evaluators_configs=payload.evaluators_configs, ) if ( - payload.evaluators_configs.len == 1 + len(payload.evaluators_configs) == 1 and payload.evaluators_configs.evaluator_key in ["human_a_b_testing", "human_single_model_test"] ): From c48a2d023c07590f6c804ee12b37cc2b9b0ea6e9 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 16:24:02 +0100 Subject: [PATCH 138/414] Update - modified fixtures --- .../variants_evaluators_router/conftest.py | 115 +++++------------- 1 file changed, 33 insertions(+), 82 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 8b952b5d54..90ff918451 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -2,16 +2,11 @@ import httpx import pytest from pathlib import Path -from bson import ObjectId -from datetime import datetime from agenta_backend.models.db_engine import DBEngine -from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( - AppDB, - AppVariantDB, + UserDB, OrganizationDB, - TestSetDB, ) @@ -42,84 +37,44 @@ def fetch_single_prompt_template(fetch_templates): return fetch_templates[1] -@pytest.fixture(scope="session") -def ensure_frontend_reachable(): - response = httpx.get(f"{BASE_URI}apps/") - response.raise_for_status() - return response.text +@pytest.fixture() +async def fetch_user_organization(): + organization = await engine.find(OrganizationDB) + return {"org_id": str(organization[0].id)} @pytest.fixture() -async def fetch_app(): - apps = await engine.find(AppDB) +def app_from_template(): return { - "app_id": str(apps[0].id), - "app_name": apps[0].app_name, - "org_id": str(apps[0].user.organizations[0]), + "app_name": "string", + "env_vars": {"OPENAI_API_KEY": OPEN_AI_KEY}, + "organization_id": "string", + "template_id": "string", } -@pytest.fixture() -async def fetch_app_variant(fetch_app): - app = await fetch_app - app_variant = await engine.find_one( - AppVariantDB, AppVariantDB.app == ObjectId(app["app_id"]) - ) - return {"variant_id": str(app_variant.id), "app_id": app["app_id"]} +@pytest.fixture(scope="session") +async def create_user_and_organization(): + user = await engine.find_one(UserDB, UserDB.uid == "0") + if user is None: + create_user = UserDB(uid="xxxx", username="evaluator") + await engine.save(create_user) + org = OrganizationDB(type="evaluator", owner=str(create_user.id)) + await engine.save(org) -@pytest.fixture() -async def create_app_from_template(fetch_app, fetch_single_prompt_template): - app = await fetch_app - payload = { - "app_name": app["app_name"], - "template_id": fetch_single_prompt_template["id"], - "env_vars": {"OPENAI_API_KEY": OPEN_AI_KEY}, - "organization_id": app["org_id"], - } - print("Payload: ", payload) - response = httpx.post( - f"{BACKEND_URI}apps/app_and_variant_from_template/", json=payload - ) - return response.json() - + create_user.organizations.append(org.id) + await engine.save(create_user) + await engine.save(org) -@pytest.fixture() -async def prepare_testset_csvdata(create_app_from_template): - app_variant = await create_app_from_template - print("AppV: ", app_variant) - app_db = await engine.find_one(AppDB, AppDB.id == ObjectId(app_variant["app_id"])) - org_db = await engine.find_one( - OrganizationDB, OrganizationDB.id == ObjectId(app_variant["organization_id"]) - ) - json_path = os.path.join( - PARENT_DIRECTORY, - "resources", - "default_testsets", - "chat_openai_testset.json", - ) - - csvdata = get_json(json_path) - testset = { - "name": f"{app_db.app_name}_testset", - "app_name": app_db.app_name, - "created_at": datetime.now().isoformat(), - "csvdata": csvdata, - } - testset_db = TestSetDB(**testset, app=app_db, user=app_db.user, organization=org_db) - await engine.save(testset_db) - return { - "testset_id": str(testset_db.id), - "variant_id": app_variant["variant_id"], - "app_id": app_variant["app_id"], - } + return create_user + return user @pytest.fixture() -async def auto_exact_match_evaluator_config(fetch_app): - app = await fetch_app +def auto_exact_match_evaluator_config(): return { - "app_id": app["app_id"], + "app_id": "string", "name": "ExactMatchEvaluator", "evaluator_key": "auto_exact_match", "settings_values": {}, @@ -127,10 +82,9 @@ async def auto_exact_match_evaluator_config(fetch_app): @pytest.fixture() -async def auto_similarity_match_evaluator_config(fetch_app): - app = await fetch_app +def auto_similarity_match_evaluator_config(): return { - "app_id": app["app_id"], + "app_id": "string", "name": "SimilarityMatchEvaluator", "evaluator_key": "auto_similarity_match", "settings_values": {"similarity_threshold": 0.3}, @@ -138,10 +92,9 @@ async def auto_similarity_match_evaluator_config(fetch_app): @pytest.fixture() -async def auto_regex_test_evaluator_config(fetch_app): - app = await fetch_app +def auto_regex_test_evaluator_config(): return { - "app_id": app["app_id"], + "app_id": "string", "name": "RegexEvaluator", "evaluator_key": "auto_regex_test", "settings_values": { @@ -152,10 +105,9 @@ async def auto_regex_test_evaluator_config(fetch_app): @pytest.fixture() -async def auto_webhook_test_evaluator_config(fetch_app): - app = await fetch_app +def auto_webhook_test_evaluator_config(): return { - "app_id": app["app_id"], + "app_id": "string", "name": "WebhookEvaluator", "evaluator_key": "auto_webhook_test", "settings_values": { @@ -166,10 +118,9 @@ async def auto_webhook_test_evaluator_config(fetch_app): @pytest.fixture() -async def auto_ai_critique_evaluator_config(fetch_app): - app = await fetch_app +def auto_ai_critique_evaluator_config(): return { - "app_id": app["app_id"], + "app_id": "string", "name": "AICritique_Evaluator", "evaluator_key": "auto_ai_critique", "settings_values": { From 425abb82da43128bb3bc68ef1412ae1fab40b1d5 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 16:24:23 +0100 Subject: [PATCH 139/414] Update - modified test cases for app variant and evaluators router --- .../test_evaluators_router.py | 93 +++++++++++++------ .../test_app_variant_router.py | 2 +- 2 files changed, 67 insertions(+), 28 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 2ad7074ce4..e5fd0c7909 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -1,10 +1,9 @@ import httpx import pytest import asyncio - from agenta_backend.models.db_engine import DBEngine -from agenta_backend.models.db_models import EvaluationDB -from agenta_backend.models.api.evaluation_model import Evaluation, EvaluationStatusEnum +from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum +from agenta_backend.models.db_models import EvaluationDB, AppDB, TestSetDB, AppVariantDB # Initialize database engine @@ -15,9 +14,26 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables +APP_NAME = "evaluation_in_backend" BACKEND_API_HOST = "http://host.docker.internal/api" +@pytest.mark.asyncio +async def test_create_app_from_template( + app_from_template, create_user_and_organization, fetch_single_prompt_template +): + user = await create_user_and_organization + payload = app_from_template + payload["app_name"] = APP_NAME + payload["organization_id"] = str(user.organizations[0]) + payload["template_id"] = fetch_single_prompt_template["id"] + + response = httpx.post( + f"{BACKEND_API_HOST}/apps/app_and_variant_from_template/", json=payload + ) + assert response.status_code == 200 + + @pytest.mark.asyncio async def test_get_evaluators_endpoint(): response = await test_client.get( @@ -32,7 +48,10 @@ async def test_get_evaluators_endpoint(): async def test_create_auto_exact_match_evaluator_config( auto_exact_match_evaluator_config, ): - payload = await auto_exact_match_evaluator_config + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + payload = auto_exact_match_evaluator_config + payload["app_id"] = str(app.id) + response = await test_client.post( f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout ) @@ -45,7 +64,10 @@ async def test_create_auto_exact_match_evaluator_config( async def test_create_auto_similarity_match_evaluator_config( auto_similarity_match_evaluator_config, ): - payload = await auto_similarity_match_evaluator_config + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + payload = auto_similarity_match_evaluator_config + payload["app_id"] = str(app.id) + response = await test_client.post( f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout ) @@ -58,8 +80,11 @@ async def test_create_auto_similarity_match_evaluator_config( async def test_create_auto_regex_test_evaluator_config( auto_regex_test_evaluator_config, ): - payload = await auto_regex_test_evaluator_config - payload["settings_values"]["regex_pattern"] = "^Nigeria\\d{3}$" + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + payload = auto_regex_test_evaluator_config + payload["app_id"] = str(app.id) + payload["settings_values"]["regex_pattern"] = "^ig\\d{3}$" + response = await test_client.post( f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout ) @@ -72,7 +97,10 @@ async def test_create_auto_regex_test_evaluator_config( async def test_create_auto_webhook_test_evaluator_config( auto_webhook_test_evaluator_config, ): - payload = await auto_webhook_test_evaluator_config + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + payload = auto_webhook_test_evaluator_config + payload["app_id"] = str(app.id) + response = await test_client.post( f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout ) @@ -85,7 +113,10 @@ async def test_create_auto_webhook_test_evaluator_config( async def test_create_auto_ai_critique_evaluator_config( auto_ai_critique_evaluator_config, ): - payload = await auto_ai_critique_evaluator_config + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + payload = auto_ai_critique_evaluator_config + payload["app_id"] = str(app.id) + response = await test_client.post( f"{BACKEND_API_HOST}/evaluators/configs/", json=payload, timeout=timeout ) @@ -95,10 +126,10 @@ async def test_create_auto_ai_critique_evaluator_config( @pytest.mark.asyncio -async def test_get_evaluator_configs(fetch_app): - app = await fetch_app +async def test_get_evaluator_configs(): + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) response = await test_client.get( - f"{BACKEND_API_HOST}/evaluators/configs/?app_id={app['app_id']}", + f"{BACKEND_API_HOST}/evaluators/configs/?app_id={str(app.id)}", timeout=timeout, ) assert response.status_code == 200 @@ -106,23 +137,23 @@ async def test_get_evaluator_configs(fetch_app): @pytest.mark.asyncio -async def test_create_evaluation(prepare_testset_csvdata): - # Fetch app variant and testset - testset = await prepare_testset_csvdata +async def test_create_evaluation(): + # Fetch app, app_variant and testset + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app_variant = await engine.find_one(AppVariantDB, AppVariantDB.app == app.id) + testset = await engine.find_one(TestSetDB, TestSetDB.app == app.id) # Prepare payload payload = { - "app_id": testset["app_id"], - "variant_ids": [ - testset["variant_id"], - ], + "app_id": str(app.id), + "variant_ids": [str(app_variant.id)], "evaluators_configs": [], - "testset_id": "" + "testset_id": str(testset.id), } # Fetch evaluator configs response = await test_client.get( - f"{BACKEND_API_HOST}/evaluators/configs/?app_id={testset['app_id']}", + f"{BACKEND_API_HOST}/evaluators/configs/?app_id={payload['app_id']}", timeout=timeout, ) list_of_configs_ids = [] @@ -130,20 +161,21 @@ async def test_create_evaluation(prepare_testset_csvdata): for evaluator_config in evaluator_configs: list_of_configs_ids.append(evaluator_config["id"]) - # Update payload with list of configs ids and testset id + # Update payload with list of configs ids payload["evaluators_configs"] = list_of_configs_ids - payload["testset_id"] = testset["testset_id"] + print("Payload: ", payload) # Make request to create evaluation response = await test_client.post( f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout ) response_data = response.json() + print("RD: ", response_data) assert response.status_code == 200 assert response_data["app_id"] == payload["app_id"] assert response_data["status"] == EvaluationStatusEnum.EVALUATION_STARTED - assert response_data is not None and isinstance(response_data, Evaluation) + assert response_data is not None @pytest.mark.asyncio @@ -186,10 +218,10 @@ async def test_fetch_evaluation_results(): @pytest.mark.asyncio -async def test_delete_evaluator_config(fetch_app): - app = await fetch_app +async def test_delete_evaluator_config(): + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) response = await test_client.get( - f"{BACKEND_API_HOST}/evaluators/configs/?app_id={app['app_id']}", + f"{BACKEND_API_HOST}/evaluators/configs/?app_id={str(app.id)}", timeout=timeout, ) list_of_deleted_configs = [] @@ -203,3 +235,10 @@ async def test_delete_evaluator_config(fetch_app): count_of_deleted_configs = sum(list_of_deleted_configs) assert len(evaluator_configs) == count_of_deleted_configs + + +# @pytest.mark.asyncio +# async def remove_running_template_app_container(): +# app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) +# container_name = f"{app.app_name}-app-{str(app.id)}" +# assert True diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index 64b4ae533b..86baaca8c5 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -54,7 +54,7 @@ async def test_list_apps(): response = await test_client.get(f"{BACKEND_API_HOST}/apps/") assert response.status_code == 200 - assert len(response.json()) == 2 + assert len(response.json()) == 3 @pytest.mark.asyncio From 4ee18b482f2a4b757d22273d9a685a4e2d6a46d7 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 16:28:21 +0100 Subject: [PATCH 140/414] Update - modified conftest --- .../agenta_backend/tests/variants_evaluators_router/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 90ff918451..d78a72d80a 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -17,7 +17,7 @@ BASE_URI = "http://host.docker.internal/" BACKEND_URI = BASE_URI + "api/" PARENT_DIRECTORY = Path(os.path.dirname(__file__)).parent.parent -OPEN_AI_KEY = "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" +OPEN_AI_KEY = "sk-xxxxxx" @pytest.fixture(scope="session") From f56d062628efba555f133bdd6f1cfa5c7ac98c74 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 19:16:23 +0100 Subject: [PATCH 141/414] Update - read open ai key from env --- .../tests/variants_evaluators_router/conftest.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index d78a72d80a..0bdcaf101d 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -1,7 +1,6 @@ import os import httpx import pytest -from pathlib import Path from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.db_models import ( @@ -14,10 +13,8 @@ engine = DBEngine().engine() # Set global variables -BASE_URI = "http://host.docker.internal/" -BACKEND_URI = BASE_URI + "api/" -PARENT_DIRECTORY = Path(os.path.dirname(__file__)).parent.parent -OPEN_AI_KEY = "sk-xxxxxx" +OPEN_AI_KEY = os.environ.get("OPEN_AI_KEY") +BACKEND_URI = "http://host.docker.internal/api/" @pytest.fixture(scope="session") From 57e019f853a44b2cacbe40499a1769d27117c500 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 19:16:58 +0100 Subject: [PATCH 142/414] Update - modified run-backend-tests workflow --- .github/workflows/run-backend-tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 5a5f2b78e3..c3c99cdae5 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -1,6 +1,8 @@ name: Run Backend tests on: [pull_request] +env: + OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} jobs: continous_integration: @@ -9,6 +11,8 @@ jobs: - uses: actions/checkout@v3 - name: Start Docker Compose + env: + OPEN_AI_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} run: docker-compose -f "docker-compose.test.yml" up -d --build - name: Install Curl From 579455c82d1009e3c562cabc2b732faf5bd73282 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 19:48:05 +0100 Subject: [PATCH 143/414] Update - refactor variants router test conftest fixtures --- .../tests/variants_router/conftest.py | 76 +++++++------------ 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_router/conftest.py index f7c8d817aa..8b7917b7c0 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_router/conftest.py @@ -21,71 +21,51 @@ logger.setLevel(logging.DEBUG) -@pytest.fixture(scope="function") +@pytest.fixture() async def get_first_user_object(): """Get the user object from the database or create a new one if not found.""" - try: - user = await engine.find_one(UserDB, UserDB.uid == "0") - if user is None: - create_user = UserDB(uid="0") - await engine.save(create_user) + user = await engine.find_one(UserDB, UserDB.uid == "0") + if user is None: + create_user = UserDB(uid="0") + await engine.save(create_user) - org = OrganizationDB(type="default", owner=str(create_user.id)) - await engine.save(org) + org = OrganizationDB(type="default", owner=str(create_user.id)) + await engine.save(org) - create_user.organizations.append(org.id) - await engine.save(create_user) - await engine.save(org) + create_user.organizations.append(org.id) + await engine.save(create_user) + await engine.save(org) - return create_user - else: - return user - except Exception as e: - pytest.fail(f"Failed to get or create the first user: {e}") + return create_user + return user -@pytest.fixture(scope="function") +@pytest.fixture() async def get_second_user_object(): """Create a second user object.""" - try: - user = await engine.find_one(UserDB, UserDB.uid == "1") - if user is None: - create_user = UserDB( - uid="1", username="test_user1", email="test_user1@email.com" - ) - await engine.save(create_user) - - org = OrganizationDB(type="default", owner=str(create_user.id)) - await engine.save(org) + user = await engine.find_one(UserDB, UserDB.uid == "1") + if user is None: + create_user = UserDB( + uid="1", username="test_user1", email="test_user1@email.com" + ) + await engine.save(create_user) - create_user.organizations.append(org.id) - await engine.save(create_user) - await engine.save(org) + org = OrganizationDB(type="default", owner=str(create_user.id)) + await engine.save(org) - return create_user - else: - return user + create_user.organizations.append(org.id) + await engine.save(create_user) + await engine.save(org) - except Exception as e: - pytest.fail(f"Failed to get or create the second user: {e}") + return create_user + return user @pytest.fixture() -async def get_first_user_app(): - user = await engine.find_one(UserDB, UserDB.uid == "0") - if user is None: - user = UserDB(uid="0") - await engine.save(user) - - organization = OrganizationDB(type="default", owner=str(user.id)) - await engine.save(organization) - - user.organizations.append(organization.id) - await engine.save(user) - await engine.save(organization) - +async def get_first_user_app(get_first_user_object): + user = await get_first_user_object organization = await selectors.get_user_own_org(user.uid) app = AppDB(app_name="myapp", organization=organization, user=user) From e5421aa422061779ee71e43ee6ecc19292cecc35 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 19:48:54 +0100 Subject: [PATCH 144/414] Refactor - rename create_user_and_organization fixture to fetch_user --- .../tests/variants_evaluators_router/conftest.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 0bdcaf101d..441f2e62d8 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -13,7 +13,7 @@ engine = DBEngine().engine() # Set global variables -OPEN_AI_KEY = os.environ.get("OPEN_AI_KEY") +OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") BACKEND_URI = "http://host.docker.internal/api/" @@ -51,20 +51,8 @@ def app_from_template(): @pytest.fixture(scope="session") -async def create_user_and_organization(): +async def fetch_user(): user = await engine.find_one(UserDB, UserDB.uid == "0") - if user is None: - create_user = UserDB(uid="xxxx", username="evaluator") - await engine.save(create_user) - - org = OrganizationDB(type="evaluator", owner=str(create_user.id)) - await engine.save(org) - - create_user.organizations.append(org.id) - await engine.save(create_user) - await engine.save(org) - - return create_user return user From 3e7f65650c6e9b340aebee3ee3285fb82c3e6766 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 19:50:39 +0100 Subject: [PATCH 145/414] :art: Format - ran black --- agenta-backend/agenta_backend/routers/app_router.py | 3 ++- .../agenta_backend/routers/evaluation_router.py | 11 ++++++----- agenta-backend/agenta_backend/services/db_manager.py | 4 +++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/app_router.py b/agenta-backend/agenta_backend/routers/app_router.py index bba060f7d6..d67775e4a7 100644 --- a/agenta-backend/agenta_backend/routers/app_router.py +++ b/agenta-backend/agenta_backend/routers/app_router.py @@ -384,7 +384,8 @@ async def create_app_and_variant_from_template( logger.debug("Step 8: Starting variant and injecting environment variables") if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: if not os.environ["OPENAI_API_KEY"]: - raise Exception("Unable to start app container. Please file an issue by clicking on the button below.", + raise Exception( + "Unable to start app container. Please file an issue by clicking on the button below.", ) envvars = { **(payload.env_vars or {}), diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 719b1bb5d5..2d788ace31 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -90,11 +90,12 @@ async def create_evaluation( new_evaluation_data=new_evaluation_data, evaluators_configs=payload.evaluators_configs, ) - if ( - len(payload.evaluators_configs) == 1 - and payload.evaluators_configs.evaluator_key - in ["human_a_b_testing", "human_single_model_test"] - ): + if len( + payload.evaluators_configs + ) == 1 and payload.evaluators_configs.evaluator_key in [ + "human_a_b_testing", + "human_single_model_test", + ]: return evaluation else: evaluate.delay( diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index a95eeee304..eaa2c4ab5f 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1807,6 +1807,8 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: delete_result = remove_document_using_driver( str(evaluator_config_id), "evaluators_configs" ) - return delete_result is None # checking if delete_result is None (has been deleted) + return ( + delete_result is None + ) # checking if delete_result is None (has been deleted) except Exception as e: raise e From 7909b9d2cfa8b51f38875a72abd0be6d342f1c01 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 21:05:46 +0100 Subject: [PATCH 146/414] Cleanup - remove redundant exception raise --- agenta-backend/agenta_backend/models/db_engine.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 6bb56359e6..6d3434a2a6 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -58,9 +58,6 @@ def engine(self) -> AIOEngine: ) logger.info(f"Using {self.mode} database...") return aio_engine - raise ValueError( - "Mode of database is unknown. Did you mean 'default' or 'test'?" - ) def remove_db(self) -> None: """ From d294e364a68920eca1f5c746d17aba6689ec5ab6 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 22:15:39 +0100 Subject: [PATCH 147/414] Refactor - removed print statement --- .../variants_evaluators_router/test_evaluators_router.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index e5fd0c7909..6cede68733 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -20,9 +20,9 @@ @pytest.mark.asyncio async def test_create_app_from_template( - app_from_template, create_user_and_organization, fetch_single_prompt_template + app_from_template, fetch_user, fetch_single_prompt_template ): - user = await create_user_and_organization + user = await fetch_user payload = app_from_template payload["app_name"] = APP_NAME payload["organization_id"] = str(user.organizations[0]) @@ -163,14 +163,12 @@ async def test_create_evaluation(): # Update payload with list of configs ids payload["evaluators_configs"] = list_of_configs_ids - print("Payload: ", payload) # Make request to create evaluation response = await test_client.post( f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout ) response_data = response.json() - print("RD: ", response_data) assert response.status_code == 200 assert response_data["app_id"] == payload["app_id"] @@ -183,7 +181,7 @@ async def test_fetch_evaluation_status(): evaluations = await engine.find(EvaluationDB) # will return only one in this case evaluation = evaluations[0] - # Prepare short-polling request + # Prepare and start short-polling request max_attempts = 10 intervals = 2 # seconds for _ in range(max_attempts): From e11b53fdc9dc5b80540b5a493c9fe038cbc35479 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 28 Dec 2023 22:24:31 +0100 Subject: [PATCH 148/414] Update - revert back to localhost:8001 --- .../agenta_backend/models/converters.py | 22 ++++++++++++------- .../test_observability_router.py | 2 +- .../test_organization_router.py | 2 +- .../testset_router/test_testset_router.py | 2 +- .../user_profile_router/test_user_profile.py | 2 +- .../variants_evaluators_router/conftest.py | 2 +- .../test_evaluators_router.py | 2 +- .../test_app_variant_router.py | 2 +- 8 files changed, 21 insertions(+), 15 deletions(-) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 5701d9e2ce..c9773b3c64 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -83,7 +83,7 @@ async def evaluation_db_to_pydantic( variant_names=variant_names, testset_id=str(evaluation_db.testset.id), testset_name=evaluation_db.testset.name, - aggregated_results= await aggregated_result_to_pydantic( + aggregated_results=await aggregated_result_to_pydantic( evaluation_db.aggregated_results ), created_at=evaluation_db.created_at, @@ -94,16 +94,22 @@ async def evaluation_db_to_pydantic( async def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: transformed_results = [] for result in results: - evaluator_config_db = await db_manager.fetch_evaluator_config(str(result.evaluator_config)) - evaluator_config_dict = evaluator_config_db.dict() if evaluator_config_db else None + evaluator_config_db = await db_manager.fetch_evaluator_config( + str(result.evaluator_config) + ) + evaluator_config_dict = ( + evaluator_config_db.dict() if evaluator_config_db else None + ) if evaluator_config_dict: - evaluator_config_dict['id'] = str(evaluator_config_dict['id']) + evaluator_config_dict["id"] = str(evaluator_config_dict["id"]) - transformed_results.append({ - "evaluator_config": evaluator_config_dict, - "result": result.result.dict(), - }) + transformed_results.append( + { + "evaluator_config": evaluator_config_dict, + "result": result.result.dict(), + } + ) return transformed_results diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index 28b765749d..c093d8a597 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -28,7 +28,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://host.docker.internal/api" +BACKEND_API_HOST = "http://localhost:8001/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index 0142d6acbe..7f84bcaed5 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -16,7 +16,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://host.docker.internal/api" +BACKEND_API_HOST = "http://localhost:8001/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index fec3275c00..0ff3077647 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -18,7 +18,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://host.docker.internal/api" +BACKEND_API_HOST = "http://localhost:8001/api" TESTSET_SUBMODULE_DIR = Path(__file__).parent diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index 8833560c6b..22f03670a9 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -15,7 +15,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://host.docker.internal/api" +BACKEND_API_HOST = "http://localhost:8001/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 441f2e62d8..b29b7ac56e 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -14,7 +14,7 @@ # Set global variables OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") -BACKEND_URI = "http://host.docker.internal/api/" +BACKEND_URI = "http://localhost:8001/api/" @pytest.fixture(scope="session") diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 6cede68733..7a313f774f 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -15,7 +15,7 @@ # Set global variables APP_NAME = "evaluation_in_backend" -BACKEND_API_HOST = "http://host.docker.internal/api" +BACKEND_API_HOST = "http://localhost:8001/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index 86baaca8c5..d6ab8ce258 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -29,7 +29,7 @@ logger.setLevel(logging.DEBUG) # Set global variables -BACKEND_API_HOST = "http://host.docker.internal/api" +BACKEND_API_HOST = "http://localhost:8001/api" @pytest.mark.asyncio From a4e4c9961ef0f8f9d6a4088a11c60dfb9f780007 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 12:05:16 +0100 Subject: [PATCH 149/414] Update - include calculation for ai critique --- .../agenta_backend/tasks/evaluations.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 35bea882fb..c6e34a8f4b 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -136,9 +136,28 @@ async def aggregate_evaluator_results( ) -> List[AggregatedResult]: aggregated_results = [] for evaluator_key, results in evaluators_aggregated_data.items(): - average_value = ( - sum([result.value for result in results]) / len(results) if results else 0 - ) + if evaluator_key != "auto_ai_critique": + average_value = ( + sum([result.value for result in results]) / len(results) + if results + else 0 + ) + elif evaluator_key == "auto_ai_critique": + try: + average_value = ( + sum( + [ + int(result.value) + for result in results + if isinstance(int(result.value), int) + ] + ) + / len(results) + if results + else 0 + ) + except TypeError: + average_value = None evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) aggregated_result = AggregatedResult( evaluator_config=evaluator_config.id, From 556157284a86a04929d8f4cd607a7d27959f567d Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 12:49:54 +0100 Subject: [PATCH 150/414] Update - modified aggregated result to pydantic converter --- .../agenta_backend/models/converters.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 5701d9e2ce..ab7058fecc 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -1,5 +1,6 @@ """Converts db models to pydantic models """ +import json from typing import List from agenta_backend.services import db_manager from agenta_backend.models.api.user_models import User @@ -83,7 +84,7 @@ async def evaluation_db_to_pydantic( variant_names=variant_names, testset_id=str(evaluation_db.testset.id), testset_name=evaluation_db.testset.name, - aggregated_results= await aggregated_result_to_pydantic( + aggregated_results=await aggregated_result_to_pydantic( evaluation_db.aggregated_results ), created_at=evaluation_db.created_at, @@ -94,17 +95,18 @@ async def evaluation_db_to_pydantic( async def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: transformed_results = [] for result in results: - evaluator_config_db = await db_manager.fetch_evaluator_config(str(result.evaluator_config)) - evaluator_config_dict = evaluator_config_db.dict() if evaluator_config_db else None - - if evaluator_config_dict: - evaluator_config_dict['id'] = str(evaluator_config_dict['id']) - - transformed_results.append({ - "evaluator_config": evaluator_config_dict, - "result": result.result.dict(), - }) - + evaluator_config_db = await db_manager.fetch_evaluator_config( + str(result.evaluator_config) + ) + evaluator_config_dict = ( + evaluator_config_db.json() if evaluator_config_db else None + ) + transformed_results.append( + { + "evaluator_config": json.loads(evaluator_config_dict), + "result": result.result.dict(), + } + ) return transformed_results From fd4cebbcec962f0f162bbe62dcc2bda8e18710a3 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 12:50:37 +0100 Subject: [PATCH 151/414] Update - check for access rights in evaluation --- .../agenta_backend/services/evaluation_service.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 8e048d0371..290745aba8 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -817,7 +817,7 @@ async def create_new_evaluation( return await converters.evaluation_db_to_pydantic(evaluation_db) -async def retrieve_evaluation_results(evaluation_id: str) -> List[dict]: +async def retrieve_evaluation_results(evaluation_id: str, **user_org_data: dict) -> List[dict]: """Retrieve the aggregated results for a given evaluation. Args: @@ -827,5 +827,14 @@ async def retrieve_evaluation_results(evaluation_id: str) -> List[dict]: List[dict]: evaluation aggregated results """ + # Check for access rights evaluation = await db_manager.fetch_evaluation_by_id(evaluation_id) - return converters.aggregated_result_to_pydantic(evaluation.aggregated_results) + access = await check_access_to_app( + user_org_data=user_org_data, app_id=str(evaluation.app.id) + ) + if not access: + raise HTTPException( + status_code=403, + detail=f"You do not have access to this app: {str(evaluation.app.id)}", + ) + return await converters.aggregated_result_to_pydantic(evaluation.aggregated_results) From ec3a03c1c60f7ba4d5b9715af182cdecc8f09a82 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 12:51:12 +0100 Subject: [PATCH 152/414] Update - modified fetch evaluation results endpoint --- agenta-backend/agenta_backend/routers/evaluation_router.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index eb1d09ff77..0039931a1f 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -138,7 +138,9 @@ async def fetch_evaluation_results(evaluation_id: str, request: Request): """ try: - results = await evaluation_service.retrieve_evaluation_results(evaluation_id) + # Get user and organization id + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + results = await evaluation_service.retrieve_evaluation_results(evaluation_id, **user_org_data) return {"results": results, "evaluation_id": evaluation_id} except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) From 6470e8be4a2954c52fc5915c4b28dcdf3195e91d Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 12:52:03 +0100 Subject: [PATCH 153/414] :art: Format - ran black --- agenta-backend/agenta_backend/routers/evaluation_router.py | 4 +++- agenta-backend/agenta_backend/services/evaluation_service.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 0039931a1f..3d56cb927f 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -140,7 +140,9 @@ async def fetch_evaluation_results(evaluation_id: str, request: Request): try: # Get user and organization id user_org_data: dict = await get_user_and_org_id(request.state.user_id) - results = await evaluation_service.retrieve_evaluation_results(evaluation_id, **user_org_data) + results = await evaluation_service.retrieve_evaluation_results( + evaluation_id, **user_org_data + ) return {"results": results, "evaluation_id": evaluation_id} except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 290745aba8..e6a6ac0ded 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -817,7 +817,9 @@ async def create_new_evaluation( return await converters.evaluation_db_to_pydantic(evaluation_db) -async def retrieve_evaluation_results(evaluation_id: str, **user_org_data: dict) -> List[dict]: +async def retrieve_evaluation_results( + evaluation_id: str, **user_org_data: dict +) -> List[dict]: """Retrieve the aggregated results for a given evaluation. Args: From ce730ec5cade5c68e5da8df3cfa483cdc0a4afa5 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 14:00:54 +0100 Subject: [PATCH 154/414] Update - renamed openai api key --- .github/workflows/run-backend-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index c3c99cdae5..e89faa8476 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -12,7 +12,7 @@ jobs: - name: Start Docker Compose env: - OPEN_AI_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} + OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} run: docker-compose -f "docker-compose.test.yml" up -d --build - name: Install Curl From 361cdec790bc21eca0308cf2e51ca561178784ac Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 14:06:14 +0100 Subject: [PATCH 155/414] Update - added required services in backend compose --- docker-compose.test.yml | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 522ea4a1d2..a261f7d769 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -26,7 +26,7 @@ services: "8001", "--reload", "--log-level", - "info" + "info", ] ports: - "8000:8001" @@ -37,6 +37,7 @@ services: - agenta-test-network agenta-web: + container_name: agenta-web-test build: context: ./agenta-web dockerfile: dev.Dockerfile @@ -51,6 +52,7 @@ services: mongo: image: mongo:5.0 + container_name: agenta-mongo-test environment: MONGO_INITDB_ROOT_USERNAME: username MONGO_INITDB_ROOT_PASSWORD: password @@ -66,11 +68,46 @@ services: redis: image: redis:latest + container_name: agenta-redis-test networks: - agenta-test-network volumes: - redis_data:/data + rabbitmq: + image: rabbitmq:3-management + container_name: agenta-rabbitmq-test + ports: + - "5672:5672" + - "15672:15672" + volumes: + - ./rabbitmq_data:/var/lib/rabbitmq + environment: + RABBITMQ_DEFAULT_USER: "guest" + RABBITMQ_DEFAULT_PASS: "guest" + networks: + - agenta-test-network + + celery_worker: + build: ./agenta-backend + container_name: agenta-celery_worker-test + command: > + watchmedo auto-restart --directory=./agenta_backend --pattern=*.py --recursive -- celery -A agenta_backend.main.celery_app worker --concurrency=1 --loglevel=INFO + environment: + - MONGODB_URI=mongodb://username:password@mongo:27017 + - REDIS_URL=redis://redis:6379/0 + - CELERY_BROKER_URL=amqp://guest@rabbitmq// + - CELERY_RESULT_BACKEND=redis://redis:6379/0 + - FEATURE_FLAG=oss + volumes: + - ./agenta-backend/agenta_backend:/app/agenta_backend + - /var/run/docker.sock:/var/run/docker.sock + depends_on: + - rabbitmq + - redis + networks: + - agenta-test-network + networks: agenta-test-network: name: agenta-test-network From 82a1303cdee5baa6e8005d988f6b0f1b1abcc1cb Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 14:10:39 +0100 Subject: [PATCH 156/414] Refactor - revert back to using localhost:8001 --- .../tests/observability_router/test_observability_router.py | 2 +- .../tests/organization_router/test_organization_router.py | 2 +- .../agenta_backend/tests/testset_router/test_testset_router.py | 2 +- .../tests/user_profile_router/test_user_profile.py | 2 +- .../agenta_backend/tests/variants_evaluators_router/conftest.py | 2 +- .../tests/variants_evaluators_router/test_evaluators_router.py | 2 +- .../tests/variants_router/test_app_variant_router.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index c093d8a597..35dfb450fe 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -28,7 +28,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001/api" +BACKEND_API_HOST = "http://localhost:8001" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index 7f84bcaed5..db06d86840 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -16,7 +16,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001/api" +BACKEND_API_HOST = "http://localhost:8001" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index 0ff3077647..894fc40f08 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -18,7 +18,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001/api" +BACKEND_API_HOST = "http://localhost:8001" TESTSET_SUBMODULE_DIR = Path(__file__).parent diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index 22f03670a9..6d7912aaf3 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -15,7 +15,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001/api" +BACKEND_API_HOST = "http://localhost:8001" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index b29b7ac56e..65d6a13bca 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -14,7 +14,7 @@ # Set global variables OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") -BACKEND_URI = "http://localhost:8001/api/" +BACKEND_URI = "http://localhost:8001/" @pytest.fixture(scope="session") diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 7a313f774f..77913cc144 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -15,7 +15,7 @@ # Set global variables APP_NAME = "evaluation_in_backend" -BACKEND_API_HOST = "http://localhost:8001/api" +BACKEND_API_HOST = "http://localhost:8001" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index d6ab8ce258..24a6fb8daa 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -29,7 +29,7 @@ logger.setLevel(logging.DEBUG) # Set global variables -BACKEND_API_HOST = "http://localhost:8001/api" +BACKEND_API_HOST = "http://localhost:8001" @pytest.mark.asyncio From 7c622cec3b90a061a906b61f8d89ac1980887507 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 14:29:31 +0100 Subject: [PATCH 157/414] Update - drop v2 database --- agenta-backend/agenta_backend/models/db_engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 6d3434a2a6..8e79c9cf0c 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -67,5 +67,7 @@ def remove_db(self) -> None: client = MongoClient(self.db_url) if self.mode == "default": client.drop_database("agenta") + elif self.mode == "v2": + client.drop_database("agenta_v2") elif self.mode == "test": client.drop_database("agenta_test") From e972b00893da554554091d93b62c04f05ecb4fc7 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 15:03:55 +0100 Subject: [PATCH 158/414] Update - remove running template app container --- .../test_evaluators_router.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 77913cc144..8f32293bb1 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -235,8 +235,20 @@ async def test_delete_evaluator_config(): assert len(evaluator_configs) == count_of_deleted_configs -# @pytest.mark.asyncio -# async def remove_running_template_app_container(): -# app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) -# container_name = f"{app.app_name}-app-{str(app.id)}" -# assert True +@pytest.mark.asyncio +async def test_remove_running_template_app_container(): + import docker + + # Connect to the Docker daemon + client = docker.from_env() + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + container_name = f"{app.app_name}-app-{str(app.organization.id)}" + try: + # Retrieve container + container = client.containers.get(container_name) + # Stop and remove container + container.stop() + container.remove() + assert True + except: + assert False From fc1a9cd5c207b611de4423cd40b70cbbfab062d2 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 15:17:56 +0100 Subject: [PATCH 159/414] Update - include required env vars --- docker-compose.test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index a261f7d769..6c95750d6f 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -8,7 +8,11 @@ services: - MONGODB_URI=mongodb://username:password@mongo:27017/ - REDIS_URL=redis://redis:6379/0 - ENVIRONMENT=development - - DATABASE_MODE=test + - BARE_DOMAIN_NAME=localhost:8001 + - DOMAIN_NAME=http://localhost:8001 + - CELERY_BROKER_URL=amqp://guest@rabbitmq// + - CELERY_RESULT_BACKEND=redis://redis:6379/0 + - DATABASE_MODE=v2 - FEATURE_FLAG=oss - AGENTA_TEMPLATE_REPO=agentaai/templates_v2 - POSTHOG_API_KEY=phc_hmVSxIjTW1REBHXgj2aw4HW9X6CXb6FzerBgP9XenC7 From 7b77d9f86a31a625a6454b88c08f549ef5954892 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 29 Dec 2023 15:50:45 +0100 Subject: [PATCH 160/414] add basic schema contract for annotations --- agenta-backend/agenta_backend/main.py | 2 + .../models/api/annotation_models.py | 24 ++++ .../routers/annotations_router.py | 132 ++++++++++++++++++ 3 files changed, 158 insertions(+) create mode 100644 agenta-backend/agenta_backend/models/api/annotation_models.py create mode 100644 agenta-backend/agenta_backend/routers/annotations_router.py diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index e11329eeb3..2820174dbb 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -8,6 +8,7 @@ app_router, container_router, environment_router, + annotations_router, evaluation_router, evaluators_router, observability_router, @@ -77,6 +78,7 @@ async def lifespan(application: FastAPI, cache=True): app.include_router(user_profile.router, prefix="/profile") app.include_router(app_router.router, prefix="/apps") app.include_router(variants_router.router, prefix="/variants") +app.include_router(annotations_router.router, prefix="/annotations") app.include_router(evaluation_router.router, prefix="/evaluations") app.include_router(evaluators_router.router, prefix="/evaluators") app.include_router(testset_router.router, prefix="/testsets") diff --git a/agenta-backend/agenta_backend/models/api/annotation_models.py b/agenta-backend/agenta_backend/models/api/annotation_models.py new file mode 100644 index 0000000000..2576aa5a98 --- /dev/null +++ b/agenta-backend/agenta_backend/models/api/annotation_models.py @@ -0,0 +1,24 @@ +from pydantic import BaseModel +from typing import List + + +class Annotation(BaseModel): + app_id: str + variants_ids: List[str] + annotation_key: str + testset_id: str + aggregated_results: List + + +class NewAnnotation(BaseModel): + app_id: str + variants_ids: List[str] + annotation_key: str + testset_id: str + + +class AnnotationScenarioUpdate(BaseModel): + app_id: str + variants_ids: List[str] + annotation_key: str + testset_id: str diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py new file mode 100644 index 0000000000..d844ad8873 --- /dev/null +++ b/agenta-backend/agenta_backend/routers/annotations_router.py @@ -0,0 +1,132 @@ +import os +import secrets +from typing import List, Dict + +from fastapi.responses import JSONResponse +from fastapi.encoders import jsonable_encoder +from fastapi import HTTPException, APIRouter, Body, Request, status, Response + +from agenta_backend.models.api.annotation_models import ( + Annotation, + NewAnnotation, + AnnotationScenarioUpdate +) + +from agenta_backend.utils.common import check_access_to_app +from agenta_backend.services import db_manager + +if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: + from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all + get_user_and_org_id, + ) +else: + from agenta_backend.services.selectors import get_user_and_org_id + +router = APIRouter() + +@router.post("/") +async def create_annotation( + payload: NewAnnotation, + request: Request, +): + """Creates a new annotation document + Raises: + HTTPException: _description_ + Returns: + _description_ + """ + try: + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + access_app = await check_access_to_app( + user_org_data=user_org_data, + app_id=payload.app_id, + check_owner=False, + ) + if not access_app: + error_msg = f"You do not have access to this app: {payload.app_id}" + return JSONResponse( + {"detail": error_msg}, + status_code=400, + ) + app = await db_manager.fetch_app_by_id(app_id=payload.app_id) + if app is None: + raise HTTPException(status_code=404, detail="App not found") + + app_data = jsonable_encoder(app) + new_annotation_data = payload.dict() + annotation = await annotation_service.create_new_annotation( + app_data=app_data, + new_annotation_data=new_annotation_data, + ) + + return annotation + except KeyError: + raise HTTPException( + status_code=400, + detail="columns in the annotation set should match the names of the inputs in the variant", + ) + + +@router.get("/", response_model=List[Annotation]) +async def fetch_list_annotations( + app_id: str, + request: Request, +): + """Fetches a list of annotations, optionally filtered by an app ID. + + Args: + app_id (Optional[str]): An optional app ID to filter the annotations. + + Returns: + List[Annotation]: A list of annotations. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await annotation_service.fetch_list_annotations( + app_id=app_id, **user_org_data + ) + + +@router.get("/{annotation_id}/", response_model=Annotation) +async def fetch_annotation( + annotation_id: str, + request: Request, +): + """Fetches a single annotation based on its ID. + + Args: + annotation_id (str): The ID of the annotation to fetch. + + Returns: + Annotation: The fetched annotation. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await annotation_service.fetch_annotation(annotation_id, **user_org_data) + + +@router.put( + "/{annotation_id}/annotation_scenario/{annotation_scenario_id}/" +) +async def update_annotation_scenario_router( + annotation_id: str, + annotation_scenario_id: str, + annotation_scenario: AnnotationScenarioUpdate, + request: Request, +): + """Updates an annotation scenario's data. + + Raises: + HTTPException: If update fails or unauthorized. + + Returns: + None: 204 No Content status code upon successful update. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + try: + await update_annotation_scenario( + annotation_scenario_id, + annotation_scenario, + **user_org_data, + ) + return Response(status_code=status.HTTP_204_NO_CONTENT) + except UpdateAnnotationScenarioError as e: + raise HTTPException(status_code=500, detail=str(e)) from e \ No newline at end of file From 8e1178d6dea8b37d234cf763d72c04e8b43e9484 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 19:55:44 +0100 Subject: [PATCH 161/414] Update - modified run-backend-tests workflow --- .github/workflows/run-backend-tests.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index e89faa8476..e60a236a33 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -10,14 +10,18 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Set Environment Variables + run: | + echo "OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }}" >> $GITHUB_ENV + + - name: Install Curl + run: sudo apt install curl -y + - name: Start Docker Compose env: OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} run: docker-compose -f "docker-compose.test.yml" up -d --build - - name: Install Curl - run: sudo apt install curl -y - - name: Wait for Backend Service run: | while true; do @@ -28,6 +32,8 @@ jobs: done - name: Run tests + env: + OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} run: docker exec agenta-backend-test pytest - name: Stop Docker Compose From f61c1190f98b35470cf0ed00fd1da49d771eeddd Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 21:20:42 +0100 Subject: [PATCH 162/414] Update - added openai api key in backend service --- docker-compose.test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 6c95750d6f..dabe04ddb1 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -14,6 +14,7 @@ services: - CELERY_RESULT_BACKEND=redis://redis:6379/0 - DATABASE_MODE=v2 - FEATURE_FLAG=oss + - OPENAI_API_KEY=${OPENAI_API_KEY} - AGENTA_TEMPLATE_REPO=agentaai/templates_v2 - POSTHOG_API_KEY=phc_hmVSxIjTW1REBHXgj2aw4HW9X6CXb6FzerBgP9XenC7 volumes: From ebb5577402856f85cc71a96fab4c7ae41a0634e4 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 21:23:00 +0100 Subject: [PATCH 163/414] Update - set openai_api_key in compose build command --- .github/workflows/run-backend-tests.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index e60a236a33..b0aca88317 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -1,8 +1,6 @@ name: Run Backend tests on: [pull_request] -env: - OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} jobs: continous_integration: @@ -18,9 +16,7 @@ jobs: run: sudo apt install curl -y - name: Start Docker Compose - env: - OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} - run: docker-compose -f "docker-compose.test.yml" up -d --build + run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} docker-compose -f "docker-compose.test.yml" up -d --build - name: Wait for Backend Service run: | @@ -32,8 +28,6 @@ jobs: done - name: Run tests - env: - OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} run: docker exec agenta-backend-test pytest - name: Stop Docker Compose From 2d8739bfe7fc4907e89ff1491eb4ec76cb6dd4b3 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 21:36:13 +0100 Subject: [PATCH 164/414] Update - added extra steps to fetch template images --- .github/workflows/run-backend-tests.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index b0aca88317..979dc49e96 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -18,6 +18,18 @@ jobs: - name: Start Docker Compose run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} docker-compose -f "docker-compose.test.yml" up -d --build + - name: Restart Backend Service To Fetch Template Images + run: docker container agenta-backend-test restart + + - name: Check Templates Exists + run: | + while true; do + if curl -s http://localhost:8000/containers/templates/; then + break + fi + sleep 5 + done + - name: Wait for Backend Service run: | while true; do From c96134dbd909e2a2532daf4ac2af8c0e2d99c227 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 21:40:39 +0100 Subject: [PATCH 165/414] :art: Format - ran black --- .../agenta_backend/routers/annotations_router.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py index d844ad8873..5e068e6417 100644 --- a/agenta-backend/agenta_backend/routers/annotations_router.py +++ b/agenta-backend/agenta_backend/routers/annotations_router.py @@ -9,7 +9,7 @@ from agenta_backend.models.api.annotation_models import ( Annotation, NewAnnotation, - AnnotationScenarioUpdate + AnnotationScenarioUpdate, ) from agenta_backend.utils.common import check_access_to_app @@ -24,6 +24,7 @@ router = APIRouter() + @router.post("/") async def create_annotation( payload: NewAnnotation, @@ -103,9 +104,7 @@ async def fetch_annotation( return await annotation_service.fetch_annotation(annotation_id, **user_org_data) -@router.put( - "/{annotation_id}/annotation_scenario/{annotation_scenario_id}/" -) +@router.put("/{annotation_id}/annotation_scenario/{annotation_scenario_id}/") async def update_annotation_scenario_router( annotation_id: str, annotation_scenario_id: str, @@ -129,4 +128,4 @@ async def update_annotation_scenario_router( ) return Response(status_code=status.HTTP_204_NO_CONTENT) except UpdateAnnotationScenarioError as e: - raise HTTPException(status_code=500, detail=str(e)) from e \ No newline at end of file + raise HTTPException(status_code=500, detail=str(e)) from e From dfe16dd3d695211b5de730ec9381382ef73b758e Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 29 Dec 2023 21:46:14 +0100 Subject: [PATCH 166/414] Update - correct command to restart backend --- .github/workflows/run-backend-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 979dc49e96..ac07dea79c 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -19,7 +19,7 @@ jobs: run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} docker-compose -f "docker-compose.test.yml" up -d --build - name: Restart Backend Service To Fetch Template Images - run: docker container agenta-backend-test restart + run: docker container restart agenta-backend-test - name: Check Templates Exists run: | From d99a1a8e832dbf6abd9672b419771d6ac8e1b7f9 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 12:20:06 +0100 Subject: [PATCH 167/414] Retry 1: include print statement to discover the cause of failing create_app_from_template testcase --- .../tests/variants_evaluators_router/test_evaluators_router.py | 1 + 1 file changed, 1 insertion(+) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 8f32293bb1..4c4d8966f6 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -31,6 +31,7 @@ async def test_create_app_from_template( response = httpx.post( f"{BACKEND_API_HOST}/apps/app_and_variant_from_template/", json=payload ) + print("Response: ", response.json()) assert response.status_code == 200 From 241e1d746f24e3a2d32f6b3ef3ee535a6199fed0 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 12:38:15 +0100 Subject: [PATCH 168/414] Retry 2: raise a runtime error when starting container fails to get full traceback --- .../services/deployment_manager.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/services/deployment_manager.py b/agenta-backend/agenta_backend/services/deployment_manager.py index 8a3b71b2ad..f356bf3f55 100644 --- a/agenta-backend/agenta_backend/services/deployment_manager.py +++ b/agenta-backend/agenta_backend/services/deployment_manager.py @@ -37,12 +37,21 @@ async def start_service( logger.debug(f"container_name: {container_name}") logger.debug(f"env_vars: {env_vars}") - results = docker_utils.start_container( - image_name=app_variant_db.image.tags, - uri_path=uri_path, - container_name=container_name, - env_vars=env_vars, - ) + try: + results = docker_utils.start_container( + image_name=app_variant_db.image.tags, + uri_path=uri_path, + container_name=container_name, + env_vars=env_vars, + ) + except Exception as exc: + import traceback + + full_traceback = traceback.format_exc() + raise RuntimeError( + f"An error occurred while starting the container: {exc}\n\nFull Traceback:\n{full_traceback}" + ) + uri = results["uri"] container_id = results["container_id"] container_name = results["container_name"] From 82001e4cff2896cb1b6ecfaaa9418fe3f59a0c38 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 12:44:48 +0100 Subject: [PATCH 169/414] Retry 3: raise a runtime error when starting container fails to get full traceback in start_variant function --- .../agenta_backend/services/app_manager.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/services/app_manager.py b/agenta-backend/agenta_backend/services/app_manager.py index b15c335c4f..d19f5a6812 100644 --- a/agenta-backend/agenta_backend/services/app_manager.py +++ b/agenta-backend/agenta_backend/services/app_manager.py @@ -97,9 +97,15 @@ async def start_variant( logger.error( f"Error starting Docker container for app variant {db_app_variant.app.app_name}/{db_app_variant.variant_name}: {str(e)}" ) - raise Exception( - f"Failed to start Docker container for app variant {db_app_variant.app.app_name}/{db_app_variant.variant_name} \n {str(e)}" - ) from e + # raise Exception( + # f"Failed to start Docker container for app variant {db_app_variant.app.app_name}/{db_app_variant.variant_name} \n {str(e)}" + # ) from e + import traceback + + full_traceback = traceback.format_exc() + raise RuntimeError( + f"An error occurred while starting the container: {str(e)}\n\nFull Traceback:\n{full_traceback}" + ) return URI(uri=deployment.uri) From 0086bfd3dca1a337a0b173c0b4a4991f0dfc66ff Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 12:55:47 +0100 Subject: [PATCH 170/414] Retry 4: put client.containers.run logic in a try-except to get the full traceback of error occuring --- .../agenta_backend/services/app_manager.py | 10 +++---- .../services/deployment_manager.py | 20 +++++--------- .../agenta_backend/services/docker_utils.py | 26 ++++++++++++------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/agenta-backend/agenta_backend/services/app_manager.py b/agenta-backend/agenta_backend/services/app_manager.py index d19f5a6812..754e06afd5 100644 --- a/agenta-backend/agenta_backend/services/app_manager.py +++ b/agenta-backend/agenta_backend/services/app_manager.py @@ -100,12 +100,12 @@ async def start_variant( # raise Exception( # f"Failed to start Docker container for app variant {db_app_variant.app.app_name}/{db_app_variant.variant_name} \n {str(e)}" # ) from e - import traceback + # import traceback - full_traceback = traceback.format_exc() - raise RuntimeError( - f"An error occurred while starting the container: {str(e)}\n\nFull Traceback:\n{full_traceback}" - ) + # full_traceback = traceback.format_exc() + # raise RuntimeError( + # f"An error occurred while starting the container: {str(e)}\n\nFull Traceback:\n{full_traceback}" + # ) return URI(uri=deployment.uri) diff --git a/agenta-backend/agenta_backend/services/deployment_manager.py b/agenta-backend/agenta_backend/services/deployment_manager.py index f356bf3f55..a43e302a98 100644 --- a/agenta-backend/agenta_backend/services/deployment_manager.py +++ b/agenta-backend/agenta_backend/services/deployment_manager.py @@ -37,20 +37,12 @@ async def start_service( logger.debug(f"container_name: {container_name}") logger.debug(f"env_vars: {env_vars}") - try: - results = docker_utils.start_container( - image_name=app_variant_db.image.tags, - uri_path=uri_path, - container_name=container_name, - env_vars=env_vars, - ) - except Exception as exc: - import traceback - - full_traceback = traceback.format_exc() - raise RuntimeError( - f"An error occurred while starting the container: {exc}\n\nFull Traceback:\n{full_traceback}" - ) + results = docker_utils.start_container( + image_name=app_variant_db.image.tags, + uri_path=uri_path, + container_name=container_name, + env_vars=env_vars, + ) uri = results["uri"] container_id = results["container_id"] diff --git a/agenta-backend/agenta_backend/services/docker_utils.py b/agenta-backend/agenta_backend/services/docker_utils.py index 73c0b88a3e..55ffb52ea6 100644 --- a/agenta-backend/agenta_backend/services/docker_utils.py +++ b/agenta-backend/agenta_backend/services/docker_utils.py @@ -106,15 +106,23 @@ def start_container( env_vars = {} if env_vars is None else env_vars extra_hosts = {"host.docker.internal": "host-gateway"} - container = client.containers.run( - image, - detach=True, - labels=labels, - network="agenta-network", - name=container_name, - environment=env_vars, - extra_hosts=extra_hosts, - ) + try: + container = client.containers.run( + image, + detach=True, + labels=labels, + network="agenta-network", + name=container_name, + environment=env_vars, + extra_hosts=extra_hosts, + ) + except Exception as e: + import traceback + + full_traceback = traceback.format_exc() + raise RuntimeError( + f"An error occurred while running the container: {str(e)}\n\nFull Traceback:\n{full_traceback}" + ) # Check the container's status sleep(0.5) container.reload() # Refresh container data From c56ab8212b73840a1f639e09dfdb116b6604ce0d Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 13:05:26 +0100 Subject: [PATCH 171/414] Retry 5: include raise exception to know cause of error in start_variant and view docker logs in workflow --- .github/workflows/run-backend-tests.yml | 4 ++++ .../agenta_backend/services/app_manager.py | 12 +++--------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index ac07dea79c..d30e416986 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -42,5 +42,9 @@ jobs: - name: Run tests run: docker exec agenta-backend-test pytest + - name: Docker logs + if: always() # + run: docker ps -q | xargs -I {} docker logs {} + - name: Stop Docker Compose run: docker-compose down diff --git a/agenta-backend/agenta_backend/services/app_manager.py b/agenta-backend/agenta_backend/services/app_manager.py index 754e06afd5..b15c335c4f 100644 --- a/agenta-backend/agenta_backend/services/app_manager.py +++ b/agenta-backend/agenta_backend/services/app_manager.py @@ -97,15 +97,9 @@ async def start_variant( logger.error( f"Error starting Docker container for app variant {db_app_variant.app.app_name}/{db_app_variant.variant_name}: {str(e)}" ) - # raise Exception( - # f"Failed to start Docker container for app variant {db_app_variant.app.app_name}/{db_app_variant.variant_name} \n {str(e)}" - # ) from e - # import traceback - - # full_traceback = traceback.format_exc() - # raise RuntimeError( - # f"An error occurred while starting the container: {str(e)}\n\nFull Traceback:\n{full_traceback}" - # ) + raise Exception( + f"Failed to start Docker container for app variant {db_app_variant.app.app_name}/{db_app_variant.variant_name} \n {str(e)}" + ) from e return URI(uri=deployment.uri) From 2a55e8f11274e2f8b603d4511925198aef603034 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 13:21:15 +0100 Subject: [PATCH 172/414] Retry 5: refactor test compose to use agenta-network and revert start_container back to previous state --- .../agenta_backend/services/docker_utils.py | 26 +++++++------------ docker-compose.test.yml | 16 ++++++------ 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/agenta-backend/agenta_backend/services/docker_utils.py b/agenta-backend/agenta_backend/services/docker_utils.py index 55ffb52ea6..73c0b88a3e 100644 --- a/agenta-backend/agenta_backend/services/docker_utils.py +++ b/agenta-backend/agenta_backend/services/docker_utils.py @@ -106,23 +106,15 @@ def start_container( env_vars = {} if env_vars is None else env_vars extra_hosts = {"host.docker.internal": "host-gateway"} - try: - container = client.containers.run( - image, - detach=True, - labels=labels, - network="agenta-network", - name=container_name, - environment=env_vars, - extra_hosts=extra_hosts, - ) - except Exception as e: - import traceback - - full_traceback = traceback.format_exc() - raise RuntimeError( - f"An error occurred while running the container: {str(e)}\n\nFull Traceback:\n{full_traceback}" - ) + container = client.containers.run( + image, + detach=True, + labels=labels, + network="agenta-network", + name=container_name, + environment=env_vars, + extra_hosts=extra_hosts, + ) # Check the container's status sleep(0.5) container.reload() # Refresh container data diff --git a/docker-compose.test.yml b/docker-compose.test.yml index dabe04ddb1..ac6e5fde34 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -39,7 +39,7 @@ services: mongo: condition: service_healthy networks: - - agenta-test-network + - agenta-network agenta-web: container_name: agenta-web-test @@ -53,7 +53,7 @@ services: ports: - "3000:3000" networks: - - agenta-test-network + - agenta-network mongo: image: mongo:5.0 @@ -69,13 +69,13 @@ services: timeout: 10s retries: 20 networks: - - agenta-test-network + - agenta-network redis: image: redis:latest container_name: agenta-redis-test networks: - - agenta-test-network + - agenta-network volumes: - redis_data:/data @@ -91,7 +91,7 @@ services: RABBITMQ_DEFAULT_USER: "guest" RABBITMQ_DEFAULT_PASS: "guest" networks: - - agenta-test-network + - agenta-network celery_worker: build: ./agenta-backend @@ -111,11 +111,11 @@ services: - rabbitmq - redis networks: - - agenta-test-network + - agenta-network networks: - agenta-test-network: - name: agenta-test-network + agenta-network: + name: agenta-network volumes: mongodb_data: From 3319157104836b39c168ab765b4d28a829c3f3e1 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 14:53:04 +0100 Subject: [PATCH 173/414] Update - added testcase to know the status of app from template container --- .../test_evaluators_router.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 4c4d8966f6..87be8e6d44 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -31,10 +31,22 @@ async def test_create_app_from_template( response = httpx.post( f"{BACKEND_API_HOST}/apps/app_and_variant_from_template/", json=payload ) - print("Response: ", response.json()) assert response.status_code == 200 +@pytest.mark.asyncio +async def test_app_from_template_container_is_running(): + app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + + response = httpx.client( + f"{BACKEND_API_HOST}/{str(app.organization.id)}/{app.app_name}/app/openapi.json" + ) + response_data = response.json() + assert response.status_code == 200 + assert "openapi" in response_data + assert isinstance(response_data, dict) + + @pytest.mark.asyncio async def test_get_evaluators_endpoint(): response = await test_client.get( From 938f4177110d2ac67841c5a72ac09a5f76a4e728 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 30 Dec 2023 14:53:33 +0100 Subject: [PATCH 174/414] Update - extra-hosts to backend service --- docker-compose.test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index ac6e5fde34..333ed82e8b 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -38,6 +38,8 @@ services: depends_on: mongo: condition: service_healthy + extra_hosts: + - "host.docker.internal:host-gateway" networks: - agenta-network From 1275043e3a3a175c7aaba6935609ff699fb6ca89 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 07:42:48 +0100 Subject: [PATCH 175/414] add fetch annotation + fetch annotation + create annotation --- .../models/api/annotation_models.py | 26 +++- .../agenta_backend/models/converters.py | 14 ++ .../agenta_backend/models/db_models.py | 35 +++++ .../routers/annotations_router.py | 17 ++- .../services/annotation_manager.py | 126 ++++++++++++++++++ .../agenta_backend/services/db_manager.py | 63 +++++++++ 6 files changed, 268 insertions(+), 13 deletions(-) create mode 100644 agenta-backend/agenta_backend/services/annotation_manager.py diff --git a/agenta-backend/agenta_backend/models/api/annotation_models.py b/agenta-backend/agenta_backend/models/api/annotation_models.py index 2576aa5a98..d78dede6b7 100644 --- a/agenta-backend/agenta_backend/models/api/annotation_models.py +++ b/agenta-backend/agenta_backend/models/api/annotation_models.py @@ -1,11 +1,19 @@ from pydantic import BaseModel -from typing import List +from typing import List, Any +from enum import Enum + + +class AnnotationStatusEnum(str, Enum): + ANNOTATION_INITIALIZED = "ANNOTATION_INITIALIZED" + ANNOTATION_STARTED = "ANNOTATION_STARTED" + ANNOTATION_FINISHED = "ANNOTATION_FINISHED" + ANNOTATION_ERROR = "ANNOTATION_ERROR" class Annotation(BaseModel): app_id: str variants_ids: List[str] - annotation_key: str + annotation_name: str testset_id: str aggregated_results: List @@ -13,12 +21,22 @@ class Annotation(BaseModel): class NewAnnotation(BaseModel): app_id: str variants_ids: List[str] - annotation_key: str + annotation_name: str testset_id: str class AnnotationScenarioUpdate(BaseModel): app_id: str variants_ids: List[str] - annotation_key: str + annotation_name: str testset_id: str + + +class AnnotationScenario(BaseModel): + annotation: str + + +class AnnotationScenarioInput(BaseModel): + name: str + type: str + value: Any diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index ab7058fecc..228e149e53 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -5,6 +5,7 @@ from agenta_backend.services import db_manager from agenta_backend.models.api.user_models import User from agenta_backend.models.db_models import ( + AnnotationsDB, AppVariantDB, EvaluationScenarioResult, EvaluatorConfigDB, @@ -47,6 +48,8 @@ EvaluationScenarioOutput, ) +from agenta_backend.models.api.annotation_models import Annotation + import logging logger = logging.getLogger(__name__) @@ -342,3 +345,14 @@ def evaluator_config_db_to_pydantic(evaluator_config: EvaluatorConfigDB): evaluator_key=evaluator_config.evaluator_key, settings_values=evaluator_config.settings_values, ) + + +def annotation_db_to_pydantic(annotation_db: AnnotationsDB): + return Annotation( + id=str(annotation_db.id), + app_id=str(annotation_db.app.id), + annotation_name=annotation_db.annotation_name, + variants_ids=[str(variants_id) for variants_id in annotation_db.variants_ids], + testset_id=str(annotation_db.testset_id), + aggregated_results=annotation_db.aggregated_results, + ) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 79bdfa2af7..534068692a 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -330,3 +330,38 @@ class TraceDB(Model): class Config: collection = "traces" + + +class ABTestingAggregatedResult(EmbeddedModel): + variant_id: str + result: Result + + +class AnnotationsDB(Model): + app: AppDB = Reference(key_name="app") + organization: OrganizationDB = Reference(key_name="organization") + user: UserDB = Reference(key_name="user") + variants_ids: List[ObjectId] + testset_id: ObjectId + status: str = Field(default="ANNOTATION_INITIALIZED") + annotation_name: str + aggregated_results: List[ABTestingAggregatedResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + +class AnnotationsScenariosDB(Model): + app: AppDB = Reference(key_name="app") + organization: OrganizationDB = Reference(key_name="organization") + user: UserDB = Reference(key_name="user") + variants_ids: List[ObjectId] + + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] + evaluators_configs: List[ObjectId] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py index d844ad8873..87d0870857 100644 --- a/agenta-backend/agenta_backend/routers/annotations_router.py +++ b/agenta-backend/agenta_backend/routers/annotations_router.py @@ -9,11 +9,11 @@ from agenta_backend.models.api.annotation_models import ( Annotation, NewAnnotation, - AnnotationScenarioUpdate + AnnotationScenarioUpdate, ) from agenta_backend.utils.common import check_access_to_app -from agenta_backend.services import db_manager +from agenta_backend.services import db_manager, annotation_manager if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all @@ -24,6 +24,7 @@ router = APIRouter() + @router.post("/") async def create_annotation( payload: NewAnnotation, @@ -54,7 +55,7 @@ async def create_annotation( app_data = jsonable_encoder(app) new_annotation_data = payload.dict() - annotation = await annotation_service.create_new_annotation( + annotation = await annotation_manager.create_new_annotation( app_data=app_data, new_annotation_data=new_annotation_data, ) @@ -81,7 +82,7 @@ async def fetch_list_annotations( List[Annotation]: A list of annotations. """ user_org_data = await get_user_and_org_id(request.state.user_id) - return await annotation_service.fetch_list_annotations( + return await annotation_manager.fetch_list_annotations( app_id=app_id, **user_org_data ) @@ -100,12 +101,10 @@ async def fetch_annotation( Annotation: The fetched annotation. """ user_org_data = await get_user_and_org_id(request.state.user_id) - return await annotation_service.fetch_annotation(annotation_id, **user_org_data) + return await annotation_manager.fetch_annotation(annotation_id, **user_org_data) -@router.put( - "/{annotation_id}/annotation_scenario/{annotation_scenario_id}/" -) +@router.put("/{annotation_id}/annotation_scenario/{annotation_scenario_id}/") async def update_annotation_scenario_router( annotation_id: str, annotation_scenario_id: str, @@ -129,4 +128,4 @@ async def update_annotation_scenario_router( ) return Response(status_code=status.HTTP_204_NO_CONTENT) except UpdateAnnotationScenarioError as e: - raise HTTPException(status_code=500, detail=str(e)) from e \ No newline at end of file + raise HTTPException(status_code=500, detail=str(e)) from e diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py new file mode 100644 index 0000000000..8f39e8096b --- /dev/null +++ b/agenta-backend/agenta_backend/services/annotation_manager.py @@ -0,0 +1,126 @@ +import os +import secrets +from typing import List, Dict + +from bson import ObjectId +from fastapi import HTTPException + +from agenta_backend.services import db_manager +from agenta_backend.models import converters +from agenta_backend.models.api.annotation_models import ( + Annotation, + AnnotationScenario, + AnnotationScenarioInput, + AnnotationStatusEnum, + NewAnnotation, + AnnotationScenarioUpdate, +) + + +from agenta_backend.models.db_models import ( + AnnotationsDB, + AppDB, +) + +from agenta_backend.utils.common import engine, check_access_to_app + + +async def _fetch_annotation_and_check_access( + annotation_id: str, **user_org_data: dict +) -> AnnotationsDB: + + annotation = await db_manager.fetch_annotation_by_id(annotation_id=annotation_id) + + if annotation is None: + raise HTTPException( + status_code=404, + detail=f"Annotation with id {annotation_id} not found", + ) + + access = await check_access_to_app( + user_org_data=user_org_data, app_id=annotation.app.id + ) + if not access: + raise HTTPException( + status_code=403, + detail=f"You do not have access to this app: {str(annotation.app.id)}", + ) + return annotation + + +async def fetch_list_annotations( + app_id: str, + **user_org_data: dict, +) -> List[Annotation]: + """ + Fetches a list of annotations based on the provided filtering criteria. + + Args: + app_id (str): The app ID to filter the annotations. + user_org_data (dict): User and organization data. + + Returns: + List[Annotation]: A list of annotations. + """ + + access = await check_access_to_app(user_org_data=user_org_data, app_id=app_id) + if not access: + raise HTTPException( + status_code=403, + detail=f"You do not have access to this app: {app_id}", + ) + + annotations_db = await db_manager.fetch_annotations_by_app_id(app_id=app_id) + + return [ + converters.annotation_db_to_pydantic(annotation) + for annotation in annotations_db + ] + + +async def fetch_annotation(annotation_id: str, **user_org_data: dict) -> Annotation: + """ + Fetches a single annotation based on its ID. + + Args: + annotation_id (str): The ID of the annotation. + user_org_data (dict): User and organization data. + + Returns: + Annotation: The fetched annotation. + """ + annotation = await _fetch_annotation_and_check_access( + annotation_id=annotation_id, **user_org_data + ) + return converters.annotation_db_to_pydantic(annotation) + + +async def create_new_annotation( + app_data: dict, new_annotation_data: dict +) -> Annotation: + """ + Create a new annotation. + + Args: + app_data (dict): Required app data + new_annotation_data (dict): Required new annotation data + + Returns: + Annotation + """ + + new_annotation = NewAnnotation(**new_annotation_data) + app = AppDB(**app_data) + + print("are we here fel annotation!") + annotation_db = await db_manager.create_new_annotation( + app=app, + organization=app.organization, + user=app.user, + annotation_name=new_annotation.annotation_name, + testset_id=new_annotation.testset_id, + status=AnnotationStatusEnum.ANNOTATION_STARTED, + variants_ids=new_annotation.variants_ids, + ) + return converters.annotation_db_to_pydantic(annotation_db) + diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 35032fa500..a6fdf09d2c 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -19,6 +19,7 @@ ) from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( + AnnotationsDB, Result, AggregatedResult, AppDB, @@ -1817,3 +1818,65 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: return delete_result is not None except Exception as e: raise e + + +async def fetch_annotations_by_app_id(app_id: str) -> List[AnnotationsDB]: + """ + Fetches annotations from the database based on the provided app ID. + + Args: + app_id (str): The app ID to filter the annotations. + + Returns: + List[AnnotationsDB]: A list of annotation database objects. + """ + annotations_db = await engine.find( + AnnotationsDB, AnnotationsDB.app == ObjectId(app_id) + ) + return annotations_db + + +async def create_new_annotation( + app: AppDB, + organization: OrganizationDB, + user: UserDB, + testset_id: str, + status: str, + variants_ids: [str], + annotation_name: str, +) -> AnnotationsDB: + """Create a new annotation scenario. + Returns: + Annotation: The created annotation scenario. + """ + annotation = AnnotationsDB( + app=app, + organization=organization, + user=user, + testset_id=testset_id, + variants_ids=variants_ids, + annotation_name=annotation_name, + status=status, + aggregated_results=[], + created_at=datetime.now().isoformat(), + updated_at=datetime.now().isoformat(), + ) + await engine.save(annotation) + return annotation + + +async def fetch_annotation_by_id(annotation_id: str) -> Optional[AnnotationsDB]: + """ + Fetches an annotation from the database based on its ID. + + Args: + annotation_id (str): The unique identifier of the annotation. + + Returns: + Optional[AnnotationsDB]: The annotation database object if found, otherwise None. + """ + + annotation = await engine.find_one( + AnnotationsDB, AnnotationsDB.id == ObjectId(annotation_id) + ) + return annotation From 1bd9c0f88dc668b9a5a09b26b714047a66106ec1 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 07:48:54 +0100 Subject: [PATCH 176/414] remove print --- agenta-backend/agenta_backend/services/annotation_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py index 8f39e8096b..4f53d8ee9e 100644 --- a/agenta-backend/agenta_backend/services/annotation_manager.py +++ b/agenta-backend/agenta_backend/services/annotation_manager.py @@ -112,7 +112,6 @@ async def create_new_annotation( new_annotation = NewAnnotation(**new_annotation_data) app = AppDB(**app_data) - print("are we here fel annotation!") annotation_db = await db_manager.create_new_annotation( app=app, organization=app.organization, From f8ee0f0ab9616f1ae612f818b831199defd3cd9c Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 09:56:31 +0100 Subject: [PATCH 177/414] create batch annotations scenarios --- .../agenta_backend/models/db_models.py | 35 ++++-- .../services/annotation_manager.py | 116 ++++++++++++++++++ .../agenta_backend/services/db_manager.py | 77 +++++++++++- .../services/evaluation_service.py | 1 + 4 files changed, 219 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 534068692a..51988d76f0 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -332,7 +332,23 @@ class Config: collection = "traces" -class ABTestingAggregatedResult(EmbeddedModel): +class AnnotationScenarioInputDB(EmbeddedModel): + name: str + type: str + value: str + + +class AnnotationScenarioOutputDB(EmbeddedModel): + type: str + value: Any + + +class AnnoationResult(EmbeddedModel): + variant_id: str + result: Result + + +class AnnoatationScenarioResult(EmbeddedModel): variant_id: str result: Result @@ -345,23 +361,26 @@ class AnnotationsDB(Model): testset_id: ObjectId status: str = Field(default="ANNOTATION_INITIALIZED") annotation_name: str - aggregated_results: List[ABTestingAggregatedResult] + aggregated_results: List[AnnoationResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) + class Config: + collection = "annotations" class AnnotationsScenariosDB(Model): app: AppDB = Reference(key_name="app") organization: OrganizationDB = Reference(key_name="organization") user: UserDB = Reference(key_name="user") + annotation_id: ObjectId variants_ids: List[ObjectId] - - inputs: List[EvaluationScenarioInputDB] - outputs: List[EvaluationScenarioOutputDB] - correct_answer: Optional[str] + inputs: List[AnnotationScenarioInputDB] + outputs: List[AnnotationScenarioOutputDB] is_pinned: Optional[bool] note: Optional[str] - evaluators_configs: List[ObjectId] - results: List[EvaluationScenarioResult] + results: List[AnnoatationScenarioResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) + + class Config: + collection = "annotations_scenarios" \ No newline at end of file diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py index 4f53d8ee9e..9bc334d13e 100644 --- a/agenta-backend/agenta_backend/services/annotation_manager.py +++ b/agenta-backend/agenta_backend/services/annotation_manager.py @@ -1,3 +1,4 @@ +import datetime import os import secrets from typing import List, Dict @@ -19,6 +20,7 @@ from agenta_backend.models.db_models import ( AnnotationsDB, + AnnotationsScenariosDB, AppDB, ) @@ -48,6 +50,39 @@ async def _fetch_annotation_and_check_access( return annotation +async def _fetch_annotation_scenario_and_check_access( + annotation_scenario_id: str, **user_org_data: dict +) -> AnnotationsScenariosDB: + # Fetch the annotation scenario by ID + annotation_scenario = await db_manager.fetch_annotation_scenario_by_id( + annotation_scenario_id=annotation_scenario_id + ) + if annotation_scenario is None: + raise HTTPException( + status_code=404, + detail=f"Annotation scenario with id {annotation_scenario_id} not found", + ) + annotation = annotation_scenario.annotation + + # Check if the annotation exists + if annotation is None: + raise HTTPException( + status_code=404, + detail=f"Annotation scenario for annotation scenario with id {annotation_scenario_id} not found", + ) + + # Check for access rights + access = await check_access_to_app( + user_org_data=user_org_data, app_id=annotation.app.id + ) + if not access: + raise HTTPException( + status_code=403, + detail=f"You do not have access to this app: {str(annotation.app.id)}", + ) + return annotation_scenario + + async def fetch_list_annotations( app_id: str, **user_org_data: dict, @@ -112,6 +147,8 @@ async def create_new_annotation( new_annotation = NewAnnotation(**new_annotation_data) app = AppDB(**app_data) + testset = await db_manager.fetch_testset_by_id(new_annotation.testset_id) + annotation_db = await db_manager.create_new_annotation( app=app, organization=app.organization, @@ -121,5 +158,84 @@ async def create_new_annotation( status=AnnotationStatusEnum.ANNOTATION_STARTED, variants_ids=new_annotation.variants_ids, ) + + annotations_scenarios = [] + for datapoint in testset.csvdata: + # TODO: make inputs dynamic + annotation_scenario = { + "annotation_id": ObjectId(annotation_db.id), + "inputs": [{"input_name": "country", "input_value": datapoint['country']}], + "user": ObjectId(app.user.id), + "organization": ObjectId(app.organization.id) + } + annotations_scenarios.append(annotation_scenario) + + db_manager.insert_many_documents_using_driver(annotations_scenarios, 'annotations_scenarios_db') + return converters.annotation_db_to_pydantic(annotation_db) + +async def create_annotation_scenario( + annotation_id: str, payload: AnnotationScenario, **user_org_data: dict +) -> None: + """ + Create a new annotation scenario. + + Args: + annotation_id (str): The ID of the annotation. + payload (AnnotationScenario): Annotation scenario data. + user_org_data (dict): User and organization data. + + Raises: + HTTPException: If annotation not found or access denied. + """ + + scenario_inputs = [ + AnnotationScenarioInput( + input_name=input_item.input_name, + input_value=input_item.input_value, + ) + for input_item in payload.inputs + ] + + new_annotation_scenario = AnnotationsScenariosDB( + user=new_annotation_scenario.user, + organization=new_annotation_scenario.organization, + annotation_id=annotation_id, + inputs=scenario_inputs, + outputs=[], + is_pinned=False, + note="", + created_at=datetime.utcnow(), + updated_at=datetime.utcnow(), + ) + + await engine.save(new_annotation_scenario) + + +async def update_annotation_scenario( + annotation_scenario_id: str, + annotation_scenario_data: AnnotationScenarioUpdate, + **user_org_data, +) -> None: + """ + Updates an annotation scenario. + + Args: + annotation_scenario_id (str): The ID of the annotation scenario. + annotation_scenario_data (AnnotationScenarioUpdate): New data for the scenario. + annotation_type (AnnotationType): Type of the annotation. + user_org_data (dict): User and organization data. + + Raises: + HTTPException: If annotation scenario not found or access denied. + """ + annotation_scenario = await _fetch_annotation_scenario_and_check_access( + annotation_scenario_id=annotation_scenario_id, + **user_org_data, + ) + + updated_data = annotation_scenario_data.dict() + updated_data["updated_at"] = datetime.utcnow() + + await engine.save(annotation_scenario) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index a6fdf09d2c..01492d7315 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1,5 +1,6 @@ import os import logging +import pymongo from pathlib import Path from bson import ObjectId from datetime import datetime @@ -20,6 +21,7 @@ from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( AnnotationsDB, + AnnotationsScenariosDB, Result, AggregatedResult, AppDB, @@ -1432,8 +1434,6 @@ async def remove_old_template_from_db(tag_ids: list) -> None: def remove_document_using_driver(document_id: str, collection_name: str) -> None: """Deletes document from using pymongo driver""" - import pymongo - client = pymongo.MongoClient(os.environ["MONGODB_URI"]) db = client.get_database("agenta_v2") @@ -1880,3 +1880,76 @@ async def fetch_annotation_by_id(annotation_id: str) -> Optional[AnnotationsDB]: AnnotationsDB, AnnotationsDB.id == ObjectId(annotation_id) ) return annotation + + +async def fetch_annotation_scenario_by_id(annotation_id: str) -> Optional[AnnotationsScenariosDB]: + """ + Fetches an annotation from the database based on its ID. + + Args: + annotation_id (str): The unique identifier of the annotation. + + Returns: + Optional[AnnotationsDB]: The annotation database object if found, otherwise None. + """ + + annotation = await engine.find_one( + AnnotationsScenariosDB, AnnotationsScenariosDB.id == ObjectId(annotation_id) + ) + return annotation + + +async def create_annotation_scenario( + annotation: AnnotationsDB, + scenario_inputs: List[dict], + user: UserDB, + organization: OrganizationDB +) -> AnnotationsScenariosDB: + """ + Create a new annotation scenario in the database. + + Args: + annotation (AnnotationsDB): The annotation to which the scenario belongs. + scenario_inputs (List[dict]): List of inputs for the annotation scenario. + user (UserDB): User information. + organization (OrganizationDB): Organization information. + + Returns: + AnnotationsScenariosDB: The created annotation scenario. + """ + new_annotation_scenario = AnnotationsScenariosDB( + user=user, + organization=organization, + annotation=annotation, + inputs=scenario_inputs, + outputs=[], + is_pinned=False, + note="", + created_at=datetime.utcnow(), + updated_at=datetime.utcnow(), + ) + await engine.save(new_annotation_scenario) + return new_annotation_scenario + + +def insert_many_documents_using_driver(documents: list, collection_name: str) -> None: + """ + Inserts multiple documents into a MongoDB collection using the pymongo driver. + + Args: + documents (list): A list of dictionaries, each representing a document to insert. + collection_name (str): The name of the MongoDB collection where documents will be inserted. + """ + client = pymongo.MongoClient(os.environ["MONGODB_URI"]) + db = client.get_database("agenta_v2") + + collection = db.get_collection(collection_name) + + for document in documents: + if '_id' in document and isinstance(document['_id'], str): + document['_id'] = ObjectId(document['_id']) + + inserted = collection.insert_many(documents) + print( + f"Inserted {len(inserted.inserted_ids)} documents into {collection_name} collection. Acknowledged: {inserted.acknowledged}" + ) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index e6a6ac0ded..14001db5d1 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -12,6 +12,7 @@ EvaluationScenario, CustomEvaluationOutput, CustomEvaluationDetail, + EvaluationScenarioInput, EvaluationType, NewEvaluation, EvaluationScenarioUpdate, From d362ea112d8ec3ac692122b800adae2c5a859e1b Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 09:57:24 +0100 Subject: [PATCH 178/414] format --- agenta-backend/agenta_backend/models/db_models.py | 3 ++- .../agenta_backend/services/annotation_manager.py | 9 +++++---- agenta-backend/agenta_backend/services/db_manager.py | 10 ++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 51988d76f0..5afb15f1aa 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -368,6 +368,7 @@ class AnnotationsDB(Model): class Config: collection = "annotations" + class AnnotationsScenariosDB(Model): app: AppDB = Reference(key_name="app") organization: OrganizationDB = Reference(key_name="organization") @@ -383,4 +384,4 @@ class AnnotationsScenariosDB(Model): updated_at: datetime = Field(default=datetime.utcnow()) class Config: - collection = "annotations_scenarios" \ No newline at end of file + collection = "annotations_scenarios" diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py index 9bc334d13e..c0922a33fd 100644 --- a/agenta-backend/agenta_backend/services/annotation_manager.py +++ b/agenta-backend/agenta_backend/services/annotation_manager.py @@ -30,7 +30,6 @@ async def _fetch_annotation_and_check_access( annotation_id: str, **user_org_data: dict ) -> AnnotationsDB: - annotation = await db_manager.fetch_annotation_by_id(annotation_id=annotation_id) if annotation is None: @@ -164,13 +163,15 @@ async def create_new_annotation( # TODO: make inputs dynamic annotation_scenario = { "annotation_id": ObjectId(annotation_db.id), - "inputs": [{"input_name": "country", "input_value": datapoint['country']}], + "inputs": [{"input_name": "country", "input_value": datapoint["country"]}], "user": ObjectId(app.user.id), - "organization": ObjectId(app.organization.id) + "organization": ObjectId(app.organization.id), } annotations_scenarios.append(annotation_scenario) - db_manager.insert_many_documents_using_driver(annotations_scenarios, 'annotations_scenarios_db') + db_manager.insert_many_documents_using_driver( + annotations_scenarios, "annotations_scenarios_db" + ) return converters.annotation_db_to_pydantic(annotation_db) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 01492d7315..1cc0218b99 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1882,7 +1882,9 @@ async def fetch_annotation_by_id(annotation_id: str) -> Optional[AnnotationsDB]: return annotation -async def fetch_annotation_scenario_by_id(annotation_id: str) -> Optional[AnnotationsScenariosDB]: +async def fetch_annotation_scenario_by_id( + annotation_id: str, +) -> Optional[AnnotationsScenariosDB]: """ Fetches an annotation from the database based on its ID. @@ -1903,7 +1905,7 @@ async def create_annotation_scenario( annotation: AnnotationsDB, scenario_inputs: List[dict], user: UserDB, - organization: OrganizationDB + organization: OrganizationDB, ) -> AnnotationsScenariosDB: """ Create a new annotation scenario in the database. @@ -1946,8 +1948,8 @@ def insert_many_documents_using_driver(documents: list, collection_name: str) -> collection = db.get_collection(collection_name) for document in documents: - if '_id' in document and isinstance(document['_id'], str): - document['_id'] = ObjectId(document['_id']) + if "_id" in document and isinstance(document["_id"], str): + document["_id"] = ObjectId(document["_id"]) inserted = collection.insert_many(documents) print( From 5ab2902c70e24d833992be69ce07d22013cc5352 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 16:17:31 +0100 Subject: [PATCH 179/414] move result to shared models --- agenta-backend/agenta_backend/models/api/api_models.py | 5 +++++ .../agenta_backend/models/api/evaluation_model.py | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/api_models.py b/agenta-backend/agenta_backend/models/api/api_models.py index 8bd6aef44a..5319788dd9 100644 --- a/agenta-backend/agenta_backend/models/api/api_models.py +++ b/agenta-backend/agenta_backend/models/api/api_models.py @@ -5,6 +5,11 @@ from pydantic import BaseModel +class Result(BaseModel): + type: str + value: Any + + class GetConfigReponse(BaseModel): config_id: str config_name: str diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 0d105b0805..b25dec6988 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -2,6 +2,7 @@ from datetime import datetime from pydantic import BaseModel, Field from typing import Optional, List, Dict, Any, Union +from agenta_backend.models.api.api_models import Result class Evaluator(BaseModel): @@ -50,11 +51,6 @@ class EvaluationScenarioStatusEnum(str, Enum): COMPARISON_RUN_STARTED = "COMPARISON_RUN_STARTED" -class Result(BaseModel): - type: str - value: Any - - class AggregatedResult(BaseModel): evaluator_config: EvaluatorConfig result: Result From bd8a16c346f3e59c21f2019d44855c381ca56256 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 16:29:40 +0100 Subject: [PATCH 180/414] Add annotation_scenario resource and improve update_annotation_scenario --- .../models/api/annotation_models.py | 33 +++++++++-- .../agenta_backend/models/converters.py | 34 ++++++++++- .../agenta_backend/models/db_models.py | 1 - .../services/annotation_manager.py | 48 ++++----------- .../agenta_backend/services/db_manager.py | 59 +++++++++++++++---- 5 files changed, 121 insertions(+), 54 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/annotation_models.py b/agenta-backend/agenta_backend/models/api/annotation_models.py index d78dede6b7..94ffcc23d4 100644 --- a/agenta-backend/agenta_backend/models/api/annotation_models.py +++ b/agenta-backend/agenta_backend/models/api/annotation_models.py @@ -1,6 +1,7 @@ from pydantic import BaseModel -from typing import List, Any +from typing import Optional, List, Any from enum import Enum +from agenta_backend.models.api.api_models import Result class AnnotationStatusEnum(str, Enum): @@ -11,6 +12,7 @@ class AnnotationStatusEnum(str, Enum): class Annotation(BaseModel): + id: str app_id: str variants_ids: List[str] annotation_name: str @@ -26,14 +28,33 @@ class NewAnnotation(BaseModel): class AnnotationScenarioUpdate(BaseModel): - app_id: str - variants_ids: List[str] - annotation_name: str - testset_id: str + result: Result + + +class AnnotationScenarioInput(BaseModel): + name: str + type: str + value: Any + + +class AnnotationScenarioOutput(BaseModel): + type: str + value: Any + + +class AnnoatationScenarioResult(BaseModel): + variant_id: str + result: Result class AnnotationScenario(BaseModel): - annotation: str + id: Optional[str] + annotation_id: str + inputs: List[AnnotationScenarioInput] + outputs: List[AnnotationScenarioOutput] + is_pinned: Optional[bool] + note: Optional[str] + result: AnnoatationScenarioResult class AnnotationScenarioInput(BaseModel): diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 228e149e53..6519099049 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -5,7 +5,9 @@ from agenta_backend.services import db_manager from agenta_backend.models.api.user_models import User from agenta_backend.models.db_models import ( + AnnoatationScenarioResult, AnnotationsDB, + AnnotationsScenariosDB, AppVariantDB, EvaluationScenarioResult, EvaluatorConfigDB, @@ -48,7 +50,12 @@ EvaluationScenarioOutput, ) -from agenta_backend.models.api.annotation_models import Annotation +from agenta_backend.models.api.annotation_models import ( + Annotation, + AnnotationScenario, + AnnotationScenarioInput, + AnnotationScenarioOutput, +) import logging @@ -356,3 +363,28 @@ def annotation_db_to_pydantic(annotation_db: AnnotationsDB): testset_id=str(annotation_db.testset_id), aggregated_results=annotation_db.aggregated_results, ) + + +def annotation_scenario_db_to_pydantic( + annotation_scenario_db: AnnotationsScenariosDB, +) -> AnnotationScenario: + return AnnotationScenario( + id=str(annotation_scenario_db.id), + annotation_id=str(annotation_scenario_db.annotation_id), + inputs=[ + AnnotationScenarioInput(**input_dict) + for input_dict in annotation_scenario_db.inputs + ], + outputs=[ + AnnotationScenarioOutput(**output_dict) + for output_dict in annotation_scenario_db.outputs + ], + is_pinned=annotation_scenario_db.is_pinned, + note=annotation_scenario_db.note, + results=[ + AnnoatationScenarioResult(**result_dict) + for result_dict in annotation_scenario_db.results + ], + created_at=annotation_scenario_db.created_at, + updated_at=annotation_scenario_db.updated_at, + ) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 5afb15f1aa..13aa2c448f 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -374,7 +374,6 @@ class AnnotationsScenariosDB(Model): organization: OrganizationDB = Reference(key_name="organization") user: UserDB = Reference(key_name="user") annotation_id: ObjectId - variants_ids: List[ObjectId] inputs: List[AnnotationScenarioInputDB] outputs: List[AnnotationScenarioOutputDB] is_pinned: Optional[bool] diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py index c0922a33fd..23a2420776 100644 --- a/agenta-backend/agenta_backend/services/annotation_manager.py +++ b/agenta-backend/agenta_backend/services/annotation_manager.py @@ -1,7 +1,7 @@ import datetime import os import secrets -from typing import List, Dict +from typing import Any, List, Dict from bson import ObjectId from fastapi import HTTPException @@ -146,8 +146,6 @@ async def create_new_annotation( new_annotation = NewAnnotation(**new_annotation_data) app = AppDB(**app_data) - testset = await db_manager.fetch_testset_by_id(new_annotation.testset_id) - annotation_db = await db_manager.create_new_annotation( app=app, organization=app.organization, @@ -158,21 +156,6 @@ async def create_new_annotation( variants_ids=new_annotation.variants_ids, ) - annotations_scenarios = [] - for datapoint in testset.csvdata: - # TODO: make inputs dynamic - annotation_scenario = { - "annotation_id": ObjectId(annotation_db.id), - "inputs": [{"input_name": "country", "input_value": datapoint["country"]}], - "user": ObjectId(app.user.id), - "organization": ObjectId(app.organization.id), - } - annotations_scenarios.append(annotation_scenario) - - db_manager.insert_many_documents_using_driver( - annotations_scenarios, "annotations_scenarios_db" - ) - return converters.annotation_db_to_pydantic(annotation_db) @@ -216,27 +199,22 @@ async def create_annotation_scenario( async def update_annotation_scenario( annotation_scenario_id: str, - annotation_scenario_data: AnnotationScenarioUpdate, + updates: Dict[str, Any], **user_org_data, -) -> None: +) -> AnnotationScenario: """ - Updates an annotation scenario. + Edit an existing annotation scenario. Args: - annotation_scenario_id (str): The ID of the annotation scenario. - annotation_scenario_data (AnnotationScenarioUpdate): New data for the scenario. - annotation_type (AnnotationType): Type of the annotation. - user_org_data (dict): User and organization data. + annotation_scenario_id (str): The ID of the annotation scenario to be updated. + updates (Dict[str, Any]): A dictionary containing the updates. - Raises: - HTTPException: If annotation scenario not found or access denied. + Returns: + AnnotationScenario: The updated annotation scenario object. """ - annotation_scenario = await _fetch_annotation_scenario_and_check_access( - annotation_scenario_id=annotation_scenario_id, - **user_org_data, + print("update_annotation_scenario") + annotation_scenario = await db_manager.update_annotation_scenario( + annotation_scenario_id, updates ) - - updated_data = annotation_scenario_data.dict() - updated_data["updated_at"] = datetime.utcnow() - - await engine.save(annotation_scenario) + print(annotation_scenario) + return converters.annotation_scenario_db_to_pydantic(annotation_scenario) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 1cc0218b99..7cdf89ef3e 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1861,8 +1861,8 @@ async def create_new_annotation( created_at=datetime.now().isoformat(), updated_at=datetime.now().isoformat(), ) - await engine.save(annotation) - return annotation + new_annotation = await engine.save(annotation) + return new_annotation async def fetch_annotation_by_id(annotation_id: str) -> Optional[AnnotationsDB]: @@ -1901,11 +1901,16 @@ async def fetch_annotation_scenario_by_id( return annotation -async def create_annotation_scenario( - annotation: AnnotationsDB, - scenario_inputs: List[dict], - user: UserDB, +async def create_new_annotation_scenario( + app: AppDB, organization: OrganizationDB, + user: UserDB, + annotation_id: str, + inputs: List[dict], + outputs: List[dict], + isPinned: bool, + results: List, + note: str, ) -> AnnotationsScenariosDB: """ Create a new annotation scenario in the database. @@ -1920,13 +1925,15 @@ async def create_annotation_scenario( AnnotationsScenariosDB: The created annotation scenario. """ new_annotation_scenario = AnnotationsScenariosDB( + app=app, user=user, organization=organization, - annotation=annotation, - inputs=scenario_inputs, - outputs=[], - is_pinned=False, - note="", + annotation_id=annotation_id, + inputs=inputs, + outputs=outputs, + is_pinned=isPinned, + note=note, + results=results, created_at=datetime.utcnow(), updated_at=datetime.utcnow(), ) @@ -1955,3 +1962,33 @@ def insert_many_documents_using_driver(documents: list, collection_name: str) -> print( f"Inserted {len(inserted.inserted_ids)} documents into {collection_name} collection. Acknowledged: {inserted.acknowledged}" ) + + +async def update_annotation_scenario( + annotation_scenario_id: str, updates: Dict[str, Any] +) -> AnnotationsScenariosDB: + """ + Update an annotation scenario in the database with the provided id. + + Arguments: + annotation_scenario_id (str): The ID of the annotation scenario to be updated. + updates (Dict[str, Any]): The updates to apply to the annotation scenario. + + Returns: + AnnotationsScenariosDB: The updated annotation scenario object. + """ + + annotation_scenario = await engine.find_one( + AnnotationsScenariosDB, + AnnotationsScenariosDB.id == ObjectId(annotation_scenario_id), + ) + + if not annotation_scenario: + raise HTTPException(status_code=404, detail="Annotation scenario not found") + + for key, value in updates.items(): + if hasattr(annotation_scenario, key): + setattr(annotation_scenario, key, value) + + await engine.save(annotation_scenario) + return annotation_scenario From 237044bf635f0aedc1d66de25b0481f2bb1e73d4 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 16:31:19 +0100 Subject: [PATCH 181/414] add celery task to prepare annotations scenarios --- .../agenta_backend/celery_config.py | 5 ++ .../routers/annotations_router.py | 27 +++--- .../agenta_backend/tasks/annotations.py | 83 +++++++++++++++++++ 3 files changed, 105 insertions(+), 10 deletions(-) create mode 100644 agenta-backend/agenta_backend/tasks/annotations.py diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py index df11dad091..1b643786a9 100644 --- a/agenta-backend/agenta_backend/celery_config.py +++ b/agenta-backend/agenta_backend/celery_config.py @@ -14,4 +14,9 @@ Exchange("agenta_backend.tasks.evaluations.evaluate"), routing_key="agenta_backend.tasks.evaluations.evaluate", ), + Queue( + "agenta_backend.tasks.annotations.prepare_scenarios", + Exchange("agenta_backend.tasks.annotations.prepare_scenarios"), + routing_key="agenta_backend.tasks.annotations.prepare_scenarios", + ), ) diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py index 87d0870857..31fe1de639 100644 --- a/agenta-backend/agenta_backend/routers/annotations_router.py +++ b/agenta-backend/agenta_backend/routers/annotations_router.py @@ -12,9 +12,14 @@ AnnotationScenarioUpdate, ) +from agenta_backend.services.annotation_manager import update_annotation_scenario +from agenta_backend.tasks.evaluations import evaluate + from agenta_backend.utils.common import check_access_to_app from agenta_backend.services import db_manager, annotation_manager +from agenta_backend.tasks.annotations import prepare_scenarios + if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all get_user_and_org_id, @@ -29,7 +34,7 @@ async def create_annotation( payload: NewAnnotation, request: Request, -): +)-> Annotation: """Creates a new annotation document Raises: HTTPException: _description_ @@ -60,6 +65,10 @@ async def create_annotation( new_annotation_data=new_annotation_data, ) + prepare_scenarios.delay( + app_data, new_annotation_data, annotation.id, annotation.testset_id + ) + return annotation except KeyError: raise HTTPException( @@ -120,12 +129,10 @@ async def update_annotation_scenario_router( None: 204 No Content status code upon successful update. """ user_org_data = await get_user_and_org_id(request.state.user_id) - try: - await update_annotation_scenario( - annotation_scenario_id, - annotation_scenario, - **user_org_data, - ) - return Response(status_code=status.HTTP_204_NO_CONTENT) - except UpdateAnnotationScenarioError as e: - raise HTTPException(status_code=500, detail=str(e)) from e + + await update_annotation_scenario( + annotation_scenario_id, + annotation_scenario, + **user_org_data, + ) + return Response(status_code=status.HTTP_204_NO_CONTENT) diff --git a/agenta-backend/agenta_backend/tasks/annotations.py b/agenta-backend/agenta_backend/tasks/annotations.py new file mode 100644 index 0000000000..b51a0c62fc --- /dev/null +++ b/agenta-backend/agenta_backend/tasks/annotations.py @@ -0,0 +1,83 @@ +import asyncio +from typing import List +from bson import ObjectId +from celery import shared_task +from collections import defaultdict + +from agenta_backend.services import llm_apps_service +from agenta_backend.services.db_manager import ( + fetch_annotation_by_id, + fetch_app_variant_by_id, + get_deployment_by_objectid, + fetch_testset_by_id, + create_new_annotation_scenario, +) +from agenta_backend.models.db_models import ( + AppDB, + AnnotationScenarioInputDB, + AnnotationScenarioOutputDB, + AnnotationScenarioInputDB, + AnnoatationScenarioResult, +) + +from agenta_backend.models.api.annotation_models import NewAnnotation + +@shared_task(queue="agenta_backend.tasks.annotations.prepare_scenarios") +def prepare_scenarios( + app_data: dict, new_annotation_data: dict, annotation_id: str, testset_id: str +): + loop = asyncio.get_event_loop() + app = AppDB(**app_data) + annotation = NewAnnotation(**new_annotation_data) + + testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) + new_annotation_db = loop.run_until_complete(fetch_annotation_by_id(annotation_id)) + + for variant_id in annotation.variants_ids: + variant_id = str(variant_id) + + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) + deployment = loop.run_until_complete( + get_deployment_by_objectid(app_variant_db.base.deployment) + ) + + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + + for data_point in testset.csvdata: + # 1. We prepare the inputs + raw_inputs = ( + app_variant_db.parameters.get("inputs", []) + if app_variant_db.parameters + else [] + ) + inputs = [] + if raw_inputs: + inputs = [ + AnnotationScenarioInputDB( + name=input_item["name"], + type="text", + value=data_point[input_item["name"]], + ) + for input_item in raw_inputs + ] + + # 2. We get the output from the llm app + # TODO: make outputs for all variants + variant_output = llm_apps_service.get_llm_app_output(uri, data_point) + + # 3. We create a new annotation scenario + annotation_scenario = loop.run_until_complete( + create_new_annotation_scenario( + app=app, + user=app.user, + organization=app.organization, + annotation_id=new_annotation_db.id, + inputs= inputs, + outputs=[ + AnnotationScenarioOutputDB(type="text", value=variant_output) + ], + isPinned= False, + note="", + results=[] + ) + ) From fd8803ae0af645aa1a03c3b99933a7f8ca854e79 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Sun, 31 Dec 2023 16:41:23 +0100 Subject: [PATCH 182/414] format --- .../agenta_backend/routers/annotations_router.py | 2 +- agenta-backend/agenta_backend/tasks/annotations.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py index 31fe1de639..aca56946d2 100644 --- a/agenta-backend/agenta_backend/routers/annotations_router.py +++ b/agenta-backend/agenta_backend/routers/annotations_router.py @@ -34,7 +34,7 @@ async def create_annotation( payload: NewAnnotation, request: Request, -)-> Annotation: +) -> Annotation: """Creates a new annotation document Raises: HTTPException: _description_ diff --git a/agenta-backend/agenta_backend/tasks/annotations.py b/agenta-backend/agenta_backend/tasks/annotations.py index b51a0c62fc..f2bb11194e 100644 --- a/agenta-backend/agenta_backend/tasks/annotations.py +++ b/agenta-backend/agenta_backend/tasks/annotations.py @@ -22,6 +22,7 @@ from agenta_backend.models.api.annotation_models import NewAnnotation + @shared_task(queue="agenta_backend.tasks.annotations.prepare_scenarios") def prepare_scenarios( app_data: dict, new_annotation_data: dict, annotation_id: str, testset_id: str @@ -72,12 +73,12 @@ def prepare_scenarios( user=app.user, organization=app.organization, annotation_id=new_annotation_db.id, - inputs= inputs, + inputs=inputs, outputs=[ AnnotationScenarioOutputDB(type="text", value=variant_output) ], - isPinned= False, + isPinned=False, note="", - results=[] + results=[], ) ) From dc2c6a533f22efc88280a5fd9f3227481bca32d8 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 31 Dec 2023 20:52:41 +0100 Subject: [PATCH 183/414] Update - added backoff retry-on-exception --- .../services/llm_apps_service.py | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index 9da5104418..3f253340ff 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -1,28 +1,29 @@ import httpx +import backoff +@backoff.on_exception( + backoff.expo, + (httpx.TimeoutException, httpx.ConnectTimeout, httpx.ConnectError), + max_tries=2, +) def get_llm_app_output(uri, input): - try: - url = f"{uri}/generate" + url = f"{uri}/generate" - # TODO: adjust these hardcoded values in this payload - payload = { - "temperature": 1, - "model": "gpt-3.5-turbo", - "max_tokens": -1, - "prompt_system": "You are an expert in geography.", - "prompt_user": f"What is the capital of {input}?", - "top_p": 1, - "inputs": {"country": input}, - } + # TODO: adjust these hardcoded values in this payload + payload = { + "temperature": 1, + "model": "gpt-3.5-turbo", + "max_tokens": -1, + "prompt_system": "You are an expert in geography.", + "prompt_user": f"What is the capital of {input}?", + "top_p": 1, + "inputs": {"country": input}, + } - with httpx.Client() as client: - response = client.post(url, json=payload) - response.raise_for_status() - return response.json() - except httpx.HTTPError as e: - print(f"An HTTP error occurred: {e}") - except Exception as e: - print(f"An error occurred: {e}") - - return None + with httpx.Client() as client: + response = client.post( + url, json=payload, timeout=httpx.Timeout(timeout=5, read=None, write=5) + ) + response.raise_for_status() + return response.json() From cd41eefaa449c9398f88ce47bed2578938cb72b3 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 31 Dec 2023 21:04:12 +0100 Subject: [PATCH 184/414] Update - modified backend url for auto webhook test evaluator config fixture --- .../tests/variants_evaluators_router/conftest.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 65d6a13bca..b75c270a2b 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -13,8 +13,9 @@ engine = DBEngine().engine() # Set global variables +BACKEND_URI = "http://localhost:8000/" +ENVIRONMENT = os.environ.get("ENVIRONMENT") OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") -BACKEND_URI = "http://localhost:8001/" @pytest.fixture(scope="session") @@ -91,12 +92,17 @@ def auto_regex_test_evaluator_config(): @pytest.fixture() def auto_webhook_test_evaluator_config(): + url = ( + "http://host.docker.internal/api" + if ENVIRONMENT == "development" + else "http://agenta-backend-test:8000" + ) return { "app_id": "string", "name": "WebhookEvaluator", "evaluator_key": "auto_webhook_test", "settings_values": { - "webhook_url": f"{BACKEND_URI}evaluations/webhook_example_fake/", + "webhook_url": f"{url}/evaluations/webhook_example_fake/", "webhook_body": {}, }, } From 86d076bc1295392582f63fc9060228a96b7d4808 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 31 Dec 2023 21:05:13 +0100 Subject: [PATCH 185/414] Update - modified testcase to know app from template container is running --- .../test_evaluators_router.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 87be8e6d44..c62c93e1e7 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -1,9 +1,16 @@ import httpx import pytest import asyncio + from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum -from agenta_backend.models.db_models import EvaluationDB, AppDB, TestSetDB, AppVariantDB +from agenta_backend.models.db_models import ( + EvaluationDB, + AppDB, + TestSetDB, + AppVariantDB, + DeploymentDB, +) # Initialize database engine @@ -15,7 +22,7 @@ # Set global variables APP_NAME = "evaluation_in_backend" -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://localhost:8000" @pytest.mark.asyncio @@ -37,14 +44,24 @@ async def test_create_app_from_template( @pytest.mark.asyncio async def test_app_from_template_container_is_running(): app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + deployment = await engine.find_one(DeploymentDB, DeploymentDB.app == app.id) - response = httpx.client( - f"{BACKEND_API_HOST}/{str(app.organization.id)}/{app.app_name}/app/openapi.json" - ) - response_data = response.json() - assert response.status_code == 200 - assert "openapi" in response_data - assert isinstance(response_data, dict) + # Prepare and start short-polling request + max_attempts = 10 + intervals = 2 # seconds + for _ in range(max_attempts): + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + response = httpx.get(url=uri + "/openapi.json", timeout=timeout) + if response.status_code == 200: + response_data = response.json() + assert "openapi" in response_data + assert isinstance(response_data, dict) + return + await asyncio.sleep(intervals) + + assert ( + False + ), f"Could not reach {app.app_name} running container within the specified polling time" @pytest.mark.asyncio From d37b378ca54b707970a722151e20d50fb9d6cdfc Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 31 Dec 2023 21:06:26 +0100 Subject: [PATCH 186/414] Update - added traefix and label to agenta-backend and modified backend host in tests modules --- .../test_observability_router.py | 2 +- .../test_organization_router.py | 2 +- .../testset_router/test_testset_router.py | 2 +- .../user_profile_router/test_user_profile.py | 2 +- .../test_app_variant_router.py | 2 +- docker-compose.test.yml | 49 +++++++++++-------- 6 files changed, 33 insertions(+), 26 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index 35dfb450fe..33a22d47fe 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -28,7 +28,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://localhost:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index db06d86840..54e29df6ce 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -16,7 +16,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://localhost:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index 894fc40f08..6c95449239 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -18,7 +18,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://localhost:8000" TESTSET_SUBMODULE_DIR = Path(__file__).parent diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index 6d7912aaf3..b921247c33 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -15,7 +15,7 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://localhost:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index 24a6fb8daa..e90be8c90c 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -29,7 +29,7 @@ logger.setLevel(logging.DEBUG) # Set global variables -BACKEND_API_HOST = "http://localhost:8001" +BACKEND_API_HOST = "http://localhost:8000" @pytest.mark.asyncio diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 333ed82e8b..8bb638f6b0 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -1,6 +1,17 @@ version: '3.8' - services: + reverse-proxy: + image: traefik:v2.10 + container_name: agenta-reverse_proxy-test + command: --api.dashboard=true --api.insecure=true --providers.docker --entrypoints.web.address=:80 + ports: + - "80:80" + - "8080:8080" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + networks: + - agenta-network + backend: build: ./agenta-backend container_name: agenta-backend-test @@ -8,8 +19,8 @@ services: - MONGODB_URI=mongodb://username:password@mongo:27017/ - REDIS_URL=redis://redis:6379/0 - ENVIRONMENT=development - - BARE_DOMAIN_NAME=localhost:8001 - - DOMAIN_NAME=http://localhost:8001 + - BARE_DOMAIN_NAME=localhost + - DOMAIN_NAME=http://localhost - CELERY_BROKER_URL=amqp://guest@rabbitmq// - CELERY_RESULT_BACKEND=redis://redis:6379/0 - DATABASE_MODE=v2 @@ -21,6 +32,14 @@ services: - ./agenta-backend/agenta_backend:/app/agenta_backend - ./agenta-backend/tests:/app/tests - /var/run/docker.sock:/var/run/docker.sock + labels: + - "traefik.http.routers.backend.rule=PathPrefix(`/api/`)" + - "traefik.http.routers.backend.entrypoints=web" + - "traefik.http.middlewares.backend-strip.stripprefix.prefixes=/api" + - "traefik.http.middlewares.backend-strip.stripprefix.forceslash=true" + - "traefik.http.routers.backend.middlewares=backend-strip" + - "traefik.http.services.backend.loadbalancer.server.port=8000" + - "traefik.http.routers.backend.service=backend" command: [ "uvicorn", @@ -28,32 +47,18 @@ services: "--host", "0.0.0.0", "--port", - "8001", + "8000", "--reload", "--log-level", "info", + "--root-path", + "/api", ] - ports: - - "8000:8001" depends_on: mongo: condition: service_healthy extra_hosts: - - "host.docker.internal:host-gateway" - networks: - - agenta-network - - agenta-web: - container_name: agenta-web-test - build: - context: ./agenta-web - dockerfile: dev.Dockerfile - volumes: - - ./agenta-web/src:/app/src - - ./agenta-web/public:/app/public - - .nextjs_cache:/app/.next - ports: - - "3000:3000" + - host.docker.internal:host-gateway networks: - agenta-network @@ -112,6 +117,8 @@ services: depends_on: - rabbitmq - redis + extra_hosts: + - host.docker.internal:host-gateway networks: - agenta-network From 5e6c8298e3e55fc0d64a87a0ffdc1187bae3ce9a Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 31 Dec 2023 21:16:13 +0100 Subject: [PATCH 187/414] Update - modified step 5 and 6 in run-backend-tests workflow --- .github/workflows/run-backend-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index d30e416986..ad2423c141 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -24,7 +24,7 @@ jobs: - name: Check Templates Exists run: | while true; do - if curl -s http://localhost:8000/containers/templates/; then + if curl -s http://localhost/api/containers/templates/; then break fi sleep 5 @@ -33,7 +33,7 @@ jobs: - name: Wait for Backend Service run: | while true; do - if curl -s http://localhost:8000/openapi.json; then + if curl -s http://localhost/api/openapi.json; then break fi sleep 5 From 60f3eddb1305dd898b4fbc3a9cbb4d86a722e69c Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 31 Dec 2023 21:45:39 +0100 Subject: [PATCH 188/414] Update - added environment to switch between backend api host --- .github/workflows/run-backend-tests.yml | 2 +- .../observability_router/test_observability_router.py | 8 ++++++-- .../organization_router/test_organization_router.py | 9 +++++++-- .../tests/testset_router/test_testset_router.py | 11 ++++++++--- .../tests/user_profile_router/test_user_profile.py | 10 +++++++--- .../tests/variants_evaluators_router/conftest.py | 8 ++++++-- .../test_evaluators_router.py | 7 ++++++- .../tests/variants_router/test_app_variant_router.py | 7 ++++++- docker-compose.test.yml | 2 +- 9 files changed, 48 insertions(+), 16 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index ad2423c141..dab591f9fa 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -16,7 +16,7 @@ jobs: run: sudo apt install curl -y - name: Start Docker Compose - run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} docker-compose -f "docker-compose.test.yml" up -d --build + run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=test docker-compose -f "docker-compose.test.yml" up -d --build - name: Restart Backend Service To Fetch Template Images run: docker container restart agenta-backend-test diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index 33a22d47fe..225c99c4b9 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -1,4 +1,4 @@ -import json +import os import pytest import random from typing import List @@ -28,7 +28,11 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8000" +ENVIRONMENT = os.environ.get("ENVIRONMENT") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://localhost:8000" +elif ENVIRONMENT == "test": # github actions environment + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index 54e29df6ce..fde28b7d8a 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -1,4 +1,4 @@ -import pytest +import os from agenta_backend.services import selectors from agenta_backend.models.db_models import UserDB @@ -6,6 +6,7 @@ from agenta_backend.models.api.organization_models import OrganizationOutput import httpx +import pytest # Initialize database engine @@ -16,7 +17,11 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8000" +ENVIRONMENT = os.environ.get("ENVIRONMENT") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://localhost:8000" +elif ENVIRONMENT == "test": # github actions environment + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index 6c95449239..a238e7b23b 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -1,4 +1,4 @@ -import pytest +import os from pathlib import Path from agenta_backend.models.db_engine import DBEngine @@ -6,9 +6,10 @@ AppDB, TestSetDB, ) - import httpx +import pytest + # Initialize database engine engine = DBEngine().engine() @@ -18,7 +19,11 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8000" +ENVIRONMENT = os.environ.get("ENVIRONMENT") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://localhost:8000" +elif ENVIRONMENT == "test": # github actions environment + BACKEND_API_HOST = "http://localhost/api" TESTSET_SUBMODULE_DIR = Path(__file__).parent diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index b921247c33..b902f9bc57 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -1,11 +1,11 @@ +import os +import httpx import pytest from agenta_backend.models.db_models import UserDB from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.api.user_models import User -import httpx - # Initialize database engine engine = DBEngine().engine() @@ -15,7 +15,11 @@ timeout = httpx.Timeout(timeout=5, read=None, write=5) # Set global variables -BACKEND_API_HOST = "http://localhost:8000" +ENVIRONMENT = os.environ.get("ENVIRONMENT") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://localhost:8000" +elif ENVIRONMENT == "test": # github actions environment + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index b75c270a2b..93c44e832f 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -13,14 +13,18 @@ engine = DBEngine().engine() # Set global variables -BACKEND_URI = "http://localhost:8000/" ENVIRONMENT = os.environ.get("ENVIRONMENT") OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://localhost:8000" +elif ENVIRONMENT == "test": # github actions environment + BACKEND_API_HOST = "http://localhost/api" + @pytest.fixture(scope="session") def fetch_templates(): - response = httpx.get(f"{BACKEND_URI}containers/templates/") + response = httpx.get(f"{BACKEND_API_HOST}/containers/templates/") response_data = response.json() return response_data diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index c62c93e1e7..959f7d30ae 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -1,3 +1,4 @@ +import os import httpx import pytest import asyncio @@ -22,7 +23,11 @@ # Set global variables APP_NAME = "evaluation_in_backend" -BACKEND_API_HOST = "http://localhost:8000" +ENVIRONMENT = os.environ.get("ENVIRONMENT") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://localhost:8000" +elif ENVIRONMENT == "test": # github actions environment + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index e90be8c90c..edc1a39a29 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -1,3 +1,4 @@ +import os import httpx import pytest import logging @@ -29,7 +30,11 @@ logger.setLevel(logging.DEBUG) # Set global variables -BACKEND_API_HOST = "http://localhost:8000" +ENVIRONMENT = os.environ.get("ENVIRONMENT") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://localhost:8000" +elif ENVIRONMENT == "test": # github actions environment + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 8bb638f6b0..36fb9d4f5c 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -18,7 +18,7 @@ services: environment: - MONGODB_URI=mongodb://username:password@mongo:27017/ - REDIS_URL=redis://redis:6379/0 - - ENVIRONMENT=development + - ENVIRONMENT=${ENVIRONMENT} - BARE_DOMAIN_NAME=localhost - DOMAIN_NAME=http://localhost - CELERY_BROKER_URL=amqp://guest@rabbitmq// From 4f5501ec165f1bb09d776160f1542176d26fefb7 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 01:12:43 +0100 Subject: [PATCH 189/414] Cleanup - set development and test backend environment to run tests --- .github/workflows/run-backend-tests.yml | 2 +- .../services/llm_apps_service.py | 4 ++- .../agenta_backend/tasks/evaluations.py | 14 +++++--- .../test_observability_router.py | 6 ++-- .../test_organization_router.py | 6 ++-- .../testset_router/test_testset_router.py | 7 ++-- .../user_profile_router/test_user_profile.py | 6 ++-- .../variants_evaluators_router/conftest.py | 14 +++----- .../test_evaluators_router.py | 33 +++---------------- .../test_app_variant_router.py | 6 ++-- docker-compose.test.yml | 18 +++++----- 11 files changed, 47 insertions(+), 69 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index dab591f9fa..f23594c564 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -16,7 +16,7 @@ jobs: run: sudo apt install curl -y - name: Start Docker Compose - run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=test docker-compose -f "docker-compose.test.yml" up -d --build + run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=github docker-compose -f "docker-compose.test.yml" up -d --build - name: Restart Backend Service To Fetch Template Images run: docker container restart agenta-backend-test diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index 3f253340ff..d556f526f7 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -1,3 +1,5 @@ +from typing import Any + import httpx import backoff @@ -7,7 +9,7 @@ (httpx.TimeoutException, httpx.ConnectTimeout, httpx.ConnectError), max_tries=2, ) -def get_llm_app_output(uri, input): +def get_llm_app_output(uri: str, input: Any) -> Any: url = f"{uri}/generate" # TODO: adjust these hardcoded values in this payload diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index c6e34a8f4b..18eb5c40a9 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -1,6 +1,6 @@ +import os import asyncio from typing import List -from bson import ObjectId from celery import shared_task from collections import defaultdict @@ -47,8 +47,14 @@ def evaluate( get_deployment_by_objectid(app_variant_db.base.deployment) ) - # TODO: remove if abraham's fix is working - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + #!NOTE: do not remove! this will be used in github workflow! + backend_environment = os.environ.get("ENVIRONMENT") + if backend_environment is not None and backend_environment == "github": + uri = f"http://{deployment.container_name}" + else: + uri = deployment.uri.replace( + "http://localhost", "http://host.docker.internal" + ) for data_point in testset.csvdata: # 1. We prepare the inputs @@ -91,7 +97,7 @@ def evaluate( variant_output, data_point["correct_answer"], evaluator_config.settings_values, - **additional_kwargs + **additional_kwargs, ) result_object = EvaluationScenarioResult( diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index 225c99c4b9..d37e8e683f 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -30,9 +30,9 @@ # Set global variables ENVIRONMENT = os.environ.get("ENVIRONMENT") if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://localhost:8000" -elif ENVIRONMENT == "test": # github actions environment - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index fde28b7d8a..c19d32711e 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -19,9 +19,9 @@ # Set global variables ENVIRONMENT = os.environ.get("ENVIRONMENT") if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://localhost:8000" -elif ENVIRONMENT == "test": # github actions environment - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index a238e7b23b..0d6fe4cf1e 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -21,10 +21,9 @@ # Set global variables ENVIRONMENT = os.environ.get("ENVIRONMENT") if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://localhost:8000" -elif ENVIRONMENT == "test": # github actions environment - BACKEND_API_HOST = "http://localhost/api" -TESTSET_SUBMODULE_DIR = Path(__file__).parent + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" # TODO: test_csv_upload_file diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index b902f9bc57..182ce64e93 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -17,9 +17,9 @@ # Set global variables ENVIRONMENT = os.environ.get("ENVIRONMENT") if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://localhost:8000" -elif ENVIRONMENT == "test": # github actions environment - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 93c44e832f..53abc772a7 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -15,11 +15,10 @@ # Set global variables ENVIRONMENT = os.environ.get("ENVIRONMENT") OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") - if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://localhost:8000" -elif ENVIRONMENT == "test": # github actions environment - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.fixture(scope="session") @@ -96,17 +95,12 @@ def auto_regex_test_evaluator_config(): @pytest.fixture() def auto_webhook_test_evaluator_config(): - url = ( - "http://host.docker.internal/api" - if ENVIRONMENT == "development" - else "http://agenta-backend-test:8000" - ) return { "app_id": "string", "name": "WebhookEvaluator", "evaluator_key": "auto_webhook_test", "settings_values": { - "webhook_url": f"{url}/evaluations/webhook_example_fake/", + "webhook_url": f"{BACKEND_API_HOST}/evaluations/webhook_example_fake/", "webhook_body": {}, }, } diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 959f7d30ae..92434b1b6c 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -25,9 +25,9 @@ APP_NAME = "evaluation_in_backend" ENVIRONMENT = os.environ.get("ENVIRONMENT") if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://localhost:8000" -elif ENVIRONMENT == "test": # github actions environment - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio @@ -46,29 +46,6 @@ async def test_create_app_from_template( assert response.status_code == 200 -@pytest.mark.asyncio -async def test_app_from_template_container_is_running(): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) - deployment = await engine.find_one(DeploymentDB, DeploymentDB.app == app.id) - - # Prepare and start short-polling request - max_attempts = 10 - intervals = 2 # seconds - for _ in range(max_attempts): - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") - response = httpx.get(url=uri + "/openapi.json", timeout=timeout) - if response.status_code == 200: - response_data = response.json() - assert "openapi" in response_data - assert isinstance(response_data, dict) - return - await asyncio.sleep(intervals) - - assert ( - False - ), f"Could not reach {app.app_name} running container within the specified polling time" - - @pytest.mark.asyncio async def test_get_evaluators_endpoint(): response = await test_client.get( @@ -277,10 +254,10 @@ async def test_remove_running_template_app_container(): # Connect to the Docker daemon client = docker.from_env() app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) - container_name = f"{app.app_name}-app-{str(app.organization.id)}" + deployment = await engine.find_one(DeploymentDB, DeploymentDB.app == app.id) try: # Retrieve container - container = client.containers.get(container_name) + container = client.containers.get(deployment.container_name) # Stop and remove container container.stop() container.remove() diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index edc1a39a29..65df2ea317 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -32,9 +32,9 @@ # Set global variables ENVIRONMENT = os.environ.get("ENVIRONMENT") if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://localhost:8000" -elif ENVIRONMENT == "test": # github actions environment - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 36fb9d4f5c..fe2525335a 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -2,7 +2,6 @@ version: '3.8' services: reverse-proxy: image: traefik:v2.10 - container_name: agenta-reverse_proxy-test command: --api.dashboard=true --api.insecure=true --providers.docker --entrypoints.web.address=:80 ports: - "80:80" @@ -32,14 +31,6 @@ services: - ./agenta-backend/agenta_backend:/app/agenta_backend - ./agenta-backend/tests:/app/tests - /var/run/docker.sock:/var/run/docker.sock - labels: - - "traefik.http.routers.backend.rule=PathPrefix(`/api/`)" - - "traefik.http.routers.backend.entrypoints=web" - - "traefik.http.middlewares.backend-strip.stripprefix.prefixes=/api" - - "traefik.http.middlewares.backend-strip.stripprefix.forceslash=true" - - "traefik.http.routers.backend.middlewares=backend-strip" - - "traefik.http.services.backend.loadbalancer.server.port=8000" - - "traefik.http.routers.backend.service=backend" command: [ "uvicorn", @@ -54,6 +45,14 @@ services: "--root-path", "/api", ] + labels: + - "traefik.http.routers.backend.rule=PathPrefix(`/api/`)" + - "traefik.http.routers.backend.entrypoints=web" + - "traefik.http.middlewares.backend-strip.stripprefix.prefixes=/api" + - "traefik.http.middlewares.backend-strip.stripprefix.forceslash=true" + - "traefik.http.routers.backend.middlewares=backend-strip" + - "traefik.http.services.backend.loadbalancer.server.port=8000" + - "traefik.http.routers.backend.service=backend" depends_on: mongo: condition: service_healthy @@ -108,6 +107,7 @@ services: environment: - MONGODB_URI=mongodb://username:password@mongo:27017 - REDIS_URL=redis://redis:6379/0 + - ENVIRONMENT=${ENVIRONMENT} - CELERY_BROKER_URL=amqp://guest@rabbitmq// - CELERY_RESULT_BACKEND=redis://redis:6379/0 - FEATURE_FLAG=oss From b6d71a3919b9ba37ca9ff0347fa6c28b1261c9bc Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 01:16:27 +0100 Subject: [PATCH 190/414] Update - revert back to localhost for github environment --- .github/workflows/run-backend-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index f23594c564..40723fcbec 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -24,7 +24,7 @@ jobs: - name: Check Templates Exists run: | while true; do - if curl -s http://localhost/api/containers/templates/; then + if curl -s http://localhost:8000/containers/templates/; then break fi sleep 5 @@ -33,7 +33,7 @@ jobs: - name: Wait for Backend Service run: | while true; do - if curl -s http://localhost/api/openapi.json; then + if curl -s http://localhost:8000/openapi.json; then break fi sleep 5 From a11e8b93fb37692a8ecd29f8481d2264bf583967 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 01:20:11 +0100 Subject: [PATCH 191/414] Update - set reachable hostname in action workflow --- .github/workflows/run-backend-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 40723fcbec..1c9d9072dc 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -24,7 +24,7 @@ jobs: - name: Check Templates Exists run: | while true; do - if curl -s http://localhost:8000/containers/templates/; then + if curl -s http://agenta-backend-test:8000/containers/templates/; then break fi sleep 5 @@ -33,7 +33,7 @@ jobs: - name: Wait for Backend Service run: | while true; do - if curl -s http://localhost:8000/openapi.json; then + if curl -s http://agenta-backend-test:8000/openapi.json; then break fi sleep 5 From 5e176db45acea5ab48b2d9348b183326d8920bf5 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 08:52:42 +0100 Subject: [PATCH 192/414] Retry 6: debug step 5 to know why backend isn't accessible --- .github/workflows/run-backend-tests.yml | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 1c9d9072dc..da604b5e39 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -24,23 +24,29 @@ jobs: - name: Check Templates Exists run: | while true; do - if curl -s http://agenta-backend-test:8000/containers/templates/; then + if curl -s http://localhost/api/containers/templates/; then break fi sleep 5 done - - - name: Wait for Backend Service - run: | while true; do - if curl -s http://agenta-backend-test:8000/openapi.json; then + if curl -s http://localhost:8000/containers/templates/; then break fi sleep 5 done - - name: Run tests - run: docker exec agenta-backend-test pytest + # - name: Wait for Backend Service + # run: | + # while true; do + # if curl -s http://localhost/api/openapi.json; then + # break + # fi + # sleep 5 + # done + + # - name: Run tests + # run: docker exec agenta-backend-test pytest - name: Docker logs if: always() # From 4b268eb02aaa27ffce8008216dd7788f7c6c4c2b Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 08:58:29 +0100 Subject: [PATCH 193/414] Retry 6(b): debug step 5 to know why backend isn't accessible --- .github/workflows/run-backend-tests.yml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index da604b5e39..750842118d 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -23,18 +23,10 @@ jobs: - name: Check Templates Exists run: | - while true; do - if curl -s http://localhost/api/containers/templates/; then - break - fi - sleep 5 - done - while true; do - if curl -s http://localhost:8000/containers/templates/; then - break - fi - sleep 5 - done + curl -s http://localhost/api/containers/templates/ + curl -s http://localhost:8000/containers/templates/ + curl -s http://localhost:80/api/containers/templates/ + curl -s http://localhost:8000/api/containers/templates/ # - name: Wait for Backend Service # run: | From 27cbe4151847355a29db0850ca0f759ace98364f Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 09:01:15 +0100 Subject: [PATCH 194/414] Retry 6(c): debug step 5 to know why backend isn't accessible --- .github/workflows/run-backend-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 750842118d..639fa56e0d 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -23,7 +23,6 @@ jobs: - name: Check Templates Exists run: | - curl -s http://localhost/api/containers/templates/ curl -s http://localhost:8000/containers/templates/ curl -s http://localhost:80/api/containers/templates/ curl -s http://localhost:8000/api/containers/templates/ From c15a9d49ee865b0b064ca8c8ca501d484c92067e Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 09:13:58 +0100 Subject: [PATCH 195/414] Retry 6(d): debug step 5 to know why backend isn't accessible --- .github/workflows/run-backend-tests.yml | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 639fa56e0d..acb6d8ff7f 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -18,23 +18,14 @@ jobs: - name: Start Docker Compose run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=github docker-compose -f "docker-compose.test.yml" up -d --build - - name: Restart Backend Service To Fetch Template Images - run: docker container restart agenta-backend-test + - name: Wait for services to be ready + run: sleep 10 - name: Check Templates Exists - run: | - curl -s http://localhost:8000/containers/templates/ - curl -s http://localhost:80/api/containers/templates/ - curl -s http://localhost:8000/api/containers/templates/ - - # - name: Wait for Backend Service - # run: | - # while true; do - # if curl -s http://localhost/api/openapi.json; then - # break - # fi - # sleep 5 - # done + run: curl -i http://localhost/api/containers/templates/ + + - name: Wait for Backend Service + run: curl -i http://localhost/api/openapi.json # - name: Run tests # run: docker exec agenta-backend-test pytest From 8d80795aa7aca829fefc56f018daaebc026586fa Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 09:16:53 +0100 Subject: [PATCH 196/414] Retry 6(e): debug step 5 to know why backend isn't accessible --- .github/workflows/run-backend-tests.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index acb6d8ff7f..025719a286 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -22,10 +22,22 @@ jobs: run: sleep 10 - name: Check Templates Exists - run: curl -i http://localhost/api/containers/templates/ + run: | + while true; do + if curl -i http://localhost/api/containers/templates/; then + break + fi + sleep 5 + done - name: Wait for Backend Service - run: curl -i http://localhost/api/openapi.json + run: | + while true; do + if curl -i http://localhost/api/openapi.json; then + break + fi + sleep 5 + done # - name: Run tests # run: docker exec agenta-backend-test pytest From abf6bc0c58c645827b2c88f3199b0fb8c9f4099c Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 09:21:09 +0100 Subject: [PATCH 197/414] Retry 6(f): debug step 5 to know why backend isn't accessible --- .github/workflows/run-backend-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 025719a286..90ef64bffd 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -18,8 +18,8 @@ jobs: - name: Start Docker Compose run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=github docker-compose -f "docker-compose.test.yml" up -d --build - - name: Wait for services to be ready - run: sleep 10 + - name: Restart Backend Service To Fetch Template Images + run: docker container restart agenta-backend-test - name: Check Templates Exists run: | From bc08f2132b2359871079de39cd7cc02c57f05f84 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 09:24:05 +0100 Subject: [PATCH 198/414] Retry 6(g): debug step 5 to know why backend isn't accessible --- .github/workflows/run-backend-tests.yml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 90ef64bffd..3d94ac85f1 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -23,21 +23,11 @@ jobs: - name: Check Templates Exists run: | - while true; do - if curl -i http://localhost/api/containers/templates/; then - break - fi - sleep 5 - done + sleep 10 && curl -i http://localhost/api/containers/templates/ - name: Wait for Backend Service run: | - while true; do - if curl -i http://localhost/api/openapi.json; then - break - fi - sleep 5 - done + sleep 10 && curl -i http://localhost/api/openapi.json # - name: Run tests # run: docker exec agenta-backend-test pytest From 0a45de0bc0a3d44ba20fcc9ffc0c08a8a4969a95 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 09:27:17 +0100 Subject: [PATCH 199/414] Update - make use of localhost/api in github workflow --- .github/workflows/run-backend-tests.yml | 4 ++-- .../tests/observability_router/test_observability_router.py | 2 +- .../tests/organization_router/test_organization_router.py | 2 +- .../tests/testset_router/test_testset_router.py | 2 +- .../tests/user_profile_router/test_user_profile.py | 2 +- .../tests/variants_evaluators_router/conftest.py | 2 +- .../variants_evaluators_router/test_evaluators_router.py | 2 +- .../tests/variants_router/test_app_variant_router.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 3d94ac85f1..8e5145e681 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -29,8 +29,8 @@ jobs: run: | sleep 10 && curl -i http://localhost/api/openapi.json - # - name: Run tests - # run: docker exec agenta-backend-test pytest + - name: Run tests + run: sleep 10 && docker exec agenta-backend-test pytest - name: Docker logs if: always() # diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index d37e8e683f..a8ca204190 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -32,7 +32,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index c19d32711e..0b07f995bf 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -21,7 +21,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index 0d6fe4cf1e..531de8e9b4 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -23,7 +23,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" + BACKEND_API_HOST = "http://localhost/api" # TODO: test_csv_upload_file diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index 182ce64e93..c547431f73 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -19,7 +19,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 53abc772a7..65a318ec80 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -18,7 +18,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" + BACKEND_API_HOST = "http://localhost/api" @pytest.fixture(scope="session") diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 92434b1b6c..9a2247885a 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -27,7 +27,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index 65df2ea317..aa53dbc964 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -34,7 +34,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" + BACKEND_API_HOST = "http://localhost/api" @pytest.mark.asyncio From ff0a2e1aa99bbaedb71ab61b2bf3864d413ceb82 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 1 Jan 2024 09:35:47 +0100 Subject: [PATCH 200/414] Update - make use of agenta-backend-test:8000 in pytest instead of localhost/api --- .../tests/observability_router/test_observability_router.py | 2 +- .../tests/organization_router/test_organization_router.py | 2 +- .../agenta_backend/tests/testset_router/test_testset_router.py | 2 +- .../tests/user_profile_router/test_user_profile.py | 2 +- .../agenta_backend/tests/variants_evaluators_router/conftest.py | 2 +- .../tests/variants_evaluators_router/test_evaluators_router.py | 2 +- .../tests/variants_router/test_app_variant_router.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index a8ca204190..d37e8e683f 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -32,7 +32,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index 0b07f995bf..c19d32711e 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -21,7 +21,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index 531de8e9b4..0d6fe4cf1e 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -23,7 +23,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://agenta-backend-test:8000" # TODO: test_csv_upload_file diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index c547431f73..182ce64e93 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -19,7 +19,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 65a318ec80..53abc772a7 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -18,7 +18,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.fixture(scope="session") diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 9a2247885a..92434b1b6c 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -27,7 +27,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index aa53dbc964..65df2ea317 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -34,7 +34,7 @@ if ENVIRONMENT == "development": BACKEND_API_HOST = "http://host.docker.internal/api" elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://localhost/api" + BACKEND_API_HOST = "http://agenta-backend-test:8000" @pytest.mark.asyncio From d77c5f89a0a8bf460cbd1a89b9fff5448b07c969 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Mon, 1 Jan 2024 12:02:35 +0100 Subject: [PATCH 201/414] remove array of results --- .../models/api/annotation_models.py | 9 ++---- .../agenta_backend/models/converters.py | 14 ++++------ .../agenta_backend/models/db_models.py | 4 +-- .../routers/annotations_router.py | 20 ++++++++++++- .../services/annotation_manager.py | 28 ++++++++++++++++++- .../agenta_backend/services/db_manager.py | 3 +- .../agenta_backend/tasks/annotations.py | 3 +- 7 files changed, 57 insertions(+), 24 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/annotation_models.py b/agenta-backend/agenta_backend/models/api/annotation_models.py index 94ffcc23d4..0744fa6d97 100644 --- a/agenta-backend/agenta_backend/models/api/annotation_models.py +++ b/agenta-backend/agenta_backend/models/api/annotation_models.py @@ -42,7 +42,7 @@ class AnnotationScenarioOutput(BaseModel): value: Any -class AnnoatationScenarioResult(BaseModel): +class AnnotationScenarioResult(BaseModel): variant_id: str result: Result @@ -54,10 +54,5 @@ class AnnotationScenario(BaseModel): outputs: List[AnnotationScenarioOutput] is_pinned: Optional[bool] note: Optional[str] - result: AnnoatationScenarioResult + result: AnnotationScenarioResult - -class AnnotationScenarioInput(BaseModel): - name: str - type: str - value: Any diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 6519099049..08a214198f 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -5,7 +5,7 @@ from agenta_backend.services import db_manager from agenta_backend.models.api.user_models import User from agenta_backend.models.db_models import ( - AnnoatationScenarioResult, + AnnotationScenarioResult, AnnotationsDB, AnnotationsScenariosDB, AppVariantDB, @@ -372,19 +372,15 @@ def annotation_scenario_db_to_pydantic( id=str(annotation_scenario_db.id), annotation_id=str(annotation_scenario_db.annotation_id), inputs=[ - AnnotationScenarioInput(**input_dict) - for input_dict in annotation_scenario_db.inputs + AnnotationScenarioInput(**input_dict.dict()) for input_dict in annotation_scenario_db.inputs ], outputs=[ - AnnotationScenarioOutput(**output_dict) - for output_dict in annotation_scenario_db.outputs + AnnotationScenarioOutput(**output_dict.dict()) for output_dict in annotation_scenario_db.outputs ], is_pinned=annotation_scenario_db.is_pinned, note=annotation_scenario_db.note, - results=[ - AnnoatationScenarioResult(**result_dict) - for result_dict in annotation_scenario_db.results - ], + result=AnnotationScenarioResult(**annotation_scenario_db.result.dict()), + created_at=annotation_scenario_db.created_at, updated_at=annotation_scenario_db.updated_at, ) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 13aa2c448f..d8441f7d17 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -348,7 +348,7 @@ class AnnoationResult(EmbeddedModel): result: Result -class AnnoatationScenarioResult(EmbeddedModel): +class AnnotationScenarioResult(EmbeddedModel): variant_id: str result: Result @@ -378,7 +378,7 @@ class AnnotationsScenariosDB(Model): outputs: List[AnnotationScenarioOutputDB] is_pinned: Optional[bool] note: Optional[str] - results: List[AnnoatationScenarioResult] + result: Optional[AnnotationScenarioResult] = None created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py index aca56946d2..24a423c42c 100644 --- a/agenta-backend/agenta_backend/routers/annotations_router.py +++ b/agenta-backend/agenta_backend/routers/annotations_router.py @@ -8,6 +8,7 @@ from agenta_backend.models.api.annotation_models import ( Annotation, + AnnotationScenario, NewAnnotation, AnnotationScenarioUpdate, ) @@ -113,7 +114,24 @@ async def fetch_annotation( return await annotation_manager.fetch_annotation(annotation_id, **user_org_data) -@router.put("/{annotation_id}/annotation_scenario/{annotation_scenario_id}/") +@router.get("/{annotation_id}/annotations_scenarios/", response_model=List[AnnotationScenario]) +async def fetch_annotations_scenarios( + annotation_id: str, + request: Request, +): + """Fetches a single annotation based on its ID. + + Args: + annotation_id (str): The ID of the annotation to fetch. + + Returns: + Annotation: The fetched annotation. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await annotation_manager.fetch_annotations_scenarios(annotation_id, **user_org_data) + + +@router.put("/{annotation_id}/annotations_scenarios/{annotation_scenario_id}/") async def update_annotation_scenario_router( annotation_id: str, annotation_scenario_id: str, diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py index 23a2420776..3ea525101d 100644 --- a/agenta-backend/agenta_backend/services/annotation_manager.py +++ b/agenta-backend/agenta_backend/services/annotation_manager.py @@ -197,6 +197,32 @@ async def create_annotation_scenario( await engine.save(new_annotation_scenario) +async def fetch_annotations_scenarios(annotation_id: str, **user_org_data: dict) -> [AnnotationScenario]: + """ + Fetches a single annotation based on its ID. + + Args: + annotation_id (str): The ID of the annotation. + user_org_data (dict): User and organization data. + + Returns: + Annotation: The fetched annotation. + """ + annotation = await _fetch_annotation_and_check_access( + annotation_id=annotation_id, + **user_org_data, + ) + scenarios = await engine.find( + AnnotationsScenariosDB, + AnnotationsScenariosDB.annotation_id == ObjectId(annotation_id), + ) + annotations_scenarios = [ + converters.annotation_scenario_db_to_pydantic(scenario) + for scenario in scenarios + ] + return annotations_scenarios + + async def update_annotation_scenario( annotation_scenario_id: str, updates: Dict[str, Any], @@ -212,7 +238,7 @@ async def update_annotation_scenario( Returns: AnnotationScenario: The updated annotation scenario object. """ - print("update_annotation_scenario") + annotation_scenario = await db_manager.update_annotation_scenario( annotation_scenario_id, updates ) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 7cdf89ef3e..0f420afc76 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1909,7 +1909,6 @@ async def create_new_annotation_scenario( inputs: List[dict], outputs: List[dict], isPinned: bool, - results: List, note: str, ) -> AnnotationsScenariosDB: """ @@ -1933,7 +1932,7 @@ async def create_new_annotation_scenario( outputs=outputs, is_pinned=isPinned, note=note, - results=results, + result=None, created_at=datetime.utcnow(), updated_at=datetime.utcnow(), ) diff --git a/agenta-backend/agenta_backend/tasks/annotations.py b/agenta-backend/agenta_backend/tasks/annotations.py index f2bb11194e..37a81da0e7 100644 --- a/agenta-backend/agenta_backend/tasks/annotations.py +++ b/agenta-backend/agenta_backend/tasks/annotations.py @@ -17,7 +17,7 @@ AnnotationScenarioInputDB, AnnotationScenarioOutputDB, AnnotationScenarioInputDB, - AnnoatationScenarioResult, + AnnotationScenarioResult, ) from agenta_backend.models.api.annotation_models import NewAnnotation @@ -79,6 +79,5 @@ def prepare_scenarios( ], isPinned=False, note="", - results=[], ) ) From a5282b7c88053853929ed6e04436b092aae1bf89 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Mon, 1 Jan 2024 22:31:22 +0100 Subject: [PATCH 202/414] added variant_id into evaluation scenario --- agenta-backend/agenta_backend/models/db_models.py | 1 + agenta-backend/agenta_backend/services/db_manager.py | 2 ++ agenta-backend/agenta_backend/tasks/evaluations.py | 1 + 3 files changed, 4 insertions(+) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index d8441f7d17..8e8471a123 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -268,6 +268,7 @@ class EvaluationScenarioDB(Model): user: UserDB = Reference(key_name="user") organization: OrganizationDB = Reference(key_name="organization") evaluation: EvaluationDB = Reference(key_name="evaluations") + variant_id: ObjectId inputs: List[EvaluationScenarioInputDB] outputs: List[EvaluationScenarioOutputDB] correct_answer: Optional[str] diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 0f420afc76..0e60598b20 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1650,6 +1650,7 @@ async def create_new_evaluation_scenario( user: UserDB, organization: OrganizationDB, evaluation: EvaluationDB, + variant_id: str, inputs: List[EvaluationScenarioInputDB], outputs: List[EvaluationScenarioOutputDB], correct_answer: Optional[str], @@ -1666,6 +1667,7 @@ async def create_new_evaluation_scenario( user=user, organization=organization, evaluation=evaluation, + variant_id=ObjectId(variant_id), inputs=inputs, outputs=outputs, correct_answer=correct_answer, diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index c6e34a8f4b..664908d5b6 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -109,6 +109,7 @@ def evaluate( user=app.user, organization=app.organization, evaluation=new_evaluation_db, + variant_id=variant_id, evaluators_configs=new_evaluation_db.evaluators_configs, inputs=inputs, is_pinned=False, From 07393019aeb5943d5605580add56c515ca8c3dbb Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Mon, 1 Jan 2024 22:47:54 +0100 Subject: [PATCH 203/414] 1 evaluation per 1 variant --- .../routers/evaluation_router.py | 31 ++-- .../agenta_backend/tasks/evaluations.py | 146 +++++++++--------- 2 files changed, 94 insertions(+), 83 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 3d56cb927f..ca83452243 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -84,17 +84,28 @@ async def create_evaluation( raise HTTPException(status_code=404, detail="App not found") app_data = jsonable_encoder(app) - new_evaluation_data = payload.dict() - evaluation = await evaluation_service.create_new_evaluation( - app_data=app_data, - new_evaluation_data=new_evaluation_data, - evaluators_configs=payload.evaluators_configs, - ) + evaluations = [] + + for variant_id in payload.variant_ids: + new_evaluation_data = { + "app_id": payload.app_id, + "variant_ids": [variant_id], # Only this variant ID + "evaluators_configs": payload.evaluators_configs, + "testset_id": payload.testset_id + } + + evaluation = await evaluation_service.create_new_evaluation( + app_data=app_data, + new_evaluation_data=new_evaluation_data, + evaluators_configs=payload.evaluators_configs, + ) - evaluate.delay( - app_data, new_evaluation_data, evaluation.id, evaluation.testset_id - ) - return evaluation + evaluate.delay( + app_data, new_evaluation_data, evaluation.id, evaluation.testset_id + ) + evaluations.append(evaluation) + + return evaluations except KeyError: raise HTTPException( status_code=400, diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 664908d5b6..7c84ac3dae 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -39,88 +39,88 @@ def evaluate( new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) evaluators_aggregated_data = defaultdict(list) - for variant_id in evaluation.variant_ids: - variant_id = str(variant_id) - app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) - deployment = loop.run_until_complete( - get_deployment_by_objectid(app_variant_db.base.deployment) - ) - - # TODO: remove if abraham's fix is working - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + variant_id = str(evaluation.variant_ids[0]) - for data_point in testset.csvdata: - # 1. We prepare the inputs - raw_inputs = ( - app_variant_db.parameters.get("inputs", []) - if app_variant_db.parameters - else [] - ) - inputs = [] - if raw_inputs: - inputs = [ - EvaluationScenarioInputDB( - name=input_item["name"], - type="text", - value=data_point[input_item["name"]], - ) - for input_item in raw_inputs - ] + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) + deployment = loop.run_until_complete( + get_deployment_by_objectid(app_variant_db.base.deployment) + ) - # 2. We get the output from the llm app - variant_output = llm_apps_service.get_llm_app_output(uri, data_point) + # TODO: remove if abraham's fix is working + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") - # 3. We evaluate - evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config_id in evaluation.evaluators_configs: - evaluator_config = loop.run_until_complete( - fetch_evaluator_config(evaluator_config_id) + for data_point in testset.csvdata: + # 1. We prepare the inputs + raw_inputs = ( + app_variant_db.parameters.get("inputs", []) + if app_variant_db.parameters + else [] + ) + inputs = [] + if raw_inputs: + inputs = [ + EvaluationScenarioInputDB( + name=input_item["name"], + type="text", + value=data_point[input_item["name"]], ) + for input_item in raw_inputs + ] - additional_kwargs = ( - { - "app_params": app_variant_db.config.parameters, - "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed - } - if evaluator_config.evaluator_key == "custom_code_run" - else {} - ) - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - variant_output, - data_point["correct_answer"], - evaluator_config.settings_values, - **additional_kwargs - ) + # 2. We get the output from the llm app + variant_output = llm_apps_service.get_llm_app_output(uri, data_point) - result_object = EvaluationScenarioResult( - evaluator_config=evaluator_config.id, - result=result, - ) - evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append( - result - ) + # 3. We evaluate + evaluators_results: [EvaluationScenarioResult] = [] + for evaluator_config_id in evaluation.evaluators_configs: + evaluator_config = loop.run_until_complete( + fetch_evaluator_config(evaluator_config_id) + ) - # 4. We create a new evaluation scenario - evaluation_scenario = loop.run_until_complete( - create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - variant_id=variant_id, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=inputs, - is_pinned=False, - note="", - correct_answer=data_point["correct_answer"], - outputs=[ - EvaluationScenarioOutputDB(type="text", value=variant_output) - ], - results=evaluators_results, - ) + additional_kwargs = ( + { + "app_params": app_variant_db.config.parameters, + "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed + } + if evaluator_config.evaluator_key == "custom_code_run" + else {} + ) + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + variant_output, + data_point["correct_answer"], + evaluator_config.settings_values, + **additional_kwargs + ) + + result_object = EvaluationScenarioResult( + evaluator_config=evaluator_config.id, + result=result, ) + evaluators_results.append(result_object) + evaluators_aggregated_data[evaluator_config.evaluator_key].append( + result + ) + + # 4. We create a new evaluation scenario + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + variant_id=variant_id, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=inputs, + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[ + EvaluationScenarioOutputDB(type="text", value=variant_output) + ], + results=evaluators_results, + ) + ) aggregated_results = loop.run_until_complete( aggregate_evaluator_results(app, evaluators_aggregated_data) From 6cacff711841b96b6c4e863c990eb97d3ec14922 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 2 Jan 2024 11:34:35 +0500 Subject: [PATCH 204/414] filters | fixes | expand/copy --- agenta-web/dev.Dockerfile | 28 +- agenta-web/package-lock.json | 18 +- agenta-web/package.json | 4 +- agenta-web/src/components/Sidebar/Sidebar.tsx | 22 ++ .../evaluationCompare/EvaluationCompare.tsx | 256 +++++++++++------- .../evaluationResults/EvaluationResults.tsx | 170 +++++++++++- .../EvaluationScenarios.tsx | 139 ++++++---- agenta-web/src/lib/Types.ts | 13 +- .../annotations/[annotation_id]/index.tsx | 9 + .../pages/apps/[app_id]/annotations/index.tsx | 10 + agenta-web/src/services/evaluations/index.ts | 19 +- 11 files changed, 491 insertions(+), 197 deletions(-) create mode 100644 agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx create mode 100644 agenta-web/src/pages/apps/[app_id]/annotations/index.tsx diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile index 6f86dbd847..92f6ea5bbc 100644 --- a/agenta-web/dev.Dockerfile +++ b/agenta-web/dev.Dockerfile @@ -3,14 +3,14 @@ FROM node:18-alpine WORKDIR /app # Install dependencies based on the preferred package manager -COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ -RUN \ - if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ - elif [ -f package-lock.json ]; then npm i; \ - elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ - # Allow install without lockfile, so example works even without Node.js installed locally - else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ - fi +# COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ +# RUN \ +# if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ +# elif [ -f package-lock.json ]; then npm i; \ +# elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ +# # Allow install without lockfile, so example works even without Node.js installed locally +# else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ +# fi COPY src ./src COPY public ./public @@ -28,10 +28,10 @@ COPY sentry.* . # Note: Don't expose ports here, Compose will handle that for us # Start Next.js in development mode based on the preferred package manager -CMD \ - if [ -f yarn.lock ]; then yarn dev; \ - elif [ -f package-lock.json ]; then npm run dev; \ - elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ - else yarn dev; \ - fi +# CMD \ +# if [ -f yarn.lock ]; then yarn dev; \ +# elif [ -f package-lock.json ]; then npm run dev; \ +# elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ +# else yarn dev; \ +# fi diff --git a/agenta-web/package-lock.json b/agenta-web/package-lock.json index da84a9d2c8..6f4ce2c284 100644 --- a/agenta-web/package-lock.json +++ b/agenta-web/package-lock.json @@ -27,8 +27,8 @@ "@types/react-highlight-words": "^0.16.4", "@types/react-syntax-highlighter": "^15.5.7", "@types/uuid": "^9.0.7", - "ag-grid-community": "^30.0.6", - "ag-grid-react": "^30.0.3", + "ag-grid-community": "^31.0.1", + "ag-grid-react": "^31.0.1", "antd": "^5.4.7", "autoprefixer": "10.4.14", "axios": "^1.4.0", @@ -1766,19 +1766,19 @@ } }, "node_modules/ag-grid-community": { - "version": "30.2.1", - "resolved": "https://registry.npmjs.org/ag-grid-community/-/ag-grid-community-30.2.1.tgz", - "integrity": "sha512-1slonXskJbbI9ybhTx//4YKfJpZVAEnHL8dui1rQJRSXKByUi+/f7XtvkLsbgBkawoWbqvRAySjYtvz80+kBfA==" + "version": "31.0.1", + "resolved": "https://registry.npmjs.org/ag-grid-community/-/ag-grid-community-31.0.1.tgz", + "integrity": "sha512-RZQlW1DTOJHsUR/tnbnTJQKgAnDlHi05YYyTe5AgNor/1TlX1hoYdcqrGsJjvcHQgTjeEgzWOL0yf+KcqXZzxg==" }, "node_modules/ag-grid-react": { - "version": "30.2.1", - "resolved": "https://registry.npmjs.org/ag-grid-react/-/ag-grid-react-30.2.1.tgz", - "integrity": "sha512-WYt5ZstSoPEGAcTqXBdaonihXtapZdjTHZ3dc3xTK1xIdbF0/Vw4zDWCQSsG5H4M5CeUKjvbeHx7kKM1Yiah3g==", + "version": "31.0.1", + "resolved": "https://registry.npmjs.org/ag-grid-react/-/ag-grid-react-31.0.1.tgz", + "integrity": "sha512-9nmYPsgH1YUDUDOTiyaFsysoNAx/y72ovFJKuOffZC1V7OrQMadyP6DbqGFWCqzzoLJOY7azOr51dDQzAIXLpw==", "dependencies": { + "ag-grid-community": "~31.0.1", "prop-types": "^15.8.1" }, "peerDependencies": { - "ag-grid-community": "~30.2.1", "react": "^16.3.0 || ^17.0.0 || ^18.0.0", "react-dom": "^16.3.0 || ^17.0.0 || ^18.0.0" } diff --git a/agenta-web/package.json b/agenta-web/package.json index b4d786e016..7f74a72908 100644 --- a/agenta-web/package.json +++ b/agenta-web/package.json @@ -38,8 +38,8 @@ "@types/react-highlight-words": "^0.16.4", "@types/react-syntax-highlighter": "^15.5.7", "@types/uuid": "^9.0.7", - "ag-grid-community": "^30.0.6", - "ag-grid-react": "^30.0.3", + "ag-grid-community": "^31.0.1", + "ag-grid-react": "^31.0.1", "antd": "^5.4.7", "autoprefixer": "10.4.14", "axios": "^1.4.0", diff --git a/agenta-web/src/components/Sidebar/Sidebar.tsx b/agenta-web/src/components/Sidebar/Sidebar.tsx index 46eeee7235..2d9fb3204b 100644 --- a/agenta-web/src/components/Sidebar/Sidebar.tsx +++ b/agenta-web/src/components/Sidebar/Sidebar.tsx @@ -293,6 +293,28 @@ const Sidebar: React.FC = () => { + + }> + + {collapsed + ? "Perform 1-to-1 variant comparisons on testsets to identify superior options." + : "Annotations"} + + + + ({ - header: { - margin: "1rem 0", - "& > h3": { - textAlign: "center", - }, - }, - date: { - fontSize: "0.75rem", - color: theme.colorTextSecondary, - display: "inline-block", - marginBottom: "1rem", - }, table: { - height: 500, + height: "calc(100vh - 240px)", + }, + infoRow: { + marginTop: "1rem", + margin: "0.75rem 0", + display: "flex", + alignItems: "center", + justifyContent: "space-between", }, })) interface Props {} const EvaluationCompareMode: React.FC = () => { - const router = useRouter() const appId = useAppId() const classes = useStyles() const {appTheme} = useAppTheme() - const evaluationIds = router.query.evaluations as string + const [evaluationIds, setEvaluationIds] = useQueryParam("evaluations") const [scenarios, setScenarios] = useState<_EvaluationScenario[]>([]) - const [evalaution, setEvaluation] = useState<_Evaluation[]>() const [fetching, setFetching] = useState(false) + const gridRef = useRef>() + + const evalautions = useMemo(() => { + return uniqBy( + scenarios.map((scenario) => scenario.evaluation), + "id", + ) + }, [scenarios]) + + const colors = useMemo(() => getTagColors(), [evalautions]) const colDefs = useMemo(() => { const colDefs: ColDef<_EvaluationScenario>[] = [] - if (!scenarios.length || !evalaution) return colDefs - - scenarios[0]?.inputs.forEach((input, index) => { - colDefs.push({ - headerName: `Input: ${input.name}`, - field: `inputs.${index}`, - valueGetter: (params) => { - return params.data?.inputs[index].value || "" - }, - }) - }) + if (!scenarios.length || !evalautions.length) return colDefs + colDefs.push({ headerName: "Expected Output", + minWidth: 280, + flex: 1, field: "correct_answer", + ...getFilterParams("text"), valueGetter: (params) => { - return params.data?.correct_answer?.value || "" + return params.data?.correct_answer?.toString() || "" }, + pinned: "left", + cellRenderer: LongTextCellRenderer, }) - evalaution.map( - (evalaution) => - evalaution?.variants.forEach((variant, index) => { - colDefs.push({ - headerName: `Output (${variant.variantName})`, - field: `outputs.${index}`, - valueGetter: (params) => { - return params.data?.outputs[index].value || "" - }, + evalautions.forEach((evalaution, vi) => { + evalaution?.variants.forEach((variant, index) => { + scenarios + .find((scenario) => scenario.evaluation.id === evalaution.id) + ?.inputs.forEach((input, index) => { + colDefs.push({ + headerComponent: () => ( + + Input: {input.name} + {variant.variantName} + + ), + minWidth: 200, + flex: 1, + field: `inputs.${index}`, + ...getFilterParams(input.type === "number" ? "number" : "text"), + valueGetter: (params) => { + return getTypedValue(params.data?.inputs[index]) + }, + cellRenderer: LongTextCellRenderer, + }) }) - }), - ) - - scenarios.map( - (scenario) => - scenario?.evaluators_configs.forEach((config, index) => { + colDefs.push({ + headerComponent: () => ( + + Output + {variant.variantName} + + ), + minWidth: 280, + flex: 1, + field: `outputs.${index}`, + ...getFilterParams("text"), + valueGetter: (params) => { + return getTypedValue(params.data?.outputs[index]) + }, + cellRenderer: LongTextCellRenderer, + }) + evalaution.aggregated_results.forEach(({evaluator_config: config}) => { colDefs.push({ - headerName: `Evaluator: ${config.name}`, - field: `results`, + flex: 1, + headerComponent: () => ( + + Evaluator: {config.name} + {variant.variantName} + + ), + field: "results", + ...getFilterParams("text"), valueGetter: (params) => { - return ( + return getTypedValue( params.data?.results.find( - (item) => item.evaluator.key === config.evaluator_key, - )?.result || "" + (item) => item.evaluator_config === config.id, + )?.result, ) }, }) - }), - ) + }) + }) + }) return colDefs - }, [evalaution, scenarios]) + }, [scenarios]) + + const fetcher = () => { + setFetching(true) + Promise.all( + (evaluationIds?.split(",") || []).map((evalId) => + fetchAllEvaluationScenarios(appId, evalId), + ), + ) + .then((scenariosNest) => { + setScenarios(scenariosNest.flat(1)) + setTimeout(() => { + if (!gridRef.current) return + + const ids: string[] = + gridRef.current.api + .getColumns() + ?.filter((column) => column.getColDef().field?.startsWith("results")) + ?.map((item) => item.getColId()) || [] + gridRef.current.api.autoSizeColumns(ids, false) + setFetching(false) + }, 100) + }) + .catch(() => setFetching(false)) + } useEffect(() => { - const fetcher = async () => { - setFetching(true) - - try { - const evaluationIdsArray = evaluationIds?.split(",") || [] - - const fetchPromises = evaluationIdsArray.map((evalId) => { - return Promise.all([ - fetchAllEvaluationScenarios(appId, evalId), - fetchEvaluation(evalId), - ]) - }) - - const results = await Promise.all(fetchPromises) - const fetchedScenarios = results.map(([[scenarios]]) => scenarios) - const fetchedEvaluations = results.map(([_, evaluation]) => evaluation) - - setScenarios(fetchedScenarios) - setEvaluation(fetchedEvaluations) - } catch (error) { - console.error(error) - } finally { - setFetching(false) - } - } - fetcher() }, [appId, evaluationIds]) - const handleDeleteVariant = (variantId: string) => { - console.log(variantId) + const handleDeleteVariant = (evalId: string) => { + setEvaluationIds( + evaluationIds + ?.split(",") + .filter((item) => item !== evalId) + .join(","), + ) + } + + const onExport = () => { + if (!gridRef.current) return + const {currentApp} = getAppValues() + gridRef.current.api.exportDataAsCsv({ + fileName: `${currentApp?.app_name}_${evalautions + .map(({variants}) => variants[0].variantName) + .join("_")}.csv`, + }) } return (
-
- - Testset: {evalaution ? evalaution[0]?.testset.name : ""} - - - Variants: - {evalaution?.map( - (evalaution) => - evalaution?.variants?.map((variant) => ( + Evaluations Comparison +
+ + + Testset: + {evalautions[0]?.testset.name || ""} + + + Variants: +
+ {evalautions?.map(({variants, id}, vi) => ( handleDeleteVariant(variant.variantId)} + key={id} + color={colors[vi]} + onClose={() => handleDeleteVariant(id)} closable > - {variant.variantName} + {variants[0].variantName} - )), - )} + ))} +
+
+ + +
@@ -159,6 +216,7 @@ const EvaluationCompareMode: React.FC = () => { } ${classes.table}`} > + ref={gridRef as any} rowData={scenarios} columnDefs={colDefs} getRowId={(params) => params.data.id} diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 03582ed803..ddca14361b 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -1,12 +1,20 @@ -import React, {useEffect, useMemo, useRef, useState} from "react" +import React, {useCallback, useEffect, useMemo, useRef, useState} from "react" import {AgGridReact} from "ag-grid-react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {ColDef, ICellRendererParams} from "ag-grid-community" import {createUseStyles} from "react-jss" -import {Button, GlobalToken, Space, Spin, Typography, theme} from "antd" -import {DeleteOutlined, PlusCircleOutlined, SlidersOutlined, SwapOutlined} from "@ant-design/icons" -import {EvaluationStatus, JSSTheme, _Evaluation} from "@/lib/Types" -import {uniqBy} from "lodash" +import {Button, GlobalToken, Space, Spin, Typography, message, theme} from "antd" +import { + CopyOutlined, + DeleteOutlined, + FullscreenExitOutlined, + FullscreenOutlined, + PlusCircleOutlined, + SlidersOutlined, + SwapOutlined, +} from "@ant-design/icons" +import {EvaluationStatus, GenericObject, JSSTheme, TypedValue, _Evaluation} from "@/lib/Types" +import {capitalize, round, uniqBy} from "lodash" import dayjs from "dayjs" import relativeTime from "dayjs/plugin/relativeTime" import duration from "dayjs/plugin/duration" @@ -15,7 +23,7 @@ import {useAppId} from "@/hooks/useAppId" import {deleteEvaluations, fetchAllEvaluations, fetchEvaluationStatus} from "@/services/evaluations" import {useRouter} from "next/router" import {useUpdateEffect} from "usehooks-ts" -import {durationToStr, shortPoll} from "@/lib/helpers/utils" +import {shortPoll} from "@/lib/helpers/utils" import AlertPopup from "@/components/AlertPopup/AlertPopup" import {useDurationCounter} from "@/hooks/useDurationCounter" dayjs.extend(relativeTime) @@ -57,6 +65,27 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ date: { color: "#8c8c8c", }, + longCell: { + height: "100%", + position: "relative", + overflow: "hidden", + textOverflow: "ellipsis", + whiteSpace: "nowrap", + "& .ant-space": { + position: "absolute", + bottom: 2, + right: 0, + height: 35, + backgroundColor: theme.colorBgContainer, + padding: "0.5rem", + borderRadius: theme.borderRadius, + border: `1px solid ${theme.colorBorder}`, + display: "none", + }, + "&:hover .ant-space": { + display: "inline-flex", + }, + }, })) const statusMapper = (token: GlobalToken) => ({ @@ -78,9 +107,100 @@ const statusMapper = (token: GlobalToken) => ({ }, }) -interface Props {} +export function getTypedValue(res?: TypedValue) { + const {value, type} = res || {} + return type === "number" + ? round(Number(value), 2) + : ["boolean", "bool"].includes(type as string) + ? capitalize(value?.toString()) + : value?.toString() +} + +export function getFilterParams(type: "number" | "text" | "date") { + const filterParams: GenericObject = {} + if (type == "date") { + filterParams.comparator = function ( + filterLocalDateAtMidnight: Date, + cellValue: string | null, + ) { + if (cellValue == null) return -1 + const cellDate = dayjs(cellValue).startOf("day").toDate() + if (filterLocalDateAtMidnight.getTime() === cellDate.getTime()) { + return 0 + } + if (cellDate < filterLocalDateAtMidnight) { + return -1 + } + if (cellDate > filterLocalDateAtMidnight) { + return 1 + } + } + } + + return { + sortable: true, + floatingFilter: true, + filter: + type === "number" + ? "agNumberColumnFilter" + : type === "date" + ? "agDateColumnFilter" + : "agTextColumnFilter", + cellDataType: type, + filterParams, + } +} + +export function LongTextCellRenderer(params: ICellRendererParams) { + const {value, api, node} = params + const [expanded, setExpanded] = useState( + node.rowHeight !== api.getSizesForCurrentTheme().rowHeight, + ) + const classes = useStyles() + + const onCopy = useCallback(() => { + navigator.clipboard + .writeText(value as string) + .then(() => { + message.success("Copied to clipboard") + }) + .catch(console.error) + }, []) + + const onExpand = useCallback(() => { + node.setRowHeight(api.getSizesForCurrentTheme().rowHeight * (expanded ? 1 : 5)) + api.onRowHeightChanged() + }, [expanded]) -const EvaluationResults: React.FC = () => { + useEffect(() => { + node.addEventListener("heightChanged", () => { + setExpanded(node.rowHeight !== api.getSizesForCurrentTheme().rowHeight) + }) + }, []) + + return ( +
+ {value} + + {expanded ? ( + + ) : ( + + )} + + +
+ ) +} + +interface Props { + type?: "auto" | "human" +} + +const EvaluationResults: React.FC = ({type = "auto"}) => { const {appTheme} = useAppTheme() const classes = useStyles() const appId = useAppId() @@ -175,15 +295,26 @@ const EvaluationResults: React.FC = () => { headerCheckboxSelection: true, checkboxSelection: true, showDisabledCheckboxes: true, + ...getFilterParams("text"), + pinned: "left", + }, + { + field: "testset.name", + flex: 1, + minWidth: 160, + tooltipValueGetter: (params) => params.value, + ...getFilterParams("text"), }, - {field: "testset.name", flex: 1, minWidth: 160}, { field: "variants", flex: 1, minWidth: 160, valueGetter: (params) => params.data?.variants.map((item) => item.variantName).join(","), - headerName: "Variant", + headerName: "Variant(s)", + tooltipValueGetter: (params) => + params.data?.variants.map((item) => item.variantName).join(","), + ...getFilterParams("text"), }, ...evaluatorConfigs.map( (config) => @@ -196,16 +327,26 @@ const EvaluationResults: React.FC = () => { {config.name} ), + ...getFilterParams("number"), valueGetter: (params) => - params.data?.aggregated_results.find( - (item) => item.evaluator_config.id === config.id, - )?.result?.value || "", + getTypedValue( + params.data?.aggregated_results.find( + (item) => item.evaluator_config.id === config.id, + )?.result, + ), + tooltipValueGetter: (params) => + params.data?.aggregated_results + .find((item) => item.evaluator_config.id === config.id) + ?.result?.value?.toString() || "", }) as ColDef<_Evaluation>, ), { flex: 1, field: "status", minWidth: 200, + ...getFilterParams("text"), + filterValueGetter: (params) => + statusMapper(token)[params.data?.status as EvaluationStatus].label, cellRenderer: (params: ICellRendererParams<_Evaluation>) => { const classes = useStyles() const duration = useDurationCounter( @@ -231,6 +372,7 @@ const EvaluationResults: React.FC = () => { field: "created_at", headerName: "Created", minWidth: 120, + ...getFilterParams("date"), valueFormatter: (params) => dayjs(params.value).fromNow(), }, ] @@ -290,11 +432,13 @@ const EvaluationResults: React.FC = () => { columnDefs={colDefs} getRowId={(params) => params.data.id} onRowClicked={(params) => + EvaluationStatus.FINISHED === params.data?.status && router.push(`/apps/${appId}/evaluations-new/${params.data?.id}`) } rowSelection="multiple" suppressRowClickSelection onSelectionChanged={(event) => setSelected(event.api.getSelectedRows())} + tooltipShowDelay={0} />
diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx index 1da8e3a523..7208c411da 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -1,42 +1,38 @@ import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {useAppId} from "@/hooks/useAppId" import {JSSTheme, _Evaluation, _EvaluationScenario} from "@/lib/Types" -import {fetchAllEvaluationScenarios, fetchEvaluation} from "@/services/evaluations" +import {deleteEvaluations, fetchAllEvaluationScenarios} from "@/services/evaluations" import {DeleteOutlined, DownloadOutlined} from "@ant-design/icons" import {ColDef} from "ag-grid-community" import {AgGridReact} from "ag-grid-react" -import {Spin, Typography} from "antd" +import {Space, Spin, Tooltip, Typography} from "antd" import dayjs from "dayjs" import {useRouter} from "next/router" -import React, {useEffect, useMemo, useState} from "react" +import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" +import { + LongTextCellRenderer, + getFilterParams, + getTypedValue, +} from "../evaluationResults/EvaluationResults" +import {getAppValues} from "@/contexts/app.context" +import AlertPopup from "@/components/AlertPopup/AlertPopup" const useStyles = createUseStyles((theme: JSSTheme) => ({ - header: { + infoRow: { marginTop: "1rem", + margin: "0.75rem 0", display: "flex", alignItems: "center", justifyContent: "space-between", - - "& > h3": { - margin: 0, - }, - - "& > :last-child": { - display: "flex", - alignItems: "center", - gap: "1rem", - }, }, date: { - marginTop: "0.25rem", fontSize: "0.75rem", color: "#8c8c8c", display: "inline-block", - marginBottom: "1rem", }, table: { - height: "calc(100vh - 220px)", + height: "calc(100vh - 240px)", }, })) @@ -49,8 +45,9 @@ const EvaluationScenarios: React.FC = () => { const {appTheme} = useAppTheme() const evaluationId = router.query.evaluation_id as string const [scenarios, setScenarios] = useState<_EvaluationScenario[]>([]) - const [evalaution, setEvaluation] = useState<_Evaluation>() const [fetching, setFetching] = useState(false) + const gridRef = useRef>() + const evalaution = scenarios[0]?.evaluation const colDefs = useMemo(() => { const colDefs: ColDef<_EvaluationScenario>[] = [] @@ -58,38 +55,47 @@ const EvaluationScenarios: React.FC = () => { scenarios[0]?.inputs.forEach((input, index) => { colDefs.push({ + flex: 1, headerName: `Input: ${input.name}`, + ...getFilterParams(input.type === "number" ? "number" : "text"), field: `inputs.${index}`, valueGetter: (params) => { - return params.data?.inputs[index].value || "" + return getTypedValue(params.data?.inputs[index]) }, + cellRenderer: LongTextCellRenderer, }) }) colDefs.push({ + flex: 1, headerName: "Expected Output", field: "correct_answer", + ...getFilterParams("text"), valueGetter: (params) => { - return params.data?.correct_answer?.value || "" + return params.data?.correct_answer?.toString() || "" }, + cellRenderer: LongTextCellRenderer, }) - evalaution?.variants.forEach((variant, index) => { + evalaution?.variants.forEach((_, index) => { colDefs.push({ - headerName: `Output (${variant.variantName})`, - field: `outputs.${index}`, + flex: 1, + headerName: "Output", + ...getFilterParams("text"), + field: `outputs.0`, valueGetter: (params) => { - return params.data?.outputs[index].value || "" + return getTypedValue(params.data?.outputs[index]) }, + cellRenderer: LongTextCellRenderer, }) }) - scenarios[0]?.evaluators_configs.forEach((config, index) => { + scenarios[0]?.evaluators_configs.forEach((config) => { colDefs.push({ headerName: `Evaluator: ${config.name}`, field: `results`, + ...getFilterParams("text"), valueGetter: (params) => { - return ( - params.data?.results.find( - (item) => item.evaluator.key === config.evaluator_key, - )?.result || "" + return getTypedValue( + params.data?.results.find((item) => item.evaluator_config === config.id) + ?.result, ) }, }) @@ -99,13 +105,20 @@ const EvaluationScenarios: React.FC = () => { const fetcher = () => { setFetching(true) - Promise.all([ - fetchAllEvaluationScenarios(appId, evaluationId), - fetchEvaluation(evaluationId), - ]) - .then(([scenarios, evaluation]) => { + fetchAllEvaluationScenarios(appId, evaluationId) + .then((scenarios) => { setScenarios(scenarios) - setEvaluation(evaluation) + setTimeout(() => { + if (!gridRef.current) return + + const ids: string[] = + gridRef.current.api + .getColumns() + ?.filter((column) => column.getColDef().field === "results") + ?.map((item) => item.getColId()) || [] + gridRef.current.api.autoSizeColumns(ids, false) + setFetching(false) + }, 100) }) .catch(console.error) .finally(() => setFetching(false)) @@ -115,20 +128,53 @@ const EvaluationScenarios: React.FC = () => { fetcher() }, [appId, evaluationId]) + const onExport = () => { + if (!gridRef.current) return + const {currentApp} = getAppValues() + gridRef.current.api.exportDataAsCsv({ + fileName: `${currentApp?.app_name}_${evalaution.variants[0].variantName}.csv`, + }) + } + + const onDelete = () => { + AlertPopup({ + title: "Delete Evaluation", + message: "Are you sure you want to delete this evaluation?", + onOk: () => + deleteEvaluations([evaluationId]) + .then(() => router.push(`/apps/${appId}/evaluations-new`)) + .catch(console.error), + }) + } + return (
-
- - Evaluation Result (Testset: {evalaution?.testset.name || ""}) - -
- - -
+ Evaluation Results +
+ + + {dayjs(evalaution?.created_at).format("DD MMM YYYY | h:m a")} + + + Testset: + {evalaution?.testset.name || ""} + + + Variant: + + {evalaution?.variants[0].variantName || ""} + + + + + + + + + + +
- - {dayjs(evalaution?.created_at).format("DD MMM YYYY | h:m a")} -
= () => { } ${classes.table}`} > + ref={gridRef as any} rowData={scenarios} columnDefs={colDefs} getRowId={(params) => params.data.id} diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 1f7153b31d..4e0e50a709 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -294,7 +294,7 @@ export type ChatMessage = { } type ValueType = number | string | boolean | GenericObject | null -type ValueTypeOptions = "text" | "number" | "boolean" | "string" | "code" | "regex" +type ValueTypeOptions = "text" | "number" | "boolean" | "bool" | "string" | "code" | "regex" //evaluation revamp types export interface EvaluationSettingsTemplate { @@ -360,16 +360,13 @@ export interface _Evaluation { export interface _EvaluationScenario { id: string - user: User - organization: Org + evaluation_id: string evaluation: _Evaluation + evaluators_configs: EvaluatorConfig[] inputs: (TypedValue & {name: string})[] outputs: TypedValue[] - correct_answer?: TypedValue - created_at?: string - updated_at?: string + correct_answer?: string is_pinned?: boolean note?: string - evaluators_configs: EvaluatorConfig[] - results: {evaluator: Evaluator; result: ValueType}[] + results: {evaluator_config: string; result: TypedValue}[] } diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx new file mode 100644 index 0000000000..d4a63b599a --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx @@ -0,0 +1,9 @@ +import React from "react" + +interface Props {} + +const AnnotationScenarios: React.FC = () => { + return
+} + +export default AnnotationScenarios diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/index.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/index.tsx new file mode 100644 index 0000000000..68b1be8ff8 --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/annotations/index.tsx @@ -0,0 +1,10 @@ +import EvaluationResults from "@/components/pages/evaluations/evaluationResults/EvaluationResults" +import React from "react" + +interface Props {} + +const Annotations: React.FC = () => { + return +} + +export default Annotations diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index e89c29deda..61363072cb 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -125,11 +125,18 @@ export const deleteEvaluations = async (evaluationsIds: string[]) => { // Evaluation Scenarios export const fetchAllEvaluationScenarios = async (appId: string, evaluationId: string) => { - // await delay(1000) - // return Mock.evaluationScenarios - - const response = await axios.get(`/api/evaluations/${evaluationId}/evaluation_scenarios/`, { - params: {app_id: appId}, + const [{data: evaluationScenarios}, evaluation] = await Promise.all([ + axios.get(`/api/evaluations/${evaluationId}/evaluation_scenarios/`, { + params: {app_id: appId}, + }), + fetchEvaluation(evaluationId), + ]) + + evaluationScenarios.forEach((scenario: _EvaluationScenario) => { + scenario.evaluation = evaluation + scenario.evaluators_configs = evaluation.aggregated_results.map( + (item) => item.evaluator_config, + ) }) - return response.data as _EvaluationScenario[] + return evaluationScenarios as _EvaluationScenario[] } From 18310d9c7711592698407dcf837af0ffe874409f Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 2 Jan 2024 14:57:31 +0500 Subject: [PATCH 205/414] annotation services and global date transformer in axios interceptor --- agenta-web/src/components/Sidebar/Sidebar.tsx | 4 +- .../evaluationResults/EvaluationResults.tsx | 46 ++++++++------- agenta-web/src/hooks/useDurationCounter.ts | 4 +- agenta-web/src/lib/Types.ts | 23 ++++++++ agenta-web/src/lib/helpers/axiosConfig.ts | 15 ++++- agenta-web/src/lib/helpers/utils.ts | 10 ++-- agenta-web/src/services/evaluations/index.ts | 57 +++++++++++++++++++ 7 files changed, 129 insertions(+), 30 deletions(-) diff --git a/agenta-web/src/components/Sidebar/Sidebar.tsx b/agenta-web/src/components/Sidebar/Sidebar.tsx index 2d9fb3204b..b1744feac6 100644 --- a/agenta-web/src/components/Sidebar/Sidebar.tsx +++ b/agenta-web/src/components/Sidebar/Sidebar.tsx @@ -293,7 +293,7 @@ const Sidebar: React.FC = () => { - { : "Annotations"} - + */} ) => { + const classes = useStyles() + const {token} = theme.useToken() + const duration = useDurationCounter( + params.data?.duration || 0, + [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes(params.value), + ) + const {label, color} = statusMapper(token)[params.value as EvaluationStatus] + + return ( + +
+ {label} + + {duration} + + ) + }, + (prev, next) => prev.value === next.value && prev.data?.duration === next.data?.duration, +) + interface Props { type?: "auto" | "human" } @@ -343,35 +365,17 @@ const EvaluationResults: React.FC = ({type = "auto"}) => { { flex: 1, field: "status", - minWidth: 200, + minWidth: 185, ...getFilterParams("text"), filterValueGetter: (params) => statusMapper(token)[params.data?.status as EvaluationStatus].label, - cellRenderer: (params: ICellRendererParams<_Evaluation>) => { - const classes = useStyles() - const duration = useDurationCounter( - params.data?.duration || 0, - [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes( - params.value, - ), - ) - const {label, color} = statusMapper(token)[params.value as EvaluationStatus] - - return ( - -
- {label} - - {duration} - - ) - }, + cellRenderer: StatusRenderer, }, { flex: 1, field: "created_at", headerName: "Created", - minWidth: 120, + minWidth: 160, ...getFilterParams("date"), valueFormatter: (params) => dayjs(params.value).fromNow(), }, diff --git a/agenta-web/src/hooks/useDurationCounter.ts b/agenta-web/src/hooks/useDurationCounter.ts index ab884339d6..74baaed0e0 100644 --- a/agenta-web/src/hooks/useDurationCounter.ts +++ b/agenta-web/src/hooks/useDurationCounter.ts @@ -7,8 +7,8 @@ export const useDurationCounter = (duration: number, isRunning: boolean = true) useEffect(() => { if (isRunning) { const interval = setInterval(() => { - setElapsed((prev) => prev + 100) - }, 100) + setElapsed((prev) => prev + 1000) + }, 1000) return () => clearInterval(interval) } }, [isRunning]) diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 4e0e50a709..2f33533ee0 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -370,3 +370,26 @@ export interface _EvaluationScenario { note?: string results: {evaluator_config: string; result: TypedValue}[] } + +export interface Annotation { + id: string + app_id: string + variants: {variantId: string; variantName: string}[] + annotation_name: "flag" | "score" + testset: { + id: string + name: string + } + aggregated_results: string[] +} + +export interface AnnotationScenario { + id: string + annotation_id: string + annotation: Annotation + inputs: (TypedValue & {name: string})[] + outputs: TypedValue[] + is_pinned?: boolean + note?: string + result: TypedValue +} diff --git a/agenta-web/src/lib/helpers/axiosConfig.ts b/agenta-web/src/lib/helpers/axiosConfig.ts index 258ad9f864..cbdb76ef95 100644 --- a/agenta-web/src/lib/helpers/axiosConfig.ts +++ b/agenta-web/src/lib/helpers/axiosConfig.ts @@ -3,6 +3,7 @@ import {getErrorMessage, globalErrorHandler} from "./errorHandler" import {signOut} from "supertokens-auth-react/recipe/thirdpartypasswordless" import router from "next/router" import {getAgentaApiUrl} from "./utils" +import {isObject} from "lodash" const axios = axiosApi.create({ baseURL: getAgentaApiUrl(), @@ -12,7 +13,19 @@ const axios = axiosApi.create({ }) axios.interceptors.response.use( - (response) => response, + (response) => { + const {data} = response + // deep convert all UTC dats to local + if (data && isObject(data)) + response.data = JSON.parse(JSON.stringify(data), (k, v) => { + return ["created_at", "updated_at"].includes(k) && + typeof v === "string" && + !v.endsWith("Z") + ? v + "Z" + : v + }) + return response + }, (error) => { // if axios config has _ignoreError set to true, then don't handle error if (error.config?._ignoreError) throw error diff --git a/agenta-web/src/lib/helpers/utils.ts b/agenta-web/src/lib/helpers/utils.ts index 093d0d4ef0..01abcbb1d5 100644 --- a/agenta-web/src/lib/helpers/utils.ts +++ b/agenta-web/src/lib/helpers/utils.ts @@ -4,6 +4,8 @@ import {GenericObject} from "../Types" import promiseRetry from "promise-retry" import {getErrorMessage} from "./errorHandler" import dayjs from "dayjs" +import utc from "dayjs/plugin/utc" +dayjs.extend(utc) const llmAvailableProvidersToken = "llmAvailableProvidersToken" @@ -346,10 +348,10 @@ export function durationToStr(duration: number) { const mins = Math.floor(dayjs.duration(duration, "milliseconds").asMinutes()) const secs = Math.floor(dayjs.duration(duration, "milliseconds").asSeconds()) - if (days > 0) return `${days} days` - if (hours > 0) return `${hours} hours` - if (mins > 0) return `${mins} mins` - return `${secs} seconds` + if (days > 0) return `${days}d ${hours}h` + if (hours > 0) return `${hours}h ${mins}m` + if (mins > 0) return `${mins}m ${secs}s` + return `${secs}s` } type DayjsDate = Parameters[0] diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index 61363072cb..a2cc6fc55e 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -1,8 +1,11 @@ import axios from "@/lib//helpers/axiosConfig" import { + Annotation, + AnnotationScenario, EvaluationStatus, Evaluator, EvaluatorConfig, + TypedValue, _Evaluation, _EvaluationScenario, } from "@/lib/Types" @@ -140,3 +143,57 @@ export const fetchAllEvaluationScenarios = async (appId: string, evaluationId: s }) return evaluationScenarios as _EvaluationScenario[] } + +//annotations +export const fetchAllAnnotations = async (appId: string) => { + const response = await axios.get(`/api/annotations/`, {params: {app_id: appId}}) + return response.data.map(evaluationTransformer) as Annotation[] +} + +export const fetchAnnotation = async (annotationId: string) => { + const response = await axios.get(`/api/annotations/${annotationId}/`) + return evaluationTransformer(response.data) as unknown as Annotation +} + +export const fetchAnnotationStatus = async (annotationId: string) => { + const response = await axios.get(`/api/annotations/${annotationId}/status/`) + return response.data as {status: EvaluationStatus} +} + +export const createAnnotation = async ( + appId: string, + annotation: Omit & + Pick, +) => { + return axios.post(`/api/annotations/`, {...annotation, app_id: appId}) +} + +export const deleteAnnotations = async (annotationsIds: string[]) => { + return axios.delete(`/api/annotations/`, {data: {annotations_ids: annotationsIds}}) +} + +// Annotation Scenarios +export const fetchAllAnnotationScenarios = async (appId: string, annotationId: string) => { + const [{data: annotationScenarios}, annotation] = await Promise.all([ + axios.get(`/api/annotations/${annotationId}/annotation_scenarios/`, { + params: {app_id: appId}, + }), + fetchAnnotation(annotationId), + ]) + + annotationScenarios.forEach((scenario: AnnotationScenario) => { + scenario.annotation = annotation + }) + return annotationScenarios as AnnotationScenario[] +} + +export const updateAnnotationScenario = async ( + annotationId: string, + annotationScenarioId: string, + data: Pick, +) => { + return axios.put( + `/api/annotations/${annotationId}/annotation_scenarios/${annotationScenarioId}`, + data, + ) +} From 372e7c7c48542b5c98d2f9a5df16d157ad5e32f4 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 11:25:36 +0100 Subject: [PATCH 206/414] add same old models renamed with human evals --- .../agenta_backend/models/db_models.py | 48 +++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 8e8471a123..0ed516cfd9 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -248,6 +248,49 @@ class EvaluationScenarioOutputDB(EmbeddedModel): value: Any +class HumanEvaluationScenarioInput(EmbeddedModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(EmbeddedModel): + variant_id: str + variant_output: str + + +class HumanEvaluationDB(Model): + app: AppDB = Reference(key_name="app") + organization: OrganizationDB = Reference(key_name="organization") + user: UserDB = Reference(key_name="user") + status: str + evaluation_type: str + variants: List[ObjectId] + testset: TestSetDB = Reference(key_name="testsets") + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Config: + collection = "evaluations" + + +class HumanEvaluationScenarioDB(Model): + user: UserDB = Reference(key_name="user") + organization: OrganizationDB = Reference(key_name="organization") + evaluation: HumanEvaluationDB = Reference(key_name="evaluations") + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] + vote: Optional[str] + score: Optional[Union[str, int]] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Config: + collection = "evaluation_scenarios" + + class EvaluationDB(Model): app: AppDB = Reference(key_name="app") organization: OrganizationDB = Reference(key_name="organization") @@ -350,7 +393,6 @@ class AnnoationResult(EmbeddedModel): class AnnotationScenarioResult(EmbeddedModel): - variant_id: str result: Result @@ -359,7 +401,7 @@ class AnnotationsDB(Model): organization: OrganizationDB = Reference(key_name="organization") user: UserDB = Reference(key_name="user") variants_ids: List[ObjectId] - testset_id: ObjectId + testset: TestSetDB = Reference() status: str = Field(default="ANNOTATION_INITIALIZED") annotation_name: str aggregated_results: List[AnnoationResult] @@ -379,7 +421,7 @@ class AnnotationsScenariosDB(Model): outputs: List[AnnotationScenarioOutputDB] is_pinned: Optional[bool] note: Optional[str] - result: Optional[AnnotationScenarioResult] = None + result: Optional[Union[dict, Result]] = Field(default=None) created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) From b60ee592f5ceda3f01acc3ca92885c803f96d814 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 11:26:29 +0100 Subject: [PATCH 207/414] add old eval for human evals --- .../models/api/evaluation_model.py | 10 +++ .../routers/evaluation_router.py | 45 +++++++++++ .../services/evaluation_service.py | 76 ++++++++++++++++++- 3 files changed, 128 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index b25dec6988..1262120eb7 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -56,6 +56,16 @@ class AggregatedResult(BaseModel): result: Result +class NewHumanEvaluation(BaseModel): + app_id: str + variant_ids: List[str] + evaluation_type: EvaluationType + evaluation_type_settings: Optional[EvaluationTypeSettings] + inputs: List[str] + testset_id: str + status: str + + class Evaluation(BaseModel): id: str app_id: str diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index ca83452243..3fc0b6a670 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -23,6 +23,8 @@ CreateCustomEvaluation, EvaluationUpdate, EvaluationWebhook, + NewHumanEvaluation, + SimpleEvaluationOutput, ) from agenta_backend.services.evaluation_service import ( UpdateEvaluationScenarioError, @@ -55,6 +57,49 @@ router = APIRouter() +@router.post( + "/human-evaluations/", response_model=SimpleEvaluationOutput, operation_id="create_evaluation" +) +async def create_evaluation( + payload: NewHumanEvaluation, + request: Request, +): + """Creates a new comparison table document + Raises: + HTTPException: _description_ + Returns: + _description_ + """ + try: + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + access_app = await check_access_to_app( + user_org_data=user_org_data, + app_id=payload.app_id, + check_owner=False, + ) + if not access_app: + error_msg = f"You do not have access to this app: {payload.app_id}" + return JSONResponse( + {"detail": error_msg}, + status_code=400, + ) + app = await db_manager.fetch_app_by_id(app_id=payload.app_id) + + if app is None: + raise HTTPException(status_code=404, detail="App not found") + + new_evaluation_db = await evaluation_service.create_new_human_evaluation( + payload, **user_org_data + ) + print(new_evaluation_db) + return converters.evaluation_db_to_simple_evaluation_output(new_evaluation_db) + except KeyError: + raise HTTPException( + status_code=400, + detail="columns in the test set should match the names of the inputs in the variant", + ) + + @router.post("/") async def create_evaluation( payload: NewEvaluation, diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 14001db5d1..0474ace172 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -19,6 +19,7 @@ CreateCustomEvaluation, EvaluationUpdate, EvaluationStatusEnum, + NewHumanEvaluation, ) from agenta_backend.models import converters from agenta_backend.utils.common import engine, check_access_to_app @@ -28,6 +29,9 @@ AppVariantDB, EvaluationDB, EvaluationScenarioDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + HumanEvaluationScenarioInput, UserDB, AppDB, EvaluationScenarioInputDB, @@ -111,7 +115,7 @@ async def prepare_csvdata_and_create_evaluation_scenario( csvdata: List[Dict[str, str]], payload_inputs: List[str], evaluation_type: EvaluationType, - new_evaluation: EvaluationDB, + new_evaluation: HumanEvaluationDB, user: UserDB, app: AppDB, ): @@ -149,7 +153,7 @@ async def prepare_csvdata_and_create_evaluation_scenario( # Create evaluation scenarios list_of_scenario_input = [] for scenario_input in inputs: - eval_scenario_input_instance = EvaluationScenarioInputDB( + eval_scenario_input_instance = HumanEvaluationScenarioInput( input_name=scenario_input["input_name"], input_value=scenario_input["input_value"], ) @@ -164,7 +168,7 @@ async def prepare_csvdata_and_create_evaluation_scenario( **_extend_with_correct_answer(evaluation_type, datum), } - eval_scenario_instance = EvaluationScenarioDB( + eval_scenario_instance = HumanEvaluationScenarioDB( **evaluation_scenario_payload, user=user, organization=app.organization, @@ -786,6 +790,69 @@ async def fetch_custom_evaluation_names( return list_of_custom_eval_names +async def create_new_human_evaluation( + payload: NewHumanEvaluation, **user_org_data: dict +) -> EvaluationDB: + """ + Create a new evaluation based on the provided payload and additional arguments. + + Args: + payload (NewEvaluation): The evaluation payload. + **user_org_data (dict): Additional keyword arguments, e.g., user id. + + Returns: + EvaluationDB + """ + user = await get_user(user_uid=user_org_data["uid"]) + + # Initialize evaluation type settings + settings = payload.evaluation_type_settings + evaluation_type_settings = {} + + current_time = datetime.utcnow() + + # Fetch app + app = await db_manager.fetch_app_by_id(app_id=payload.app_id) + if app is None: + raise HTTPException( + status_code=404, + detail=f"App with id {payload.app_id} does not exist", + ) + + variants = [ObjectId(variant_id) for variant_id in payload.variant_ids] + + testset = await db_manager.fetch_testset_by_id(testset_id=payload.testset_id) + # Initialize and save evaluation instance to database + eval_instance = HumanEvaluationDB( + app=app, + organization=app.organization, # Assuming user has an organization_id attribute + user=user, + status=payload.status, + evaluation_type=payload.evaluation_type, + evaluation_type_settings=evaluation_type_settings, + variants=variants, + testset=testset, + created_at=current_time, + updated_at=current_time, + ) + newEvaluation = await engine.save(eval_instance) + + if newEvaluation is None: + raise HTTPException( + status_code=500, detail="Failed to create evaluation_scenario" + ) + + await prepare_csvdata_and_create_evaluation_scenario( + testset.csvdata, + payload.inputs, + payload.evaluation_type, + newEvaluation, + user, + app, + ) + return newEvaluation + + async def create_new_evaluation( app_data: dict, new_evaluation_data: dict, evaluators_configs: List[str] ) -> Evaluation: @@ -818,6 +885,9 @@ async def create_new_evaluation( return await converters.evaluation_db_to_pydantic(evaluation_db) + + + async def retrieve_evaluation_results( evaluation_id: str, **user_org_data: dict ) -> List[dict]: From e8e188a1b6aeab45a33c3e745415646321891d36 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 2 Jan 2024 16:06:08 +0500 Subject: [PATCH 208/414] old evaluations cleanup - p1 --- .../cypress/e2e/ab-testing-evaluation.cy.ts | 4 +- agenta-web/cypress/e2e/app-navigation.cy.ts | 6 +- .../AICritiqueEvaluationTable.tsx | 532 ----------------- .../CustomCodeRunEvaluationTable.tsx | 544 ------------------ .../EvaluationTableWithChat.tsx | 153 ----- .../ExactMatchEvaluationTable.tsx | 438 -------------- .../EvaluationTable/RegexEvaluationTable.tsx | 505 ---------------- .../SimilarityMatchEvaluationTable.tsx | 471 --------------- .../WebhookEvaluationTable.tsx | 460 --------------- .../Evaluations/AutomaticEvaluationResult.tsx | 323 ----------- .../Evaluations/CustomPythonCode.tsx | 231 -------- .../components/Evaluations/Evaluations.tsx | 254 +------- .../Evaluations/HumanEvaluationResult.tsx | 10 +- agenta-web/src/components/Sidebar/Sidebar.tsx | 26 +- .../evaluationResults/EvaluationResults.tsx | 4 +- .../evaluations/evaluationResults/mock.ts | 305 ---------- .../EvaluationScenarios.tsx | 2 +- agenta-web/src/lib/services/api.ts | 51 +- .../annotations/[annotation_id]/index.tsx | 9 - .../pages/apps/[app_id]/annotations/index.tsx | 11 +- .../apps/[app_id]/evaluations-new/index.tsx | 69 --- .../[evaluation_id]/auto_ai_critique.tsx | 69 --- .../[evaluation_id]/auto_exact_match.tsx | 69 --- .../[evaluation_id]/auto_regex_test.tsx | 69 --- .../[evaluation_id]/auto_similarity_match.tsx | 69 --- .../[evaluation_id]/auto_webhook_test.tsx | 69 --- .../[evaluation_id]/custom_code_run.tsx | 71 --- .../[evaluation_id]/human_a_b_testing.tsx | 71 --- .../[evaluation_id]/index.tsx | 0 .../[evaluation_id]/similarity_match.tsx | 69 --- .../[evaluation_id]/single_model_test.tsx | 72 --- .../compare/index.tsx | 0 .../evaluations/create_custom_evaluation.tsx | 50 -- .../evaluations/custom_evaluations/[id].tsx | 86 --- .../pages/apps/[app_id]/evaluations/index.tsx | 70 ++- 35 files changed, 137 insertions(+), 5105 deletions(-) delete mode 100644 agenta-web/src/components/EvaluationTable/AICritiqueEvaluationTable.tsx delete mode 100644 agenta-web/src/components/EvaluationTable/CustomCodeRunEvaluationTable.tsx delete mode 100644 agenta-web/src/components/EvaluationTable/EvaluationTableWithChat.tsx delete mode 100644 agenta-web/src/components/EvaluationTable/ExactMatchEvaluationTable.tsx delete mode 100644 agenta-web/src/components/EvaluationTable/RegexEvaluationTable.tsx delete mode 100644 agenta-web/src/components/EvaluationTable/SimilarityMatchEvaluationTable.tsx delete mode 100644 agenta-web/src/components/EvaluationTable/WebhookEvaluationTable.tsx delete mode 100644 agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx delete mode 100644 agenta-web/src/components/Evaluations/CustomPythonCode.tsx delete mode 100644 agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts delete mode 100644 agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_ai_critique.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_exact_match.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_regex_test.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_similarity_match.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_webhook_test.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/custom_code_run.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/human_a_b_testing.tsx rename agenta-web/src/pages/apps/[app_id]/{evaluations-new => evaluations}/[evaluation_id]/index.tsx (100%) delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/similarity_match.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/single_model_test.tsx rename agenta-web/src/pages/apps/[app_id]/{evaluations-new => evaluations}/compare/index.tsx (100%) delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/create_custom_evaluation.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/custom_evaluations/[id].tsx diff --git a/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts b/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts index 2e58da24ab..e0e24fb0ec 100644 --- a/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts +++ b/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts @@ -39,8 +39,8 @@ describe("A/B Testing Evaluation workflow", () => { context("When executing the evaluation", () => { it("Should successfully execute the evaluation process", () => { - cy.visit(`/apps/${app_id}/evaluations`) - cy.url().should("include", "/evaluations") + cy.visit(`/apps/${app_id}/annotations`) + cy.url().should("include", "/annotations") cy.clickLinkAndWait('[data-cy="abTesting-button"]') cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") diff --git a/agenta-web/cypress/e2e/app-navigation.cy.ts b/agenta-web/cypress/e2e/app-navigation.cy.ts index 7b29ea0324..c6a3cdc04c 100644 --- a/agenta-web/cypress/e2e/app-navigation.cy.ts +++ b/agenta-web/cypress/e2e/app-navigation.cy.ts @@ -31,11 +31,7 @@ describe("App Navigation without errors", () => { it("should navigate successfully to Evaluations", () => { cy.clickLinkAndWait('[data-cy="app-evaluations-link"]') cy.location("pathname").should("include", "/evaluations") - cy.get('[data-cy="evaluations-container"]').within(() => { - cy.contains("1. Select an evaluation type") - cy.contains("2. Which variants would you like to evaluate") - cy.contains("3. Which testset you want to use?") - }) + //TOOD add more assertions specific to the new evaluations page }) if (isDemo()) { diff --git a/agenta-web/src/components/EvaluationTable/AICritiqueEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/AICritiqueEvaluationTable.tsx deleted file mode 100644 index 3034fb7787..0000000000 --- a/agenta-web/src/components/EvaluationTable/AICritiqueEvaluationTable.tsx +++ /dev/null @@ -1,532 +0,0 @@ -import {useState, useEffect} from "react" -import type {ColumnType} from "antd/es/table" -import {LineChartOutlined} from "@ant-design/icons" -import { - Button, - Card, - Col, - Input, - Row, - Space, - Spin, - Statistic, - Table, - Tag, - Typography, - message, -} from "antd" -import {Evaluation} from "@/lib/Types" -import { - updateEvaluationScenario, - callVariant, - fetchEvaluationResults, - updateEvaluation, - evaluateAICritiqueForEvalScenario, -} from "@/lib/services/api" -import {useVariants} from "@/lib/hooks/useVariant" -import {useRouter} from "next/router" -import {EvaluationFlow, EvaluationType} from "@/lib/enums" -import {batchExecute, getApikeys} from "@/lib/helpers/utils" -import {createUseStyles} from "react-jss" -import {exportAICritiqueEvaluationData} from "@/lib/helpers/evaluate" -import SecondaryButton from "../SecondaryButton/SecondaryButton" -import {useAppTheme} from "../Layout/ThemeContextProvider" -import {contentToChatMessageString, testsetRowToChatMessages} from "@/lib/helpers/testset" -import ParamsForm from "../Playground/ParamsForm/ParamsForm" - -const {Title} = Typography - -interface AICritiqueEvaluationTableProps { - evaluation: Evaluation - columnsCount: number - evaluationScenarios: AICritiqueEvaluationTableRow[] -} - -interface AICritiqueEvaluationTableRow { - id?: string - inputs: { - input_name: string - input_value: string - }[] - outputs: { - variant_id: string - variant_output: string - }[] - columnData0: string - correctAnswer: string - score: string - evaluationFlow: EvaluationFlow -} - -type StyleProps = { - themeMode: "dark" | "light" -} -/** - * - * @param evaluation - Evaluation object - * @param evaluationScenarios - Evaluation rows - * @param columnsCount - Number of variants to compare face to face (per default 2) - * @returns - */ - -const useStyles = createUseStyles({ - appVariant: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - inputTestContainer: { - display: "flex", - justifyContent: "space-between", - }, - inputTest: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - recordInput: { - marginBottom: 10, - }, - tag: { - fontSize: "14px", - }, - card: ({themeMode}: StyleProps) => ({ - marginTop: 16, - width: "100%", - border: "1px solid #ccc", - marginRight: "24px", - marginBottom: 30, - background: themeMode === "light" ? "rgb(246 253 245)" : "#000000", - "& .ant-card-head": { - minHeight: 44, - padding: "0px 12px", - }, - "& .ant-card-body": { - padding: "4px 16px", - border: "0px solid #ccc", - }, - }), - cardTextarea: { - height: 120, - padding: "0px 0px", - }, - row: {marginBottom: 20}, - evaluationResult: ({themeMode}: StyleProps) => ({ - padding: "30px 10px", - marginBottom: 20, - border: "1px solid #ccc", - background: themeMode === "light" ? "rgb(244 244 244)" : "#000000", - color: themeMode === "light" ? "#000" : "#fff", - borderRadius: 5, - }), - h3: { - marginTop: 0, - }, - resultDataRow: { - maxWidth: "100%", - overflowX: "auto", - whiteSpace: "nowrap", - }, - resultDataCol: { - display: "inline-block", - }, - resultDataCard: { - width: 200, - margin: "0 4px", - }, - stat: { - "& .ant-statistic-content-value": { - color: "#3f8600", - }, - }, -}) - -const AICritiqueEvaluationTable: React.FC = ({ - evaluation, - evaluationScenarios, - columnsCount, -}) => { - const {appTheme} = useAppTheme() - const classes = useStyles({themeMode: appTheme} as StyleProps) - const router = useRouter() - const appId = router.query.app_id as string - - const variants = evaluation.variants - - const variantData = useVariants(appId, variants) - - const [rows, setRows] = useState([]) - const [evaluationPromptTemplate, setEvaluationPromptTemplate] = useState( - evaluation.evaluationTypeSettings.evaluationPromptTemplate || - `We have an LLM App that we want to evaluate its outputs. -Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: - -Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. - -Prompt: {llm_app_prompt_template} -Inputs: {inputs} -Correct Answer:{correct_answer} -Evaluate this: {app_variant_output} - -Answer ONLY with one of the given grading or evaluation options. -`, - ) - - const [shouldFetchResults, setShouldFetchResults] = useState(false) - const [evaluationStatus, setEvaluationStatus] = useState(evaluation.status) - const [evaluationResults, setEvaluationResults] = useState(null) - - useEffect(() => { - if ( - variantData && - variantData[0] && - variantData[0].inputParams && - variantData[0].inputParams.length > 0 - ) { - const llmAppInputs = variantData[0].inputParams - .map((param) => `${param.name}: {${param.name}}`) - .join(", ") - setEvaluationPromptTemplate(evaluationPromptTemplate.replace("{inputs}", llmAppInputs)) - } - }, [variantData]) - - useEffect(() => { - if (evaluationScenarios) { - setRows(evaluationScenarios) - } - }, [evaluationScenarios]) - - useEffect(() => { - if (evaluationStatus === EvaluationFlow.EVALUATION_FINISHED && shouldFetchResults) { - fetchEvaluationResults(evaluation.id) - .then((data) => setEvaluationResults(data)) - .catch((err) => console.error("Failed to fetch results:", err)) - .then(() => { - updateEvaluation(evaluation.id, { - status: EvaluationFlow.EVALUATION_FINISHED, - evaluation_type_settings: { - evaluation_prompt_template: evaluationPromptTemplate, - }, - }) - }) - .catch((err) => console.error("Failed to fetch results:", err)) - } - }, [evaluationStatus, evaluation.id]) - - const handleInputChange = (value: any, name: string, rowIndex: any) => { - const newRows = [...rows] - const ip = newRows[rowIndex].inputs.find((ip) => ip.input_name === name) - if (ip) ip.input_value = value - setRows(newRows) - } - - const runAllEvaluations = async () => { - try { - setEvaluationStatus(EvaluationFlow.EVALUATION_STARTED) - await batchExecute(rows.map((_, rowIndex) => () => runEvaluation(rowIndex))) - setEvaluationStatus(EvaluationFlow.EVALUATION_FINISHED) - console.log("All evaluations finished.") - } catch (err) { - console.error("An error occurred:", err) - setEvaluationStatus(EvaluationFlow.EVALUATION_FAILED) - } - } - - const runEvaluation = async (rowIndex: number) => { - const inputParamsDict = rows[rowIndex].inputs.reduce((acc: {[key: string]: any}, item) => { - acc[item.input_name] = item.input_value - return acc - }, {}) - - const columnsDataNames = ["columnData0"] - let idx = 0 - for (const columnName of columnsDataNames) { - setRowValue(rowIndex, "evaluationFlow", EvaluationFlow.COMPARISON_RUN_STARTED) - - let result = await callVariant( - inputParamsDict, - variantData[idx].inputParams!, - variantData[idx].optParams!, - appId || "", - variants[idx].baseId || "", - variantData[idx].isChatVariant - ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) - : [], - ) - if (variantData[idx].isChatVariant) result = contentToChatMessageString(result) - - setRowValue(rowIndex, columnName as any, result) - await evaluate(rowIndex) - setShouldFetchResults(true) - if (rowIndex === rows.length - 1) { - message.success("Evaluation Results Saved") - } - idx++ - } - } - - const evaluate = async (rowNumber: number) => { - const evaluation_scenario_id = rows[rowNumber].id - const outputVariantX = rows[rowNumber].columnData0 - - if (evaluation_scenario_id) { - const data = { - outputs: [{variant_id: variants[0].variantId, variant_output: outputVariantX}], - } - - const aiCritiqueScoreResponse = await evaluateAICritiqueForEvalScenario({ - correct_answer: rows[rowNumber].correctAnswer, - llm_app_prompt_template: evaluation.llmAppPromptTemplate, - inputs: rows[rowNumber].inputs, - outputs: data.outputs, - evaluation_prompt_template: evaluationPromptTemplate, - open_ai_key: getApikeys(), - }) - - try { - const responseData = await updateEvaluationScenario( - evaluation.id, - evaluation_scenario_id, - {...data, score: aiCritiqueScoreResponse.data}, - evaluation.evaluationType as EvaluationType, - ) - setRowValue(rowNumber, "evaluationFlow", EvaluationFlow.EVALUATION_FINISHED) - setRowValue(rowNumber, "score", aiCritiqueScoreResponse.data) - } catch (err) { - console.error(err) - } - } - } - - const setRowValue = ( - rowIndex: number, - columnKey: keyof AICritiqueEvaluationTableRow, - value: any, - ) => { - const newRows = [...rows] - newRows[rowIndex][columnKey] = value as never - setRows(newRows) - } - - const dynamicColumns: ColumnType[] = Array.from( - {length: columnsCount}, - (_, i) => { - const columnKey = `columnData${i}` - - return { - title: ( -
- App Variant: - - {variants ? variants[i].variantName : ""} - -
- ), - dataIndex: columnKey, - key: columnKey, - width: "30%", - render: (text: any, record: AICritiqueEvaluationTableRow, rowIndex: number) => { - if ( - record.evaluationFlow === EvaluationFlow.COMPARISON_RUN_STARTED && - evaluationStatus === EvaluationFlow.EVALUATION_STARTED - ) { - return ( -
- -
- ) - } - if ( - record.evaluationFlow === EvaluationFlow.COMPARISON_RUN_STARTED && - evaluationStatus === EvaluationFlow.EVALUATION_FAILED - ) { - return - } - if (record.outputs && record.outputs.length > 0) { - const outputValue = record.outputs.find( - (output: any) => output.variant_id === variants[i].variantId, - )?.variant_output - return
{outputValue}
- } - return text - }, - } - }, - ) - - const columns = [ - { - key: "1", - width: "30%", - title: ( -
-
- Inputs (Test set: - {evaluation.testset.name} - ) -
-
- ), - dataIndex: "inputs", - render: (text: any, record: AICritiqueEvaluationTableRow, rowIndex: number) => ( -
- {evaluation.testset.testsetChatColumn ? ( - evaluation.testset.csvdata[rowIndex][ - evaluation.testset.testsetChatColumn - ] || " - " - ) : ( - - handleInputChange(value, name, rowIndex) - } - inputParams={ - variantData[0].inputParams?.map((item) => ({ - ...item, - value: record.inputs.find((ip) => ip.input_name === item.name) - ?.input_value, - })) || [] - } - /> - )} -
- ), - }, - ...dynamicColumns, - { - title: "Correct Answer", - dataIndex: "correctAnswer", - key: "correctAnswer", - width: "30%", - - render: (text: any, record: any, rowIndex: number) =>
{record.correctAnswer}
, - }, - { - title: "Evaluation", - dataIndex: "evaluation", - key: "score", - width: 200, - align: "center" as "left" | "right" | "center", - render: (score: string, record: any) => { - if ( - record.evaluationFlow === EvaluationFlow.COMPARISON_RUN_STARTED && - evaluationStatus === EvaluationFlow.EVALUATION_STARTED - ) { - return - } - if ( - record.evaluationFlow === EvaluationFlow.COMPARISON_RUN_STARTED && - evaluationStatus === EvaluationFlow.EVALUATION_FAILED - ) { - return - } - let tagColor = "" - - return ( - - -
- {score !== "" && ( - - {record.score} - - )} -
-
-
- ) - }, - }, - ] - - const onChangeEvaluationPromptTemplate = (e: any) => { - setEvaluationPromptTemplate(e.target.value) - } - - return ( -
- AI Critique Evaluation -
-
- - - -
- - - - - exportAICritiqueEvaluationData(evaluation, rows)} - disabled={evaluationStatus !== EvaluationFlow.EVALUATION_FINISHED} - > - Export results - - - - -
-
-
- {evaluationStatus === EvaluationFlow.EVALUATION_FAILED && ( -
Failed to run evaluation
- )} - - {evaluationStatus === EvaluationFlow.EVALUATION_INITIALIZED && ( -
Run evaluation to see results!
- )} - - {evaluationStatus === EvaluationFlow.EVALUATION_STARTED && } - - {evaluationStatus === EvaluationFlow.EVALUATION_FINISHED && - evaluationResults && - evaluationResults.results_data && ( -
-

Results Data:

- - {Object.entries(evaluationResults.results_data).map( - ([key, value], index) => ( - - - - - - ), - )} - -
- )} -
-
-
- - - - ) -} - -export default AICritiqueEvaluationTable diff --git a/agenta-web/src/components/EvaluationTable/CustomCodeRunEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/CustomCodeRunEvaluationTable.tsx deleted file mode 100644 index 711321cf82..0000000000 --- a/agenta-web/src/components/EvaluationTable/CustomCodeRunEvaluationTable.tsx +++ /dev/null @@ -1,544 +0,0 @@ -import {useState, useEffect} from "react" -import type {ColumnType} from "antd/es/table" -import {CodeOutlined, LineChartOutlined} from "@ant-design/icons" -import { - Button, - Card, - Col, - Input, - Modal, - Row, - Space, - Spin, - Statistic, - Table, - Typography, - message, -} from "antd" -import {CustomEvaluation, Evaluation} from "@/lib/Types" -import { - updateEvaluationScenario, - callVariant, - fetchEvaluationResults, - updateEvaluation, - executeCustomEvaluationCode, - loadTestset, - updateEvaluationScenarioScore, - fetchEvaluationScenarioResults, - fetchCustomEvaluationDetail, -} from "@/lib/services/api" -import {useVariants} from "@/lib/hooks/useVariant" -import {useRouter} from "next/router" -import {EvaluationFlow, EvaluationType} from "@/lib/enums" -import {batchExecute, getApikeys} from "@/lib/helpers/utils" -import {createUseStyles} from "react-jss" -import SecondaryButton from "../SecondaryButton/SecondaryButton" -import {exportCustomCodeEvaluationData} from "@/lib/helpers/evaluate" -import CodeBlock from "../DynamicCodeBlock/CodeBlock" -import {contentToChatMessageString, testsetRowToChatMessages} from "@/lib/helpers/testset" -import ParamsForm from "../Playground/ParamsForm/ParamsForm" - -const {Title} = Typography - -interface CustomCodeEvaluationTableProps { - evaluation: Evaluation - columnsCount: number - customEvaluationId: string - evaluationScenarios: CustomCodeEvaluationTableRow[] -} - -interface CustomCodeEvaluationTableRow { - id?: string - inputs: { - input_name: string - input_value: string - }[] - outputs: { - variant_id: string - variant_output: string - }[] - columnData0: string - correctAnswer: string - evaluation: string - codeResult: string - evaluationFlow: EvaluationFlow -} - -interface IVariantInputs { - input_name: string - input_value: string -} - -interface IScenarioScore { - scenario_id: string - score: string -} - -/** - * - * @param evaluation - Evaluation object - * @param evaluationScenarios - Evaluation rows - * @param columnsCount - Number of variants to compare face to face (per default 2) - * @returns - */ - -const useStyles = createUseStyles({ - appVariant: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - inputTestContainer: { - display: "flex", - justifyContent: "space-between", - }, - inputTest: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - recordInput: { - marginBottom: 10, - }, - tag: { - fontSize: "14px", - }, - card: { - marginBottom: 20, - }, - codeButton: { - marginBottom: 20, - }, - cardTextarea: { - height: 120, - padding: "0px 0px", - }, - row: {marginBottom: 20}, - evaluationResult: { - padding: "30px 10px", - marginBottom: 20, - backgroundColor: "rgb(244 244 244)", - border: "1px solid #ccc", - borderRadius: 5, - }, - h3: { - marginTop: 0, - }, - resultDataRow: { - maxWidth: "100%", - overflowX: "auto", - whiteSpace: "nowrap", - }, - resultDataCol: { - display: "inline-block", - }, - resultDataCard: { - width: 200, - margin: "0 4px", - }, - stat: { - "& .ant-statistic-content-value": { - color: "#3f8600", - }, - }, - codeBlockContainer: { - marginTop: 24, - }, -}) - -const CustomCodeRunEvaluationTable: React.FC = ({ - evaluation, - evaluationScenarios, - columnsCount, - customEvaluationId, -}) => { - const classes = useStyles() - const router = useRouter() - const appId = router.query.app_id as string - - const variants = evaluation.variants - - const variantData = useVariants(appId, variants) - - const [rows, setRows] = useState([]) - - const [shouldFetchResults, setShouldFetchResults] = useState(false) - const [evaluationStatus, setEvaluationStatus] = useState(evaluation.status) - const [evaluationResults, setEvaluationResults] = useState(null) - const [evaluationTestsets, setEvaluationTestsets] = useState([]) - const [listScenariosResult, setListScenariosResult] = useState([]) - const [customEvaluation, setCustomEvaluation] = useState() - const [modalOpen, setModalOpen] = useState(false) - - useEffect(() => { - if (customEvaluationId && customEvaluation?.id !== customEvaluationId) { - fetchCustomEvaluationDetail(customEvaluationId) - .then(setCustomEvaluation) - .catch(console.error) - } - }, [customEvaluationId]) - - useEffect(() => { - if (evaluationScenarios) { - setRows(evaluationScenarios) - Promise.all(evaluationScenarios.map((item) => retrieveScenarioScore(item.id!))) - } - }, [evaluationScenarios]) - - useEffect(() => { - if (evaluationStatus === EvaluationFlow.EVALUATION_FINISHED) { - fetchEvaluationResults(evaluation.id) - .then((data) => setEvaluationResults(data)) - .catch((err) => console.error("Failed to fetch results:", err)) - .then(() => { - updateEvaluation(evaluation.id, {status: EvaluationFlow.EVALUATION_FINISHED}) - }) - .catch((err) => console.error("Failed to fetch results:", err)) - } - }, [evaluationStatus, evaluation.id]) - - useEffect(() => { - const getTests = async () => { - const data = await loadTestset(evaluation.testset._id) - if (data.csvdata.length > 0) { - setEvaluationTestsets(data.csvdata) - } - } - - getTests() - }, [evaluation]) - - const handleInputChange = (value: any, name: string, rowIndex: any) => { - const newRows = [...rows] - const ip = newRows[rowIndex].inputs.find((ip) => ip.input_name === name) - if (ip) ip.input_value = value - setRows(newRows) - } - - const runAllEvaluations = async () => { - try { - setEvaluationStatus(EvaluationFlow.EVALUATION_STARTED) - await batchExecute(rows.map((_, rowIndex) => () => runEvaluation(rowIndex))) - setEvaluationStatus(EvaluationFlow.EVALUATION_FINISHED) - console.log("All evaluations finished.") - } catch (err) { - console.error("An error occurred:", err) - } - } - - const runEvaluation = async (rowIndex: number) => { - const inputParamsDict = rows[rowIndex].inputs.reduce((acc: {[key: string]: any}, item) => { - acc[item.input_name] = item.input_value - return acc - }, {}) - - const columnsDataNames = ["columnData0"] - let idx = 0 - for (const columnName of columnsDataNames) { - setRowValue(rowIndex, "evaluationFlow", EvaluationFlow.COMPARISON_RUN_STARTED) - - let result = await callVariant( - inputParamsDict, - variantData[idx].inputParams!, - variantData[idx].optParams!, - appId || "", - variants[idx].baseId || "", - variantData[idx].isChatVariant - ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) - : [], - ) - if (variantData[idx].isChatVariant) result = contentToChatMessageString(result) - - setRowValue(rowIndex, columnName as any, result) - await evaluate(rowIndex) - setShouldFetchResults(true) - if (rowIndex === rows.length - 1) { - message.success("Evaluation Results Saved") - } - idx++ - } - } - - const correctAnswer = (variantInputs: Array) => { - const {input_name, input_value} = variantInputs[0] - const filteredData: any = evaluationTestsets.filter( - (item) => item[input_name] === input_value, - )[0] - return filteredData?.correct_answer - } - - const calcScenarioScore = (ix: number) => { - const item = rows[ix] - - let score = +item.codeResult - if (!item.codeResult && item.outputs.length && listScenariosResult.length) { - score = +(listScenariosResult.find((res) => res.scenario_id === item.id)?.score || 0) - } - if (isNaN(score)) score = 0 - - return score.toFixed(2) - } - - const retrieveScenarioScore = async (scenario_id: string) => { - const response: any = await fetchEvaluationScenarioResults(scenario_id) - setListScenariosResult((prev) => [...prev, response.data as IScenarioScore]) - } - - const evaluate = async (rowNumber: number) => { - const evaluation_scenario_id = rows[rowNumber].id - const outputVariantX = rows[rowNumber].columnData0 - - if (evaluation_scenario_id) { - const data = { - outputs: [{variant_id: variants[0].variantId, variant_output: outputVariantX}], - inputs: rows[rowNumber].inputs, - correct_answer: correctAnswer(rows[rowNumber].inputs), - open_ai_key: getApikeys(), - } - - try { - // Update evaluation scenario - const responseData = await updateEvaluationScenario( - evaluation.id, - evaluation_scenario_id, - data, - evaluation.evaluationType as EvaluationType, - ) - - // Call custom code evaluation - const result = await callCustomCodeHandler( - variants[0].variantId, - data.inputs, - data.outputs, - ) - if (result) { - // Update the evaluation scenario with the score - await updateEvaluationScenarioScore(evaluation_scenario_id, result) - } - setRowValue(rowNumber, "codeResult", result) - - setRowValue(rowNumber, "evaluationFlow", EvaluationFlow.EVALUATION_FINISHED) - setRowValue(rowNumber, "evaluation", responseData.evaluation) - } catch (err) { - console.log(err) - } - } - } - - const callCustomCodeHandler = async ( - variantId: string, - inputs: Array, - outputs: Array, - ) => { - const expectedTarget = correctAnswer(inputs) - const data = { - evaluation_id: customEvaluationId, - inputs, - outputs, - correct_answer: expectedTarget, - variant_id: variantId, - app_id: appId, - } - const response = await executeCustomEvaluationCode(data) - if (response.status === 200) { - return response.data - } - } - - const setRowValue = ( - rowIndex: number, - columnKey: keyof CustomCodeEvaluationTableRow, - value: any, - ) => { - const newRows = [...rows] - newRows[rowIndex][columnKey] = value as never - setRows(newRows) - } - - const dynamicColumns: ColumnType[] = Array.from( - {length: columnsCount}, - (_, i) => { - const columnKey = `columnData${i}` - - return { - title: ( -
- App Variant: - - {variants ? variants[i].variantName : ""} - -
- ), - dataIndex: columnKey, - key: columnKey, - width: "30%", - render: (text: any, record: CustomCodeEvaluationTableRow, rowIndex: number) => { - if (record.evaluationFlow === EvaluationFlow.COMPARISON_RUN_STARTED) { - return ( -
- -
- ) - } - if (record.outputs && record.outputs.length > 0) { - const outputValue = record.outputs.find( - (output: any) => output.variant_id === variants[i].variantId, - )?.variant_output - return
{outputValue}
- } - return text - }, - } - }, - ) - - const columns = [ - { - key: "1", - width: "30%", - title: ( -
-
- Inputs (Test set: - {evaluation.testset.name} - ) -
-
- ), - dataIndex: "inputs", - render: (text: any, record: CustomCodeEvaluationTableRow, rowIndex: number) => ( -
- {evaluation.testset.testsetChatColumn ? ( - evaluation.testset.csvdata[rowIndex][ - evaluation.testset.testsetChatColumn - ] || " - " - ) : ( - - handleInputChange(value, name, rowIndex) - } - inputParams={ - variantData[0].inputParams?.map((item) => ({ - ...item, - value: record.inputs.find((ip) => ip.input_name === item.name) - ?.input_value, - })) || [] - } - /> - )} -
- ), - }, - ...dynamicColumns, - { - title: "Correct Answer", - dataIndex: "correctAnswer", - key: "correctAnswer", - width: "30%", - - render: (text: any, record: any, rowIndex: number) => { - return
{correctAnswer(record.inputs)}
- }, - }, - { - title: "Result", - dataIndex: "codeResult", - key: "code_result", - width: 200, - align: "center" as "left" | "right" | "center", - render: (_: number, record: any, ix: number) => { - return ( - - {calcScenarioScore(ix)} - - ) - }, - }, - ] - - return ( -
- Custom Code Evaluation -
- -
- - - - exportCustomCodeEvaluationData( - evaluation, - rows.map((item, ix) => ({ - ...item, - score: calcScenarioScore(ix), - })), - ) - } - disabled={evaluationStatus !== EvaluationFlow.EVALUATION_FINISHED} - > - Export results - - - - - - - - - - - - - - - {customEvaluation?.python_code && ( - - )} - -
-
- - - setModalOpen(false)} - width={700} - > -
- -
-
- - ) -} - -export default CustomCodeRunEvaluationTable diff --git a/agenta-web/src/components/EvaluationTable/EvaluationTableWithChat.tsx b/agenta-web/src/components/EvaluationTable/EvaluationTableWithChat.tsx deleted file mode 100644 index 93f82467a6..0000000000 --- a/agenta-web/src/components/EvaluationTable/EvaluationTableWithChat.tsx +++ /dev/null @@ -1,153 +0,0 @@ -import {useState, useEffect, useRef} from "react" -import {Button, Dropdown, Input, Menu, Space, Table, Typography} from "antd" -import {AppVariant} from "@/lib/Types" -import type {ColumnType} from "antd/es/table" -import {DislikeOutlined, DownOutlined, LikeOutlined} from "@ant-design/icons" -import {createUseStyles} from "react-jss" - -interface EvaluationTableWithChatProps { - columnsCount: number - appVariants: AppVariant[] -} - -interface TableDataType { - key: React.Key - [key: string]: any -} - -const useStyles = createUseStyles({ - table: { - display: "flex", - justifyContent: "center", - marginBottom: 15, - }, - title: { - display: "flex", - justifyContent: "space-between", - }, -}) - -const EvaluationTableWithChat: React.FC = ({ - columnsCount, - appVariants, -}) => { - const classes = useStyles() - const [dataSource, setDataSource] = useState([]) - const [selectedItems, setSelectedItems] = useState( - Array(columnsCount).fill("Select a variant"), - ) - const [isSelected, setIsSelected] = useState(Array(columnsCount).fill(false)) - const [inputData, setInputData] = useState("") - const inputRef = useRef(null) - - const {Text} = Typography - - const handleMenuClick = - (columnIndex: number) => - ({key}: {key: string}) => { - setSelectedItems((prevState) => { - const newState = [...prevState] - newState[columnIndex] = key - return newState - }) - - setIsSelected((prevState) => { - const newState = [...prevState] - newState[columnIndex] = true - return newState - }) - const a = {modelOne: selectedItems[0], modelTwo: selectedItems[1]} - } - - const handleKeyPress = (e: React.KeyboardEvent) => { - if (e.key === "Enter" && inputData) { - setDataSource([ - ...dataSource, - { - key: `${dataSource.length}`, - ...dynamicColumns.reduce( - (acc, column) => ({...acc, [column.key as string]: inputData}), - {}, - ), - }, - ]) - - setInputData("") - } - } - - const handleInputChange = (e: React.ChangeEvent) => { - setInputData(e.target.value) - } - - const dynamicColumns: ColumnType[] = Array.from( - {length: columnsCount}, - (_, i) => { - const columnKey = `column${i}` - const menu = ( - - {appVariants.map((appVariant, index) => ( - {appVariant.name} - ))} - - ) - - return { - title: ( -
- App Variant: - 0 ? "button-animation" : "" - } - > - - -
- ), - dataIndex: columnKey, - key: columnKey, - width: "50%", - } - }, - ) - - const columns = [...dynamicColumns] - - return ( -
-
( -
-
- - - - - -
-
- -
-
- )} - rowClassName={() => "editable-row"} - /> - - ) -} - -export default EvaluationTableWithChat diff --git a/agenta-web/src/components/EvaluationTable/ExactMatchEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/ExactMatchEvaluationTable.tsx deleted file mode 100644 index 7f80ba22b4..0000000000 --- a/agenta-web/src/components/EvaluationTable/ExactMatchEvaluationTable.tsx +++ /dev/null @@ -1,438 +0,0 @@ -import {useState, useEffect} from "react" -import type {ColumnType} from "antd/es/table" -import {LineChartOutlined} from "@ant-design/icons" -import { - Button, - Card, - Col, - Input, - Row, - Space, - Spin, - Statistic, - Table, - Tag, - Typography, - message, -} from "antd" -import { - updateEvaluationScenario, - callVariant, - fetchEvaluationResults, - updateEvaluation, -} from "@/lib/services/api" -import {useVariants} from "@/lib/hooks/useVariant" -import {useRouter} from "next/router" -import {EvaluationFlow} from "@/lib/enums" -import {evaluateWithExactMatch} from "@/lib/services/evaluations" -import {createUseStyles} from "react-jss" -import {exportExactEvaluationData} from "@/lib/helpers/evaluate" -import SecondaryButton from "../SecondaryButton/SecondaryButton" -import {contentToChatMessageString, testsetRowToChatMessages} from "@/lib/helpers/testset" -import {Evaluation} from "@/lib/Types" -import ParamsForm from "../Playground/ParamsForm/ParamsForm" -import {batchExecute} from "@/lib/helpers/utils" - -const {Title} = Typography - -interface ExactMatchEvaluationTableProps { - evaluation: Evaluation - columnsCount: number - evaluationScenarios: ExactMatchEvaluationTableRow[] -} - -interface ExactMatchEvaluationTableRow { - id?: string - inputs: { - input_name: string - input_value: string - }[] - outputs: { - variant_id: string - variant_output: string - }[] - columnData0: string - correctAnswer: string - score: string - evaluationFlow: EvaluationFlow -} -/** - * - * @param evaluation - Evaluation object - * @param evaluationScenarios - Evaluation rows - * @param columnsCount - Number of variants to compare face to face (per default 2) - * @returns - */ - -const useStyles = createUseStyles({ - appVariant: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - inputTestContainer: { - display: "flex", - justifyContent: "space-between", - }, - inputTest: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - recordInput: { - marginBottom: 10, - }, - tag: { - fontSize: "14px", - }, - card: { - marginBottom: 20, - }, - statCorrect: { - "& .ant-statistic-content-value": { - color: "#3f8600", - }, - }, - statWrong: { - "& .ant-statistic-content-value": { - color: "#cf1322", - }, - }, -}) - -const ExactMatchEvaluationTable: React.FC = ({ - evaluation, - evaluationScenarios, - columnsCount, -}) => { - const classes = useStyles() - const router = useRouter() - const appId = router.query.app_id as string - const variants = evaluation.variants - - const variantData = useVariants(appId, variants) - - const [rows, setRows] = useState([]) - const [wrongAnswers, setWrongAnswers] = useState(0) - const [correctAnswers, setCorrectAnswers] = useState(0) - const [accuracy, setAccuracy] = useState(0) - const [evaluationStatus, setEvaluationStatus] = useState(evaluation.status) - - useEffect(() => { - if (evaluationScenarios) { - setRows(evaluationScenarios) - } - }, [evaluationScenarios]) - - useEffect(() => { - if (evaluationStatus === EvaluationFlow.EVALUATION_FINISHED) { - fetchEvaluationResults(evaluation.id) - .then(() => { - updateEvaluation(evaluation.id, {status: EvaluationFlow.EVALUATION_FINISHED}) - }) - .catch((err) => console.error("Failed to fetch results:", err)) - } - }, [evaluationStatus, evaluation.id]) - - useEffect(() => { - if (correctAnswers + wrongAnswers > 0) { - setAccuracy((correctAnswers / (correctAnswers + wrongAnswers)) * 100) - } else { - setAccuracy(0) - } - }, [correctAnswers, wrongAnswers]) - - useEffect(() => { - const correct = rows.filter((row) => row.score === "correct").length - const wrong = rows.filter((row) => row.score === "wrong").length - const accuracy = correct + wrong > 0 ? (correct / (correct + wrong)) * 100 : 0 - - setCorrectAnswers(correct) - setWrongAnswers(wrong) - setAccuracy(accuracy) - }, [rows]) - - const handleInputChange = (value: any, name: string, rowIndex: any) => { - const newRows = [...rows] - const ip = newRows[rowIndex].inputs.find((ip) => ip.input_name === name) - if (ip) ip.input_value = value - setRows(newRows) - } - - const runAllEvaluations = async () => { - setEvaluationStatus(EvaluationFlow.EVALUATION_STARTED) - - batchExecute(rows.map((_, rowIndex) => () => runEvaluation(rowIndex))) - .then(() => { - console.log("All functions finished.") - setEvaluationStatus(EvaluationFlow.EVALUATION_FINISHED) - }) - .catch((err) => console.error("An error occurred:", err)) - } - - const runEvaluation = async (rowIndex: number) => { - const inputParamsDict = rows[rowIndex].inputs.reduce((acc: {[key: string]: any}, item) => { - acc[item.input_name] = item.input_value - return acc - }, {}) - - const columnsDataNames = ["columnData0"] - columnsDataNames.forEach(async (columnName: any, idx: number) => { - setRowValue(rowIndex, columnName, "loading...") - try { - let result = await callVariant( - inputParamsDict, - variantData[idx].inputParams!, - variantData[idx].optParams!, - appId || "", - variants[idx].baseId || "", - variantData[idx].isChatVariant - ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) - : [], - ) - if (variantData[idx].isChatVariant) result = contentToChatMessageString(result) - - setRowValue(rowIndex, columnName, result) - setRowValue(rowIndex, "evaluationFlow", EvaluationFlow.COMPARISON_RUN_STARTED) - evaluate(rowIndex) - if (rowIndex === rows.length - 1) { - message.success("Evaluation Results Saved") - } - } catch (err) { - console.log("Error running evaluation:", err) - setRowValue(rowIndex, columnName, "") - } - }) - } - - /** - * - * @param rowNumber - * - * This method will: - * 1. perform an exact match evaluation for the given row number - * 2. update the evaluation row with the result - * 3. update the score column in the table - */ - const evaluate = (rowNumber: number) => { - const isCorrect = evaluateWithExactMatch( - rows[rowNumber].columnData0, - rows[rowNumber].correctAnswer, - ) - - const evaluation_scenario_id = rows[rowNumber].id - // TODO: we need to improve this and make it dynamic - const outputVariantX = rows[rowNumber].columnData0 - - if (evaluation_scenario_id) { - const data = { - score: isCorrect ? "correct" : "wrong", - outputs: [{variant_id: variants[0].variantId, variant_output: outputVariantX}], - } - - updateEvaluationScenario( - evaluation.id, - evaluation_scenario_id, - data, - evaluation.evaluationType, - ) - .then(() => { - setRowValue(rowNumber, "score", data.score) - if (isCorrect) { - setCorrectAnswers((prevCorrect) => prevCorrect + 1) - } else { - setWrongAnswers((prevWrong) => prevWrong + 1) - } - }) - .catch((err) => { - console.error(err) - }) - } - } - - const setRowValue = ( - rowIndex: number, - columnKey: keyof ExactMatchEvaluationTableRow, - value: any, - ) => { - const newRows = [...rows] - newRows[rowIndex][columnKey] = value as never - setRows(newRows) - } - - const dynamicColumns: ColumnType[] = Array.from( - {length: columnsCount}, - (_, i) => { - const columnKey = `columnData${i}` - - return { - title: ( -
- App Variant: - - {variants ? variants[i].variantName : ""} - -
- ), - dataIndex: columnKey, - key: columnKey, - width: "25%", - render: (text: any, record: ExactMatchEvaluationTableRow, rowIndex: number) => { - if (record.outputs && record.outputs.length > 0) { - const outputValue = record.outputs.find( - (output: any) => output.variant_id === variants[i].variantId, - )?.variant_output - return
{outputValue}
- } - return text - }, - } - }, - ) - - const columns = [ - { - key: "1", - width: "30%", - title: ( -
-
- Inputs (Test set: - {evaluation.testset.name} - ) -
-
- ), - dataIndex: "inputs", - render: (text: any, record: ExactMatchEvaluationTableRow, rowIndex: number) => ( -
- {evaluation.testset.testsetChatColumn ? ( - evaluation.testset.csvdata[rowIndex][ - evaluation.testset.testsetChatColumn - ] || " - " - ) : ( - - handleInputChange(value, name, rowIndex) - } - inputParams={ - variantData[0].inputParams?.map((item) => ({ - ...item, - value: record.inputs.find((ip) => ip.input_name === item.name) - ?.input_value, - })) || [] - } - /> - )} -
- ), - }, - ...dynamicColumns, - { - title: "Correct Answer", - dataIndex: "correctAnswer", - key: "correctAnswer", - width: "25%", - - render: (text: any, record: any, rowIndex: number) =>
{record.correctAnswer}
, - }, - { - title: "Evaluation", - dataIndex: "evaluation", - key: "evaluation", - width: 200, - align: "center" as "left" | "right" | "center", - render: (text: any, record: any, rowIndex: number) => { - let tagColor = "" - if (record.score === "correct") { - tagColor = "green" - } else if (record.score === "wrong") { - tagColor = "red" - } - return ( - - -
- {rows[rowIndex].score !== "" && ( - - {record.score} - - )} -
-
-
- ) - }, - }, - ] - - return ( -
- Exact match Evaluation -
- -
- - - exportExactEvaluationData(evaluation, rows)} - disabled={evaluationStatus !== EvaluationFlow.EVALUATION_FINISHED} - > - Export results - - - - - - - - - - - - - - - - - - - - - -
-
- - - ) -} - -export default ExactMatchEvaluationTable diff --git a/agenta-web/src/components/EvaluationTable/RegexEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/RegexEvaluationTable.tsx deleted file mode 100644 index baa31e5dd5..0000000000 --- a/agenta-web/src/components/EvaluationTable/RegexEvaluationTable.tsx +++ /dev/null @@ -1,505 +0,0 @@ -import {useState, useEffect, useRef} from "react" -import type {ColumnType} from "antd/es/table" -import {InfoCircleOutlined, LineChartOutlined} from "@ant-design/icons" -import { - Button, - Card, - Col, - Form, - Input, - Radio, - Row, - Space, - Spin, - Statistic, - Table, - Tag, - Tooltip, - message, - Typography, -} from "antd" -import {updateEvaluationScenario, callVariant, updateEvaluation} from "@/lib/services/api" -import {useVariants} from "@/lib/hooks/useVariant" -import {useRouter} from "next/router" -import {EvaluationFlow} from "@/lib/enums" -import {evaluateWithRegex} from "@/lib/services/evaluations" -import {createUseStyles} from "react-jss" -import Highlighter from "react-highlight-words" -import {globalErrorHandler} from "@/lib/helpers/errorHandler" -import SecondaryButton from "../SecondaryButton/SecondaryButton" -import {exportRegexEvaluationData} from "@/lib/helpers/evaluate" -import {isValidRegex} from "@/lib/helpers/validators" -import {contentToChatMessageString, testsetRowToChatMessages} from "@/lib/helpers/testset" -import ParamsForm from "../Playground/ParamsForm/ParamsForm" -import {batchExecute} from "@/lib/helpers/utils" - -const {Title} = Typography - -interface RegexEvaluationTableProps { - evaluation: any - columnsCount: number - evaluationScenarios: RegexEvaluationTableRow[] -} - -interface RegexEvaluationTableRow { - id?: string - inputs: { - input_name: string - input_value: string - }[] - outputs: { - variant_id: string - variant_output: string - }[] - columnData0: string - correctAnswer: string - score: string - isMatch: boolean - evaluationFlow: EvaluationFlow -} -/** - * - * @param evaluation - Evaluation object - * @param evaluationScenarios - Evaluation rows - * @param columnsCount - Number of variants to compare face to face (per default 2) - * @returns - */ - -const useStyles = createUseStyles({ - appVariant: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - inputTestContainer: { - display: "flex", - justifyContent: "space-between", - }, - inputTest: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - recordInput: { - marginBottom: 10, - }, - tag: { - fontSize: "14px", - }, - card: { - marginBottom: 20, - }, - statCorrect: { - "& .ant-statistic-content-value": { - color: "#3f8600", - }, - }, - statWrong: { - "& .ant-statistic-content-value": { - color: "#cf1322", - }, - }, - form: { - marginBottom: 20, - "& .ant-form-item-has-error": { - marginBottom: 0, - }, - }, - regexInput: { - minWidth: 240, - }, - infoLabel: { - display: "flex", - gap: 3, - alignItems: "center", - "& .anticon-info-circle": { - color: "#faad14", - marginTop: 2, - }, - }, -}) - -const RegexEvaluationTable: React.FC = ({ - evaluation, - evaluationScenarios, - columnsCount, -}) => { - const classes = useStyles() - const router = useRouter() - const appId = router.query.app_id as string - const variants = evaluation.variants - const variantData = useVariants(appId, variants) - - const [rows, setRows] = useState([]) - const [wrongAnswers, setWrongAnswers] = useState(0) - const [correctAnswers, setCorrectAnswers] = useState(0) - const [accuracy, setAccuracy] = useState(0) - const [settings, setSettings] = useState(evaluation.evaluationTypeSettings) - const [loading, setLoading] = useState([]) - const [form] = Form.useForm() - const showError = useRef(true) - - useEffect(() => { - if (evaluationScenarios) { - setRows(evaluationScenarios) - setLoading(Array(evaluationScenarios.length).fill(false)) - } - }, [evaluationScenarios]) - - useEffect(() => { - if (correctAnswers + wrongAnswers > 0) { - setAccuracy((correctAnswers / (correctAnswers + wrongAnswers)) * 100) - } else { - setAccuracy(0) - } - }, [correctAnswers, wrongAnswers]) - - useEffect(() => { - const correct = rows.filter((row) => row.score === "correct").length - const wrong = rows.filter((row) => row.score === "wrong").length - const accuracy = correct + wrong > 0 ? (correct / (correct + wrong)) * 100 : 0 - - setCorrectAnswers(correct) - setWrongAnswers(wrong) - setAccuracy(accuracy) - }, [rows]) - - const handleInputChange = (value: any, name: string, rowIndex: any) => { - const newRows = [...rows] - const ip = newRows[rowIndex].inputs.find((ip) => ip.input_name === name) - if (ip) ip.input_value = value - setRows(newRows) - } - - const runAllEvaluations = async () => { - //validate form - try { - await form.validateFields() - } catch { - return - } - showError.current = true - - const {regexPattern, regexShouldMatch} = form.getFieldsValue() - batchExecute(rows.map((_, rowIndex) => () => runEvaluation(rowIndex))) - .then(() => { - updateEvaluation(evaluation.id, { - evaluation_type_settings: { - regex_should_match: regexShouldMatch, - regex_pattern: regexPattern, - }, - status: EvaluationFlow.EVALUATION_FINISHED, - }).then(() => { - setSettings({regexShouldMatch, regexPattern}) - message.success("Evaluation Results Saved") - }) - }) - .catch(() => {}) - } - - const runEvaluation = async (rowIndex: number) => { - setLoading((prev) => prev.map((val, i) => (i === rowIndex ? true : val))) - const inputParamsDict = rows[rowIndex].inputs.reduce((acc: {[key: string]: any}, item) => { - acc[item.input_name] = item.input_value - return acc - }, {}) - - const columnsDataNames = ["columnData0"] - for (let idx = 0; idx < columnsDataNames.length; ++idx) { - const columnName = columnsDataNames[idx] as keyof RegexEvaluationTableRow - try { - let result = await callVariant( - inputParamsDict, - variantData[idx].inputParams!, - variantData[idx].optParams!, - appId || "", - variants[idx].baseId || "", - variantData[idx].isChatVariant - ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) - : [], - ) - if (variantData[idx].isChatVariant) result = contentToChatMessageString(result) - - const {regexPattern, regexShouldMatch} = form.getFieldsValue() - const isCorrect = evaluateWithRegex(result, regexPattern, regexShouldMatch) - const evaluationScenarioId = rows[rowIndex].id - const score = isCorrect ? "correct" : "wrong" - - if (evaluationScenarioId) { - await updateEvaluationScenario( - evaluation.id, - evaluationScenarioId, - { - score, - outputs: [{variant_id: variants[0].variantId, variant_output: result}], - }, - evaluation.evaluationType, - ) - } - - setRowValue(rowIndex, "score", score) - if (isCorrect) { - setCorrectAnswers((prevCorrect) => prevCorrect + 1) - } else { - setWrongAnswers((prevWrong) => prevWrong + 1) - } - setRowValue(rowIndex, columnName, result) - } catch (err) { - setRowValue(rowIndex, columnName, "") - if (showError.current) { - globalErrorHandler(err) - showError.current = false - } - throw err - } finally { - setLoading((prev) => prev.map((val, i) => (i === rowIndex ? false : val))) - } - } - } - - const setRowValue = ( - rowIndex: number, - columnKey: keyof RegexEvaluationTableRow, - value: any, - ) => { - const newRows: any = [...rows] - newRows[rowIndex][columnKey] = value - setRows(newRows) - } - - const dynamicColumns: ColumnType[] = Array.from( - {length: columnsCount}, - (_, i) => { - const columnKey = `columnData${i}` - - return { - title: ( -
- App Variant: - - {variants ? variants[i].variantName : ""} - -
- ), - dataIndex: columnKey, - key: columnKey, - width: "25%", - render: (value: any, record: RegexEvaluationTableRow, ix: number) => { - if (loading[ix]) return "Loading..." - - let outputValue = value - if (record.outputs && record.outputs.length > 0) { - outputValue = record.outputs.find( - (output: any) => output.variant_id === variants[i].variantId, - )?.variant_output - } - - return ( - - ) - }, - } - }, - ) - - const columns = [ - { - key: "1", - width: "30%", - title: ( -
-
- Inputs (Test set: - {evaluation.testset.name} - ) -
-
- ), - dataIndex: "inputs", - render: (_: any, record: RegexEvaluationTableRow, rowIndex: number) => ( -
- {evaluation.testset.testsetChatColumn ? ( - evaluation.testset.csvdata[rowIndex][ - evaluation.testset.testsetChatColumn - ] || " - " - ) : ( - - handleInputChange(value, name, rowIndex) - } - inputParams={ - variantData[0].inputParams?.map((item) => ({ - ...item, - value: record.inputs.find((ip) => ip.input_name === item.name) - ?.input_value, - })) || [] - } - /> - )} -
- ), - }, - ...dynamicColumns, - { - title: "Match / Mismatch", - dataIndex: "score", - key: "isMatch", - width: "25%", - render: (val: string, _: any, ix: number) => { - if (loading[ix]) return - - const isCorrect = val === "correct" - const isMatch = settings.regexShouldMatch ? isCorrect : !isCorrect - return settings.regexPattern ? ( -
- {isMatch ? "Match" : "Mismatch"} -
- ) : null - }, - }, - { - title: "Evaluation", - dataIndex: "score", - key: "evaluation", - width: 200, - align: "center" as "left" | "right" | "center", - render: (score: string, _: any, ix: number) => { - if (loading[ix]) return - return ( - - {score && ( - - {score} - - )} - - ) - }, - }, - ] - - return ( -
- Regex Match / Mismatch Evaluation -
- -
- - - - exportRegexEvaluationData(evaluation, rows, settings) - } - disabled={!rows?.[0]?.score} - > - Export results - - - - - - - - - - - - - - - - - - - - - - - {settings && ( -
- - new Promise((res, rej) => - isValidRegex(value) - ? res("") - : rej("Regex pattern is not valid"), - ), - }, - ]} - > - - - - Strategy - - - - - } - rules={[{required: true, message: "Please select strategy"}]} - name="regexShouldMatch" - > - - Match - Mismatch - - - - )} - -
-
- - - ) -} - -export default RegexEvaluationTable diff --git a/agenta-web/src/components/EvaluationTable/SimilarityMatchEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/SimilarityMatchEvaluationTable.tsx deleted file mode 100644 index 1e294efb83..0000000000 --- a/agenta-web/src/components/EvaluationTable/SimilarityMatchEvaluationTable.tsx +++ /dev/null @@ -1,471 +0,0 @@ -import {useState, useEffect} from "react" -import type {ColumnType} from "antd/es/table" -import {LineChartOutlined} from "@ant-design/icons" -import { - Button, - Card, - Col, - Form, - Row, - Slider, - Space, - Spin, - Statistic, - Table, - Tag, - message, -} from "antd" -import {updateEvaluationScenario, callVariant, updateEvaluation} from "@/lib/services/api" -import {useVariants} from "@/lib/hooks/useVariant" -import {useRouter} from "next/router" -import {EvaluationFlow} from "@/lib/enums" -import {evaluateWithSimilarityMatch} from "@/lib/services/evaluations" -import {Typography} from "antd" -import {createUseStyles} from "react-jss" -import {exportSimilarityEvaluationData} from "@/lib/helpers/evaluate" -import SecondaryButton from "../SecondaryButton/SecondaryButton" -import {contentToChatMessageString, testsetRowToChatMessages} from "@/lib/helpers/testset" -import ParamsForm from "../Playground/ParamsForm/ParamsForm" -import {batchExecute} from "@/lib/helpers/utils" - -const {Title} = Typography - -interface SimilarityMatchEvaluationTableProps { - evaluation: any - columnsCount: number - evaluationScenarios: SimilarityMatchEvaluationTableRow[] -} - -interface SimilarityMatchEvaluationTableRow { - id?: string - inputs: { - input_name: string - input_value: string - }[] - outputs: { - variant_id: string - variant_output: string - }[] - columnData0: string - correctAnswer: string - score: string - similarity: number - evaluationFlow: EvaluationFlow -} -/** - * - * @param evaluation - Evaluation object - * @param evaluationScenarios - Evaluation rows - * @param columnsCount - Number of variants to compare face to face (per default 2) - * @returns - */ - -const useStyles = createUseStyles({ - appVariant: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - inputTestContainer: { - display: "flex", - justifyContent: "space-between", - }, - inputTest: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - recordInput: { - marginBottom: 10, - }, - tag: { - fontSize: "14px", - }, - card: { - marginBottom: 20, - }, - div: { - marginBottom: 20, - }, - statCorrect: { - "& .ant-statistic-content-value": { - color: "#3f8600", - }, - }, - statWrong: { - "& .ant-statistic-content-value": { - color: "#cf1322", - }, - }, - form: { - marginBottom: 20, - "& .ant-form-item-has-error": { - marginBottom: 0, - }, - }, - slider: { - width: 200, - }, -}) - -const SimilarityMatchEvaluationTable: React.FC = ({ - evaluation, - evaluationScenarios, - columnsCount, -}) => { - const classes = useStyles() - const router = useRouter() - const appId = router.query.app_id as string - - const variants = evaluation.variants - - const variantData = useVariants(appId, variants) - - const [rows, setRows] = useState([]) - const [dissimilarAnswers, setDissimilarAnswers] = useState(0) - const [similarAnswers, setSimilarAnswers] = useState(0) - const [accuracy, setAccuracy] = useState(0) - const [settings, setSettings] = useState(evaluation.evaluationTypeSettings) - const [loading, setLoading] = useState([]) - const [form] = Form.useForm() - const {Text} = Typography - - useEffect(() => { - if (evaluationScenarios) { - setRows( - evaluationScenarios.map((item) => ({ - ...item, - similarity: item.outputs?.[0]?.variant_output - ? evaluateWithSimilarityMatch( - item.outputs[0].variant_output, - item.correctAnswer, - ) - : NaN, - })), - ) - setLoading(Array(evaluationScenarios.length).fill(false)) - } - }, [evaluationScenarios]) - - useEffect(() => { - if (similarAnswers + dissimilarAnswers > 0) { - setAccuracy((similarAnswers / (similarAnswers + dissimilarAnswers)) * 100) - } else { - setAccuracy(0) - } - }, [similarAnswers, dissimilarAnswers]) - - useEffect(() => { - const similar = rows.filter((row) => row.score === "true").length - const dissimilar = rows.filter((row) => row.score === "false").length - const accuracy = similar + dissimilar > 0 ? (similar / (similar + dissimilar)) * 100 : 0 - - setSimilarAnswers(similar) - setDissimilarAnswers(dissimilar) - setAccuracy(accuracy) - }, [rows]) - - const handleInputChange = (value: any, name: string, rowIndex: any) => { - const newRows = [...rows] - const ip = newRows[rowIndex].inputs.find((ip) => ip.input_name === name) - if (ip) ip.input_value = value - setRows(newRows) - } - - const runAllEvaluations = async () => { - //validate form - try { - await form.validateFields() - } catch { - return - } - - const {similarityThreshold} = form.getFieldsValue() - batchExecute(rows.map((_, rowIndex) => () => runEvaluation(rowIndex))).then(() => { - updateEvaluation(evaluation.id, { - evaluation_type_settings: { - similarity_threshold: similarityThreshold, - }, - status: EvaluationFlow.EVALUATION_FINISHED, - }).then(() => { - message.success("Evaluation Results Saved") - }) - }) - } - - const runEvaluation = async (rowIndex: number) => { - setLoading((prev) => prev.map((val, i) => (i === rowIndex ? true : val))) - const inputParamsDict = rows[rowIndex].inputs.reduce((acc: {[key: string]: any}, item) => { - acc[item.input_name] = item.input_value - return acc - }, {}) - - const columnsDataNames = ["columnData0"] - columnsDataNames.forEach(async (columnName: any, idx: number) => { - try { - let result = await callVariant( - inputParamsDict, - variantData[idx].inputParams!, - variantData[idx].optParams!, - appId || "", - variants[idx].baseId || "", - variantData[idx].isChatVariant - ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) - : [], - ) - if (variantData[idx].isChatVariant) result = contentToChatMessageString(result) - - const {similarityThreshold} = form.getFieldsValue() - const similarity = evaluateWithSimilarityMatch(result, rows[rowIndex].correctAnswer) - const evaluationScenarioId = rows[rowIndex].id - const isSimilar = similarity >= similarityThreshold ? "true" : "false" - - if (evaluationScenarioId) { - await updateEvaluationScenario( - evaluation.id, - evaluationScenarioId, - { - score: isSimilar, - outputs: [{variant_id: variants[0].variantId, variant_output: result}], - }, - evaluation.evaluationType, - ) - } - - setRowValue(rowIndex, "similarity", similarity) - setRowValue(rowIndex, "score", isSimilar) - if (isSimilar) { - setSimilarAnswers((prevSimilar) => prevSimilar + 1) - } else { - setDissimilarAnswers((prevDissimilar) => prevDissimilar + 1) - } - setRowValue(rowIndex, columnName, result) - } catch { - setRowValue(rowIndex, columnName, "") - } finally { - setLoading((prev) => prev.map((val, i) => (i === rowIndex ? false : val))) - } - }) - } - - const setRowValue = ( - rowIndex: number, - columnKey: keyof SimilarityMatchEvaluationTableRow, - value: any, - ) => { - const newRows = [...rows] - newRows[rowIndex][columnKey] = value as never - setRows(newRows) - } - - const dynamicColumns: ColumnType[] = Array.from( - {length: columnsCount}, - (_, i) => { - const columnKey = `columnData${i}` - - return { - title: ( -
- App Variant: - - {variants ? variants[i].variantName : ""} - -
- ), - dataIndex: columnKey, - key: columnKey, - width: "25%", - render: (text: any, record: SimilarityMatchEvaluationTableRow, ix: number) => { - if (loading[ix]) return "Loading..." - - if (record.outputs && record.outputs.length > 0) { - const outputValue = record.outputs.find( - (output: any) => output.variant_id === variants[i].variantId, - )?.variant_output - return
{outputValue}
- } - return text - }, - } - }, - ) - - const columns = [ - { - key: "1", - width: "30%", - title: ( -
-
- Inputs (Test set: - {evaluation.testset.name} - ) -
-
- ), - dataIndex: "inputs", - render: (text: any, record: SimilarityMatchEvaluationTableRow, rowIndex: number) => ( -
- {evaluation.testset.testsetChatColumn ? ( - evaluation.testset.csvdata[rowIndex][ - evaluation.testset.testsetChatColumn - ] || " - " - ) : ( - - handleInputChange(value, name, rowIndex) - } - inputParams={ - variantData[0].inputParams?.map((item) => ({ - ...item, - value: record.inputs.find((ip) => ip.input_name === item.name) - ?.input_value, - })) || [] - } - /> - )} -
- ), - }, - ...dynamicColumns, - { - title: "Correct Answer", - dataIndex: "correctAnswer", - key: "correctAnswer", - width: "25%", - }, - { - title: "Evaluation", - dataIndex: "score", - key: "evaluation", - width: 200, - align: "center" as "left" | "right" | "center", - render: (score: string, _: any, ix: number) => { - if (loading[ix]) return - return ( - - {score && ( - - {score} - - )} - - ) - }, - }, - { - title: "Similarity", - dataIndex: "similarity", - key: "similarity", - width: 200, - align: "center" as "left" | "right" | "center", - render: (similarity: number, record: any, ix: number) => { - if (loading[ix]) return - - const score = record.score - return ( - - {score && !isNaN(similarity) && ( - - {similarity.toFixed(2)} - - )} - - ) - }, - }, - ] - - return ( -
- - Similarity match Evaluation (Threshold: {settings.similarityThreshold}) - -
- - This evaluation type is calculating the similarity using Jaccard similarity. - -
-
- -
- - - exportSimilarityEvaluationData(evaluation, rows)} - disabled={!rows?.[0]?.score} - > - Export results - - - - - - - - - - - - - - - - - - - - - - - {settings && ( -
- - setSettings({similarityThreshold: value})} - /> - - - )} - -
-
- - - ) -} - -export default SimilarityMatchEvaluationTable diff --git a/agenta-web/src/components/EvaluationTable/WebhookEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/WebhookEvaluationTable.tsx deleted file mode 100644 index cd3979f81c..0000000000 --- a/agenta-web/src/components/EvaluationTable/WebhookEvaluationTable.tsx +++ /dev/null @@ -1,460 +0,0 @@ -import {useState, useEffect, useRef} from "react" -import type {ColumnType} from "antd/es/table" -import {LineChartOutlined} from "@ant-design/icons" -import { - Alert, - Button, - Card, - Col, - Form, - Input, - Row, - Space, - Spin, - Statistic, - Table, - Tag, - Tooltip, - Typography, - message, -} from "antd" -import {updateEvaluationScenario, callVariant, updateEvaluation} from "@/lib/services/api" -import {useVariants} from "@/lib/hooks/useVariant" -import {useRouter} from "next/router" -import {EvaluationFlow} from "@/lib/enums" -import {evaluateWithWebhook} from "@/lib/services/evaluations" -import {createUseStyles} from "react-jss" -import {globalErrorHandler} from "@/lib/helpers/errorHandler" -import {isValidHttpUrl} from "@/lib/helpers/validators" -import SecondaryButton from "../SecondaryButton/SecondaryButton" -import {exportWebhookEvaluationData} from "@/lib/helpers/evaluate" -import {contentToChatMessageString, testsetRowToChatMessages} from "@/lib/helpers/testset" -import ParamsForm from "../Playground/ParamsForm/ParamsForm" -import {batchExecute} from "@/lib/helpers/utils" - -const {Title} = Typography - -interface WebhookEvaluationTableProps { - evaluation: any - columnsCount: number - evaluationScenarios: WebhookEvaluationTableRow[] -} - -interface WebhookEvaluationTableRow { - id?: string - inputs: { - input_name: string - input_value: string - }[] - outputs: { - variant_id: string - variant_output: string - }[] - columnData0: string - correctAnswer: string - score: string - isMatch: boolean - evaluationFlow: EvaluationFlow -} -/** - * - * @param evaluation - Evaluation object - * @param evaluationScenarios - Evaluation rows - * @param columnsCount - Number of variants to compare face to face (per default 2) - * @returns - */ - -const useStyles = createUseStyles({ - appVariant: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - inputTestContainer: { - display: "flex", - justifyContent: "space-between", - }, - inputTest: { - backgroundColor: "rgb(201 255 216)", - color: "rgb(0 0 0)", - padding: 4, - borderRadius: 5, - }, - recordInput: { - marginBottom: 10, - }, - tag: { - fontSize: "14px", - }, - card: { - marginBottom: 20, - }, - infoBox: { - marginBottom: 20, - }, - pre: { - position: "relative", - overflow: "auto", - }, - statCorrect: { - "& .ant-statistic-content-value": { - color: "#3f8600", - }, - }, - statWrong: { - "& .ant-statistic-content-value": { - color: "#cf1322", - }, - }, - form: { - marginBottom: 20, - "& .ant-form-item-has-error": { - marginBottom: 0, - }, - }, -}) - -const WebhookEvaluationTable: React.FC = ({ - evaluation, - evaluationScenarios, - columnsCount, -}) => { - const classes = useStyles() - const router = useRouter() - const appId = router.query.app_id as string - const variants = evaluation.variants - const variantData = useVariants(appId, variants) - - const [rows, setRows] = useState([]) - const [accuracy, setAccuracy] = useState(0) - const [settings, setSettings] = useState(evaluation.evaluationTypeSettings) - const [loading, setLoading] = useState([]) - const [form] = Form.useForm() - const showError = useRef(true) - - useEffect(() => { - if (evaluationScenarios) { - setRows(evaluationScenarios) - setLoading(Array(evaluationScenarios.length).fill(false)) - } - }, [evaluationScenarios]) - - useEffect(() => { - const scores = rows.filter((item) => !isNaN(+item.score)).map((item) => +item.score) - const avg = scores.reduce((acc, val) => acc + val, 0) / (scores.length || 1) - setAccuracy(avg * 100) - }, [rows]) - - const handleInputChange = (value: any, name: string, rowIndex: any) => { - const newRows = [...rows] - const ip = newRows[rowIndex].inputs.find((ip) => ip.input_name === name) - if (ip) ip.input_value = value - setRows(newRows) - } - - const runAllEvaluations = async () => { - //validate form - try { - await form.validateFields() - } catch { - return - } - showError.current = true - - const {webhookUrl} = form.getFieldsValue() - batchExecute(rows.map((_, rowIndex) => () => runEvaluation(rowIndex))) - .then(() => { - updateEvaluation(evaluation.id, { - evaluation_type_settings: { - webhook_url: webhookUrl, - }, - status: EvaluationFlow.EVALUATION_FINISHED, - }).then(() => { - setSettings({webhookUrl}) - message.success("Evaluation Results Saved") - }) - }) - .catch(() => {}) - } - - const runEvaluation = async (rowIndex: number) => { - setLoading((prev) => prev.map((val, i) => (i === rowIndex ? true : val))) - const inputParamsDict = rows[rowIndex].inputs.reduce((acc: {[key: string]: any}, item) => { - acc[item.input_name] = item.input_value - return acc - }, {}) - - const columnsDataNames = ["columnData0"] - for (let idx = 0; idx < columnsDataNames.length; ++idx) { - const columnName = columnsDataNames[idx] as keyof WebhookEvaluationTableRow - try { - let result = await callVariant( - inputParamsDict, - variantData[idx].inputParams!, - variantData[idx].optParams!, - appId || "", - variants[idx].baseId || "", - variantData[idx].isChatVariant - ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) - : [], - ) - if (variantData[idx].isChatVariant) result = contentToChatMessageString(result) - - const {webhookUrl} = form.getFieldsValue() - const score = await evaluateWithWebhook(webhookUrl, { - input_vars: inputParamsDict, - output: result, - correct_answer: rows[rowIndex].correctAnswer || null, - }) - const evaluationScenarioId = rows[rowIndex].id - - if (evaluationScenarioId) { - await updateEvaluationScenario( - evaluation.id, - evaluationScenarioId, - { - score, - outputs: [{variant_id: variants[0].variantId, variant_output: result}], - }, - evaluation.evaluationType, - ) - } - - setRowValue(rowIndex, "score", score) - setRowValue(rowIndex, columnName, result) - } catch (err) { - setRowValue(rowIndex, columnName, "") - if (showError.current) { - globalErrorHandler(err) - showError.current = false - } - throw err - } finally { - setLoading((prev) => prev.map((val, i) => (i === rowIndex ? false : val))) - } - } - } - - const setRowValue = ( - rowIndex: number, - columnKey: keyof WebhookEvaluationTableRow, - value: any, - ) => { - const newRows: any = [...rows] - newRows[rowIndex][columnKey] = value - setRows(newRows) - } - - const dynamicColumns: ColumnType[] = Array.from( - {length: columnsCount}, - (_, i) => { - const columnKey = `columnData${i}` - - return { - title: ( -
- App Variant: - - {variants ? variants[i].variantName : ""} - -
- ), - dataIndex: columnKey, - key: columnKey, - width: "25%", - render: (value: any, record: WebhookEvaluationTableRow, ix: number) => { - if (loading[ix]) return "Loading..." - - let outputValue = value - if (record.outputs && record.outputs.length > 0) { - outputValue = record.outputs.find( - (output: any) => output.variant_id === variants[i].variantId, - )?.variant_output - } - - return outputValue - }, - } - }, - ) - - const columns = [ - { - key: "1", - width: "30%", - title: ( -
-
- Inputs (Test set: - {evaluation.testset.name} - ) -
-
- ), - dataIndex: "inputs", - render: (_: any, record: WebhookEvaluationTableRow, rowIndex: number) => ( -
- {evaluation.testset.testsetChatColumn ? ( - evaluation.testset.csvdata[rowIndex][ - evaluation.testset.testsetChatColumn - ] || " - " - ) : ( - - handleInputChange(value, name, rowIndex) - } - inputParams={ - variantData[0].inputParams?.map((item) => ({ - ...item, - value: record.inputs.find((ip) => ip.input_name === item.name) - ?.input_value, - })) || [] - } - /> - )} -
- ), - }, - ...dynamicColumns, - { - title: "Correct Answer", - dataIndex: "correctAnswer", - key: "correctAnswer", - width: "25%", - }, - { - title: "Evaluation", - dataIndex: "score", - key: "evaluation", - width: 200, - align: "center" as "left" | "right" | "center", - render: (score: string, _: any, ix: number) => { - if (loading[ix]) return - return ( - - {score && ( - - {(+score).toFixed(2)} - - )} - - ) - }, - }, - ] - - return ( -
- Webhook URL Evaluation - - The webhook URL you provide will be called with an HTTP POST request. - The request body will contain the following JSON object: -
-                            
-                                {`{
-    "input_vars": {                     // Key/value pairs for each variable in the Test Suite / Prompt
-	    "var_1": "value_1",
-	    "var_2": "value_2",
-	    ...
-    },
-    "output": string,                   // The LLM's output
-    "correct_answer": string | null     // The correct answer, if available
-}`}
-                            
-                        
- Thre response of the payload should contain the following JSON object: -
-                            
-                                {`{
-    "score": number                     // Evaluation score between 0 and 1, 0 being "bad" and 1 being "good"
-}`}
-                            
-                        
-
- NOTE: Your webhook should allow CORS request from our domain in - the response headers -
- - } - type="info" - showIcon - /> -
- -
- - - exportWebhookEvaluationData(evaluation, rows)} - disabled={!rows?.[0]?.score} - > - Export results - - - - - - - - - - - - - - - {settings && ( - - - new Promise((res, rej) => - isValidHttpUrl(value) - ? res("") - : rej("Please enter a valid url"), - ), - }, - ]} - > - - - - )} - -
-
- - - ) -} - -export default WebhookEvaluationTable diff --git a/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx b/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx deleted file mode 100644 index f120eca967..0000000000 --- a/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx +++ /dev/null @@ -1,323 +0,0 @@ -import {deleteEvaluations, fetchEvaluationResults, loadEvaluations} from "@/lib/services/api" -import {Button, Collapse, Statistic, Table, Typography} from "antd" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {ColumnsType} from "antd/es/table" -import {Evaluation, GenericObject} from "@/lib/Types" -import {DeleteOutlined} from "@ant-design/icons" -import {EvaluationTypeLabels} from "@/lib/helpers/utils" -import {EvaluationFlow, EvaluationType} from "@/lib/enums" -import {createUseStyles} from "react-jss" -import {useAppTheme} from "../Layout/ThemeContextProvider" -import {calculateResultsDataAvg} from "@/lib/helpers/evaluate" - -interface EvaluationListTableDataType { - key: string - variants: string[] - testset: { - _id: string - name: string - } - evaluationType: string - status: EvaluationFlow - scoresData: { - nb_of_rows: number - wrong?: GenericObject[] - correct?: GenericObject[] - true?: GenericObject[] - false?: GenericObject[] - variant: string[] - } - avgScore: number - custom_code_eval_id: string - resultsData: {[key: string]: number} - createdAt: string -} - -type StyleProps = { - themeMode: "dark" | "light" -} - -const useStyles = createUseStyles({ - container: { - marginBottom: 20, - "& svg": { - color: "red", - }, - }, - collapse: ({themeMode}: StyleProps) => ({ - margin: "10px 0", - "& .ant-collapse-header": { - alignItems: "center !important", - padding: "0px 20px !important", - borderTopLeftRadius: "10px !important", - borderTopRightRadius: "10px !important", - background: themeMode === "dark" ? "#1d1d1d" : "#f8f8f8", - }, - }), - stat: { - "& .ant-statistic-content-value": { - fontSize: 20, - color: "#1677ff", - }, - "& .ant-statistic-content-suffix": { - fontSize: 20, - color: "#1677ff", - }, - }, -}) - -const {Title} = Typography - -export default function AutomaticEvaluationResult() { - const router = useRouter() - const [evaluationsList, setEvaluationsList] = useState([]) - const [selectedRowKeys, setSelectedRowKeys] = useState([]) - const [selectionType] = useState<"checkbox" | "radio">("checkbox") - const {appTheme} = useAppTheme() - const classes = useStyles({themeMode: appTheme} as StyleProps) - - const app_id = router.query.app_id?.toString() || "" - - useEffect(() => { - if (!app_id) { - return - } - - const fetchEvaluations = async () => { - try { - const evals: Evaluation[] = await loadEvaluations(app_id) - const results = await Promise.all(evals.map((e) => fetchEvaluationResults(e.id))) - const newEvals = results.map((result, ix) => { - const item = evals[ix] - if ( - [ - EvaluationType.auto_exact_match, - EvaluationType.auto_similarity_match, - EvaluationType.auto_regex_test, - EvaluationType.auto_ai_critique, - EvaluationType.custom_code_run, - EvaluationType.auto_webhook_test, - EvaluationType.single_model_test, - ].includes(item.evaluationType) - ) { - return { - key: item.id, - createdAt: item.createdAt, - variants: item.variants, - scoresData: result.scores_data, - evaluationType: item.evaluationType, - status: item.status, - testset: item.testset, - custom_code_eval_id: item.evaluationTypeSettings.customCodeEvaluationId, - resultsData: result.results_data, - avgScore: result.avg_score, - } - } - }) - - setEvaluationsList( - newEvals - .filter((evaluation) => evaluation !== undefined) - .filter( - (item: any) => - item.resultsData !== undefined || - !(Object.keys(item.scoresData || {}).length === 0) || - item.avgScore !== undefined, - ) as any, - ) - } catch (error) { - console.error(error) - } - } - - fetchEvaluations() - }, [app_id]) - - const onCompleteEvaluation = (evaluation: any) => { - // TODO: improve type - const evaluationType = - EvaluationType[evaluation.evaluationType as keyof typeof EvaluationType] - - if (evaluationType === EvaluationType.auto_exact_match) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/auto_exact_match`) - } else if (evaluationType === EvaluationType.auto_similarity_match) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/auto_similarity_match`) - } else if (evaluationType === EvaluationType.auto_regex_test) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/auto_regex_test`) - } else if (evaluationType === EvaluationType.auto_webhook_test) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/auto_webhook_test`) - } else if (evaluationType === EvaluationType.single_model_test) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/single_model_test`) - } else if (evaluationType === EvaluationType.auto_ai_critique) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/auto_ai_critique`) - } else if (evaluationType === EvaluationType.custom_code_run) { - router.push( - `/apps/${app_id}/evaluations/${evaluation.key}/custom_code_run?custom_eval_id=${evaluation.custom_code_eval_id}`, - ) - } - } - - const columns: ColumnsType = [ - { - title: "Variant", - dataIndex: "variants", - key: "variants", - render: (value) => { - return ( -
- {value[0].variantName} -
- ) - }, - }, - { - title: "Test set", - dataIndex: "testsetName", - key: "testsetName", - render: (value: any, record: EvaluationListTableDataType, index: number) => { - return {record.testset.name} - }, - }, - { - title: "Evaluation type", - dataIndex: "evaluationType", - key: "evaluationType", - width: "300", - render: (value: string) => { - const evaluationType = EvaluationType[value as keyof typeof EvaluationType] - const label = EvaluationTypeLabels[evaluationType] - return {label} - }, - }, - { - title: "Average score", - dataIndex: "averageScore", - key: "averageScore", - render: (value: any, record: EvaluationListTableDataType, index: number) => { - let score = 0 - if (record.scoresData) { - score = - ((record.scoresData.correct?.length || - record.scoresData.true?.length || - 0) / - record.scoresData.nb_of_rows) * - 100 - } else if (record.resultsData) { - const multiplier = { - [EvaluationType.auto_webhook_test]: 100, - [EvaluationType.single_model_test]: 1, - } - score = calculateResultsDataAvg( - record.resultsData, - multiplier[record.evaluationType as keyof typeof multiplier], - ) - score = isNaN(score) ? 0 : score - } else if (record.avgScore) { - score = record.avgScore * 100 - } - - return ( - - - - ) - }, - }, - { - title: "Created at", - dataIndex: "createdAt", - key: "createdAt", - width: "300", - }, - { - title: "Action", - dataIndex: "action", - key: "action", - render: (value: any, record: EvaluationListTableDataType, index: number) => { - let actionText = "View evaluation" - if (record.status !== EvaluationFlow.EVALUATION_FINISHED) { - actionText = "Continue evaluation" - } - return ( -
- -
- ) - }, - }, - ] - - const rowSelection = { - onChange: (selectedRowKeys: React.Key[], selectedRows: EvaluationListTableDataType[]) => { - setSelectedRowKeys(selectedRowKeys) - }, - } - - const onDelete = async () => { - const evaluationsIds = selectedRowKeys.map((key) => key.toString()) - try { - await deleteEvaluations(evaluationsIds) - setEvaluationsList((prevEvaluationsList) => - prevEvaluationsList.filter( - (evaluation) => !evaluationsIds.includes(evaluation.key), - ), - ) - - setSelectedRowKeys([]) - } catch { - } finally { - } - } - - const items = [ - { - key: "1", - label: ( -
- Evaluation Results -
- ), - children: ( -
-
- -
- -
- - ), - }, - ] - - return ( - - ) -} diff --git a/agenta-web/src/components/Evaluations/CustomPythonCode.tsx b/agenta-web/src/components/Evaluations/CustomPythonCode.tsx deleted file mode 100644 index 2e620dbff2..0000000000 --- a/agenta-web/src/components/Evaluations/CustomPythonCode.tsx +++ /dev/null @@ -1,231 +0,0 @@ -import React, {useState, useEffect} from "react" -import {useRouter} from "next/router" -import {Input, Form, Button, Row, Col, Typography, notification} from "antd" -import {CreateCustomEvaluationSuccessResponse} from "@/lib/Types" -import { - saveCustomCodeEvaluation, - fetchCustomEvaluationNames, - editCustomEvaluationDetail, -} from "@/lib/services/api" -import Editor from "@monaco-editor/react" - -interface ICustomPythonProps { - classes: any - appId: string - appTheme: string - editMode: boolean - editCode?: string - editName?: string - editId?: string -} - -interface ICustomEvalNames { - id: string - evaluation_name: string -} - -const CustomPythonCode: React.FC = ({ - classes, - appId, - appTheme, - editMode, - editCode = "", - editName = "", - editId = "", -}) => { - const {Title} = Typography - const [form] = Form.useForm() - const router = useRouter() - - const [submitting, setSubmittingData] = useState(false) - const [evalNames, setEvalNames] = useState([]) - const [evalNameExist, setEvalNameExist] = useState(false) - - let prevKey = "" - const showNotification = (config: Parameters[0]) => { - if (prevKey) notification.destroy(prevKey) - prevKey = (config.key || "") as string - notification.open(config) - } - - useEffect(() => { - const evaluationNames = async () => { - const response: any = await fetchCustomEvaluationNames(appId) - if (response.status === 200) { - setEvalNames(response.data) - } - } - - evaluationNames() - }, [appId]) - - const handlerToSubmitFormData = async (values: any) => { - setSubmittingData(true) - const data = { - evaluation_name: values.evaluationName, // TODO: Change to evaluation id! - python_code: values.pythonCode, - app_id: appId, - } - const response = editMode - ? await editCustomEvaluationDetail(editId, data) - : await saveCustomCodeEvaluation(data) - if (response.status === 200) { - const data: CreateCustomEvaluationSuccessResponse = response.data - - // Disable submitting form data - setSubmittingData(false) - showNotification({ - type: "success", - message: "Custom Evaluation", - description: data.message, - key: data.evaluation_id, - }) - - // Reset form fields and redirect user to evaluations page - form.resetFields() - router.push(`/apps/${appId}/evaluations/`) - } - } - - const isSaveButtonDisabled = () => { - return ( - evalNameExist || - !form.isFieldsTouched(true) || - form.getFieldsError().filter(({errors}) => errors.length).length > 0 - ) - } - - const isEditButtonDisabled = () => { - return evalNameExist - } - - const pythonDefaultEvalCode = () => { - if (editMode) { - return editCode - } else { - return `from typing import Dict - -def evaluate( - app_params: Dict[str, str], - inputs: Dict[str, str], - output: str, - correct_answer: str -) -> float: - # ... - return 0.75 # Replace with your calculated score` - } - } - - const switchEditorThemeBasedOnTheme = () => { - if (appTheme == "light") { - return "vs-light" - } else if (appTheme == "dark") { - return "vs-dark" - } - } - - const checkForEvaluationName = async () => { - const evalName = form.getFieldValue("evaluationName") - if (evalName === editName) { - return - } else if (evalNames.map((e) => e.evaluation_name).includes(evalName)) { - showNotification({ - type: "error", - message: "Custom Evaluation", - duration: 5, - description: "Evaluation name already exist. ", - }) - setEvalNameExist(true) - } else { - setEvalNameExist(false) - } - } - - return ( -
- - {editMode ? "Edit Python Code Evaluation" : "Save Python Code Evaluation"} - -
- -
- - - -
-

- Writing the custom evaluation code: -

- - The function name of your code evaluation must be "evaluate". and - must accept the following parameters: -
    -
  • - The variant parameters (prompt, etc..) as a Dict[str,str] -
  • -
  • A list of inputs as List[str]
  • -
  • The output of the LLM app as a string
  • -
  • A target or correct answer as a string
  • -
- And return a float value indicating the score of the evaluation. -
-
- - - - form.setFieldsValue({pythonCode: code})} - /> - - - - - {() => ( - - )} - - - - - - ) -} - -export default CustomPythonCode diff --git a/agenta-web/src/components/Evaluations/Evaluations.tsx b/agenta-web/src/components/Evaluations/Evaluations.tsx index 65a10bdac9..dc211b90b4 100644 --- a/agenta-web/src/components/Evaluations/Evaluations.tsx +++ b/agenta-web/src/components/Evaluations/Evaluations.tsx @@ -8,21 +8,14 @@ import { RadioChangeEvent, Row, Typography, - Select, message, ModalProps, - Tooltip, } from "antd" -import {DownOutlined, PlusOutlined, EditFilled} from "@ant-design/icons" -import { - createNewEvaluation, - fetchVariants, - useLoadTestsetsList, - fetchCustomEvaluations, -} from "@/lib/services/api" -import {dynamicComponent, getAgentaApiUrl, getApikeys, isDemo} from "@/lib/helpers/utils" +import {DownOutlined} from "@ant-design/icons" +import {createNewEvaluation, fetchVariants, useLoadTestsetsList} from "@/lib/services/api" +import {dynamicComponent, getApikeys, isDemo} from "@/lib/helpers/utils" import {useRouter} from "next/router" -import {Variant, Parameter, GenericObject, SingleCustomEvaluation} from "@/lib/Types" +import {Variant, Parameter, GenericObject} from "@/lib/Types" import {EvaluationType} from "@/lib/enums" import {EvaluationTypeLabels} from "@/lib/helpers/utils" import EvaluationErrorModal from "./EvaluationErrorModal" @@ -31,15 +24,8 @@ import {getAllVariantParameters} from "@/lib/helpers/variantHelper" import Image from "next/image" import abTesting from "@/media/testing.png" import singleModel from "@/media/score.png" -import exactMatch from "@/media/target.png" -import similarity from "@/media/transparency.png" -import regexIcon from "@/media/programming.png" -import webhookIcon from "@/media/link.png" -import ai from "@/media/artificial-intelligence.png" -import codeIcon from "@/media/browser.png" import {useAppTheme} from "../Layout/ThemeContextProvider" import {createUseStyles} from "react-jss" -import AutomaticEvaluationResult from "./AutomaticEvaluationResult" import HumanEvaluationResult from "./HumanEvaluationResult" import {getErrorMessage} from "@/lib/helpers/errorHandler" @@ -78,7 +64,6 @@ const useStyles = createUseStyles({ }, dropdownBtn: { marginRight: 10, - marginTop: 40, width: "100%", }, optionSelected: { @@ -142,7 +127,6 @@ export default function Evaluations() { const [isError, setIsError] = useState(false) const [variants, setVariants] = useState([]) const classes = useStyles({themeMode: appTheme} as StyleProps) - const {Option} = Select const [selectedTestset, setSelectedTestset] = useState<{ _id?: string @@ -168,11 +152,6 @@ export default function Evaluations() { const [error, setError] = useState({message: "", btnText: "", endpoint: ""}) - const [llmAppPromptTemplate, setLLMAppPromptTemplate] = useState("") - - const [customCodeEvaluationList, setCustomCodeEvaluationList] = - useState() - const [shareModalOpen, setShareModalOpen] = useState(false) const ShareEvaluationModal = dynamicComponent( @@ -349,24 +328,13 @@ export default function Evaluations() { } // 2. We create a new app evaluation - const evaluationTypeSettings: GenericObject = {} - //set default settings upon creation - if (selectedEvaluationType === EvaluationType.auto_similarity_match) { - evaluationTypeSettings.similarity_threshold = 0.3 - } else if (selectedEvaluationType === EvaluationType.auto_regex_test) { - evaluationTypeSettings.regex_pattern = "" - evaluationTypeSettings.regex_should_match = true - } else if (selectedEvaluationType === EvaluationType.auto_webhook_test) { - evaluationTypeSettings.webhook_url = `${getAgentaApiUrl()}/api/evaluations/webhook_example_fake` - } - const evaluationTableId = await createNewEvaluation({ variant_ids: selectedVariants.map((variant) => variant.variantId), appId, inputs: variantsInputs[selectedVariants[0].variantName], evaluationType: EvaluationType[selectedEvaluationType as keyof typeof EvaluationType], - evaluationTypeSettings, - llmAppPromptTemplate, + evaluationTypeSettings: {}, + llmAppPromptTemplate: "", selectedCustomEvaluationID, testsetId: selectedTestset._id!, }).catch((err) => { @@ -384,24 +352,10 @@ export default function Evaluations() { // 3 We set the variants setVariants(selectedVariants) - if (selectedEvaluationType === EvaluationType.auto_exact_match) { - router.push(`/apps/${appId}/evaluations/${evaluationTableId}/auto_exact_match`) - } else if (selectedEvaluationType === EvaluationType.human_a_b_testing) { - router.push(`/apps/${appId}/evaluations/${evaluationTableId}/human_a_b_testing`) - } else if (selectedEvaluationType === EvaluationType.auto_similarity_match) { - router.push(`/apps/${appId}/evaluations/${evaluationTableId}/similarity_match`) - } else if (selectedEvaluationType === EvaluationType.auto_regex_test) { - router.push(`/apps/${appId}/evaluations/${evaluationTableId}/auto_regex_test`) - } else if (selectedEvaluationType === EvaluationType.auto_webhook_test) { - router.push(`/apps/${appId}/evaluations/${evaluationTableId}/auto_webhook_test`) - } else if (selectedEvaluationType === EvaluationType.auto_ai_critique) { - router.push(`/apps/${appId}/evaluations/${evaluationTableId}/auto_ai_critique`) - } else if (selectedEvaluationType === EvaluationType.custom_code_run) { - router.push( - `/apps/${appId}/evaluations/${evaluationTableId}/custom_code_run?custom_eval_id=${selectedCustomEvaluationID}`, - ) + if (selectedEvaluationType === EvaluationType.human_a_b_testing) { + router.push(`/apps/${appId}/annotations/${evaluationTableId}/human_a_b_testing`) } else if (selectedEvaluationType === EvaluationType.single_model_test) { - router.push(`/apps/${appId}/evaluations/${evaluationTableId}/single_model_test`) + router.push(`/apps/${appId}/annotations/${evaluationTableId}/single_model_test`) } } @@ -424,27 +378,6 @@ export default function Evaluations() { ) } - useEffect(() => { - if (appId) - fetchCustomEvaluations(appId).then((res) => { - if (res.status === 200) { - setCustomCodeEvaluationList(res.data) - } - }) - }, [appId]) - - const handleCustomEvaluationOptionChange = (id: string) => { - if (id === "new") { - router.push(`/apps/${appId}/evaluations/create_custom_evaluation`) - } - setSelectedCustomEvaluationID(id) - setSelectedEvaluationType(EvaluationType.custom_code_run) - } - - const handleEditOption = (id: string) => { - router.push(`/apps/${appId}/evaluations/custom_evaluations/${id}`) - } - return (
@@ -452,10 +385,19 @@ export default function Evaluations() { {areAppVariantsLoading &&
loading variants...
}
- + +
+ 1. Start a new evaluation + + + 2. Which variants would you like to evaluate + + + 3. Which testset you want to use? + + + - 1. Select an evaluation type - Human evaluation onChangeEvaluationType(e)} className={classes.radioGroup} @@ -495,155 +437,15 @@ export default function Evaluations() { - - Automatic evaluation - - -
- - - - {EvaluationTypeLabels[EvaluationType.auto_exact_match]} - -
-
- -
- - - - {EvaluationTypeLabels[EvaluationType.auto_similarity_match]} - -
-
- -
- - - - {EvaluationTypeLabels[EvaluationType.auto_regex_test]} - -
-
- -
- - - - {EvaluationTypeLabels[EvaluationType.auto_webhook_test]} - -
-
- -
- - - - {EvaluationTypeLabels[EvaluationType.auto_ai_critique]} - -
-
- -
- - -
-
- 2. Which variants would you like to evaluate -
- {Array.from({length: numberOfVariants}).map((_, index) => ( -
- {" "} - 3. Which testset you want to use? -
- - - + {selectedEvaluationType === EvaluationType.human_a_b_testing && isDemo() && ( @@ -708,7 +503,6 @@ export default function Evaluations() { btnText={error.btnText} />
-
diff --git a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx index 68f6fcc625..192280ace3 100644 --- a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx +++ b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx @@ -98,11 +98,11 @@ export default function HumanEvaluationResult() { } const fetchEvaluations = async () => { try { - fetchData(`${getAgentaApiUrl()}/api/evaluations/?app_id=${app_id}`) + fetchData(`${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`) .then((response) => { const fetchPromises = response.map((item: EvaluationResponseType) => { return fetchData( - `${getAgentaApiUrl()}/api/evaluations/${item.id}/results/`, + `${getAgentaApiUrl()}/api/human-evaluations/${item.id}/results/`, ) .then((results) => { if (item.evaluation_type === EvaluationType.human_a_b_testing) { @@ -153,7 +153,7 @@ export default function HumanEvaluationResult() { EvaluationType[evaluation.evaluationType as keyof typeof EvaluationType] if (evaluationType === EvaluationType.human_a_b_testing) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/human_a_b_testing`) + router.push(`/apps/${app_id}/annotations/${evaluation.key}/human_a_b_testing`) } } @@ -286,9 +286,7 @@ export default function HumanEvaluationResult() { key: "1", label: (
- - {EvaluationTypeLabels.human_a_b_testing} Evaluation Results - + Annotation Results
), children: ( diff --git a/agenta-web/src/components/Sidebar/Sidebar.tsx b/agenta-web/src/components/Sidebar/Sidebar.tsx index b1744feac6..e7eb13d41f 100644 --- a/agenta-web/src/components/Sidebar/Sidebar.tsx +++ b/agenta-web/src/components/Sidebar/Sidebar.tsx @@ -263,28 +263,6 @@ const Sidebar: React.FC = () => { data-cy="app-evaluations-link" href={getNavigationPath("evaluations")} className={classes.menuLinks} - > - {collapsed - ? "Perform 1-to-1 variant comparisons on testsets to identify superior options." - : "Evaluate"} - - - - - - }> - {collapsed ? "Perform 1-to-1 variant comparisons on testsets to identify superior options." @@ -293,7 +271,7 @@ const Sidebar: React.FC = () => { - {/* { : "Annotations"} - */} + = ({type = "auto"}) => { type="primary" onClick={() => router.push( - `/apps/${appId}/evaluations-new/compare/?evaluations=${selected + `/apps/${appId}/evaluations/compare/?evaluations=${selected .map((item) => item.id) .join(",")}`, ) @@ -437,7 +437,7 @@ const EvaluationResults: React.FC = ({type = "auto"}) => { getRowId={(params) => params.data.id} onRowClicked={(params) => EvaluationStatus.FINISHED === params.data?.status && - router.push(`/apps/${appId}/evaluations-new/${params.data?.id}`) + router.push(`/apps/${appId}/evaluations/${params.data?.id}`) } rowSelection="multiple" suppressRowClickSelection diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts b/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts deleted file mode 100644 index 198645c723..0000000000 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/mock.ts +++ /dev/null @@ -1,305 +0,0 @@ -import { - EvaluationSettingsTemplate, - EvaluationStatus, - Evaluator, - EvaluatorConfig, - Org, - TestSet, - User, - Variant, - _Evaluation, - _EvaluationScenario, -} from "@/lib/Types" -import exactMatchImg from "@/media/target.png" -import similarityImg from "@/media/transparency.png" -import regexImg from "@/media/programming.png" -import webhookImg from "@/media/link.png" -import aiImg from "@/media/artificial-intelligence.png" -import codeImg from "@/media/browser.png" -import {pickRandom, stringToNumberInRange} from "@/lib/helpers/utils" -import {getTagColors} from "@/lib/helpers/colors" - -const evaluatorIconsMap = { - auto_exact_match: exactMatchImg, - similarity: similarityImg, - auto_regex_test: regexImg, - auto_webhook_test: webhookImg, - auto_ai_critique: aiImg, - custom_code_run: codeImg, -} - -const organizations: Org[] = [ - { - id: "org1", - name: "Organization 1", - description: "This is the description of organization 1", - owner: "user1", - }, -] - -const users: User[] = [ - { - id: "user1", - uid: "user1", - username: "user1", - email: "user1@test.com", - }, -] - -const testsets: TestSet[] = [ - { - id: "testset1", - name: "Test Set 1", - created_at: "2021-01-01T00:00:00.000Z", - updated_at: "2021-01-01T00:00:00.000Z", - csvdata: [], - }, -] - -const variants: Variant[] = [ - { - variantName: "variant1", - templateVariantName: "variant1", - persistent: false, - parameters: {}, - previousVariantName: null, - variantId: "variant1", - baseId: "variant1", - baseName: "variant1", - configId: "config1", - configName: "config1", - }, -] - -const evaluatorSettinsTemplates: EvaluationSettingsTemplate[] = [ - { - type: "number", - default: 0.5, - description: "Threshold for similarity matching", - label: "Similarity Threshold", - }, - { - type: "text", - description: "Threshold for similarity matching", - label: "System Prompt", - }, - { - type: "code", - description: "Python code for evaluation", - label: "Code", - default: `from typing import Dict - - def evaluate( - app_params: Dict[str, str], - inputs: Dict[str, str], - output: str, - correct_answer: str - ) -> float: - # ... - return 0.75 # Replace with your calculated score`, - }, - { - type: "boolean", - default: false, - description: "Whether to use the default webhook", - label: "Use Default Webhook", - }, - { - type: "regex", - description: "Regex pattern ex: ^[0-9]{3}-[0-9]{3}-[0-9]{4}$", - label: "Regex", - }, - { - type: "string", - description: "URL of the webhook", - label: "Webhook URL", - }, -] - -const evaluators: Evaluator[] = [ - { - name: "Exact Match", - key: "auto_exact_match", - settings_template: {}, - }, - { - name: "Similarity", - key: "auto_similarity_match", - settings_template: { - similarity_threshold: evaluatorSettinsTemplates[0], - }, - }, - { - name: "Regex Test", - key: "auto_regex_test", - settings_template: { - regex_pattern: evaluatorSettinsTemplates[4], - regex_should_match: evaluatorSettinsTemplates[3], - }, - }, - { - name: "AI Critique", - key: "auto_ai_critique", - settings_template: { - llm_app_prompt_template: evaluatorSettinsTemplates[1], - }, - }, - { - name: "Code Evaluation", - key: "custom_code_run", - settings_template: { - custom_code_evaluation_id: evaluatorSettinsTemplates[2], - }, - }, - { - name: "Webhook Test", - key: "auto_webhook_test", - settings_template: { - webhook_url: evaluatorSettinsTemplates[5], - }, - }, -].map((item) => ({ - ...(item as Evaluator), - icon_url: evaluatorIconsMap[item.key as keyof typeof evaluatorIconsMap], - color: getTagColors()[stringToNumberInRange(item.key, 0, getTagColors().length - 1)], -})) - -const evaluatorConfigs: EvaluatorConfig[] = pickRandom(evaluators, 7).map((item, ix) => ({ - evaluator_key: item.key, - id: ix + "", - name: `Evaluator ${ix}`, - settings_values: {}, - created_at: new Date().toString(), -})) - -const evaluations: _Evaluation[] = [ - { - id: "evaluation1", - appId: "app1", - user: users[0], - testset: testsets[0], - status: EvaluationStatus.FINISHED, - variants: [variants[0]], - aggregated_results: [ - { - evaluator_config: evaluatorConfigs[0], - result: { - type: "number", - value: 32.5, - }, - }, - ], - created_at: "2021-01-01T00:00:00.000Z", - duration: 50000, - }, - { - id: "evaluation2", - appId: "app1", - user: users[0], - testset: testsets[0], - status: EvaluationStatus.INITIALIZED, - variants: [variants[0]], - aggregated_results: [ - { - evaluator_config: evaluatorConfigs[1], - result: { - type: "string", - value: "passed", - }, - }, - ], - created_at: "2022-01-01T00:00:00.000Z", - duration: 120000, - }, - { - id: "evaluation3", - appId: "app1", - user: users[0], - testset: testsets[0], - status: EvaluationStatus.STARTED, - variants: [variants[0]], - aggregated_results: [ - { - evaluator_config: evaluatorConfigs[2], - result: { - type: "string", - value: "valid", - }, - }, - ], - created_at: "2022-05-01T00:00:00.000Z", - duration: 120000, - }, - { - id: "evaluation4", - appId: "app1", - user: users[0], - testset: testsets[0], - status: EvaluationStatus.ERROR, - variants: [variants[0]], - aggregated_results: [ - { - evaluator_config: evaluatorConfigs[0], - result: { - type: "number", - value: 15, - }, - }, - ], - created_at: "2023-05-01T00:00:00.000Z", - duration: 2000, - }, -] - -const evaluationScenarios: _EvaluationScenario[] = [ - { - id: "evaluationScenario1", - user: users[0], - organization: organizations[0], - evaluation: evaluations[0], - inputs: [ - { - name: "country", - type: "text", - value: "Sample input text", - }, - ], - outputs: [ - { - type: "number", - value: 32.5, - }, - ], - correct_answer: { - type: "number", - value: 28, - }, - created_at: "2021-01-01T00:00:00.000Z", - updated_at: "2021-01-01T00:00:00.000Z", - is_pinned: false, - note: "This is a note", - evaluators_configs: [evaluatorConfigs[0]], - results: [ - { - evaluator: evaluators.find( - (item) => item.key === evaluatorConfigs[0].evaluator_key, - )!, - result: 12, - }, - ], - }, -] - -const Mock = { - organizations, - users, - testsets, - variants, - evaluatorSettinsTemplates, - evaluators, - evaluatorConfigs, - evaluations, - evaluationScenarios, -} - -export default Mock diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx index 7208c411da..0352bcf831 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -142,7 +142,7 @@ const EvaluationScenarios: React.FC = () => { message: "Are you sure you want to delete this evaluation?", onOk: () => deleteEvaluations([evaluationId]) - .then(() => router.push(`/apps/${appId}/evaluations-new`)) + .then(() => router.push(`/apps/${appId}/evaluations`)) .catch(console.error), }) } diff --git a/agenta-web/src/lib/services/api.ts b/agenta-web/src/lib/services/api.ts index bc0af33976..d6207b2976 100644 --- a/agenta-web/src/lib/services/api.ts +++ b/agenta-web/src/lib/services/api.ts @@ -289,7 +289,7 @@ export const deleteTestsets = async (ids: string[]) => { export const loadEvaluations = async (appId: string) => { return await axios - .get(`${getAgentaApiUrl()}/api/evaluations/?app_id=${appId}`) + .get(`${getAgentaApiUrl()}/api/evaluations/human-evaluations/?app_id=${appId}`) .then((responseData) => { const evaluations = responseData.data.map((item: EvaluationResponseType) => { return fromEvaluationResponseToEvaluation(item) @@ -301,7 +301,7 @@ export const loadEvaluations = async (appId: string) => { export const loadEvaluation = async (evaluationId: string) => { return await axios - .get(`${getAgentaApiUrl()}/api/evaluations/${evaluationId}/`) + .get(`${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationId}/`) .then((responseData) => { return fromEvaluationResponseToEvaluation(responseData.data) }) @@ -310,7 +310,7 @@ export const loadEvaluation = async (evaluationId: string) => { export const deleteEvaluations = async (ids: string[]) => { const response = await axios({ method: "delete", - url: `${getAgentaApiUrl()}/api/evaluations/`, + url: `${getAgentaApiUrl()}/api/evaluations/human-evaluations/`, data: {evaluations_ids: ids}, }) return response.data @@ -321,7 +321,9 @@ export const loadEvaluationsScenarios = async ( evaluation: Evaluation, ) => { return await axios - .get(`${getAgentaApiUrl()}/api/evaluations/${evaluationTableId}/evaluation_scenarios/`) + .get( + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationTableId}/evaluation_scenarios/`, + ) .then((responseData) => { const evaluationsRows = responseData.data.map((item: any) => { return fromEvaluationScenarioResponseToEvaluationScenario(item, evaluation) @@ -367,14 +369,21 @@ export const createNewEvaluation = async ( status: EvaluationFlow.EVALUATION_INITIALIZED, } - const response = await axios.post(`${getAgentaApiUrl()}/api/evaluations/`, data, { - _ignoreError: ignoreAxiosError, - } as any) + const response = await axios.post( + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/`, + data, + { + _ignoreError: ignoreAxiosError, + } as any, + ) return response.data.id } export const updateEvaluation = async (evaluationId: string, data: GenericObject) => { - const response = await axios.put(`${getAgentaApiUrl()}/api/evaluations/${evaluationId}/`, data) + const response = await axios.put( + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationId}/`, + data, + ) return response.data } @@ -385,7 +394,7 @@ export const updateEvaluationScenario = async ( evaluationType: EvaluationType, ) => { const response = await axios.put( - `${getAgentaApiUrl()}/api/evaluations/${evaluationTableId}/evaluation_scenario/${evaluationScenarioId}/${evaluationType}/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationTableId}/evaluation_scenario/${evaluationScenarioId}/${evaluationType}/`, data, ) return response.data @@ -393,7 +402,7 @@ export const updateEvaluationScenario = async ( export const postEvaluationScenario = async (evaluationTableId: string, data: GenericObject) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/${evaluationTableId}/evaluation_scenario/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationTableId}/evaluation_scenario/`, data, ) return response.data @@ -404,7 +413,7 @@ export const evaluateAICritiqueForEvalScenario = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/evaluation_scenario/ai_critique/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/evaluation_scenario/ai_critique/`, data, {_ignoreError: ignoreAxiosError} as any, ) @@ -413,14 +422,14 @@ export const evaluateAICritiqueForEvalScenario = async ( export const fetchEvaluationResults = async (evaluationId: string) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/${evaluationId}/results/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationId}/results/`, ) return response.data } export const fetchEvaluationScenarioResults = async (evaluation_scenario_id: string) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, ) return response } @@ -430,7 +439,7 @@ export const saveCustomCodeEvaluation = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/custom_evaluation/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/`, payload, {_ignoreError: ignoreAxiosError} as any, ) @@ -443,7 +452,7 @@ export const editCustomEvaluationDetail = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.put( - `${getAgentaApiUrl()}/api/evaluations/custom_evaluation/${id}`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/${id}`, payload, {_ignoreError: ignoreAxiosError} as any, ) @@ -452,7 +461,7 @@ export const editCustomEvaluationDetail = async ( export const fetchCustomEvaluations = async (app_id: string, ignoreAxiosError: boolean = false) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/custom_evaluation/list/${app_id}/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/list/${app_id}/`, {_ignoreError: ignoreAxiosError} as any, ) return response @@ -463,7 +472,7 @@ export const fetchCustomEvaluationDetail = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/custom_evaluation/${id}/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/${id}/`, {_ignoreError: ignoreAxiosError} as any, ) return response.data @@ -474,7 +483,7 @@ export const fetchCustomEvaluationNames = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/custom_evaluation/${app_id}/names/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/${app_id}/names/`, {_ignoreError: ignoreAxiosError} as any, ) return response @@ -485,7 +494,9 @@ export const executeCustomEvaluationCode = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/custom_evaluation/execute/${payload.evaluation_id}/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/execute/${ + payload.evaluation_id + }/`, payload, {_ignoreError: ignoreAxiosError} as any, ) @@ -498,7 +509,7 @@ export const updateEvaluationScenarioScore = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.put( - `${getAgentaApiUrl()}/api/evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, {score}, {_ignoreError: ignoreAxiosError} as any, ) diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx deleted file mode 100644 index d4a63b599a..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/annotations/[annotation_id]/index.tsx +++ /dev/null @@ -1,9 +0,0 @@ -import React from "react" - -interface Props {} - -const AnnotationScenarios: React.FC = () => { - return
-} - -export default AnnotationScenarios diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/index.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/index.tsx index 68b1be8ff8..e0aa2e3565 100644 --- a/agenta-web/src/pages/apps/[app_id]/annotations/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/annotations/index.tsx @@ -1,10 +1,5 @@ -import EvaluationResults from "@/components/pages/evaluations/evaluationResults/EvaluationResults" -import React from "react" +import Evaluations from "@/components/Evaluations/Evaluations" -interface Props {} - -const Annotations: React.FC = () => { - return +export default function Evaluation() { + return } - -export default Annotations diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx deleted file mode 100644 index 5abc7a7811..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations-new/index.tsx +++ /dev/null @@ -1,69 +0,0 @@ -import EvaluationResults from "@/components/pages/evaluations/evaluationResults/EvaluationResults" -import Evaluators from "@/components/pages/evaluations/evaluators/Evaluators" -import {useAppId} from "@/hooks/useAppId" -import {useQueryParam} from "@/hooks/useQuery" -import {JSSTheme} from "@/lib/Types" -import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations" -import {SlidersOutlined, UnorderedListOutlined} from "@ant-design/icons" -import {Tabs} from "antd" -import {useAtom} from "jotai" -import React, {useEffect} from "react" -import {createUseStyles} from "react-jss" - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - root: { - "& .ant-tabs-nav": { - position: "sticky", - top: 0, - zIndex: 1, - background: theme.colorBgContainer, - marginBottom: 0, - }, - }, -})) - -interface Props {} - -const Evaluations: React.FC = () => { - const [tab, setTab] = useQueryParam("tab", "results") - const appId = useAppId() - const classes = useStyles() - const setEvaluators = useAtom(evaluatorsAtom)[1] - const setEvaluatorConfigs = useAtom(evaluatorConfigsAtom)[1] - - useEffect(() => { - Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( - ([evaluators, configs]) => { - setEvaluators(evaluators) - setEvaluatorConfigs(configs) - }, - ) - }, [appId]) - - return ( -
- , - children: , - }, - { - key: "evaluators", - label: "Evaluators", - icon: , - children: , - }, - ]} - onChange={setTab} - /> -
- ) -} - -export default Evaluations diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_ai_critique.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_ai_critique.tsx deleted file mode 100644 index b61234e9ad..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_ai_critique.tsx +++ /dev/null @@ -1,69 +0,0 @@ -import AICritiqueEvaluationTable from "../../../../../components/EvaluationTable/AICritiqueEvaluationTable" -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - const columnsCount = 1 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_exact_match.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_exact_match.tsx deleted file mode 100644 index 83585e6ca0..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_exact_match.tsx +++ /dev/null @@ -1,69 +0,0 @@ -import ExactMatchEvaluationTable from "../../../../../components/EvaluationTable/ExactMatchEvaluationTable" -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - const columnsCount = 1 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_regex_test.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_regex_test.tsx deleted file mode 100644 index bd05371213..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_regex_test.tsx +++ /dev/null @@ -1,69 +0,0 @@ -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import RegexEvaluationTable from "@/components/EvaluationTable/RegexEvaluationTable" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - const columnsCount = 1 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_similarity_match.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_similarity_match.tsx deleted file mode 100644 index 9065baed6f..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_similarity_match.tsx +++ /dev/null @@ -1,69 +0,0 @@ -import SimilarityMatchEvaluationTable from "../../../../../components/EvaluationTable/SimilarityMatchEvaluationTable" -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - const columnsCount = 1 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_webhook_test.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_webhook_test.tsx deleted file mode 100644 index d9866d7c37..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/auto_webhook_test.tsx +++ /dev/null @@ -1,69 +0,0 @@ -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import WebhookEvaluationTable from "@/components/EvaluationTable/WebhookEvaluationTable" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - const columnsCount = 1 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/custom_code_run.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/custom_code_run.tsx deleted file mode 100644 index aaff77477b..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/custom_code_run.tsx +++ /dev/null @@ -1,71 +0,0 @@ -import CustomCodeRunEvaluationTable from "../../../../../components/EvaluationTable/CustomCodeRunEvaluationTable" -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const customEvaluationId = router.query.custom_eval_id as string - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - const columnsCount = 1 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation, evaluationTableId]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId, appId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && customEvaluationId && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/human_a_b_testing.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/human_a_b_testing.tsx deleted file mode 100644 index 3b96ab1166..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/human_a_b_testing.tsx +++ /dev/null @@ -1,71 +0,0 @@ -import ABTestingEvaluationTable from "@/components/EvaluationTable/ABTestingEvaluationTable" -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect} from "react" -import {fetchVariants} from "@/lib/services/api" -import {useAtom} from "jotai" -import {evaluationAtom, evaluationScenariosAtom} from "@/lib/atoms/evaluation" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useAtom(evaluationScenariosAtom) - const [evaluation, setEvaluation] = useAtom(evaluationAtom) - const appId = router.query.app_id as string - const columnsCount = 2 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/[evaluation_id]/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/index.tsx similarity index 100% rename from agenta-web/src/pages/apps/[app_id]/evaluations-new/[evaluation_id]/index.tsx rename to agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/index.tsx diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/similarity_match.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/similarity_match.tsx deleted file mode 100644 index 9065baed6f..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/similarity_match.tsx +++ /dev/null @@ -1,69 +0,0 @@ -import SimilarityMatchEvaluationTable from "../../../../../components/EvaluationTable/SimilarityMatchEvaluationTable" -import {Evaluation} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import {getTestsetChatColumn} from "@/lib/helpers/testset" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - const columnsCount = 1 - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios(data) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/single_model_test.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/single_model_test.tsx deleted file mode 100644 index fa522c02d6..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/[evaluation_id]/single_model_test.tsx +++ /dev/null @@ -1,72 +0,0 @@ -import {Evaluation, EvaluationScenario, GenericObject} from "@/lib/Types" -import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {fetchVariants} from "@/lib/services/api" -import {getTestsetChatColumn} from "@/lib/helpers/testset" -import SingleModelEvaluationTable from "@/components/EvaluationTable/SingleModelEvaluationTable" - -export default function Evaluation() { - const router = useRouter() - const evaluationTableId = router.query.evaluation_id - ? router.query.evaluation_id.toString() - : "" - const [evaluationScenarios, setEvaluationScenarios] = useState([]) - const [evaluation, setEvaluation] = useState() - const appId = router.query.app_id as string - - useEffect(() => { - if (!evaluation) { - return - } - const init = async () => { - const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) - setEvaluationScenarios( - data.map((item: GenericObject) => { - const numericScore = parseInt(item.score) - return {...item, score: isNaN(numericScore) ? null : numericScore} - }), - ) - } - init() - }, [evaluation]) - - useEffect(() => { - if (!evaluationTableId) { - return - } - const init = async () => { - const evaluation: Evaluation = await loadEvaluation(evaluationTableId) - const backendVariants = await fetchVariants(appId) - const testset = await loadTestset(evaluation.testset._id) - // Create a map for faster access to first array elements - let backendVariantsMap = new Map() - backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) - - // Update variants in second object - evaluation.variants = evaluation.variants.map((variant) => { - let backendVariant = backendVariantsMap.get(variant.variantId) - return backendVariant ? backendVariant : variant - }) - evaluation.testset = { - ...evaluation.testset, - ...testset, - testsetChatColumn: getTestsetChatColumn(testset.csvdata), - } - setEvaluation(evaluation) - } - - init() - }, [evaluationTableId]) - - return ( -
- {evaluationTableId && evaluationScenarios && evaluation && ( - - )} -
- ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/compare/index.tsx similarity index 100% rename from agenta-web/src/pages/apps/[app_id]/evaluations-new/compare/index.tsx rename to agenta-web/src/pages/apps/[app_id]/evaluations/compare/index.tsx diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/create_custom_evaluation.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/create_custom_evaluation.tsx deleted file mode 100644 index dac017d4d8..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/create_custom_evaluation.tsx +++ /dev/null @@ -1,50 +0,0 @@ -import {useRouter} from "next/router" -import {createUseStyles} from "react-jss" -import {useAppTheme} from "@/components/Layout/ThemeContextProvider" -import CustomPythonCode from "@/components/Evaluations/CustomPythonCode" - -type StyleProps = { - themeMode: "dark" | "light" -} - -const useStyles = createUseStyles({ - evaluationContainer: { - border: "1px solid lightgrey", - padding: "20px", - borderRadius: "14px", - marginBottom: 50, - }, - evaluationImg: ({themeMode}: StyleProps) => ({ - width: 24, - height: 24, - marginRight: "8px", - filter: themeMode === "dark" ? "invert(1)" : "none", - }), - customTitle: { - marginBottom: "30px !important", - }, - submitBtn: { - marginTop: "30px", - width: "250px", - }, - levelFourHeading: { - marginBottom: "15px", - }, - copyBtn: { - marginLeft: "15px", - }, - modalError: { - color: "red", - marginLeft: "0px", - }, -}) - -export default function CreateCustomEvaluation() { - const router = useRouter() - const appId = router.query.app_id?.toString() || "" - - const {appTheme} = useAppTheme() - const classes = useStyles({themeMode: appTheme} as StyleProps) - - return -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/custom_evaluations/[id].tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/custom_evaluations/[id].tsx deleted file mode 100644 index 135c06d628..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/custom_evaluations/[id].tsx +++ /dev/null @@ -1,86 +0,0 @@ -import {useRouter} from "next/router" -import {createUseStyles} from "react-jss" -import {useAppTheme} from "@/components/Layout/ThemeContextProvider" -import {fetchCustomEvaluationDetail} from "@/lib/services/api" -import {useEffect, useState} from "react" -import CustomPythonCode from "@/components/Evaluations/CustomPythonCode" - -type StyleProps = { - themeMode: "dark" | "light" -} - -interface ICustomEvalDetails { - id: string - evaluation_name: string - app_name: string - python_code: string - created_at: string - updated_at: string -} - -const useStyles = createUseStyles({ - evaluationContainer: { - border: "1px solid lightgrey", - padding: "20px", - borderRadius: "14px", - marginBottom: 50, - }, - evaluationImg: ({themeMode}: StyleProps) => ({ - width: 24, - height: 24, - marginRight: "8px", - filter: themeMode === "dark" ? "invert(1)" : "none", - }), - customTitle: { - marginBottom: "30px !important", - }, - submitBtn: { - marginTop: "30px", - width: "250px", - }, - levelFourHeading: { - marginBottom: "15px", - }, - copyBtn: { - marginLeft: "15px", - }, - modalError: { - color: "red", - marginLeft: "0px", - }, -}) - -export default function EditCustomEvaluation() { - const router = useRouter() - const appId = router.query.app_id?.toString() || "" - const id: string = router.query.id?.toString() || "" - - const {appTheme} = useAppTheme() - const classes = useStyles({themeMode: appTheme} as StyleProps) - const [evalDetail, setEvalDetail] = useState() - - useEffect(() => { - const evaluationDetails = async () => { - const response: any = await fetchCustomEvaluationDetail(id) - setEvalDetail(response) - } - evaluationDetails() - // eslint-disable-next-line react-hooks/exhaustive-deps - }, []) - - return ( - <> - {evalDetail?.evaluation_name !== undefined && evalDetail?.evaluation_name !== "" && ( - - )} - - ) -} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx index e0aa2e3565..5abc7a7811 100644 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx @@ -1,5 +1,69 @@ -import Evaluations from "@/components/Evaluations/Evaluations" +import EvaluationResults from "@/components/pages/evaluations/evaluationResults/EvaluationResults" +import Evaluators from "@/components/pages/evaluations/evaluators/Evaluators" +import {useAppId} from "@/hooks/useAppId" +import {useQueryParam} from "@/hooks/useQuery" +import {JSSTheme} from "@/lib/Types" +import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" +import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations" +import {SlidersOutlined, UnorderedListOutlined} from "@ant-design/icons" +import {Tabs} from "antd" +import {useAtom} from "jotai" +import React, {useEffect} from "react" +import {createUseStyles} from "react-jss" -export default function Evaluation() { - return +const useStyles = createUseStyles((theme: JSSTheme) => ({ + root: { + "& .ant-tabs-nav": { + position: "sticky", + top: 0, + zIndex: 1, + background: theme.colorBgContainer, + marginBottom: 0, + }, + }, +})) + +interface Props {} + +const Evaluations: React.FC = () => { + const [tab, setTab] = useQueryParam("tab", "results") + const appId = useAppId() + const classes = useStyles() + const setEvaluators = useAtom(evaluatorsAtom)[1] + const setEvaluatorConfigs = useAtom(evaluatorConfigsAtom)[1] + + useEffect(() => { + Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( + ([evaluators, configs]) => { + setEvaluators(evaluators) + setEvaluatorConfigs(configs) + }, + ) + }, [appId]) + + return ( +
+ , + children: , + }, + { + key: "evaluators", + label: "Evaluators", + icon: , + children: , + }, + ]} + onChange={setTab} + /> +
+ ) } + +export default Evaluations From 812cfe89f700554196aa322eb982b72f2957f48a Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 12:12:38 +0100 Subject: [PATCH 209/414] update human evaluation scenario --- .../models/api/evaluation_model.py | 20 +++++++++++++++++++ .../agenta_backend/models/db_models.py | 4 ++-- .../routers/evaluation_router.py | 7 ++++--- .../agenta_backend/services/db_manager.py | 18 +++++++++++++++++ .../services/evaluation_service.py | 19 +++++++++--------- 5 files changed, 54 insertions(+), 14 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 1262120eb7..dfbe8c4c05 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -110,6 +110,26 @@ class EvaluationScenarioOutput(BaseModel): value: Any +class HumanEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class HumanEvaluationScenarioUpdate(BaseModel): + vote: Optional[str] + score: Optional[Union[str, int]] + correct_answer: Optional[str] # will be used when running custom code evaluation + outputs: Optional[List[HumanEvaluationScenarioOutput]] + inputs: Optional[List[HumanEvaluationScenarioInput]] + is_pinned: Optional[bool] + note: Optional[str] + + class EvaluationScenario(BaseModel): id: Optional[str] evaluation_id: str diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 0ed516cfd9..9324bb8719 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -270,7 +270,7 @@ class HumanEvaluationDB(Model): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Config: - collection = "evaluations" + collection = "human_evaluations" class HumanEvaluationScenarioDB(Model): @@ -288,7 +288,7 @@ class HumanEvaluationScenarioDB(Model): note: Optional[str] class Config: - collection = "evaluation_scenarios" + collection = "human_evaluations_scenarios" class EvaluationDB(Model): diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 3fc0b6a670..23eae40098 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -17,6 +17,7 @@ EvaluationScenarioScoreUpdate, EvaluationScenarioUpdate, ExecuteCustomEvaluationCode, + HumanEvaluationScenarioUpdate, NewEvaluation, DeleteEvaluation, EvaluationType, @@ -32,12 +33,12 @@ fetch_custom_evaluations, fetch_custom_evaluation_detail, get_evaluation_scenario_score, - update_evaluation_scenario, update_evaluation_scenario_score, update_evaluation, create_custom_code_evaluation, update_custom_code_evaluation, execute_custom_code_evaluation, + update_human_evaluation_scenario, ) from agenta_backend.services import evaluation_service from agenta_backend.utils.common import check_access_to_app @@ -287,7 +288,7 @@ async def update_evaluation_scenario_router( evaluation_id: str, evaluation_scenario_id: str, evaluation_type: EvaluationType, - evaluation_scenario: EvaluationScenarioUpdate, + evaluation_scenario: HumanEvaluationScenarioUpdate, request: Request, ): """Updates an evaluation scenario's vote or score based on its type. @@ -300,7 +301,7 @@ async def update_evaluation_scenario_router( """ user_org_data = await get_user_and_org_id(request.state.user_id) try: - await update_evaluation_scenario( + await update_human_evaluation_scenario( evaluation_scenario_id, evaluation_scenario, evaluation_type, diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 0e60598b20..f85c779522 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -22,6 +22,7 @@ from agenta_backend.models.db_models import ( AnnotationsDB, AnnotationsScenariosDB, + HumanEvaluationScenarioDB, Result, AggregatedResult, AppDB, @@ -1304,6 +1305,23 @@ async def fetch_evaluation_scenario_by_id( return evaluation_scenario +async def fetch_human_evaluation_scenario_by_id( + evaluation_scenario_id: str, +) -> Optional[HumanEvaluationScenarioDB]: + """Fetches and evaluation scenario by its ID. + Args: + evaluation_scenario_id (str): The ID of the evaluation scenario to fetch. + Returns: + EvaluationScenarioDB: The fetched evaluation scenario, or None if no evaluation scenario was found. + """ + assert evaluation_scenario_id is not None, "evaluation_scenario_id cannot be None" + evaluation_scenario = await engine.find_one( + HumanEvaluationScenarioDB, + HumanEvaluationScenarioDB.id == ObjectId(evaluation_scenario_id), + ) + return evaluation_scenario + + async def find_previous_variant_from_base_id( base_id: str, ) -> Optional[AppVariantDB]: diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 0474ace172..e2c115724d 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -32,6 +32,7 @@ HumanEvaluationDB, HumanEvaluationScenarioDB, HumanEvaluationScenarioInput, + HumanEvaluationScenarioOutput, UserDB, AppDB, EvaluationScenarioInputDB, @@ -78,11 +79,11 @@ async def _fetch_evaluation_and_check_access( return evaluation -async def _fetch_evaluation_scenario_and_check_access( +async def _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario_id: str, **user_org_data: dict -) -> EvaluationDB: +) -> HumanEvaluationDB: # Fetch the evaluation by ID - evaluation_scenario = await db_manager.fetch_evaluation_scenario_by_id( + evaluation_scenario = await db_manager.fetch_human_evaluation_scenario_by_id( evaluation_scenario_id=evaluation_scenario_id ) if evaluation_scenario is None: @@ -296,7 +297,7 @@ async def fetch_evaluation_scenarios_for_evaluation( return eval_scenarios -async def update_evaluation_scenario( +async def update_human_evaluation_scenario( evaluation_scenario_id: str, evaluation_scenario_data: EvaluationScenarioUpdate, evaluation_type: EvaluationType, @@ -314,7 +315,7 @@ async def update_evaluation_scenario( Raises: HTTPException: If evaluation scenario not found or access denied. """ - eval_scenario = await _fetch_evaluation_scenario_and_check_access( + eval_scenario = await _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario_id=evaluation_scenario_id, **user_org_data, ) @@ -342,7 +343,7 @@ async def update_evaluation_scenario( if updated_data["outputs"] is not None: new_outputs = [ - EvaluationScenarioOutputDB( + HumanEvaluationScenarioOutput( variant_id=output["variant_id"], variant_output=output["variant_output"], ).dict() @@ -352,7 +353,7 @@ async def update_evaluation_scenario( if updated_data["inputs"] is not None: new_inputs = [ - EvaluationScenarioInputDB( + HumanEvaluationScenarioInput( input_name=input_item["input_name"], input_value=input_item["input_value"], ).dict() @@ -387,7 +388,7 @@ async def update_evaluation_scenario_score( Raises: HTTPException: If evaluation scenario not found or access denied. """ - eval_scenario = await _fetch_evaluation_scenario_and_check_access( + eval_scenario = await _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario_id, **user_org_data ) eval_scenario.score = score @@ -409,7 +410,7 @@ async def get_evaluation_scenario_score( Returns: Dictionary with 'scenario_id' and 'score' keys. """ - evaluation_scenario = await _fetch_evaluation_scenario_and_check_access( + evaluation_scenario = await _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario_id, **user_org_data ) return { From b97907537ab6579d53f1f9dbecd5f39bf7ac1f0e Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 2 Jan 2024 16:21:40 +0500 Subject: [PATCH 210/414] fixed api path | revert dev.dockerFile changes --- agenta-web/dev.Dockerfile | 28 +++++++++---------- .../Evaluations/HumanEvaluationResult.tsx | 8 ++++-- agenta-web/src/components/Sidebar/Sidebar.tsx | 2 +- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile index 92f6ea5bbc..6f86dbd847 100644 --- a/agenta-web/dev.Dockerfile +++ b/agenta-web/dev.Dockerfile @@ -3,14 +3,14 @@ FROM node:18-alpine WORKDIR /app # Install dependencies based on the preferred package manager -# COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ -# RUN \ -# if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ -# elif [ -f package-lock.json ]; then npm i; \ -# elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ -# # Allow install without lockfile, so example works even without Node.js installed locally -# else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ -# fi +COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ +RUN \ + if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ + elif [ -f package-lock.json ]; then npm i; \ + elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ + # Allow install without lockfile, so example works even without Node.js installed locally + else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ + fi COPY src ./src COPY public ./public @@ -28,10 +28,10 @@ COPY sentry.* . # Note: Don't expose ports here, Compose will handle that for us # Start Next.js in development mode based on the preferred package manager -# CMD \ -# if [ -f yarn.lock ]; then yarn dev; \ -# elif [ -f package-lock.json ]; then npm run dev; \ -# elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ -# else yarn dev; \ -# fi +CMD \ + if [ -f yarn.lock ]; then yarn dev; \ + elif [ -f package-lock.json ]; then npm run dev; \ + elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ + else yarn dev; \ + fi diff --git a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx index 192280ace3..7d6aa7b50b 100644 --- a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx +++ b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx @@ -98,11 +98,15 @@ export default function HumanEvaluationResult() { } const fetchEvaluations = async () => { try { - fetchData(`${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`) + fetchData( + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/?app_id=${app_id}`, + ) .then((response) => { const fetchPromises = response.map((item: EvaluationResponseType) => { return fetchData( - `${getAgentaApiUrl()}/api/human-evaluations/${item.id}/results/`, + `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${ + item.id + }/results/`, ) .then((results) => { if (item.evaluation_type === EvaluationType.human_a_b_testing) { diff --git a/agenta-web/src/components/Sidebar/Sidebar.tsx b/agenta-web/src/components/Sidebar/Sidebar.tsx index e7eb13d41f..63fdb2492b 100644 --- a/agenta-web/src/components/Sidebar/Sidebar.tsx +++ b/agenta-web/src/components/Sidebar/Sidebar.tsx @@ -266,7 +266,7 @@ const Sidebar: React.FC = () => { > {collapsed ? "Perform 1-to-1 variant comparisons on testsets to identify superior options." - : "Evaluate New"} + : "Evaluate"}
From a334553449f72eb71c659ec945c14c41a2afe97b Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 2 Jan 2024 16:25:45 +0500 Subject: [PATCH 211/414] cleanup evaluate.ts from old functions --- agenta-web/src/lib/helpers/evaluate.ts | 142 ------------------------- 1 file changed, 142 deletions(-) diff --git a/agenta-web/src/lib/helpers/evaluate.ts b/agenta-web/src/lib/helpers/evaluate.ts index 5311b1e7c2..1e462b7081 100644 --- a/agenta-web/src/lib/helpers/evaluate.ts +++ b/agenta-web/src/lib/helpers/evaluate.ts @@ -2,67 +2,6 @@ import {HumanEvaluationListTableDataType} from "@/components/Evaluations/HumanEv import {Evaluation, GenericObject, Variant} from "../Types" import {convertToCsv, downloadCsv} from "./fileManipulations" -export const exportExactEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => { - const exportRow = rows.map((data, ix) => { - return { - ["Inputs"]: - evaluation.testset.csvdata[ix]?.[evaluation.testset.testsetChatColumn] || - data.inputs[0].input_value, - [`App Variant ${evaluation.variants[0].variantName} Output`]: data?.columnData0 - ? data?.columnData0 - : data.outputs[0]?.variant_output, - ["Correct answer"]: data.correctAnswer, - ["Evaluation"]: data.score, - } - }) - const exportCol = Object.keys(exportRow[0]) - - const csvData = convertToCsv(exportRow, exportCol) - const filename = `${evaluation.appName}_${evaluation.variants[0].variantName}_${evaluation.evaluationType}.csv` - downloadCsv(csvData, filename) -} - -export const exportSimilarityEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => { - const exportRow = rows.map((data, ix) => { - return { - ["Inputs"]: - evaluation.testset.csvdata[ix]?.[evaluation.testset.testsetChatColumn] || - data.inputs[0].input_value, - [`App Variant ${evaluation.variants[0].variantName} Output`]: data?.columnData0 - ? data?.columnData0 - : data.outputs[0]?.variant_output, - ["Correct answer"]: data.correctAnswer, - ["Score"]: data.score, - ["Evaluation"]: data.similarity, - } - }) - const exportCol = Object.keys(exportRow[0]) - - const csvData = convertToCsv(exportRow, exportCol) - const filename = `${evaluation.appName}_${evaluation.variants[0].variantName}_${evaluation.evaluationType}.csv` - downloadCsv(csvData, filename) -} - -export const exportAICritiqueEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => { - const exportRow = rows.map((data, ix) => { - return { - ["Inputs"]: - evaluation.testset.csvdata[ix]?.[evaluation.testset.testsetChatColumn] || - data.inputs[0].input_value, - [`App Variant ${evaluation.variants[0].variantName} Output`]: data?.columnData0 - ? data?.columnData0 - : data.outputs[0]?.variant_output, - ["Correct answer"]: data.correctAnswer, - ["Score"]: data.score, - } - }) - const exportCol = Object.keys(exportRow[0]) - - const csvData = convertToCsv(exportRow, exportCol) - const filename = `${evaluation.appName}_${evaluation.variants[0].variantName}_${evaluation.evaluationType}.csv` - downloadCsv(csvData, filename) -} - export const exportABTestingEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => { const exportRow = rows.map((data, ix) => { return { @@ -107,87 +46,6 @@ export const exportSingleModelEvaluationData = (evaluation: Evaluation, rows: Ge downloadCsv(csvData, filename) } -export const exportRegexEvaluationData = ( - evaluation: Evaluation, - rows: GenericObject[], - settings: GenericObject, -) => { - const exportRow = rows.map((data, ix) => { - const isCorrect = data.score === "correct" - const isMatch = settings.regexShouldMatch ? isCorrect : !isCorrect - - return { - ["Inputs"]: - evaluation.testset.csvdata[ix]?.[evaluation.testset.testsetChatColumn] || - data.inputs[0].input_value, - [`App Variant ${evaluation.variants[0].variantName} Output`]: data?.columnData0 - ? data?.columnData0 - : data.outputs[0]?.variant_output, - ["Match / Mismatch"]: isMatch ? "Match" : "Mismatch", - ["Evaluation"]: data.score, - } - }) - const exportCol = Object.keys(exportRow[0]) - - const csvData = convertToCsv(exportRow, exportCol) - const filename = `${evaluation.appName}_${evaluation.variants[0].variantName}_${evaluation.evaluationType}.csv` - downloadCsv(csvData, filename) -} - -export const exportWebhookEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => { - const exportRow = rows.map((data, ix) => { - return { - ["Inputs"]: - evaluation.testset.csvdata[ix]?.[evaluation.testset.testsetChatColumn] || - data.inputs[0].input_value, - [`App Variant ${evaluation.variants[0].variantName} Output`]: data?.columnData0 - ? data?.columnData0 - : data.outputs[0]?.variant_output, - ["Correct answer"]: data.correctAnswer, - ["Score"]: data.score, - } - }) - const exportCol = Object.keys(exportRow[0]) - - const csvData = convertToCsv(exportRow, exportCol) - const filename = `${evaluation.appName}_${evaluation.variants[0].variantName}_${evaluation.evaluationType}.csv` - downloadCsv(csvData, filename) -} - -export const exportCustomCodeEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => { - const exportRow = rows.map((data, ix) => { - return { - ["Inputs"]: - evaluation.testset.csvdata[ix]?.[evaluation.testset.testsetChatColumn] || - data.inputs[0].input_value, - [`App Variant ${evaluation.variants[0].variantName} Output`]: data?.columnData0 - ? data?.columnData0 - : data.outputs[0]?.variant_output, - ["Correct answer"]: data.correctAnswer, - ["Score"]: data.score, - } - }) - const exportCol = Object.keys(exportRow[0]) - - const csvData = convertToCsv(exportRow, exportCol) - const filename = `${evaluation.appName}_${evaluation.variants[0].variantName}_${evaluation.evaluationType}.csv` - downloadCsv(csvData, filename) -} - -export const calculateResultsDataAvg = ( - resultsData: Record, - multiplier: number = 10, -) => { - const obj = {...resultsData} - Object.keys(obj).forEach((key) => { - if (isNaN(+key)) delete obj[key] - }) - - const count = Object.values(obj).reduce((acc, value) => acc + +value, 0) - const sum = Object.keys(obj).reduce((acc, key) => acc + (parseFloat(key) || 0) * +obj[key], 0) - return (sum / count) * multiplier -} - export const getVotesPercentage = (record: HumanEvaluationListTableDataType, index: number) => { const variant = record.votesData.variants[index] return record.votesData.variants_votes_data[variant]?.percentage From 8e3b50fb41d89fc428a842b201b2c8bb9b7f63a7 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 12:37:59 +0100 Subject: [PATCH 212/414] more human evals resources --- .../models/api/evaluation_model.py | 15 ++++++ .../routers/evaluation_router.py | 41 ++++++++++++++-- .../services/evaluation_service.py | 48 +++++++++++++++++++ 3 files changed, 101 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index dfbe8c4c05..13b13522e5 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -109,6 +109,21 @@ class EvaluationScenarioOutput(BaseModel): type: str value: Any +class HumanEvaluation(BaseModel): + id: str + app_id: str + user_id: str + user_username: str + evaluation_type: EvaluationType + evaluation_type_settings: Optional[EvaluationTypeSettings] + variant_ids: List[str] + variant_names: List[str] + testset_id: str + testset_name: str + status: str + created_at: datetime + updated_at: datetime + class HumanEvaluationScenarioInput(BaseModel): input_name: str diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 23eae40098..537a65f6d8 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -282,7 +282,7 @@ async def create_evaluation_scenario( @router.put( - "/{evaluation_id}/evaluation_scenario/{evaluation_scenario_id}/{evaluation_type}/" + "/human-evaluations/{evaluation_id}/evaluation_scenario/{evaluation_scenario_id}/{evaluation_type}/" ) async def update_evaluation_scenario_router( evaluation_id: str, @@ -349,7 +349,7 @@ async def evaluate_ai_critique( raise HTTPException(400, f"Failed to evaluate AI critique: {str(e)}") -@router.get("/evaluation_scenario/{evaluation_scenario_id}/score/") +@router.get("/human-evaluations/evaluation_scenario/{evaluation_scenario_id}/score/") async def get_evaluation_scenario_score_router( evaluation_scenario_id: str, request: Request, @@ -368,7 +368,7 @@ async def get_evaluation_scenario_score_router( return await get_evaluation_scenario_score(evaluation_scenario_id, **user_org_data) -@router.put("/evaluation_scenario/{evaluation_scenario_id}/score/") +@router.put("/human-evaluations/evaluation_scenario/{evaluation_scenario_id}/score/") async def update_evaluation_scenario_score_router( evaluation_scenario_id: str, payload: EvaluationScenarioScoreUpdate, @@ -428,6 +428,41 @@ async def fetch_evaluation( return await evaluation_service.fetch_evaluation(evaluation_id, **user_org_data) +@router.get("/human-evaluations/", response_model=List[Evaluation]) +async def fetch_list_evaluations( + app_id: str, + request: Request, +): + """Fetches a list of evaluations, optionally filtered by an app ID. + + Args: + app_id (Optional[str]): An optional app ID to filter the evaluations. + + Returns: + List[Evaluation]: A list of evaluations. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await evaluation_service.fetch_list_evaluations( + app_id=app_id, **user_org_data + ) + +@router.get("/human-evaluations/{evaluation_id}/", response_model=Evaluation) +async def fetch_evaluation( + evaluation_id: str, + request: Request, +): + """Fetches a single evaluation based on its ID. + + Args: + evaluation_id (str): The ID of the evaluation to fetch. + + Returns: + Evaluation: The fetched evaluation. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await evaluation_service.fetch_human_evaluation(evaluation_id, **user_org_data) + + @router.delete("/", response_model=List[str]) async def delete_evaluations( delete_evaluations: DeleteEvaluation, diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index e2c115724d..6a0babb368 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -14,6 +14,7 @@ CustomEvaluationDetail, EvaluationScenarioInput, EvaluationType, + HumanEvaluation, NewEvaluation, EvaluationScenarioUpdate, CreateCustomEvaluation, @@ -497,6 +498,53 @@ async def fetch_evaluation(evaluation_id: str, **user_org_data: dict) -> Evaluat return await converters.evaluation_db_to_pydantic(evaluation) +async def fetch_list_human_evaluations( + app_id: str, + **user_org_data: dict, +) -> List[HumanEvaluation]: + """ + Fetches a list of evaluations based on the provided filtering criteria. + + Args: + app_id (Optional[str]): An optional app ID to filter the evaluations. + user_org_data (dict): User and organization data. + + Returns: + List[Evaluation]: A list of evaluations. + """ + access = await check_access_to_app(user_org_data=user_org_data, app_id=app_id) + if not access: + raise HTTPException( + status_code=403, + detail=f"You do not have access to this app: {app_id}", + ) + + evaluations_db = await engine.find( + HumanEvaluationDB, HumanEvaluationDB.app == ObjectId(app_id) + ) + return [ + await converters.evaluation_db_to_pydantic(evaluation) + for evaluation in evaluations_db + ] + + +async def fetch_human_evaluation(evaluation_id: str, **user_org_data: dict) -> HumanEvaluation: + """ + Fetches a single evaluation based on its ID. + + Args: + evaluation_id (str): The ID of the evaluation. + user_org_data (dict): User and organization data. + + Returns: + Evaluation: The fetched evaluation. + """ + evaluation = await _fetch_human_evaluation_scenario_and_check_access( + evaluation_id=evaluation_id, **user_org_data + ) + return await converters.evaluation_db_to_pydantic(evaluation) + + async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) -> None: """ Delete evaluations by their IDs. From cdb1c500c9ccaa8556ebba9e53437e9c8cc3c4c5 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 13:52:42 +0100 Subject: [PATCH 213/414] more resources for human evaluation --- agenta-backend/agenta_backend/main.py | 2 + .../agenta_backend/models/converters.py | 27 +++ .../routers/evaluation_router.py | 150 +------------- .../routers/human_evaluation_router.py | 183 ++++++++++++++++++ .../services/evaluation_service.py | 4 +- .../Evaluations/HumanEvaluationResult.tsx | 4 +- agenta-web/src/lib/services/api.ts | 36 ++-- 7 files changed, 235 insertions(+), 171 deletions(-) create mode 100644 agenta-backend/agenta_backend/routers/human_evaluation_router.py diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index 2820174dbb..1b59e6b8df 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -10,6 +10,7 @@ environment_router, annotations_router, evaluation_router, + human_evaluation_router, evaluators_router, observability_router, organization_router, @@ -80,6 +81,7 @@ async def lifespan(application: FastAPI, cache=True): app.include_router(variants_router.router, prefix="/variants") app.include_router(annotations_router.router, prefix="/annotations") app.include_router(evaluation_router.router, prefix="/evaluations") +app.include_router(human_evaluation_router.router, prefix="/human-evaluations") app.include_router(evaluators_router.router, prefix="/evaluators") app.include_router(testset_router.router, prefix="/testsets") app.include_router(container_router.router, prefix="/containers") diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 08a214198f..721ce2d041 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -11,6 +11,7 @@ AppVariantDB, EvaluationScenarioResult, EvaluatorConfigDB, + HumanEvaluationDB, ImageDB, TemplateDB, AppDB, @@ -42,6 +43,7 @@ Feedback as FeedbackOutput, ) from agenta_backend.models.api.evaluation_model import ( + HumanEvaluation, SimpleEvaluationOutput, EvaluationScenario, Evaluation, @@ -102,6 +104,31 @@ async def evaluation_db_to_pydantic( ) +async def human_evaluation_db_to_pydantic( + evaluation_db: HumanEvaluationDB, +) -> HumanEvaluation: + variant_names = [] + for variant_id in evaluation_db.variants: + variant = await db_manager.get_app_variant_instance_by_id(str(variant_id)) + variant_name = variant.variant_name if variant else str(variant_id) + variant_names.append(str(variant_name)) + + return HumanEvaluation( + id=str(evaluation_db.id), + app_id=str(evaluation_db.app.id), + user_id=str(evaluation_db.user.id), + user_username=evaluation_db.user.username or "", + status=evaluation_db.status, + evaluation_type=evaluation_db.evaluation_type, + variant_ids=[str(variant) for variant in evaluation_db.variants], + variant_names=variant_names, + testset_id=str(evaluation_db.testset.id), + testset_name=evaluation_db.testset.name, + created_at=evaluation_db.created_at, + updated_at=evaluation_db.updated_at, + ) + + async def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: transformed_results = [] for result in results: diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 537a65f6d8..452f8e015d 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -17,6 +17,7 @@ EvaluationScenarioScoreUpdate, EvaluationScenarioUpdate, ExecuteCustomEvaluationCode, + HumanEvaluation, HumanEvaluationScenarioUpdate, NewEvaluation, DeleteEvaluation, @@ -58,49 +59,6 @@ router = APIRouter() -@router.post( - "/human-evaluations/", response_model=SimpleEvaluationOutput, operation_id="create_evaluation" -) -async def create_evaluation( - payload: NewHumanEvaluation, - request: Request, -): - """Creates a new comparison table document - Raises: - HTTPException: _description_ - Returns: - _description_ - """ - try: - user_org_data: dict = await get_user_and_org_id(request.state.user_id) - access_app = await check_access_to_app( - user_org_data=user_org_data, - app_id=payload.app_id, - check_owner=False, - ) - if not access_app: - error_msg = f"You do not have access to this app: {payload.app_id}" - return JSONResponse( - {"detail": error_msg}, - status_code=400, - ) - app = await db_manager.fetch_app_by_id(app_id=payload.app_id) - - if app is None: - raise HTTPException(status_code=404, detail="App not found") - - new_evaluation_db = await evaluation_service.create_new_human_evaluation( - payload, **user_org_data - ) - print(new_evaluation_db) - return converters.evaluation_db_to_simple_evaluation_output(new_evaluation_db) - except KeyError: - raise HTTPException( - status_code=400, - detail="columns in the test set should match the names of the inputs in the variant", - ) - - @router.post("/") async def create_evaluation( payload: NewEvaluation, @@ -281,35 +239,7 @@ async def create_evaluation_scenario( return Response(status_code=status.HTTP_204_NO_CONTENT) -@router.put( - "/human-evaluations/{evaluation_id}/evaluation_scenario/{evaluation_scenario_id}/{evaluation_type}/" -) -async def update_evaluation_scenario_router( - evaluation_id: str, - evaluation_scenario_id: str, - evaluation_type: EvaluationType, - evaluation_scenario: HumanEvaluationScenarioUpdate, - request: Request, -): - """Updates an evaluation scenario's vote or score based on its type. - - Raises: - HTTPException: If update fails or unauthorized. - Returns: - None: 204 No Content status code upon successful update. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - try: - await update_human_evaluation_scenario( - evaluation_scenario_id, - evaluation_scenario, - evaluation_type, - **user_org_data, - ) - return Response(status_code=status.HTTP_204_NO_CONTENT) - except UpdateEvaluationScenarioError as e: - raise HTTPException(status_code=500, detail=str(e)) from e @router.post("/evaluation_scenario/ai_critique/", response_model=str) @@ -349,49 +279,6 @@ async def evaluate_ai_critique( raise HTTPException(400, f"Failed to evaluate AI critique: {str(e)}") -@router.get("/human-evaluations/evaluation_scenario/{evaluation_scenario_id}/score/") -async def get_evaluation_scenario_score_router( - evaluation_scenario_id: str, - request: Request, -) -> Dict[str, str]: - """ - Fetch the score of a specific evaluation scenario. - - Args: - evaluation_scenario_id: The ID of the evaluation scenario to fetch. - stoken_session: Session data, verified by `verify_session`. - - Returns: - Dictionary containing the scenario ID and its score. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - return await get_evaluation_scenario_score(evaluation_scenario_id, **user_org_data) - - -@router.put("/human-evaluations/evaluation_scenario/{evaluation_scenario_id}/score/") -async def update_evaluation_scenario_score_router( - evaluation_scenario_id: str, - payload: EvaluationScenarioScoreUpdate, - request: Request, -): - """Updates the score of an evaluation scenario. - - Raises: - HTTPException: Server error if the evaluation update fails. - - Returns: - None: 204 No Content status code upon successful update. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - try: - await update_evaluation_scenario_score( - evaluation_scenario_id, payload.score, **user_org_data - ) - return Response(status_code=status.HTTP_204_NO_CONTENT) - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e - - @router.get("/", response_model=List[Evaluation]) async def fetch_list_evaluations( app_id: str, @@ -428,41 +315,6 @@ async def fetch_evaluation( return await evaluation_service.fetch_evaluation(evaluation_id, **user_org_data) -@router.get("/human-evaluations/", response_model=List[Evaluation]) -async def fetch_list_evaluations( - app_id: str, - request: Request, -): - """Fetches a list of evaluations, optionally filtered by an app ID. - - Args: - app_id (Optional[str]): An optional app ID to filter the evaluations. - - Returns: - List[Evaluation]: A list of evaluations. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - return await evaluation_service.fetch_list_evaluations( - app_id=app_id, **user_org_data - ) - -@router.get("/human-evaluations/{evaluation_id}/", response_model=Evaluation) -async def fetch_evaluation( - evaluation_id: str, - request: Request, -): - """Fetches a single evaluation based on its ID. - - Args: - evaluation_id (str): The ID of the evaluation to fetch. - - Returns: - Evaluation: The fetched evaluation. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - return await evaluation_service.fetch_human_evaluation(evaluation_id, **user_org_data) - - @router.delete("/", response_model=List[str]) async def delete_evaluations( delete_evaluations: DeleteEvaluation, diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py new file mode 100644 index 0000000000..de05a46683 --- /dev/null +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -0,0 +1,183 @@ +import os +import secrets +from typing import List, Dict + +from fastapi.responses import JSONResponse +from fastapi.encoders import jsonable_encoder +from fastapi import HTTPException, APIRouter, Body, Request, status, Response + +from agenta_backend.models.api.evaluation_model import ( + EvaluationScenarioScoreUpdate, + HumanEvaluation, + HumanEvaluationScenarioUpdate, + EvaluationType, + NewHumanEvaluation, + SimpleEvaluationOutput, +) + +from agenta_backend.services import evaluation_service +from agenta_backend.utils.common import check_access_to_app +from agenta_backend.services import db_manager +from agenta_backend.models import converters +from agenta_backend.services import results_service +from agenta_backend.tasks.evaluations import evaluate + + +if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: + from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all + get_user_and_org_id, + ) +else: + from agenta_backend.services.selectors import get_user_and_org_id + +router = APIRouter() + + +@router.post( + "/", response_model=SimpleEvaluationOutput, operation_id="create_evaluation" +) +async def create_evaluation( + payload: NewHumanEvaluation, + request: Request, +): + """Creates a new comparison table document + Raises: + HTTPException: _description_ + Returns: + _description_ + """ + try: + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + access_app = await check_access_to_app( + user_org_data=user_org_data, + app_id=payload.app_id, + check_owner=False, + ) + if not access_app: + error_msg = f"You do not have access to this app: {payload.app_id}" + return JSONResponse( + {"detail": error_msg}, + status_code=400, + ) + app = await db_manager.fetch_app_by_id(app_id=payload.app_id) + + if app is None: + raise HTTPException(status_code=404, detail="App not found") + + new_evaluation_db = await evaluation_service.create_new_human_evaluation( + payload, **user_org_data + ) + print(new_evaluation_db) + return converters.evaluation_db_to_simple_evaluation_output(new_evaluation_db) + except KeyError: + raise HTTPException( + status_code=400, + detail="columns in the test set should match the names of the inputs in the variant", + ) + +@router.get("/", response_model=List[HumanEvaluation]) +async def fetch_list_human_evaluations( + app_id: str, + request: Request, +): + """Fetches a list of evaluations, optionally filtered by an app ID. + + Args: + app_id (Optional[str]): An optional app ID to filter the evaluations. + + Returns: + List[HumanEvaluation]: A list of evaluations. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await evaluation_service.fetch_list_human_evaluations( + app_id=app_id, **user_org_data + ) + +@router.get("/{evaluation_id}/", response_model=HumanEvaluation) +async def fetch_human_evaluation( + evaluation_id: str, + request: Request, +): + """Fetches a single evaluation based on its ID. + + Args: + evaluation_id (str): The ID of the evaluation to fetch. + + Returns: + HumanEvaluation: The fetched evaluation. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await evaluation_service.fetch_human_evaluation(evaluation_id, **user_org_data) + +@router.put( + "/{evaluation_id}/evaluation_scenario/{evaluation_scenario_id}/{evaluation_type}/" +) +async def update_evaluation_scenario_router( + evaluation_id: str, + evaluation_scenario_id: str, + evaluation_type: EvaluationType, + evaluation_scenario: HumanEvaluationScenarioUpdate, + request: Request, +): + """Updates an evaluation scenario's vote or score based on its type. + + Raises: + HTTPException: If update fails or unauthorized. + + Returns: + None: 204 No Content status code upon successful update. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + try: + await update_human_evaluation_scenario( + evaluation_scenario_id, + evaluation_scenario, + evaluation_type, + **user_org_data, + ) + return Response(status_code=status.HTTP_204_NO_CONTENT) + except UpdateEvaluationScenarioError as e: + raise HTTPException(status_code=500, detail=str(e)) from e + + +@router.get("/evaluation_scenario/{evaluation_scenario_id}/score/") +async def get_evaluation_scenario_score_router( + evaluation_scenario_id: str, + request: Request, +) -> Dict[str, str]: + """ + Fetch the score of a specific evaluation scenario. + + Args: + evaluation_scenario_id: The ID of the evaluation scenario to fetch. + stoken_session: Session data, verified by `verify_session`. + + Returns: + Dictionary containing the scenario ID and its score. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + return await get_evaluation_scenario_score(evaluation_scenario_id, **user_org_data) + + +@router.put("/evaluation_scenario/{evaluation_scenario_id}/score/") +async def update_evaluation_scenario_score_router( + evaluation_scenario_id: str, + payload: EvaluationScenarioScoreUpdate, + request: Request, +): + """Updates the score of an evaluation scenario. + + Raises: + HTTPException: Server error if the evaluation update fails. + + Returns: + None: 204 No Content status code upon successful update. + """ + user_org_data = await get_user_and_org_id(request.state.user_id) + try: + await update_evaluation_scenario_score( + evaluation_scenario_id, payload.score, **user_org_data + ) + return Response(status_code=status.HTTP_204_NO_CONTENT) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) from e \ No newline at end of file diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 6a0babb368..ee99a1f2e4 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -523,7 +523,7 @@ async def fetch_list_human_evaluations( HumanEvaluationDB, HumanEvaluationDB.app == ObjectId(app_id) ) return [ - await converters.evaluation_db_to_pydantic(evaluation) + await converters.human_evaluation_db_to_pydantic(evaluation) for evaluation in evaluations_db ] @@ -542,7 +542,7 @@ async def fetch_human_evaluation(evaluation_id: str, **user_org_data: dict) -> H evaluation = await _fetch_human_evaluation_scenario_and_check_access( evaluation_id=evaluation_id, **user_org_data ) - return await converters.evaluation_db_to_pydantic(evaluation) + return await converters.human_evaluation_db_to_pydantic(evaluation) async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) -> None: diff --git a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx index 7d6aa7b50b..80e8905946 100644 --- a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx +++ b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx @@ -99,12 +99,12 @@ export default function HumanEvaluationResult() { const fetchEvaluations = async () => { try { fetchData( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/?app_id=${app_id}`, + `${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`, ) .then((response) => { const fetchPromises = response.map((item: EvaluationResponseType) => { return fetchData( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${ + `${getAgentaApiUrl()}/api/human-evaluations/${ item.id }/results/`, ) diff --git a/agenta-web/src/lib/services/api.ts b/agenta-web/src/lib/services/api.ts index d6207b2976..0ff9fc5859 100644 --- a/agenta-web/src/lib/services/api.ts +++ b/agenta-web/src/lib/services/api.ts @@ -289,7 +289,7 @@ export const deleteTestsets = async (ids: string[]) => { export const loadEvaluations = async (appId: string) => { return await axios - .get(`${getAgentaApiUrl()}/api/evaluations/human-evaluations/?app_id=${appId}`) + .get(`${getAgentaApiUrl()}/api/human-evaluations/?app_id=${appId}`) .then((responseData) => { const evaluations = responseData.data.map((item: EvaluationResponseType) => { return fromEvaluationResponseToEvaluation(item) @@ -301,7 +301,7 @@ export const loadEvaluations = async (appId: string) => { export const loadEvaluation = async (evaluationId: string) => { return await axios - .get(`${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationId}/`) + .get(`${getAgentaApiUrl()}/api/human-evaluations/${evaluationId}/`) .then((responseData) => { return fromEvaluationResponseToEvaluation(responseData.data) }) @@ -310,7 +310,7 @@ export const loadEvaluation = async (evaluationId: string) => { export const deleteEvaluations = async (ids: string[]) => { const response = await axios({ method: "delete", - url: `${getAgentaApiUrl()}/api/evaluations/human-evaluations/`, + url: `${getAgentaApiUrl()}/api/human-evaluations/`, data: {evaluations_ids: ids}, }) return response.data @@ -322,7 +322,7 @@ export const loadEvaluationsScenarios = async ( ) => { return await axios .get( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationTableId}/evaluation_scenarios/`, + `${getAgentaApiUrl()}/api/human-evaluations/${evaluationTableId}/evaluation_scenarios/`, ) .then((responseData) => { const evaluationsRows = responseData.data.map((item: any) => { @@ -370,7 +370,7 @@ export const createNewEvaluation = async ( } const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/`, + `${getAgentaApiUrl()}/api/human-evaluations/`, data, { _ignoreError: ignoreAxiosError, @@ -381,7 +381,7 @@ export const createNewEvaluation = async ( export const updateEvaluation = async (evaluationId: string, data: GenericObject) => { const response = await axios.put( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationId}/`, + `${getAgentaApiUrl()}/api/human-evaluations/${evaluationId}/`, data, ) return response.data @@ -394,7 +394,7 @@ export const updateEvaluationScenario = async ( evaluationType: EvaluationType, ) => { const response = await axios.put( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationTableId}/evaluation_scenario/${evaluationScenarioId}/${evaluationType}/`, + `${getAgentaApiUrl()}/api/human-evaluations/${evaluationTableId}/evaluation_scenario/${evaluationScenarioId}/${evaluationType}/`, data, ) return response.data @@ -402,7 +402,7 @@ export const updateEvaluationScenario = async ( export const postEvaluationScenario = async (evaluationTableId: string, data: GenericObject) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationTableId}/evaluation_scenario/`, + `${getAgentaApiUrl()}/api/human-evaluations/${evaluationTableId}/evaluation_scenario/`, data, ) return response.data @@ -413,7 +413,7 @@ export const evaluateAICritiqueForEvalScenario = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/evaluation_scenario/ai_critique/`, + `${getAgentaApiUrl()}/api/human-evaluations/evaluation_scenario/ai_critique/`, data, {_ignoreError: ignoreAxiosError} as any, ) @@ -422,14 +422,14 @@ export const evaluateAICritiqueForEvalScenario = async ( export const fetchEvaluationResults = async (evaluationId: string) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/${evaluationId}/results/`, + `${getAgentaApiUrl()}/api/human-evaluations/${evaluationId}/results/`, ) return response.data } export const fetchEvaluationScenarioResults = async (evaluation_scenario_id: string) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, + `${getAgentaApiUrl()}/api/human-evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, ) return response } @@ -439,7 +439,7 @@ export const saveCustomCodeEvaluation = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/`, + `${getAgentaApiUrl()}/api/human-evaluations/custom_evaluation/`, payload, {_ignoreError: ignoreAxiosError} as any, ) @@ -452,7 +452,7 @@ export const editCustomEvaluationDetail = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.put( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/${id}`, + `${getAgentaApiUrl()}/api/human-evaluations/custom_evaluation/${id}`, payload, {_ignoreError: ignoreAxiosError} as any, ) @@ -461,7 +461,7 @@ export const editCustomEvaluationDetail = async ( export const fetchCustomEvaluations = async (app_id: string, ignoreAxiosError: boolean = false) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/list/${app_id}/`, + `${getAgentaApiUrl()}/api/human-evaluations/custom_evaluation/list/${app_id}/`, {_ignoreError: ignoreAxiosError} as any, ) return response @@ -472,7 +472,7 @@ export const fetchCustomEvaluationDetail = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/${id}/`, + `${getAgentaApiUrl()}/api/human-evaluations/custom_evaluation/${id}/`, {_ignoreError: ignoreAxiosError} as any, ) return response.data @@ -483,7 +483,7 @@ export const fetchCustomEvaluationNames = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.get( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/${app_id}/names/`, + `${getAgentaApiUrl()}/api/human-evaluations/custom_evaluation/${app_id}/names/`, {_ignoreError: ignoreAxiosError} as any, ) return response @@ -494,7 +494,7 @@ export const executeCustomEvaluationCode = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.post( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/custom_evaluation/execute/${ + `${getAgentaApiUrl()}/api/human-evaluations/custom_evaluation/execute/${ payload.evaluation_id }/`, payload, @@ -509,7 +509,7 @@ export const updateEvaluationScenarioScore = async ( ignoreAxiosError: boolean = false, ) => { const response = await axios.put( - `${getAgentaApiUrl()}/api/evaluations/human-evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, + `${getAgentaApiUrl()}/api/human-evaluations/evaluation_scenario/${evaluation_scenario_id}/score/`, {score}, {_ignoreError: ignoreAxiosError} as any, ) From d83650611235d5abaeb472880cf66c1ece8c702e Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 14:46:35 +0100 Subject: [PATCH 214/414] more human evals resources --- .../routers/evaluation_router.py | 63 ------------------- .../routers/human_evaluation_router.py | 58 ++++++++++++++++- .../services/evaluation_service.py | 3 - .../services/results_service.py | 25 ++------ 4 files changed, 61 insertions(+), 88 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 452f8e015d..49acc45c2b 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -239,9 +239,6 @@ async def create_evaluation_scenario( return Response(status_code=status.HTTP_204_NO_CONTENT) - - - @router.post("/evaluation_scenario/ai_critique/", response_model=str) async def evaluate_ai_critique( payload: AICritiqueCreate, @@ -338,66 +335,6 @@ async def delete_evaluations( return Response(status_code=status.HTTP_204_NO_CONTENT) -@router.get("/{evaluation_id}/results/") -async def fetch_results( - evaluation_id: str, - request: Request, -): - """Fetch all the results for one the comparison table - - Arguments: - evaluation_id -- _description_ - - Returns: - _description_ - """ - - # Get user and organization id - user_org_data: dict = await get_user_and_org_id(request.state.user_id) - evaluation = await evaluation_service._fetch_evaluation_and_check_access( - evaluation_id, **user_org_data - ) - if evaluation.evaluation_type == EvaluationType.human_a_b_testing: - results = await results_service.fetch_results_for_evaluation(evaluation) - return {"votes_data": results} - - elif evaluation.evaluation_type == EvaluationType.auto_exact_match: - results = await results_service.fetch_results_for_evaluation(evaluation) - return {"scores_data": results} - - elif evaluation.evaluation_type == EvaluationType.auto_similarity_match: - results = await results_service.fetch_results_for_evaluation(evaluation) - return {"scores_data": results} - - elif evaluation.evaluation_type == EvaluationType.auto_regex_test: - results = await results_service.fetch_results_for_evaluation(evaluation) - return {"scores_data": results} - - elif evaluation.evaluation_type == EvaluationType.auto_webhook_test: - results = await results_service.fetch_results_for_auto_ai_critique( - evaluation_id - ) - return {"results_data": results} - - elif evaluation.evaluation_type == EvaluationType.single_model_test: - results = await results_service.fetch_results_for_auto_ai_critique( - evaluation_id - ) - return {"results_data": results} - - elif evaluation.evaluation_type == EvaluationType.auto_ai_critique: - results = await results_service.fetch_results_for_auto_ai_critique( - evaluation_id - ) - return {"results_data": results} - - elif evaluation.evaluation_type == EvaluationType.custom_code_run: - results = await results_service.fetch_average_score_for_custom_code_run( - evaluation_id - ) - return {"avg_score": results} - - @router.post("/custom_evaluation/") async def create_custom_evaluation( custom_evaluation_payload: CreateCustomEvaluation, diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index de05a46683..37195627f7 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -7,6 +7,7 @@ from fastapi import HTTPException, APIRouter, Body, Request, status, Response from agenta_backend.models.api.evaluation_model import ( + DeleteEvaluation, EvaluationScenarioScoreUpdate, HumanEvaluation, HumanEvaluationScenarioUpdate, @@ -180,4 +181,59 @@ async def update_evaluation_scenario_score_router( ) return Response(status_code=status.HTTP_204_NO_CONTENT) except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e \ No newline at end of file + raise HTTPException(status_code=500, detail=str(e)) from e + + +@router.get("/{evaluation_id}/results/", operation_id="fetch_results") +async def fetch_results( + evaluation_id: str, + request: Request, +): + """Fetch all the results for one the comparison table + + Arguments: + evaluation_id -- _description_ + + Returns: + _description_ + """ + + # Get user and organization id + print("are we here") + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + evaluation = await evaluation_service._fetch_human_evaluation_scenario_and_check_access( + evaluation_id, **user_org_data + ) + print("really???") + if evaluation.evaluation_type == EvaluationType.human_a_b_testing: + results = await results_service.fetch_results_for_evaluation(evaluation) + return {"votes_data": results} + + elif evaluation.evaluation_type == EvaluationType.single_model_test: + results = await results_service.fetch_results_for_single_model_test( + evaluation_id + ) + return {"results_data": results} + + +@router.delete("/", response_model=List[str]) +async def delete_evaluations( + delete_evaluations: DeleteEvaluation, + request: Request, +): + """ + Delete specific comparison tables based on their unique IDs. + + Args: + delete_evaluations (List[str]): The unique identifiers of the comparison tables to delete. + + Returns: + A list of the deleted comparison tables' IDs. + """ + + # Get user and organization id + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + await evaluation_service.delete_evaluations( + delete_evaluations.evaluations_ids, **user_org_data + ) + return Response(status_code=status.HTTP_204_NO_CONTENT) \ No newline at end of file diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index ee99a1f2e4..f22a02961e 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -934,9 +934,6 @@ async def create_new_evaluation( return await converters.evaluation_db_to_pydantic(evaluation_db) - - - async def retrieve_evaluation_results( evaluation_id: str, **user_org_data: dict ) -> List[dict]: diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index 28b2b0ec2b..88695ef87c 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -1,15 +1,15 @@ from agenta_backend.utils.common import engine from agenta_backend.services.db_manager import query -from agenta_backend.models.db_models import EvaluationScenarioDB, EvaluationDB +from agenta_backend.models.db_models import EvaluationScenarioDB, EvaluationDB, HumanEvaluationDB, HumanEvaluationScenarioDB from agenta_backend.services import evaluation_service from agenta_backend.services import db_manager from agenta_backend.models.api.evaluation_model import EvaluationType from bson import ObjectId -async def fetch_results_for_evaluation(evaluation: EvaluationDB): +async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB): evaluation_scenarios = await engine.find( - EvaluationScenarioDB, EvaluationScenarioDB.evaluation == ObjectId(evaluation.id) + HumanEvaluationScenarioDB, HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id) ) results = {} @@ -28,24 +28,7 @@ async def fetch_results_for_evaluation(evaluation: EvaluationDB): results.update( await _compute_stats_for_human_a_b_testing_evaluation(evaluation_scenarios) ) - elif evaluation.evaluation_type == EvaluationType.auto_exact_match: - results.update( - await _compute_stats_for_evaluation( - evaluation_scenarios, classes=["correct", "wrong"] - ) - ) - elif evaluation.evaluation_type == EvaluationType.auto_similarity_match: - results.update( - await _compute_stats_for_evaluation( - evaluation_scenarios, classes=["true", "false"] - ) - ) - elif evaluation.evaluation_type == EvaluationType.auto_regex_test: - results.update( - await _compute_stats_for_evaluation( - evaluation_scenarios, classes=["correct", "wrong"] - ) - ) + return results From 7b55fb04d5bb9a67352b01546e565c0f94734263 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 14:47:42 +0100 Subject: [PATCH 215/414] format --- .../models/api/annotation_models.py | 1 - .../agenta_backend/models/api/evaluation_model.py | 1 + .../agenta_backend/models/converters.py | 7 ++++--- .../agenta_backend/routers/annotations_router.py | 8 ++++++-- .../agenta_backend/routers/evaluation_router.py | 2 +- .../routers/human_evaluation_router.py | 15 +++++++++++---- .../agenta_backend/services/annotation_manager.py | 4 +++- .../agenta_backend/services/evaluation_service.py | 4 +++- .../agenta_backend/services/results_service.py | 10 ++++++++-- .../agenta_backend/tasks/evaluations.py | 9 ++------- 10 files changed, 39 insertions(+), 22 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/annotation_models.py b/agenta-backend/agenta_backend/models/api/annotation_models.py index 0744fa6d97..5238cbee5c 100644 --- a/agenta-backend/agenta_backend/models/api/annotation_models.py +++ b/agenta-backend/agenta_backend/models/api/annotation_models.py @@ -55,4 +55,3 @@ class AnnotationScenario(BaseModel): is_pinned: Optional[bool] note: Optional[str] result: AnnotationScenarioResult - diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 13b13522e5..b8716c6d33 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -109,6 +109,7 @@ class EvaluationScenarioOutput(BaseModel): type: str value: Any + class HumanEvaluation(BaseModel): id: str app_id: str diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 721ce2d041..a814cbf932 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -399,15 +399,16 @@ def annotation_scenario_db_to_pydantic( id=str(annotation_scenario_db.id), annotation_id=str(annotation_scenario_db.annotation_id), inputs=[ - AnnotationScenarioInput(**input_dict.dict()) for input_dict in annotation_scenario_db.inputs + AnnotationScenarioInput(**input_dict.dict()) + for input_dict in annotation_scenario_db.inputs ], outputs=[ - AnnotationScenarioOutput(**output_dict.dict()) for output_dict in annotation_scenario_db.outputs + AnnotationScenarioOutput(**output_dict.dict()) + for output_dict in annotation_scenario_db.outputs ], is_pinned=annotation_scenario_db.is_pinned, note=annotation_scenario_db.note, result=AnnotationScenarioResult(**annotation_scenario_db.result.dict()), - created_at=annotation_scenario_db.created_at, updated_at=annotation_scenario_db.updated_at, ) diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py index 24a423c42c..5a3ae24964 100644 --- a/agenta-backend/agenta_backend/routers/annotations_router.py +++ b/agenta-backend/agenta_backend/routers/annotations_router.py @@ -114,7 +114,9 @@ async def fetch_annotation( return await annotation_manager.fetch_annotation(annotation_id, **user_org_data) -@router.get("/{annotation_id}/annotations_scenarios/", response_model=List[AnnotationScenario]) +@router.get( + "/{annotation_id}/annotations_scenarios/", response_model=List[AnnotationScenario] +) async def fetch_annotations_scenarios( annotation_id: str, request: Request, @@ -128,7 +130,9 @@ async def fetch_annotations_scenarios( Annotation: The fetched annotation. """ user_org_data = await get_user_and_org_id(request.state.user_id) - return await annotation_manager.fetch_annotations_scenarios(annotation_id, **user_org_data) + return await annotation_manager.fetch_annotations_scenarios( + annotation_id, **user_org_data + ) @router.put("/{annotation_id}/annotations_scenarios/{annotation_scenario_id}/") diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 49acc45c2b..ce38e767f9 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -95,7 +95,7 @@ async def create_evaluation( "app_id": payload.app_id, "variant_ids": [variant_id], # Only this variant ID "evaluators_configs": payload.evaluators_configs, - "testset_id": payload.testset_id + "testset_id": payload.testset_id, } evaluation = await evaluation_service.create_new_evaluation( diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index 37195627f7..545f12a276 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -76,6 +76,7 @@ async def create_evaluation( detail="columns in the test set should match the names of the inputs in the variant", ) + @router.get("/", response_model=List[HumanEvaluation]) async def fetch_list_human_evaluations( app_id: str, @@ -94,6 +95,7 @@ async def fetch_list_human_evaluations( app_id=app_id, **user_org_data ) + @router.get("/{evaluation_id}/", response_model=HumanEvaluation) async def fetch_human_evaluation( evaluation_id: str, @@ -108,7 +110,10 @@ async def fetch_human_evaluation( HumanEvaluation: The fetched evaluation. """ user_org_data = await get_user_and_org_id(request.state.user_id) - return await evaluation_service.fetch_human_evaluation(evaluation_id, **user_org_data) + return await evaluation_service.fetch_human_evaluation( + evaluation_id, **user_org_data + ) + @router.put( "/{evaluation_id}/evaluation_scenario/{evaluation_scenario_id}/{evaluation_type}/" @@ -201,8 +206,10 @@ async def fetch_results( # Get user and organization id print("are we here") user_org_data: dict = await get_user_and_org_id(request.state.user_id) - evaluation = await evaluation_service._fetch_human_evaluation_scenario_and_check_access( - evaluation_id, **user_org_data + evaluation = ( + await evaluation_service._fetch_human_evaluation_scenario_and_check_access( + evaluation_id, **user_org_data + ) ) print("really???") if evaluation.evaluation_type == EvaluationType.human_a_b_testing: @@ -236,4 +243,4 @@ async def delete_evaluations( await evaluation_service.delete_evaluations( delete_evaluations.evaluations_ids, **user_org_data ) - return Response(status_code=status.HTTP_204_NO_CONTENT) \ No newline at end of file + return Response(status_code=status.HTTP_204_NO_CONTENT) diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py index 3ea525101d..35d73dbd31 100644 --- a/agenta-backend/agenta_backend/services/annotation_manager.py +++ b/agenta-backend/agenta_backend/services/annotation_manager.py @@ -197,7 +197,9 @@ async def create_annotation_scenario( await engine.save(new_annotation_scenario) -async def fetch_annotations_scenarios(annotation_id: str, **user_org_data: dict) -> [AnnotationScenario]: +async def fetch_annotations_scenarios( + annotation_id: str, **user_org_data: dict +) -> [AnnotationScenario]: """ Fetches a single annotation based on its ID. diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index f22a02961e..eab542ebdb 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -528,7 +528,9 @@ async def fetch_list_human_evaluations( ] -async def fetch_human_evaluation(evaluation_id: str, **user_org_data: dict) -> HumanEvaluation: +async def fetch_human_evaluation( + evaluation_id: str, **user_org_data: dict +) -> HumanEvaluation: """ Fetches a single evaluation based on its ID. diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index 88695ef87c..9876c6ac29 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -1,6 +1,11 @@ from agenta_backend.utils.common import engine from agenta_backend.services.db_manager import query -from agenta_backend.models.db_models import EvaluationScenarioDB, EvaluationDB, HumanEvaluationDB, HumanEvaluationScenarioDB +from agenta_backend.models.db_models import ( + EvaluationScenarioDB, + EvaluationDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, +) from agenta_backend.services import evaluation_service from agenta_backend.services import db_manager from agenta_backend.models.api.evaluation_model import EvaluationType @@ -9,7 +14,8 @@ async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB): evaluation_scenarios = await engine.find( - HumanEvaluationScenarioDB, HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id) + HumanEvaluationScenarioDB, + HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id), ) results = {} diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 7c84ac3dae..ceb92493cc 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -39,7 +39,6 @@ def evaluate( new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) evaluators_aggregated_data = defaultdict(list) - variant_id = str(evaluation.variant_ids[0]) app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) @@ -99,9 +98,7 @@ def evaluate( result=result, ) evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append( - result - ) + evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) # 4. We create a new evaluation scenario evaluation_scenario = loop.run_until_complete( @@ -115,9 +112,7 @@ def evaluate( is_pinned=False, note="", correct_answer=data_point["correct_answer"], - outputs=[ - EvaluationScenarioOutputDB(type="text", value=variant_output) - ], + outputs=[EvaluationScenarioOutputDB(type="text", value=variant_output)], results=evaluators_results, ) ) From f04a995fad1dc8370f47c782345d8cb4700d0230 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 2 Jan 2024 13:54:51 +0100 Subject: [PATCH 216/414] Cleanup - added extra-hosts to celery_worker compose service --- agenta-backend/agenta_backend/tasks/annotations.py | 2 -- docker-compose.yml | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/annotations.py b/agenta-backend/agenta_backend/tasks/annotations.py index 37a81da0e7..a5c4be6a34 100644 --- a/agenta-backend/agenta_backend/tasks/annotations.py +++ b/agenta-backend/agenta_backend/tasks/annotations.py @@ -35,8 +35,6 @@ def prepare_scenarios( new_annotation_db = loop.run_until_complete(fetch_annotation_by_id(annotation_id)) for variant_id in annotation.variants_ids: - variant_id = str(variant_id) - app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) deployment = loop.run_until_complete( get_deployment_by_objectid(app_variant_db.base.deployment) diff --git a/docker-compose.yml b/docker-compose.yml index 42cc4eefc6..13e7f93d72 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -145,6 +145,8 @@ services: depends_on: - rabbitmq - redis + extra_hosts: + - "host.docker.internal:host-gateway" networks: - agenta-network From 76b2326622bd09a1ffb0b0d99e12b654612eda3e Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 15:07:03 +0100 Subject: [PATCH 217/414] remove annotations --- .../agenta_backend/celery_config.py | 5 - agenta-backend/agenta_backend/main.py | 2 - .../models/api/annotation_models.py | 57 ---- .../agenta_backend/models/converters.py | 43 --- .../agenta_backend/models/db_models.py | 53 ---- .../routers/annotations_router.py | 160 ----------- .../services/annotation_manager.py | 248 ------------------ .../agenta_backend/services/db_manager.py | 175 ------------ .../agenta_backend/tasks/annotations.py | 83 ------ 9 files changed, 826 deletions(-) delete mode 100644 agenta-backend/agenta_backend/models/api/annotation_models.py delete mode 100644 agenta-backend/agenta_backend/routers/annotations_router.py delete mode 100644 agenta-backend/agenta_backend/services/annotation_manager.py delete mode 100644 agenta-backend/agenta_backend/tasks/annotations.py diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py index 1b643786a9..df11dad091 100644 --- a/agenta-backend/agenta_backend/celery_config.py +++ b/agenta-backend/agenta_backend/celery_config.py @@ -14,9 +14,4 @@ Exchange("agenta_backend.tasks.evaluations.evaluate"), routing_key="agenta_backend.tasks.evaluations.evaluate", ), - Queue( - "agenta_backend.tasks.annotations.prepare_scenarios", - Exchange("agenta_backend.tasks.annotations.prepare_scenarios"), - routing_key="agenta_backend.tasks.annotations.prepare_scenarios", - ), ) diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index 1b59e6b8df..568895aa10 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -8,7 +8,6 @@ app_router, container_router, environment_router, - annotations_router, evaluation_router, human_evaluation_router, evaluators_router, @@ -79,7 +78,6 @@ async def lifespan(application: FastAPI, cache=True): app.include_router(user_profile.router, prefix="/profile") app.include_router(app_router.router, prefix="/apps") app.include_router(variants_router.router, prefix="/variants") -app.include_router(annotations_router.router, prefix="/annotations") app.include_router(evaluation_router.router, prefix="/evaluations") app.include_router(human_evaluation_router.router, prefix="/human-evaluations") app.include_router(evaluators_router.router, prefix="/evaluators") diff --git a/agenta-backend/agenta_backend/models/api/annotation_models.py b/agenta-backend/agenta_backend/models/api/annotation_models.py deleted file mode 100644 index 5238cbee5c..0000000000 --- a/agenta-backend/agenta_backend/models/api/annotation_models.py +++ /dev/null @@ -1,57 +0,0 @@ -from pydantic import BaseModel -from typing import Optional, List, Any -from enum import Enum -from agenta_backend.models.api.api_models import Result - - -class AnnotationStatusEnum(str, Enum): - ANNOTATION_INITIALIZED = "ANNOTATION_INITIALIZED" - ANNOTATION_STARTED = "ANNOTATION_STARTED" - ANNOTATION_FINISHED = "ANNOTATION_FINISHED" - ANNOTATION_ERROR = "ANNOTATION_ERROR" - - -class Annotation(BaseModel): - id: str - app_id: str - variants_ids: List[str] - annotation_name: str - testset_id: str - aggregated_results: List - - -class NewAnnotation(BaseModel): - app_id: str - variants_ids: List[str] - annotation_name: str - testset_id: str - - -class AnnotationScenarioUpdate(BaseModel): - result: Result - - -class AnnotationScenarioInput(BaseModel): - name: str - type: str - value: Any - - -class AnnotationScenarioOutput(BaseModel): - type: str - value: Any - - -class AnnotationScenarioResult(BaseModel): - variant_id: str - result: Result - - -class AnnotationScenario(BaseModel): - id: Optional[str] - annotation_id: str - inputs: List[AnnotationScenarioInput] - outputs: List[AnnotationScenarioOutput] - is_pinned: Optional[bool] - note: Optional[str] - result: AnnotationScenarioResult diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index a814cbf932..ab41e9ce56 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -5,9 +5,6 @@ from agenta_backend.services import db_manager from agenta_backend.models.api.user_models import User from agenta_backend.models.db_models import ( - AnnotationScenarioResult, - AnnotationsDB, - AnnotationsScenariosDB, AppVariantDB, EvaluationScenarioResult, EvaluatorConfigDB, @@ -52,13 +49,6 @@ EvaluationScenarioOutput, ) -from agenta_backend.models.api.annotation_models import ( - Annotation, - AnnotationScenario, - AnnotationScenarioInput, - AnnotationScenarioOutput, -) - import logging logger = logging.getLogger(__name__) @@ -379,36 +369,3 @@ def evaluator_config_db_to_pydantic(evaluator_config: EvaluatorConfigDB): evaluator_key=evaluator_config.evaluator_key, settings_values=evaluator_config.settings_values, ) - - -def annotation_db_to_pydantic(annotation_db: AnnotationsDB): - return Annotation( - id=str(annotation_db.id), - app_id=str(annotation_db.app.id), - annotation_name=annotation_db.annotation_name, - variants_ids=[str(variants_id) for variants_id in annotation_db.variants_ids], - testset_id=str(annotation_db.testset_id), - aggregated_results=annotation_db.aggregated_results, - ) - - -def annotation_scenario_db_to_pydantic( - annotation_scenario_db: AnnotationsScenariosDB, -) -> AnnotationScenario: - return AnnotationScenario( - id=str(annotation_scenario_db.id), - annotation_id=str(annotation_scenario_db.annotation_id), - inputs=[ - AnnotationScenarioInput(**input_dict.dict()) - for input_dict in annotation_scenario_db.inputs - ], - outputs=[ - AnnotationScenarioOutput(**output_dict.dict()) - for output_dict in annotation_scenario_db.outputs - ], - is_pinned=annotation_scenario_db.is_pinned, - note=annotation_scenario_db.note, - result=AnnotationScenarioResult(**annotation_scenario_db.result.dict()), - created_at=annotation_scenario_db.created_at, - updated_at=annotation_scenario_db.updated_at, - ) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 9324bb8719..90e56781cc 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -374,56 +374,3 @@ class TraceDB(Model): class Config: collection = "traces" - - -class AnnotationScenarioInputDB(EmbeddedModel): - name: str - type: str - value: str - - -class AnnotationScenarioOutputDB(EmbeddedModel): - type: str - value: Any - - -class AnnoationResult(EmbeddedModel): - variant_id: str - result: Result - - -class AnnotationScenarioResult(EmbeddedModel): - result: Result - - -class AnnotationsDB(Model): - app: AppDB = Reference(key_name="app") - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") - variants_ids: List[ObjectId] - testset: TestSetDB = Reference() - status: str = Field(default="ANNOTATION_INITIALIZED") - annotation_name: str - aggregated_results: List[AnnoationResult] - created_at: datetime = Field(default=datetime.utcnow()) - updated_at: datetime = Field(default=datetime.utcnow()) - - class Config: - collection = "annotations" - - -class AnnotationsScenariosDB(Model): - app: AppDB = Reference(key_name="app") - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") - annotation_id: ObjectId - inputs: List[AnnotationScenarioInputDB] - outputs: List[AnnotationScenarioOutputDB] - is_pinned: Optional[bool] - note: Optional[str] - result: Optional[Union[dict, Result]] = Field(default=None) - created_at: datetime = Field(default=datetime.utcnow()) - updated_at: datetime = Field(default=datetime.utcnow()) - - class Config: - collection = "annotations_scenarios" diff --git a/agenta-backend/agenta_backend/routers/annotations_router.py b/agenta-backend/agenta_backend/routers/annotations_router.py deleted file mode 100644 index 5a3ae24964..0000000000 --- a/agenta-backend/agenta_backend/routers/annotations_router.py +++ /dev/null @@ -1,160 +0,0 @@ -import os -import secrets -from typing import List, Dict - -from fastapi.responses import JSONResponse -from fastapi.encoders import jsonable_encoder -from fastapi import HTTPException, APIRouter, Body, Request, status, Response - -from agenta_backend.models.api.annotation_models import ( - Annotation, - AnnotationScenario, - NewAnnotation, - AnnotationScenarioUpdate, -) - -from agenta_backend.services.annotation_manager import update_annotation_scenario -from agenta_backend.tasks.evaluations import evaluate - -from agenta_backend.utils.common import check_access_to_app -from agenta_backend.services import db_manager, annotation_manager - -from agenta_backend.tasks.annotations import prepare_scenarios - -if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: - from agenta_backend.commons.services.selectors import ( # noqa pylint: disable-all - get_user_and_org_id, - ) -else: - from agenta_backend.services.selectors import get_user_and_org_id - -router = APIRouter() - - -@router.post("/") -async def create_annotation( - payload: NewAnnotation, - request: Request, -) -> Annotation: - """Creates a new annotation document - Raises: - HTTPException: _description_ - Returns: - _description_ - """ - try: - user_org_data: dict = await get_user_and_org_id(request.state.user_id) - access_app = await check_access_to_app( - user_org_data=user_org_data, - app_id=payload.app_id, - check_owner=False, - ) - if not access_app: - error_msg = f"You do not have access to this app: {payload.app_id}" - return JSONResponse( - {"detail": error_msg}, - status_code=400, - ) - app = await db_manager.fetch_app_by_id(app_id=payload.app_id) - if app is None: - raise HTTPException(status_code=404, detail="App not found") - - app_data = jsonable_encoder(app) - new_annotation_data = payload.dict() - annotation = await annotation_manager.create_new_annotation( - app_data=app_data, - new_annotation_data=new_annotation_data, - ) - - prepare_scenarios.delay( - app_data, new_annotation_data, annotation.id, annotation.testset_id - ) - - return annotation - except KeyError: - raise HTTPException( - status_code=400, - detail="columns in the annotation set should match the names of the inputs in the variant", - ) - - -@router.get("/", response_model=List[Annotation]) -async def fetch_list_annotations( - app_id: str, - request: Request, -): - """Fetches a list of annotations, optionally filtered by an app ID. - - Args: - app_id (Optional[str]): An optional app ID to filter the annotations. - - Returns: - List[Annotation]: A list of annotations. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - return await annotation_manager.fetch_list_annotations( - app_id=app_id, **user_org_data - ) - - -@router.get("/{annotation_id}/", response_model=Annotation) -async def fetch_annotation( - annotation_id: str, - request: Request, -): - """Fetches a single annotation based on its ID. - - Args: - annotation_id (str): The ID of the annotation to fetch. - - Returns: - Annotation: The fetched annotation. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - return await annotation_manager.fetch_annotation(annotation_id, **user_org_data) - - -@router.get( - "/{annotation_id}/annotations_scenarios/", response_model=List[AnnotationScenario] -) -async def fetch_annotations_scenarios( - annotation_id: str, - request: Request, -): - """Fetches a single annotation based on its ID. - - Args: - annotation_id (str): The ID of the annotation to fetch. - - Returns: - Annotation: The fetched annotation. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - return await annotation_manager.fetch_annotations_scenarios( - annotation_id, **user_org_data - ) - - -@router.put("/{annotation_id}/annotations_scenarios/{annotation_scenario_id}/") -async def update_annotation_scenario_router( - annotation_id: str, - annotation_scenario_id: str, - annotation_scenario: AnnotationScenarioUpdate, - request: Request, -): - """Updates an annotation scenario's data. - - Raises: - HTTPException: If update fails or unauthorized. - - Returns: - None: 204 No Content status code upon successful update. - """ - user_org_data = await get_user_and_org_id(request.state.user_id) - - await update_annotation_scenario( - annotation_scenario_id, - annotation_scenario, - **user_org_data, - ) - return Response(status_code=status.HTTP_204_NO_CONTENT) diff --git a/agenta-backend/agenta_backend/services/annotation_manager.py b/agenta-backend/agenta_backend/services/annotation_manager.py deleted file mode 100644 index 35d73dbd31..0000000000 --- a/agenta-backend/agenta_backend/services/annotation_manager.py +++ /dev/null @@ -1,248 +0,0 @@ -import datetime -import os -import secrets -from typing import Any, List, Dict - -from bson import ObjectId -from fastapi import HTTPException - -from agenta_backend.services import db_manager -from agenta_backend.models import converters -from agenta_backend.models.api.annotation_models import ( - Annotation, - AnnotationScenario, - AnnotationScenarioInput, - AnnotationStatusEnum, - NewAnnotation, - AnnotationScenarioUpdate, -) - - -from agenta_backend.models.db_models import ( - AnnotationsDB, - AnnotationsScenariosDB, - AppDB, -) - -from agenta_backend.utils.common import engine, check_access_to_app - - -async def _fetch_annotation_and_check_access( - annotation_id: str, **user_org_data: dict -) -> AnnotationsDB: - annotation = await db_manager.fetch_annotation_by_id(annotation_id=annotation_id) - - if annotation is None: - raise HTTPException( - status_code=404, - detail=f"Annotation with id {annotation_id} not found", - ) - - access = await check_access_to_app( - user_org_data=user_org_data, app_id=annotation.app.id - ) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {str(annotation.app.id)}", - ) - return annotation - - -async def _fetch_annotation_scenario_and_check_access( - annotation_scenario_id: str, **user_org_data: dict -) -> AnnotationsScenariosDB: - # Fetch the annotation scenario by ID - annotation_scenario = await db_manager.fetch_annotation_scenario_by_id( - annotation_scenario_id=annotation_scenario_id - ) - if annotation_scenario is None: - raise HTTPException( - status_code=404, - detail=f"Annotation scenario with id {annotation_scenario_id} not found", - ) - annotation = annotation_scenario.annotation - - # Check if the annotation exists - if annotation is None: - raise HTTPException( - status_code=404, - detail=f"Annotation scenario for annotation scenario with id {annotation_scenario_id} not found", - ) - - # Check for access rights - access = await check_access_to_app( - user_org_data=user_org_data, app_id=annotation.app.id - ) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {str(annotation.app.id)}", - ) - return annotation_scenario - - -async def fetch_list_annotations( - app_id: str, - **user_org_data: dict, -) -> List[Annotation]: - """ - Fetches a list of annotations based on the provided filtering criteria. - - Args: - app_id (str): The app ID to filter the annotations. - user_org_data (dict): User and organization data. - - Returns: - List[Annotation]: A list of annotations. - """ - - access = await check_access_to_app(user_org_data=user_org_data, app_id=app_id) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {app_id}", - ) - - annotations_db = await db_manager.fetch_annotations_by_app_id(app_id=app_id) - - return [ - converters.annotation_db_to_pydantic(annotation) - for annotation in annotations_db - ] - - -async def fetch_annotation(annotation_id: str, **user_org_data: dict) -> Annotation: - """ - Fetches a single annotation based on its ID. - - Args: - annotation_id (str): The ID of the annotation. - user_org_data (dict): User and organization data. - - Returns: - Annotation: The fetched annotation. - """ - annotation = await _fetch_annotation_and_check_access( - annotation_id=annotation_id, **user_org_data - ) - return converters.annotation_db_to_pydantic(annotation) - - -async def create_new_annotation( - app_data: dict, new_annotation_data: dict -) -> Annotation: - """ - Create a new annotation. - - Args: - app_data (dict): Required app data - new_annotation_data (dict): Required new annotation data - - Returns: - Annotation - """ - - new_annotation = NewAnnotation(**new_annotation_data) - app = AppDB(**app_data) - - annotation_db = await db_manager.create_new_annotation( - app=app, - organization=app.organization, - user=app.user, - annotation_name=new_annotation.annotation_name, - testset_id=new_annotation.testset_id, - status=AnnotationStatusEnum.ANNOTATION_STARTED, - variants_ids=new_annotation.variants_ids, - ) - - return converters.annotation_db_to_pydantic(annotation_db) - - -async def create_annotation_scenario( - annotation_id: str, payload: AnnotationScenario, **user_org_data: dict -) -> None: - """ - Create a new annotation scenario. - - Args: - annotation_id (str): The ID of the annotation. - payload (AnnotationScenario): Annotation scenario data. - user_org_data (dict): User and organization data. - - Raises: - HTTPException: If annotation not found or access denied. - """ - - scenario_inputs = [ - AnnotationScenarioInput( - input_name=input_item.input_name, - input_value=input_item.input_value, - ) - for input_item in payload.inputs - ] - - new_annotation_scenario = AnnotationsScenariosDB( - user=new_annotation_scenario.user, - organization=new_annotation_scenario.organization, - annotation_id=annotation_id, - inputs=scenario_inputs, - outputs=[], - is_pinned=False, - note="", - created_at=datetime.utcnow(), - updated_at=datetime.utcnow(), - ) - - await engine.save(new_annotation_scenario) - - -async def fetch_annotations_scenarios( - annotation_id: str, **user_org_data: dict -) -> [AnnotationScenario]: - """ - Fetches a single annotation based on its ID. - - Args: - annotation_id (str): The ID of the annotation. - user_org_data (dict): User and organization data. - - Returns: - Annotation: The fetched annotation. - """ - annotation = await _fetch_annotation_and_check_access( - annotation_id=annotation_id, - **user_org_data, - ) - scenarios = await engine.find( - AnnotationsScenariosDB, - AnnotationsScenariosDB.annotation_id == ObjectId(annotation_id), - ) - annotations_scenarios = [ - converters.annotation_scenario_db_to_pydantic(scenario) - for scenario in scenarios - ] - return annotations_scenarios - - -async def update_annotation_scenario( - annotation_scenario_id: str, - updates: Dict[str, Any], - **user_org_data, -) -> AnnotationScenario: - """ - Edit an existing annotation scenario. - - Args: - annotation_scenario_id (str): The ID of the annotation scenario to be updated. - updates (Dict[str, Any]): A dictionary containing the updates. - - Returns: - AnnotationScenario: The updated annotation scenario object. - """ - - annotation_scenario = await db_manager.update_annotation_scenario( - annotation_scenario_id, updates - ) - print(annotation_scenario) - return converters.annotation_scenario_db_to_pydantic(annotation_scenario) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index f85c779522..8f15afb470 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -20,8 +20,6 @@ ) from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( - AnnotationsDB, - AnnotationsScenariosDB, HumanEvaluationScenarioDB, Result, AggregatedResult, @@ -1838,176 +1836,3 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: return delete_result is not None except Exception as e: raise e - - -async def fetch_annotations_by_app_id(app_id: str) -> List[AnnotationsDB]: - """ - Fetches annotations from the database based on the provided app ID. - - Args: - app_id (str): The app ID to filter the annotations. - - Returns: - List[AnnotationsDB]: A list of annotation database objects. - """ - annotations_db = await engine.find( - AnnotationsDB, AnnotationsDB.app == ObjectId(app_id) - ) - return annotations_db - - -async def create_new_annotation( - app: AppDB, - organization: OrganizationDB, - user: UserDB, - testset_id: str, - status: str, - variants_ids: [str], - annotation_name: str, -) -> AnnotationsDB: - """Create a new annotation scenario. - Returns: - Annotation: The created annotation scenario. - """ - annotation = AnnotationsDB( - app=app, - organization=organization, - user=user, - testset_id=testset_id, - variants_ids=variants_ids, - annotation_name=annotation_name, - status=status, - aggregated_results=[], - created_at=datetime.now().isoformat(), - updated_at=datetime.now().isoformat(), - ) - new_annotation = await engine.save(annotation) - return new_annotation - - -async def fetch_annotation_by_id(annotation_id: str) -> Optional[AnnotationsDB]: - """ - Fetches an annotation from the database based on its ID. - - Args: - annotation_id (str): The unique identifier of the annotation. - - Returns: - Optional[AnnotationsDB]: The annotation database object if found, otherwise None. - """ - - annotation = await engine.find_one( - AnnotationsDB, AnnotationsDB.id == ObjectId(annotation_id) - ) - return annotation - - -async def fetch_annotation_scenario_by_id( - annotation_id: str, -) -> Optional[AnnotationsScenariosDB]: - """ - Fetches an annotation from the database based on its ID. - - Args: - annotation_id (str): The unique identifier of the annotation. - - Returns: - Optional[AnnotationsDB]: The annotation database object if found, otherwise None. - """ - - annotation = await engine.find_one( - AnnotationsScenariosDB, AnnotationsScenariosDB.id == ObjectId(annotation_id) - ) - return annotation - - -async def create_new_annotation_scenario( - app: AppDB, - organization: OrganizationDB, - user: UserDB, - annotation_id: str, - inputs: List[dict], - outputs: List[dict], - isPinned: bool, - note: str, -) -> AnnotationsScenariosDB: - """ - Create a new annotation scenario in the database. - - Args: - annotation (AnnotationsDB): The annotation to which the scenario belongs. - scenario_inputs (List[dict]): List of inputs for the annotation scenario. - user (UserDB): User information. - organization (OrganizationDB): Organization information. - - Returns: - AnnotationsScenariosDB: The created annotation scenario. - """ - new_annotation_scenario = AnnotationsScenariosDB( - app=app, - user=user, - organization=organization, - annotation_id=annotation_id, - inputs=inputs, - outputs=outputs, - is_pinned=isPinned, - note=note, - result=None, - created_at=datetime.utcnow(), - updated_at=datetime.utcnow(), - ) - await engine.save(new_annotation_scenario) - return new_annotation_scenario - - -def insert_many_documents_using_driver(documents: list, collection_name: str) -> None: - """ - Inserts multiple documents into a MongoDB collection using the pymongo driver. - - Args: - documents (list): A list of dictionaries, each representing a document to insert. - collection_name (str): The name of the MongoDB collection where documents will be inserted. - """ - client = pymongo.MongoClient(os.environ["MONGODB_URI"]) - db = client.get_database("agenta_v2") - - collection = db.get_collection(collection_name) - - for document in documents: - if "_id" in document and isinstance(document["_id"], str): - document["_id"] = ObjectId(document["_id"]) - - inserted = collection.insert_many(documents) - print( - f"Inserted {len(inserted.inserted_ids)} documents into {collection_name} collection. Acknowledged: {inserted.acknowledged}" - ) - - -async def update_annotation_scenario( - annotation_scenario_id: str, updates: Dict[str, Any] -) -> AnnotationsScenariosDB: - """ - Update an annotation scenario in the database with the provided id. - - Arguments: - annotation_scenario_id (str): The ID of the annotation scenario to be updated. - updates (Dict[str, Any]): The updates to apply to the annotation scenario. - - Returns: - AnnotationsScenariosDB: The updated annotation scenario object. - """ - - annotation_scenario = await engine.find_one( - AnnotationsScenariosDB, - AnnotationsScenariosDB.id == ObjectId(annotation_scenario_id), - ) - - if not annotation_scenario: - raise HTTPException(status_code=404, detail="Annotation scenario not found") - - for key, value in updates.items(): - if hasattr(annotation_scenario, key): - setattr(annotation_scenario, key, value) - - await engine.save(annotation_scenario) - return annotation_scenario diff --git a/agenta-backend/agenta_backend/tasks/annotations.py b/agenta-backend/agenta_backend/tasks/annotations.py deleted file mode 100644 index 37a81da0e7..0000000000 --- a/agenta-backend/agenta_backend/tasks/annotations.py +++ /dev/null @@ -1,83 +0,0 @@ -import asyncio -from typing import List -from bson import ObjectId -from celery import shared_task -from collections import defaultdict - -from agenta_backend.services import llm_apps_service -from agenta_backend.services.db_manager import ( - fetch_annotation_by_id, - fetch_app_variant_by_id, - get_deployment_by_objectid, - fetch_testset_by_id, - create_new_annotation_scenario, -) -from agenta_backend.models.db_models import ( - AppDB, - AnnotationScenarioInputDB, - AnnotationScenarioOutputDB, - AnnotationScenarioInputDB, - AnnotationScenarioResult, -) - -from agenta_backend.models.api.annotation_models import NewAnnotation - - -@shared_task(queue="agenta_backend.tasks.annotations.prepare_scenarios") -def prepare_scenarios( - app_data: dict, new_annotation_data: dict, annotation_id: str, testset_id: str -): - loop = asyncio.get_event_loop() - app = AppDB(**app_data) - annotation = NewAnnotation(**new_annotation_data) - - testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) - new_annotation_db = loop.run_until_complete(fetch_annotation_by_id(annotation_id)) - - for variant_id in annotation.variants_ids: - variant_id = str(variant_id) - - app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) - deployment = loop.run_until_complete( - get_deployment_by_objectid(app_variant_db.base.deployment) - ) - - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") - - for data_point in testset.csvdata: - # 1. We prepare the inputs - raw_inputs = ( - app_variant_db.parameters.get("inputs", []) - if app_variant_db.parameters - else [] - ) - inputs = [] - if raw_inputs: - inputs = [ - AnnotationScenarioInputDB( - name=input_item["name"], - type="text", - value=data_point[input_item["name"]], - ) - for input_item in raw_inputs - ] - - # 2. We get the output from the llm app - # TODO: make outputs for all variants - variant_output = llm_apps_service.get_llm_app_output(uri, data_point) - - # 3. We create a new annotation scenario - annotation_scenario = loop.run_until_complete( - create_new_annotation_scenario( - app=app, - user=app.user, - organization=app.organization, - annotation_id=new_annotation_db.id, - inputs=inputs, - outputs=[ - AnnotationScenarioOutputDB(type="text", value=variant_output) - ], - isPinned=False, - note="", - ) - ) From 07f05877be6f9a742c8ed6f4d033ebedc4e52bbd Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 2 Jan 2024 15:46:58 +0100 Subject: [PATCH 218/414] Update - modified test_create_evaluation --- .../tests/variants_evaluators_router/test_evaluators_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index 92434b1b6c..4ed641d42c 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -180,7 +180,7 @@ async def test_create_evaluation(): response = await test_client.post( f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout ) - response_data = response.json() + response_data = response.json()[0] assert response.status_code == 200 assert response_data["app_id"] == payload["app_id"] From 809bfd25c604da91ea5096672fa3d5371c5d7d94 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 2 Jan 2024 13:54:51 +0100 Subject: [PATCH 219/414] Cleanup - added extra-hosts to celery_worker compose service --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 42cc4eefc6..13e7f93d72 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -145,6 +145,8 @@ services: depends_on: - rabbitmq - redis + extra_hosts: + - "host.docker.internal:host-gateway" networks: - agenta-network From 107548aa18147686d7055d901b6cc59ea6904448 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 15:54:57 +0100 Subject: [PATCH 220/414] more fixes --- .../routers/human_evaluation_router.py | 8 ++++- .../agenta_backend/services/db_manager.py | 17 +++++++++++ .../services/evaluation_service.py | 29 ++++++++++++++++++- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index 545f12a276..6fadff5e3f 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -21,7 +21,13 @@ from agenta_backend.services import db_manager from agenta_backend.models import converters from agenta_backend.services import results_service -from agenta_backend.tasks.evaluations import evaluate + +from agenta_backend.services.evaluation_service import ( + UpdateEvaluationScenarioError, + get_evaluation_scenario_score, + update_evaluation_scenario_score, + update_human_evaluation_scenario, +) if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 8f15afb470..70a497c25e 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -20,6 +20,7 @@ ) from agenta_backend.services.json_importer_helper import get_json from agenta_backend.models.db_models import ( + HumanEvaluationDB, HumanEvaluationScenarioDB, Result, AggregatedResult, @@ -1286,6 +1287,22 @@ async def fetch_evaluation_by_id(evaluation_id: str) -> Optional[EvaluationDB]: return evaluation +async def fetch_human_evaluation_by_id( + evaluation_id: str, +) -> Optional[HumanEvaluationDB]: + """Fetches a evaluation by its ID. + Args: + evaluation_id (str): The ID of the evaluation to fetch. + Returns: + EvaluationDB: The fetched evaluation, or None if no evaluation was found. + """ + assert evaluation_id is not None, "evaluation_id cannot be None" + evaluation = await engine.find_one( + HumanEvaluationDB, HumanEvaluationDB.id == ObjectId(evaluation_id) + ) + return evaluation + + async def fetch_evaluation_scenario_by_id( evaluation_scenario_id: str, ) -> Optional[EvaluationScenarioDB]: diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index eab542ebdb..ea471a4c1b 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -80,6 +80,33 @@ async def _fetch_evaluation_and_check_access( return evaluation +async def _fetch_human_evaluation_and_check_access( + evaluation_id: str, **user_org_data: dict +) -> HumanEvaluationDB: + # Fetch the evaluation by ID + evaluation = await db_manager.fetch_human_evaluation_by_id( + evaluation_id=evaluation_id + ) + + # Check if the evaluation exists + if evaluation is None: + raise HTTPException( + status_code=404, + detail=f"Evaluation with id {evaluation_id} not found", + ) + + # Check for access rights + access = await check_access_to_app( + user_org_data=user_org_data, app_id=evaluation.app.id + ) + if not access: + raise HTTPException( + status_code=403, + detail=f"You do not have access to this app: {str(evaluation.app.id)}", + ) + return evaluation + + async def _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario_id: str, **user_org_data: dict ) -> HumanEvaluationDB: @@ -541,7 +568,7 @@ async def fetch_human_evaluation( Returns: Evaluation: The fetched evaluation. """ - evaluation = await _fetch_human_evaluation_scenario_and_check_access( + evaluation = await _fetch_human_evaluation_and_check_access( evaluation_id=evaluation_id, **user_org_data ) return await converters.human_evaluation_db_to_pydantic(evaluation) From 26294fd50716b80e5230e1bc44982bec212aeb7c Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 2 Jan 2024 16:09:05 +0100 Subject: [PATCH 221/414] :art: Format - ran black --- agenta-backend/agenta_backend/tasks/evaluations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index f650ec2b40..81a86ca5d0 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -98,7 +98,7 @@ def evaluate( variant_output, data_point["correct_answer"], evaluator_config.settings_values, - **additional_kwargs + **additional_kwargs, ) result_object = EvaluationScenarioResult( From bd636eeba9ea21f800cfdbf9855aec94234e736c Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 2 Jan 2024 20:19:51 +0500 Subject: [PATCH 222/414] fixed missing pages in annotations --- agenta-web/src/lib/transformers.ts | 2 +- .../[evaluation_id]/human_a_b_testing.tsx | 71 ++++++++++++++++++ .../[evaluation_id]/single_model_test.tsx | 72 +++++++++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/human_a_b_testing.tsx create mode 100644 agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/single_model_test.tsx diff --git a/agenta-web/src/lib/transformers.ts b/agenta-web/src/lib/transformers.ts index 09e5ee1ba5..607d37183e 100644 --- a/agenta-web/src/lib/transformers.ts +++ b/agenta-web/src/lib/transformers.ts @@ -40,7 +40,7 @@ export const fromEvaluationResponseToEvaluation = (item: EvaluationResponseType) status: item.status, evaluationType: item.evaluation_type, evaluationTypeSettings, - llmAppPromptTemplate: item.evaluation_type_settings.llm_app_prompt_template, + llmAppPromptTemplate: item.evaluation_type_settings?.llm_app_prompt_template, } as Evaluation } diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/human_a_b_testing.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/human_a_b_testing.tsx new file mode 100644 index 0000000000..3b96ab1166 --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/human_a_b_testing.tsx @@ -0,0 +1,71 @@ +import ABTestingEvaluationTable from "@/components/EvaluationTable/ABTestingEvaluationTable" +import {Evaluation} from "@/lib/Types" +import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" +import {useRouter} from "next/router" +import {useEffect} from "react" +import {fetchVariants} from "@/lib/services/api" +import {useAtom} from "jotai" +import {evaluationAtom, evaluationScenariosAtom} from "@/lib/atoms/evaluation" +import {getTestsetChatColumn} from "@/lib/helpers/testset" + +export default function Evaluation() { + const router = useRouter() + const evaluationTableId = router.query.evaluation_id + ? router.query.evaluation_id.toString() + : "" + const [evaluationScenarios, setEvaluationScenarios] = useAtom(evaluationScenariosAtom) + const [evaluation, setEvaluation] = useAtom(evaluationAtom) + const appId = router.query.app_id as string + const columnsCount = 2 + + useEffect(() => { + if (!evaluation) { + return + } + const init = async () => { + const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) + setEvaluationScenarios(data) + } + init() + }, [evaluation]) + + useEffect(() => { + if (!evaluationTableId) { + return + } + const init = async () => { + const evaluation: Evaluation = await loadEvaluation(evaluationTableId) + const backendVariants = await fetchVariants(appId) + const testset = await loadTestset(evaluation.testset._id) + // Create a map for faster access to first array elements + let backendVariantsMap = new Map() + backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) + + // Update variants in second object + evaluation.variants = evaluation.variants.map((variant) => { + let backendVariant = backendVariantsMap.get(variant.variantId) + return backendVariant ? backendVariant : variant + }) + evaluation.testset = { + ...evaluation.testset, + ...testset, + testsetChatColumn: getTestsetChatColumn(testset.csvdata), + } + setEvaluation(evaluation) + } + + init() + }, [evaluationTableId]) + + return ( +
+ {evaluationTableId && evaluationScenarios && evaluation && ( + + )} +
+ ) +} diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/single_model_test.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/single_model_test.tsx new file mode 100644 index 0000000000..fa522c02d6 --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/annotations/[evaluation_id]/single_model_test.tsx @@ -0,0 +1,72 @@ +import {Evaluation, EvaluationScenario, GenericObject} from "@/lib/Types" +import {loadEvaluation, loadEvaluationsScenarios, loadTestset} from "@/lib/services/api" +import {useRouter} from "next/router" +import {useEffect, useState} from "react" +import {fetchVariants} from "@/lib/services/api" +import {getTestsetChatColumn} from "@/lib/helpers/testset" +import SingleModelEvaluationTable from "@/components/EvaluationTable/SingleModelEvaluationTable" + +export default function Evaluation() { + const router = useRouter() + const evaluationTableId = router.query.evaluation_id + ? router.query.evaluation_id.toString() + : "" + const [evaluationScenarios, setEvaluationScenarios] = useState([]) + const [evaluation, setEvaluation] = useState() + const appId = router.query.app_id as string + + useEffect(() => { + if (!evaluation) { + return + } + const init = async () => { + const data = await loadEvaluationsScenarios(evaluationTableId, evaluation) + setEvaluationScenarios( + data.map((item: GenericObject) => { + const numericScore = parseInt(item.score) + return {...item, score: isNaN(numericScore) ? null : numericScore} + }), + ) + } + init() + }, [evaluation]) + + useEffect(() => { + if (!evaluationTableId) { + return + } + const init = async () => { + const evaluation: Evaluation = await loadEvaluation(evaluationTableId) + const backendVariants = await fetchVariants(appId) + const testset = await loadTestset(evaluation.testset._id) + // Create a map for faster access to first array elements + let backendVariantsMap = new Map() + backendVariants.forEach((obj) => backendVariantsMap.set(obj.variantId, obj)) + + // Update variants in second object + evaluation.variants = evaluation.variants.map((variant) => { + let backendVariant = backendVariantsMap.get(variant.variantId) + return backendVariant ? backendVariant : variant + }) + evaluation.testset = { + ...evaluation.testset, + ...testset, + testsetChatColumn: getTestsetChatColumn(testset.csvdata), + } + setEvaluation(evaluation) + } + + init() + }, [evaluationTableId]) + + return ( +
+ {evaluationTableId && evaluationScenarios && evaluation && ( + + )} +
+ ) +} From 613609928c5d26624a0541dac56bc0ef527f0e3c Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 16:44:06 +0100 Subject: [PATCH 223/414] fetch human evaluations scenarios --- .../models/api/evaluation_model.py | 29 +++++++++++----- .../agenta_backend/models/converters.py | 18 ++++++++++ .../routers/human_evaluation_router.py | 32 +++++++++++++++++ .../services/evaluation_service.py | 34 +++++++++++++++++++ 4 files changed, 105 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index b8716c6d33..70e31cb6a7 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -110,6 +110,16 @@ class EvaluationScenarioOutput(BaseModel): value: Any +class HumanEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + class HumanEvaluation(BaseModel): id: str app_id: str @@ -126,14 +136,17 @@ class HumanEvaluation(BaseModel): updated_at: datetime -class HumanEvaluationScenarioInput(BaseModel): - input_name: str - input_value: str - - -class HumanEvaluationScenarioOutput(BaseModel): - variant_id: str - variant_output: str +class HumanEvaluationScenario(BaseModel): + id: Optional[str] + evaluation_id: str + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] + vote: Optional[str] + score: Optional[Union[str, int]] + evaluation: Optional[str] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] class HumanEvaluationScenarioUpdate(BaseModel): diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index ab41e9ce56..9ccf33ad0e 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -9,6 +9,7 @@ EvaluationScenarioResult, EvaluatorConfigDB, HumanEvaluationDB, + HumanEvaluationScenarioDB, ImageDB, TemplateDB, AppDB, @@ -41,6 +42,7 @@ ) from agenta_backend.models.api.evaluation_model import ( HumanEvaluation, + HumanEvaluationScenario, SimpleEvaluationOutput, EvaluationScenario, Evaluation, @@ -119,6 +121,22 @@ async def human_evaluation_db_to_pydantic( ) +def human_evaluation_scenario_db_to_pydantic( + evaluation_scenario_db: HumanEvaluationScenarioDB, +) -> HumanEvaluationScenario: + return HumanEvaluationScenario( + id=str(evaluation_scenario_db.id), + evaluation_id=str(evaluation_scenario_db.evaluation.id), + inputs=evaluation_scenario_db.inputs, + outputs=evaluation_scenario_db.outputs, + vote=evaluation_scenario_db.vote, + score=evaluation_scenario_db.score, + correct_answer=evaluation_scenario_db.correct_answer, + is_pinned=evaluation_scenario_db.is_pinned or False, + note=evaluation_scenario_db.note or "", + ) + + async def aggregated_result_to_pydantic(results: List[AggregatedResult]) -> List[dict]: transformed_results = [] for result in results: diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index 6fadff5e3f..7397b9ed12 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -10,6 +10,7 @@ DeleteEvaluation, EvaluationScenarioScoreUpdate, HumanEvaluation, + HumanEvaluationScenario, HumanEvaluationScenarioUpdate, EvaluationType, NewHumanEvaluation, @@ -121,6 +122,37 @@ async def fetch_human_evaluation( ) +@router.get( + "/{evaluation_id}/evaluation_scenarios/", + response_model=List[HumanEvaluationScenario], + operation_id="fetch_evaluation_scenarios", +) +async def fetch_evaluation_scenarios( + evaluation_id: str, + request: Request, +): + """Fetches evaluation scenarios for a given evaluation ID. + + Arguments: + evaluation_id (str): The ID of the evaluation for which to fetch scenarios. + + Raises: + HTTPException: If the evaluation is not found or access is denied. + + Returns: + List[EvaluationScenario]: A list of evaluation scenarios. + """ + + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + eval_scenarios = ( + await evaluation_service.fetch_human_evaluation_scenarios_for_evaluation( + evaluation_id, **user_org_data + ) + ) + + return eval_scenarios + + @router.put( "/{evaluation_id}/evaluation_scenario/{evaluation_scenario_id}/{evaluation_type}/" ) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index ea471a4c1b..507dd1860b 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -15,6 +15,7 @@ EvaluationScenarioInput, EvaluationType, HumanEvaluation, + HumanEvaluationScenario, NewEvaluation, EvaluationScenarioUpdate, CreateCustomEvaluation, @@ -325,6 +326,39 @@ async def fetch_evaluation_scenarios_for_evaluation( return eval_scenarios +async def fetch_human_evaluation_scenarios_for_evaluation( + evaluation_id: str, **user_org_data: dict +) -> List[HumanEvaluationScenario]: + """ + Fetch evaluation scenarios for a given evaluation ID. + + Args: + evaluation_id (str): The ID of the evaluation. + user_org_data (dict): User and organization data. + + Raises: + HTTPException: If the evaluation is not found or access is denied. + + Returns: + List[EvaluationScenario]: A list of evaluation scenarios. + """ + evaluation = await _fetch_human_evaluation_and_check_access( + evaluation_id=evaluation_id, + **user_org_data, + ) + print("$$$$$$ evaluation") + print(evaluation) + scenarios = await engine.find( + HumanEvaluationScenarioDB, + HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id), + ) + eval_scenarios = [ + converters.human_evaluation_scenario_db_to_pydantic(scenario) + for scenario in scenarios + ] + return eval_scenarios + + async def update_human_evaluation_scenario( evaluation_scenario_id: str, evaluation_scenario_data: EvaluationScenarioUpdate, From 92e5c14ffb70419c38c340d8fcce778833b93c2a Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 17:19:48 +0100 Subject: [PATCH 224/414] fix results and update evaluation --- .../models/api/evaluation_model.py | 2 +- .../routers/evaluation_router.py | 35 ------------------ .../routers/human_evaluation_router.py | 37 +++++++++++++++++-- .../services/evaluation_service.py | 14 ++++--- 4 files changed, 42 insertions(+), 46 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 70e31cb6a7..88c3755900 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -89,7 +89,7 @@ class SimpleEvaluationOutput(BaseModel): evaluation_type: EvaluationType -class EvaluationUpdate(BaseModel): +class HumanEvaluationUpdate(BaseModel): status: Optional[EvaluationStatusEnum] evaluation_type_settings: Optional[EvaluationTypeSettings] diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index ce38e767f9..3cf1b45494 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -14,18 +14,12 @@ EvaluationScenario, CustomEvaluationOutput, CustomEvaluationDetail, - EvaluationScenarioScoreUpdate, - EvaluationScenarioUpdate, ExecuteCustomEvaluationCode, - HumanEvaluation, - HumanEvaluationScenarioUpdate, NewEvaluation, DeleteEvaluation, EvaluationType, CreateCustomEvaluation, - EvaluationUpdate, EvaluationWebhook, - NewHumanEvaluation, SimpleEvaluationOutput, ) from agenta_backend.services.evaluation_service import ( @@ -35,11 +29,9 @@ fetch_custom_evaluation_detail, get_evaluation_scenario_score, update_evaluation_scenario_score, - update_evaluation, create_custom_code_evaluation, update_custom_code_evaluation, execute_custom_code_evaluation, - update_human_evaluation_scenario, ) from agenta_backend.services import evaluation_service from agenta_backend.utils.common import check_access_to_app @@ -163,33 +155,6 @@ async def fetch_evaluation_results(evaluation_id: str, request: Request): raise HTTPException(status_code=500, detail=str(exc)) -@router.put("/{evaluation_id}/") -async def update_evaluation_router( - request: Request, - evaluation_id: str, - update_data: EvaluationUpdate = Body(...), -): - """Updates an evaluation's status. - - Raises: - HTTPException: If the columns in the test set do not match with the inputs in the variant. - - Returns: - None: A 204 No Content status code, indicating that the update was successful. - """ - try: - # Get user and organization id - user_org_data: dict = await get_user_and_org_id(request.state.user_id) - await update_evaluation(evaluation_id, update_data, **user_org_data) - return Response(status_code=status.HTTP_204_NO_CONTENT) - - except KeyError: - raise HTTPException( - status_code=400, - detail="columns in the test set should match the names of the inputs in the variant", - ) - - @router.get( "/{evaluation_id}/evaluation_scenarios/", response_model=List[EvaluationScenario], diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index 7397b9ed12..3bd37b7c67 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -13,6 +13,7 @@ HumanEvaluationScenario, HumanEvaluationScenarioUpdate, EvaluationType, + HumanEvaluationUpdate, NewHumanEvaluation, SimpleEvaluationOutput, ) @@ -28,6 +29,7 @@ get_evaluation_scenario_score, update_evaluation_scenario_score, update_human_evaluation_scenario, + update_human_evaluation_service, ) @@ -153,6 +155,35 @@ async def fetch_evaluation_scenarios( return eval_scenarios +@router.put("/{evaluation_id}/", operation_id="update_evaluation") +async def update_evaluation( + request: Request, + evaluation_id: str, + update_data: HumanEvaluationUpdate = Body(...), +): + """Updates an evaluation's status. + + Raises: + HTTPException: If the columns in the test set do not match with the inputs in the variant. + + Returns: + None: A 204 No Content status code, indicating that the update was successful. + """ + try: + # Get user and organization id + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + await update_human_evaluation_service( + evaluation_id, update_data, **user_org_data + ) + return Response(status_code=status.HTTP_204_NO_CONTENT) + + except KeyError: + raise HTTPException( + status_code=400, + detail="columns in the test set should match the names of the inputs in the variant", + ) + + @router.put( "/{evaluation_id}/evaluation_scenario/{evaluation_scenario_id}/{evaluation_type}/" ) @@ -244,10 +275,8 @@ async def fetch_results( # Get user and organization id print("are we here") user_org_data: dict = await get_user_and_org_id(request.state.user_id) - evaluation = ( - await evaluation_service._fetch_human_evaluation_scenario_and_check_access( - evaluation_id, **user_org_data - ) + evaluation = await evaluation_service._fetch_human_evaluation_and_check_access( + evaluation_id, **user_org_data ) print("really???") if evaluation.evaluation_type == EvaluationType.human_a_b_testing: diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 507dd1860b..a36d133858 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -14,12 +14,13 @@ CustomEvaluationDetail, EvaluationScenarioInput, EvaluationType, + EvaluationTypeSettings, HumanEvaluation, HumanEvaluationScenario, + HumanEvaluationUpdate, NewEvaluation, EvaluationScenarioUpdate, CreateCustomEvaluation, - EvaluationUpdate, EvaluationStatusEnum, NewHumanEvaluation, ) @@ -37,8 +38,6 @@ HumanEvaluationScenarioOutput, UserDB, AppDB, - EvaluationScenarioInputDB, - EvaluationScenarioOutputDB, CustomEvaluationDB, ) @@ -115,6 +114,9 @@ async def _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario = await db_manager.fetch_human_evaluation_scenario_by_id( evaluation_scenario_id=evaluation_scenario_id ) + + print("evaluation_scenario") + print(evaluation_scenario) if evaluation_scenario is None: raise HTTPException( status_code=404, @@ -251,8 +253,8 @@ async def create_evaluation_scenario( await engine.save(new_eval_scenario) -async def update_evaluation( - evaluation_id: str, update_payload: EvaluationUpdate, **user_org_data: dict +async def update_human_evaluation_service( + evaluation_id: str, update_payload: HumanEvaluationUpdate, **user_org_data: dict ) -> None: """ Update an existing evaluation based on the provided payload. @@ -265,7 +267,7 @@ async def update_evaluation( HTTPException: If the evaluation is not found or access is denied. """ # Fetch the evaluation by ID - evaluation = await _fetch_evaluation_and_check_access( + evaluation = await _fetch_human_evaluation_and_check_access( evaluation_id=evaluation_id, **user_org_data, ) From 7422743ae4a14e0e6322ec67a60795049a35561d Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Tue, 2 Jan 2024 21:21:08 +0500 Subject: [PATCH 225/414] UI improvements: empty component | duration counter | compare table header --- agenta-web/src/components/Sidebar/Sidebar.tsx | 5 +- .../evaluationCompare/EvaluationCompare.tsx | 7 +-- .../evaluationResults/EvaluationResults.tsx | 24 +++++---- .../evaluations/evaluators/Evaluators.tsx | 32 +++++++----- agenta-web/src/lib/Types.ts | 1 + agenta-web/src/services/evaluations/index.ts | 51 ++++++++++--------- 6 files changed, 67 insertions(+), 53 deletions(-) diff --git a/agenta-web/src/components/Sidebar/Sidebar.tsx b/agenta-web/src/components/Sidebar/Sidebar.tsx index 63fdb2492b..5ad3ba8d0d 100644 --- a/agenta-web/src/components/Sidebar/Sidebar.tsx +++ b/agenta-web/src/components/Sidebar/Sidebar.tsx @@ -11,6 +11,7 @@ import { SettingOutlined, LogoutOutlined, ApartmentOutlined, + FormOutlined, } from "@ant-design/icons" import {Layout, Menu, Space, Tooltip, theme, Avatar} from "antd" @@ -266,7 +267,7 @@ const Sidebar: React.FC = () => { > {collapsed ? "Perform 1-to-1 variant comparisons on testsets to identify superior options." - : "Evaluate"} + : "Evaluations"} @@ -280,7 +281,7 @@ const Sidebar: React.FC = () => { } key="annotations" > - }> + }> = () => { ?.inputs.forEach((input, index) => { colDefs.push({ headerComponent: () => ( - + Input: {input.name} {variant.variantName} @@ -92,7 +92,7 @@ const EvaluationCompareMode: React.FC = () => { }) colDefs.push({ headerComponent: () => ( - + Output {variant.variantName} @@ -110,7 +110,7 @@ const EvaluationCompareMode: React.FC = () => { colDefs.push({ flex: 1, headerComponent: () => ( - + Evaluator: {config.name} {variant.variantName} @@ -220,6 +220,7 @@ const EvaluationCompareMode: React.FC = () => { rowData={scenarios} columnDefs={colDefs} getRowId={(params) => params.data.id} + headerHeight={64} /> diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index dcc5ae6ae4..5b7d627acf 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -218,11 +218,17 @@ const StatusRenderer = React.memo( (prev, next) => prev.value === next.value && prev.data?.duration === next.data?.duration, ) -interface Props { - type?: "auto" | "human" +const runningStatuses = [EvaluationStatus.INITIALIZED, EvaluationStatus.STARTED] + +export const calcEvalDuration = (evaluation: _Evaluation) => { + return dayjs( + runningStatuses.includes(evaluation.status) ? Date.now() : evaluation.updated_at, + ).diff(dayjs(evaluation.created_at), "milliseconds") } -const EvaluationResults: React.FC = ({type = "auto"}) => { +interface Props {} + +const EvaluationResults: React.FC = () => { const {appTheme} = useAppTheme() const classes = useStyles() const appId = useAppId() @@ -237,9 +243,7 @@ const EvaluationResults: React.FC = ({type = "auto"}) => { const runningEvaluationIds = useMemo( () => evaluations - .filter((item) => - [EvaluationStatus.INITIALIZED, EvaluationStatus.STARTED].includes(item.status), - ) + .filter((item) => runningStatuses.includes(item.status)) .map((item) => item.id), [evaluations], ) @@ -282,8 +286,11 @@ const EvaluationResults: React.FC = ({type = "auto"}) => { const index = newEvals.findIndex((e) => e.id === id) if (index !== -1) { newEvals[index].status = res[ix].status + newEvals[index].duration = calcEvalDuration(newEvals[index]) } }) + if (res.some((item) => !runningStatuses.includes(item.status))) + fetcher() return newEvals }) }) @@ -400,9 +407,8 @@ const EvaluationResults: React.FC = ({type = "auto"}) => { selected.length < 2 || selected.some( (item) => - [EvaluationStatus.INITIALIZED, EvaluationStatus.STARTED].includes( - item.status, - ) || item.testset.id !== selected[0].testset.id, + runningStatuses.includes(item.status) || + item.testset.id !== selected[0].testset.id, ) } icon={} diff --git a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx index c1e26a8b18..3c676e038a 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx @@ -1,7 +1,7 @@ import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" -import {Button, Input, Space, Spin} from "antd" +import {Button, Empty, Input, Space, Spin} from "antd" import {PlusCircleOutlined} from "@ant-design/icons" import NewEvaluatorModal from "./NewEvaluatorModal" import {useAppId} from "@/hooks/useAppId" @@ -78,19 +78,23 @@ const Evaluators: React.FC = () => { -
- {filtered.map((item, ix) => ( - { - setEditIndex(ix) - setNewEvalModalOpen(true) - }} - onSuccessDelete={fetcher} - /> - ))} -
+ {!fetching && !evaluatorConfigs.length ? ( + + ) : ( +
+ {filtered.map((item, ix) => ( + { + setEditIndex(ix) + setNewEvalModalOpen(true) + }} + onSuccessDelete={fetcher} + /> + ))} +
+ )}
{ } // Evaluations -const evaluationTransformer = (item: any) => ({ - id: item.id, - appId: item.app_id, - created_at: item.created_at, - updated_at: item.updated_at, - duration: dayjs( - [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes(item.status) - ? Date.now() - : item.updated_at, - ).diff(dayjs(item.created_at), "milliseconds"), - status: item.status, - testset: { - id: item.testset_id, - name: item.testset_name, - }, - user: { - id: item.user_id, - username: item.user_username, - }, - variants: item.variant_ids.map((id: string, ix: number) => ({ - variantId: id, - variantName: item.variant_names[ix], - })), - aggregated_results: item.aggregated_results || [], -}) +const evaluationTransformer = (item: any) => { + const res = { + id: item.id, + appId: item.app_id, + created_at: item.created_at, + updated_at: item.updated_at, + status: item.status, + testset: { + id: item.testset_id, + name: item.testset_name, + }, + user: { + id: item.user_id, + username: item.user_username, + }, + variants: item.variant_ids.map((id: string, ix: number) => ({ + variantId: id, + variantName: item.variant_names[ix], + })), + aggregated_results: item.aggregated_results || [], + } + + ;(res as _Evaluation).duration = calcEvalDuration(res) + return res +} export const fetchAllEvaluations = async (appId: string) => { const response = await axios.get(`/api/evaluations/`, {params: {app_id: appId}}) From e4b0eab4fea85a5399df54032d00da3a172f69b8 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 17:46:45 +0100 Subject: [PATCH 226/414] fix delete human eval --- .../routers/human_evaluation_router.py | 2 +- .../services/evaluation_service.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index 3bd37b7c67..9fc2e2dacb 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -307,7 +307,7 @@ async def delete_evaluations( # Get user and organization id user_org_data: dict = await get_user_and_org_id(request.state.user_id) - await evaluation_service.delete_evaluations( + await evaluation_service.delete_human_evaluations( delete_evaluations.evaluations_ids, **user_org_data ) return Response(status_code=status.HTTP_204_NO_CONTENT) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index a36d133858..93250002a7 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -610,6 +610,26 @@ async def fetch_human_evaluation( return await converters.human_evaluation_db_to_pydantic(evaluation) +async def delete_human_evaluations( + evaluation_ids: List[str], **user_org_data: dict +) -> None: + """ + Delete evaluations by their IDs. + + Args: + evaluation_ids (List[str]): A list of evaluation IDs. + user_org_data (dict): User and organization data. + + Raises: + HTTPException: If evaluation not found or access denied. + """ + for evaluation_id in evaluation_ids: + evaluation = await _fetch_human_evaluation_and_check_access( + evaluation_id=evaluation_id, **user_org_data + ) + await engine.delete(evaluation) + + async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) -> None: """ Delete evaluations by their IDs. From 42f84922d8645e1360ba785198614b7f2c1bee33 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 17:59:10 +0100 Subject: [PATCH 227/414] fix 500 on single model result --- .../agenta_backend/routers/human_evaluation_router.py | 2 -- agenta-backend/agenta_backend/services/results_service.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index 9fc2e2dacb..caa3d3373e 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -273,12 +273,10 @@ async def fetch_results( """ # Get user and organization id - print("are we here") user_org_data: dict = await get_user_and_org_id(request.state.user_id) evaluation = await evaluation_service._fetch_human_evaluation_and_check_access( evaluation_id, **user_org_data ) - print("really???") if evaluation.evaluation_type == EvaluationType.human_a_b_testing: results = await results_service.fetch_results_for_evaluation(evaluation) return {"votes_data": results} diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index 9876c6ac29..101b56fb1e 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -74,7 +74,7 @@ async def _compute_stats_for_human_a_b_testing_evaluation(evaluation_scenarios: return results -async def fetch_results_for_auto_ai_critique(evaluation_id: str): +async def fetch_results_for_single_model_test(evaluation_id: str): pipeline = [ {"$match": {"evaluations": ObjectId(evaluation_id)}}, {"$group": {"_id": "$score", "count": {"$sum": 1}}}, From 3ff30b210b12a8ea6ad79cec49e9672194ea091c Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 18:29:07 +0100 Subject: [PATCH 228/414] fix results for single model --- agenta-backend/agenta_backend/services/results_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index 101b56fb1e..a0d34eea5c 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -81,7 +81,7 @@ async def fetch_results_for_single_model_test(evaluation_id: str): ] results = {} - collection = engine.get_collection(EvaluationScenarioDB) + collection = engine.get_collection(HumanEvaluationScenarioDB) aggregation_cursor = await collection.aggregate(pipeline).to_list(length=None) for doc in aggregation_cursor: results[doc["_id"]] = doc["count"] From 1c6ced490e5b20366e208b4a680f12e62fcb73f5 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 21:54:41 +0100 Subject: [PATCH 229/414] add a direct-use attribute for evaluators --- .../agenta_backend/models/api/evaluation_model.py | 1 + .../agenta_backend/resources/evaluators/evaluators.json | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 88c3755900..b701e85250 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -8,6 +8,7 @@ class Evaluator(BaseModel): name: str key: str + direct_use: bool settings_template: dict diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 82f1e2b42a..80d6bc28a1 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -2,6 +2,7 @@ { "name": "Exact Match", "key": "auto_exact_match", + "direct_use": true, "settings_template": { "label": "Exact Match Settings", "description": "Settings for the Exact Match evaluator" @@ -10,6 +11,7 @@ { "name": "Similarity Match", "key": "auto_similarity_match", + "direct_use": false, "settings_template": { "similarity_threshold": { "label": "Similarity Threshold", @@ -22,6 +24,7 @@ { "name": "Regex Test", "key": "auto_regex_test", + "direct_use": false, "settings_template": { "regex_pattern": { "label": "Regex Pattern", @@ -40,6 +43,7 @@ { "name": "AI Critique", "key": "auto_ai_critique", + "direct_use": false, "settings_template": { "prompt_template": { "label": "Prompt Template", @@ -52,6 +56,7 @@ { "name": "Code Evaluation", "key": "auto_custom_code_run", + "direct_use": false, "settings_template": { "code": { "label": "Evaluation Code", @@ -64,6 +69,7 @@ { "name": "Webhook test", "key": "auto_webhook_test", + "direct_use": false, "settings_template": { "webhook_url": { "label": "Webhook URL", @@ -76,6 +82,7 @@ { "name": "A/B Test", "key": "human_a_b_testing", + "direct_use": false, "settings_template": { "label": "A/B Testing Settings", "description": "Settings for A/B testing configurations" @@ -84,6 +91,7 @@ { "name": "Single Model Test", "key": "human_single_model_test", + "direct_use": false, "settings_template": { "label": "Single Model Testing Settings", "description": "Settings for single model testing configurations" From 7a32360308eab2cdd3c02f8a7f65c6cf44c770a4 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 2 Jan 2024 22:31:01 +0100 Subject: [PATCH 230/414] add logic to failed evals --- .../models/api/evaluation_model.py | 2 +- .../agenta_backend/services/db_manager.py | 24 +++ .../agenta_backend/tasks/evaluations.py | 199 ++++++++++-------- agenta-web/src/lib/Types.ts | 2 +- 4 files changed, 137 insertions(+), 90 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index b701e85250..f4138d8ebb 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -45,7 +45,7 @@ class EvaluationStatusEnum(str, Enum): EVALUATION_INITIALIZED = "EVALUATION_INITIALIZED" EVALUATION_STARTED = "EVALUATION_STARTED" EVALUATION_FINISHED = "EVALUATION_FINISHED" - EVALUATION_ERROR = "EVALUATION_ERROR" + EVALUATION_FAILED = "EVALUATION_FAILED" class EvaluationScenarioStatusEnum(str, Enum): diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 172ce580a4..ad4cb1c014 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1855,3 +1855,27 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: ) # checking if delete_result is None (has been deleted) except Exception as e: raise e + + +async def update_evaluation( + evaluation_id: str, updates: Dict[str, Any] +) -> EvaluationDB: + """ + Update an evaluator configuration in the database with the provided id. + + Arguments: + evaluation_id (str): The ID of the evaluator configuration to be updated. + updates (Dict[str, Any]): The updates to apply to the evaluator configuration. + + Returns: + EvaluatorConfigDB: The updated evaluator configuration object. + """ + evaluation = await engine.find_one( + EvaluationDB, EvaluationDB.id == ObjectId(evaluation_id) + ) + + for key, value in updates.items(): + if key in evaluation.__fields__: + setattr(evaluation, key, value) + await engine.save(evaluation) + return evaluation diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 81a86ca5d0..35ba7244a0 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -13,6 +13,7 @@ fetch_testset_by_id, create_new_evaluation_scenario, fetch_evaluator_config_by_appId, + update_evaluation, update_evaluation_with_aggregated_results, ) from agenta_backend.models.db_models import ( @@ -32,107 +33,129 @@ def evaluate( app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str ): loop = asyncio.get_event_loop() - app = AppDB(**app_data) - evaluation = NewEvaluation(**new_evaluation_data) - testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) - new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) - evaluators_aggregated_data = defaultdict(list) + try: + app = AppDB(**app_data) + evaluation = NewEvaluation(**new_evaluation_data) - variant_id = str(evaluation.variant_ids[0]) - - app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) - deployment = loop.run_until_complete( - get_deployment_by_objectid(app_variant_db.base.deployment) - ) + testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) + new_evaluation_db = loop.run_until_complete( + fetch_evaluation_by_id(evaluation_id) + ) + evaluators_aggregated_data = defaultdict(list) - # TODO: remove if abraham's fix is working - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + variant_id = str(evaluation.variant_ids[0]) - for data_point in testset.csvdata: - # 1. We prepare the inputs - raw_inputs = ( - app_variant_db.parameters.get("inputs", []) - if app_variant_db.parameters - else [] + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) + deployment = loop.run_until_complete( + get_deployment_by_objectid(app_variant_db.base.deployment) ) - inputs = [] - if raw_inputs: - inputs = [ - EvaluationScenarioInputDB( - name=input_item["name"], - type="text", - value=data_point[input_item["name"]], - ) - for input_item in raw_inputs - ] - - #!NOTE: do not remove! this will be used in github workflow! - backend_environment = os.environ.get("ENVIRONMENT") - if backend_environment is not None and backend_environment == "github": - uri = f"http://{deployment.container_name}" - else: - uri = deployment.uri.replace( - "http://localhost", "http://host.docker.internal" - ) - # 2. We get the output from the llm app - variant_output = llm_apps_service.get_llm_app_output(uri, data_point) - - # 3. We evaluate - evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config_id in evaluation.evaluators_configs: - evaluator_config = loop.run_until_complete( - fetch_evaluator_config(evaluator_config_id) - ) - additional_kwargs = ( - { - "app_params": app_variant_db.config.parameters, - "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed - } - if evaluator_config.evaluator_key == "custom_code_run" - else {} - ) - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - variant_output, - data_point["correct_answer"], - evaluator_config.settings_values, - **additional_kwargs, + # TODO: remove if abraham's fix is working + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + + for data_point in testset.csvdata: + # 1. We prepare the inputs + raw_inputs = ( + app_variant_db.parameters.get("inputs", []) + if app_variant_db.parameters + else [] ) + inputs = [] + if raw_inputs: + inputs = [ + EvaluationScenarioInputDB( + name=input_item["name"], + type="text", + value=data_point[input_item["name"]], + ) + for input_item in raw_inputs + ] + + #!NOTE: do not remove! this will be used in github workflow! + backend_environment = os.environ.get("ENVIRONMENT") + if backend_environment is not None and backend_environment == "github": + uri = f"http://{deployment.container_name}" + else: + uri = deployment.uri.replace( + "http://localhost", "http://host.docker.internal" + ) + + # 2. We get the output from the llm app + try: + variant_output = llm_apps_service.get_llm_app_output(uri, data_point) + except Exception as e: + print(f"Error getting variant output: {e}") + loop.run_until_complete( + update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"}) + ) + return - result_object = EvaluationScenarioResult( - evaluator_config=evaluator_config.id, - result=result, + # 3. We evaluate + evaluators_results: [EvaluationScenarioResult] = [] + for evaluator_config_id in evaluation.evaluators_configs: + evaluator_config = loop.run_until_complete( + fetch_evaluator_config(evaluator_config_id) + ) + + additional_kwargs = ( + { + "app_params": app_variant_db.config.parameters, + "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed + } + if evaluator_config.evaluator_key == "custom_code_run" + else {} + ) + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + variant_output, + data_point["correct_answer"], + evaluator_config.settings_values, + **additional_kwargs, + ) + + result_object = EvaluationScenarioResult( + evaluator_config=evaluator_config.id, + result=result, + ) + evaluators_results.append(result_object) + evaluators_aggregated_data[evaluator_config.evaluator_key].append( + result + ) + + # 4. We create a new evaluation scenario + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + variant_id=variant_id, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=inputs, + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[ + EvaluationScenarioOutputDB(type="text", value=variant_output) + ], + results=evaluators_results, + ) ) - evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) - - # 4. We create a new evaluation scenario - evaluation_scenario = loop.run_until_complete( - create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - variant_id=variant_id, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=inputs, - is_pinned=False, - note="", - correct_answer=data_point["correct_answer"], - outputs=[EvaluationScenarioOutputDB(type="text", value=variant_output)], - results=evaluators_results, + + aggregated_results = loop.run_until_complete( + aggregate_evaluator_results(app, evaluators_aggregated_data) + ) + updated_evaluation = loop.run_until_complete( + update_evaluation_with_aggregated_results( + new_evaluation_db.id, aggregated_results ) ) - aggregated_results = loop.run_until_complete( - aggregate_evaluator_results(app, evaluators_aggregated_data) - ) - updated_evaluation = loop.run_until_complete( - update_evaluation_with_aggregated_results( - new_evaluation_db.id, aggregated_results + except Exception as e: + print(f"An error occurred during evaluation: {e}") + loop.run_until_complete( + update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"}) ) - ) async def aggregate_evaluator_results( diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 2f33533ee0..c59222d394 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -334,7 +334,7 @@ export enum EvaluationStatus { INITIALIZED = "EVALUATION_INITIALIZED", STARTED = "EVALUATION_STARTED", FINISHED = "EVALUATION_FINISHED", - ERROR = "EVALUATION_ERROR", + ERROR = "EVALUATION_FAILED", } export interface _Evaluation { From c76cc3d2598d031235cba04601e22cd353abcc7d Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Wed, 3 Jan 2024 02:31:59 +0500 Subject: [PATCH 231/414] ui fixes | imporvements | refactoring --- .../Evaluations/AutomaticEvaluationResult.tsx | 290 ++++++++++++++++++ .../components/Evaluations/Evaluations.tsx | 19 +- .../Evaluations/HumanEvaluationResult.tsx | 21 +- .../cellRenderers/cellRenderers.tsx | 149 +++++++++ .../evaluationCompare/EvaluationCompare.tsx | 126 ++++---- .../evaluationResults/EvaluationResults.tsx | 245 ++++----------- .../EvaluationScenarios.tsx | 21 +- .../evaluations/evaluators/EvaluatorCard.tsx | 12 +- .../evaluators/NewEvaluatorModal.tsx | 22 +- agenta-web/src/lib/Types.ts | 1 + agenta-web/src/lib/helpers/dateTimeHelper.ts | 19 +- agenta-web/src/lib/helpers/evaluate.ts | 14 + agenta-web/src/lib/services/api.ts | 36 +-- agenta-web/src/services/evaluations/index.ts | 2 - 14 files changed, 665 insertions(+), 312 deletions(-) create mode 100644 agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx create mode 100644 agenta-web/src/components/pages/evaluations/cellRenderers/cellRenderers.tsx diff --git a/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx b/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx new file mode 100644 index 0000000000..3248322f78 --- /dev/null +++ b/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx @@ -0,0 +1,290 @@ +import {deleteEvaluations, fetchEvaluationResults, loadEvaluations} from "@/lib/services/api" +import {Button, Collapse, Statistic, Table, Typography} from "antd" +import {useRouter} from "next/router" +import {useEffect, useState} from "react" +import {ColumnsType} from "antd/es/table" +import {Evaluation, GenericObject} from "@/lib/Types" +import {DeleteOutlined} from "@ant-design/icons" +import {EvaluationFlow, EvaluationType} from "@/lib/enums" +import {createUseStyles} from "react-jss" +import {useAppTheme} from "../Layout/ThemeContextProvider" +import {calculateResultsDataAvg} from "@/lib/helpers/evaluate" +import {fromEvaluationResponseToEvaluation} from "@/lib/transformers" + +interface EvaluationListTableDataType { + key: string + variants: string[] + testset: { + _id: string + name: string + } + evaluationType: string + status: EvaluationFlow + scoresData: { + nb_of_rows: number + wrong?: GenericObject[] + correct?: GenericObject[] + true?: GenericObject[] + false?: GenericObject[] + variant: string[] + } + avgScore: number + custom_code_eval_id: string + resultsData: {[key: string]: number} + createdAt: string +} + +type StyleProps = { + themeMode: "dark" | "light" +} + +const useStyles = createUseStyles({ + container: { + marginBottom: 20, + "& svg": { + color: "red", + }, + }, + collapse: ({themeMode}: StyleProps) => ({ + margin: "10px 0", + "& .ant-collapse-header": { + alignItems: "center !important", + padding: "0px 20px !important", + borderTopLeftRadius: "10px !important", + borderTopRightRadius: "10px !important", + background: themeMode === "dark" ? "#1d1d1d" : "#f8f8f8", + }, + }), + stat: { + "& .ant-statistic-content-value": { + fontSize: 20, + color: "#1677ff", + }, + "& .ant-statistic-content-suffix": { + fontSize: 20, + color: "#1677ff", + }, + }, +}) + +const {Title} = Typography + +export default function AutomaticEvaluationResult() { + const router = useRouter() + const [evaluationsList, setEvaluationsList] = useState([]) + const [selectedRowKeys, setSelectedRowKeys] = useState([]) + const [selectionType] = useState<"checkbox" | "radio">("checkbox") + const {appTheme} = useAppTheme() + const classes = useStyles({themeMode: appTheme} as StyleProps) + + const app_id = router.query.app_id?.toString() || "" + + useEffect(() => { + if (!app_id) { + return + } + + const fetchEvaluations = async () => { + try { + const evals: Evaluation[] = (await loadEvaluations(app_id)).map( + fromEvaluationResponseToEvaluation, + ) + const results = await Promise.all(evals.map((e) => fetchEvaluationResults(e.id))) + const newEvals = results.map((result, ix) => { + const item = evals[ix] + if ([EvaluationType.single_model_test].includes(item.evaluationType)) { + return { + key: item.id, + createdAt: item.createdAt, + variants: item.variants, + scoresData: result.scores_data, + evaluationType: item.evaluationType, + status: item.status, + testset: item.testset, + custom_code_eval_id: item.evaluationTypeSettings.customCodeEvaluationId, + resultsData: result.results_data, + avgScore: result.avg_score, + } + } + }) + + setEvaluationsList( + newEvals + .filter((evaluation) => evaluation !== undefined) + .filter( + (item: any) => + item.resultsData !== undefined || + !(Object.keys(item.scoresData || {}).length === 0) || + item.avgScore !== undefined, + ) as any, + ) + } catch (error) { + console.error(error) + } + } + + fetchEvaluations() + }, [app_id]) + + const onCompleteEvaluation = (evaluation: any) => { + // TODO: improve type + const evaluationType = + EvaluationType[evaluation.evaluationType as keyof typeof EvaluationType] + + if (evaluationType === EvaluationType.single_model_test) { + router.push(`/apps/${app_id}/evaluations/${evaluation.key}/single_model_test`) + } + } + + const columns: ColumnsType = [ + { + title: "Variant", + dataIndex: "variants", + key: "variants", + render: (value) => { + return ( +
+ {value[0].variantName} +
+ ) + }, + }, + { + title: "Test set", + dataIndex: "testsetName", + key: "testsetName", + render: (value: any, record: EvaluationListTableDataType, index: number) => { + return {record.testset.name} + }, + }, + { + title: "Average score", + dataIndex: "averageScore", + key: "averageScore", + render: (value: any, record: EvaluationListTableDataType, index: number) => { + let score = 0 + if (record.scoresData) { + score = + ((record.scoresData.correct?.length || + record.scoresData.true?.length || + 0) / + record.scoresData.nb_of_rows) * + 100 + } else if (record.resultsData) { + const multiplier = { + [EvaluationType.auto_webhook_test]: 100, + [EvaluationType.single_model_test]: 1, + } + score = calculateResultsDataAvg( + record.resultsData, + multiplier[record.evaluationType as keyof typeof multiplier], + ) + score = isNaN(score) ? 0 : score + } else if (record.avgScore) { + score = record.avgScore * 100 + } + + return ( + + + + ) + }, + }, + { + title: "Created at", + dataIndex: "createdAt", + key: "createdAt", + width: "300", + }, + { + title: "Action", + dataIndex: "action", + key: "action", + render: (value: any, record: EvaluationListTableDataType, index: number) => { + let actionText = "View evaluation" + if (record.status !== EvaluationFlow.EVALUATION_FINISHED) { + actionText = "Continue evaluation" + } + return ( +
+ +
+ ) + }, + }, + ] + + const rowSelection = { + onChange: (selectedRowKeys: React.Key[], selectedRows: EvaluationListTableDataType[]) => { + setSelectedRowKeys(selectedRowKeys) + }, + } + + const onDelete = async () => { + const evaluationsIds = selectedRowKeys.map((key) => key.toString()) + try { + await deleteEvaluations(evaluationsIds) + setEvaluationsList((prevEvaluationsList) => + prevEvaluationsList.filter( + (evaluation) => !evaluationsIds.includes(evaluation.key), + ), + ) + + setSelectedRowKeys([]) + } catch { + } finally { + } + } + + const items = [ + { + key: "1", + label: ( +
+ Single Model Test Results +
+ ), + children: ( +
+
+ +
+ +
+ + ), + }, + ] + + return ( + + ) +} diff --git a/agenta-web/src/components/Evaluations/Evaluations.tsx b/agenta-web/src/components/Evaluations/Evaluations.tsx index dc211b90b4..841627e53c 100644 --- a/agenta-web/src/components/Evaluations/Evaluations.tsx +++ b/agenta-web/src/components/Evaluations/Evaluations.tsx @@ -15,7 +15,7 @@ import {DownOutlined} from "@ant-design/icons" import {createNewEvaluation, fetchVariants, useLoadTestsetsList} from "@/lib/services/api" import {dynamicComponent, getApikeys, isDemo} from "@/lib/helpers/utils" import {useRouter} from "next/router" -import {Variant, Parameter, GenericObject} from "@/lib/Types" +import {Variant, Parameter, GenericObject, JSSTheme} from "@/lib/Types" import {EvaluationType} from "@/lib/enums" import {EvaluationTypeLabels} from "@/lib/helpers/utils" import EvaluationErrorModal from "./EvaluationErrorModal" @@ -28,12 +28,13 @@ import {useAppTheme} from "../Layout/ThemeContextProvider" import {createUseStyles} from "react-jss" import HumanEvaluationResult from "./HumanEvaluationResult" import {getErrorMessage} from "@/lib/helpers/errorHandler" +import AutomaticEvaluationResult from "./AutomaticEvaluationResult" type StyleProps = { themeMode: "dark" | "light" } -const useStyles = createUseStyles({ +const useStyles = createUseStyles((theme: JSSTheme) => ({ evaluationContainer: { border: "1px solid lightgrey", padding: "20px", @@ -74,6 +75,17 @@ const useStyles = createUseStyles({ }, radioGroup: { width: "100%", + "& .ant-radio-button-wrapper": { + marginBottom: "0.5rem", + borderRadius: theme.borderRadius, + borderLeft: `1px solid ${theme.colorBorder}`, + "&::before": { + display: "none", + }, + }, + "& .ant-radio-button-wrapper-checked ": { + borderLeft: `1px solid ${theme.colorPrimary}`, + }, }, radioBtn: { display: "block", @@ -117,7 +129,7 @@ const useStyles = createUseStyles({ alignItems: "center", justifyContent: "space-between", }, -}) +})) const {Title} = Typography export default function Evaluations() { @@ -503,6 +515,7 @@ export default function Evaluations() { btnText={error.btnText} />
+
diff --git a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx index 80e8905946..dd4b7f1168 100644 --- a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx +++ b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx @@ -1,4 +1,9 @@ -import {deleteEvaluations, fetchData} from "@/lib/services/api" +import { + deleteEvaluations, + fetchData, + fetchEvaluationResults, + loadEvaluations, +} from "@/lib/services/api" import {Button, Collapse, Statistic, Table, Typography} from "antd" import {useRouter} from "next/router" import {useEffect, useState} from "react" @@ -10,7 +15,7 @@ import {createUseStyles} from "react-jss" import {formatDate} from "@/lib/helpers/dateTimeHelper" import {useAppTheme} from "../Layout/ThemeContextProvider" import {getVotesPercentage} from "@/lib/helpers/evaluate" -import {EvaluationTypeLabels, getAgentaApiUrl, isDemo} from "@/lib/helpers/utils" +import {getAgentaApiUrl, isDemo} from "@/lib/helpers/utils" interface VariantVotesData { number_of_votes: number @@ -98,16 +103,10 @@ export default function HumanEvaluationResult() { } const fetchEvaluations = async () => { try { - fetchData( - `${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`, - ) + loadEvaluations(app_id, true) .then((response) => { const fetchPromises = response.map((item: EvaluationResponseType) => { - return fetchData( - `${getAgentaApiUrl()}/api/human-evaluations/${ - item.id - }/results/`, - ) + return fetchEvaluationResults(item.id, true) .then((results) => { if (item.evaluation_type === EvaluationType.human_a_b_testing) { if (Object.keys(results.votes_data).length > 0) { @@ -290,7 +289,7 @@ export default function HumanEvaluationResult() { key: "1", label: (
- Annotation Results + A/B Test Results
), children: ( diff --git a/agenta-web/src/components/pages/evaluations/cellRenderers/cellRenderers.tsx b/agenta-web/src/components/pages/evaluations/cellRenderers/cellRenderers.tsx new file mode 100644 index 0000000000..f2263c3cbb --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/cellRenderers/cellRenderers.tsx @@ -0,0 +1,149 @@ +import {useDurationCounter} from "@/hooks/useDurationCounter" +import {EvaluationStatus, JSSTheme, _Evaluation} from "@/lib/Types" +import {CopyOutlined, FullscreenExitOutlined, FullscreenOutlined} from "@ant-design/icons" +import {ICellRendererParams} from "ag-grid-community" +import {GlobalToken, Space, Typography, message, theme} from "antd" +import Link from "next/link" +import React, {useCallback, useEffect, useState} from "react" +import {createUseStyles} from "react-jss" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + statusCell: { + display: "flex", + alignItems: "center", + gap: "0.25rem", + height: "100%", + marginBottom: 0, + + "& > div:nth-of-type(1)": { + height: 6, + aspectRatio: 1 / 1, + borderRadius: "50%", + }, + }, + dot: { + height: 3, + aspectRatio: 1 / 1, + borderRadius: "50%", + backgroundColor: "#8c8c8c", + marginTop: 2, + }, + date: { + color: "#8c8c8c", + }, + longCell: { + height: "100%", + position: "relative", + overflow: "hidden", + textOverflow: "ellipsis", + whiteSpace: "nowrap", + "& .ant-space": { + position: "absolute", + bottom: 2, + right: 0, + height: 35, + backgroundColor: theme.colorBgContainer, + padding: "0.5rem", + borderRadius: theme.borderRadius, + border: `1px solid ${theme.colorBorder}`, + display: "none", + }, + "&:hover .ant-space": { + display: "inline-flex", + }, + }, +})) + +export function LongTextCellRenderer(params: ICellRendererParams) { + const {value, api, node} = params + const [expanded, setExpanded] = useState( + node.rowHeight !== api.getSizesForCurrentTheme().rowHeight, + ) + const classes = useStyles() + + const onCopy = useCallback(() => { + navigator.clipboard + .writeText(value as string) + .then(() => { + message.success("Copied to clipboard") + }) + .catch(console.error) + }, []) + + const onExpand = useCallback(() => { + node.setRowHeight(api.getSizesForCurrentTheme().rowHeight * (expanded ? 1 : 5)) + api.onRowHeightChanged() + }, [expanded]) + + useEffect(() => { + node.addEventListener("heightChanged", () => { + setExpanded(node.rowHeight !== api.getSizesForCurrentTheme().rowHeight) + }) + }, []) + + return ( +
+ {value} + + {expanded ? ( + + ) : ( + + )} + + +
+ ) +} + +export const runningStatuses = [EvaluationStatus.INITIALIZED, EvaluationStatus.STARTED] +export const statusMapper = (token: GlobalToken) => ({ + [EvaluationStatus.INITIALIZED]: { + label: "Queued", + color: token.colorTextSecondary, + }, + [EvaluationStatus.STARTED]: { + label: "Running", + color: token.colorWarning, + }, + [EvaluationStatus.FINISHED]: { + label: "Completed", + color: token.colorSuccess, + }, + [EvaluationStatus.ERROR]: { + label: "Failed", + color: token.colorError, + }, +}) +export const StatusRenderer = React.memo( + (params: ICellRendererParams<_Evaluation>) => { + const classes = useStyles() + const {token} = theme.useToken() + const duration = useDurationCounter( + params.data?.duration || 0, + runningStatuses.includes(params.value), + ) + const {label, color} = statusMapper(token)[params.value as EvaluationStatus] + + return ( + +
+ {label} + + {duration} + + ) + }, + (prev, next) => prev.value === next.value && prev.data?.duration === next.data?.duration, +) + +export const LinkCellRenderer = React.memo( + (params: ICellRendererParams & {href: string}) => { + const {value, href} = params + return {value} + }, + (prev, next) => prev.value === next.value && prev.href === next.href, +) diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx index e09f252d3e..d51ad74b0b 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -7,16 +7,13 @@ import {AgGridReact} from "ag-grid-react" import {Space, Spin, Tag, Tooltip, Typography} from "antd" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" -import { - LongTextCellRenderer, - getFilterParams, - getTypedValue, -} from "../evaluationResults/EvaluationResults" +import {getFilterParams, getTypedValue} from "../evaluationResults/EvaluationResults" import {uniqBy} from "lodash" import {getTagColors} from "@/lib/helpers/colors" import {DownloadOutlined} from "@ant-design/icons" import {getAppValues} from "@/contexts/app.context" import {useQueryParam} from "@/hooks/useQuery" +import {LongTextCellRenderer} from "../cellRenderers/cellRenderers" const useStyles = createUseStyles((theme: JSSTheme) => ({ table: { @@ -42,11 +39,16 @@ const EvaluationCompareMode: React.FC = () => { const [fetching, setFetching] = useState(false) const gridRef = useRef>() - const evalautions = useMemo(() => { - return uniqBy( + const [evalautions, variants] = useMemo(() => { + const evalautions = uniqBy( scenarios.map((scenario) => scenario.evaluation), "id", ) + const variants = uniqBy( + evalautions.map((evaluation) => ({...evaluation.variants[0], evaluation})).flat(1), + "variantId", + ) + return [evalautions, variants] }, [scenarios]) const colors = useMemo(() => getTagColors(), [evalautions]) @@ -68,63 +70,61 @@ const EvaluationCompareMode: React.FC = () => { cellRenderer: LongTextCellRenderer, }) - evalautions.forEach((evalaution, vi) => { - evalaution?.variants.forEach((variant, index) => { - scenarios - .find((scenario) => scenario.evaluation.id === evalaution.id) - ?.inputs.forEach((input, index) => { - colDefs.push({ - headerComponent: () => ( - - Input: {input.name} - {variant.variantName} - - ), - minWidth: 200, - flex: 1, - field: `inputs.${index}`, - ...getFilterParams(input.type === "number" ? "number" : "text"), - valueGetter: (params) => { - return getTypedValue(params.data?.inputs[index]) - }, - cellRenderer: LongTextCellRenderer, - }) + variants.forEach((variant, vi) => { + const evalaution = (variant as any).evaluation as _Evaluation + scenarios + .find((scenario) => scenario.evaluation.id === evalaution.id) + ?.inputs.forEach((input, index) => { + colDefs.push({ + headerComponent: () => ( + + Input: {input.name} + {variant.variantName} + + ), + minWidth: 200, + flex: 1, + field: `inputs.${index}`, + ...getFilterParams(input.type === "number" ? "number" : "text"), + valueGetter: (params) => { + return getTypedValue(params.data?.inputs[index]) + }, + cellRenderer: LongTextCellRenderer, }) + }) + colDefs.push({ + headerComponent: () => ( + + Output + {variant.variantName} + + ), + minWidth: 280, + flex: 1, + field: `outputs.0`, + ...getFilterParams("text"), + valueGetter: (params) => { + return getTypedValue(params.data?.outputs[0]) + }, + cellRenderer: LongTextCellRenderer, + }) + evalaution.aggregated_results.forEach(({evaluator_config: config}) => { colDefs.push({ + flex: 1, headerComponent: () => ( - Output + Evaluator: {config.name} {variant.variantName} ), - minWidth: 280, - flex: 1, - field: `outputs.${index}`, + field: "results", ...getFilterParams("text"), valueGetter: (params) => { - return getTypedValue(params.data?.outputs[index]) + return getTypedValue( + params.data?.results.find((item) => item.evaluator_config === config.id) + ?.result, + ) }, - cellRenderer: LongTextCellRenderer, - }) - evalaution.aggregated_results.forEach(({evaluator_config: config}) => { - colDefs.push({ - flex: 1, - headerComponent: () => ( - - Evaluator: {config.name} - {variant.variantName} - - ), - field: "results", - ...getFilterParams("text"), - valueGetter: (params) => { - return getTypedValue( - params.data?.results.find( - (item) => item.evaluator_config === config.id, - )?.result, - ) - }, - }) }) }) }) @@ -139,8 +139,8 @@ const EvaluationCompareMode: React.FC = () => { fetchAllEvaluationScenarios(appId, evalId), ), ) - .then((scenariosNest) => { - setScenarios(scenariosNest.flat(1)) + .then((scenariosNest: _EvaluationScenario[][]) => { + setScenarios(uniqBy(scenariosNest.flat(1), "id")) setTimeout(() => { if (!gridRef.current) return @@ -186,19 +186,23 @@ const EvaluationCompareMode: React.FC = () => { Testset: - {evalautions[0]?.testset.name || ""} + + {evalautions[0]?.testset.name || ""} + Variants:
- {evalautions?.map(({variants, id}, vi) => ( + {variants?.map((v, vi) => ( handleDeleteVariant(id)} + onClose={() => handleDeleteVariant((v as any).evaluation.id)} closable > - {variants[0].variantName} + {v.variantName} ))}
diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 5b7d627acf..d80e57aabd 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -1,18 +1,10 @@ -import React, {useCallback, useEffect, useMemo, useRef, useState} from "react" +import React, {useEffect, useMemo, useRef, useState} from "react" import {AgGridReact} from "ag-grid-react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" -import {ColDef, ICellRendererParams} from "ag-grid-community" +import {ColDef} from "ag-grid-community" import {createUseStyles} from "react-jss" -import {Button, GlobalToken, Space, Spin, Typography, message, theme} from "antd" -import { - CopyOutlined, - DeleteOutlined, - FullscreenExitOutlined, - FullscreenOutlined, - PlusCircleOutlined, - SlidersOutlined, - SwapOutlined, -} from "@ant-design/icons" +import {Button, Space, Spin, Tooltip, theme} from "antd" +import {DeleteOutlined, PlusCircleOutlined, SlidersOutlined, SwapOutlined} from "@ant-design/icons" import {EvaluationStatus, GenericObject, JSSTheme, TypedValue, _Evaluation} from "@/lib/Types" import {capitalize, round, uniqBy} from "lodash" import dayjs from "dayjs" @@ -25,7 +17,12 @@ import {useRouter} from "next/router" import {useUpdateEffect} from "usehooks-ts" import {shortPoll} from "@/lib/helpers/utils" import AlertPopup from "@/components/AlertPopup/AlertPopup" -import {useDurationCounter} from "@/hooks/useDurationCounter" +import { + LinkCellRenderer, + StatusRenderer, + runningStatuses, + statusMapper, +} from "../cellRenderers/cellRenderers" dayjs.extend(relativeTime) dayjs.extend(duration) @@ -42,71 +39,8 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ marginTop: "1rem", alignSelf: "flex-end", }, - statusCell: { - display: "flex", - alignItems: "center", - gap: "0.25rem", - height: "100%", - marginBottom: 0, - - "& > div:nth-of-type(1)": { - height: 6, - aspectRatio: 1 / 1, - borderRadius: "50%", - }, - }, - dot: { - height: 3, - aspectRatio: 1 / 1, - borderRadius: "50%", - backgroundColor: "#8c8c8c", - marginTop: 2, - }, - date: { - color: "#8c8c8c", - }, - longCell: { - height: "100%", - position: "relative", - overflow: "hidden", - textOverflow: "ellipsis", - whiteSpace: "nowrap", - "& .ant-space": { - position: "absolute", - bottom: 2, - right: 0, - height: 35, - backgroundColor: theme.colorBgContainer, - padding: "0.5rem", - borderRadius: theme.borderRadius, - border: `1px solid ${theme.colorBorder}`, - display: "none", - }, - "&:hover .ant-space": { - display: "inline-flex", - }, - }, })) -const statusMapper = (token: GlobalToken) => ({ - [EvaluationStatus.INITIALIZED]: { - label: "Queued", - color: token.colorTextSecondary, - }, - [EvaluationStatus.STARTED]: { - label: "Running", - color: token.colorWarning, - }, - [EvaluationStatus.FINISHED]: { - label: "Completed", - color: token.colorSuccess, - }, - [EvaluationStatus.ERROR]: { - label: "Failed", - color: token.colorError, - }, -}) - export function getTypedValue(res?: TypedValue) { const {value, type} = res || {} return type === "number" @@ -151,75 +85,6 @@ export function getFilterParams(type: "number" | "text" | "date") { } } -export function LongTextCellRenderer(params: ICellRendererParams) { - const {value, api, node} = params - const [expanded, setExpanded] = useState( - node.rowHeight !== api.getSizesForCurrentTheme().rowHeight, - ) - const classes = useStyles() - - const onCopy = useCallback(() => { - navigator.clipboard - .writeText(value as string) - .then(() => { - message.success("Copied to clipboard") - }) - .catch(console.error) - }, []) - - const onExpand = useCallback(() => { - node.setRowHeight(api.getSizesForCurrentTheme().rowHeight * (expanded ? 1 : 5)) - api.onRowHeightChanged() - }, [expanded]) - - useEffect(() => { - node.addEventListener("heightChanged", () => { - setExpanded(node.rowHeight !== api.getSizesForCurrentTheme().rowHeight) - }) - }, []) - - return ( -
- {value} - - {expanded ? ( - - ) : ( - - )} - - -
- ) -} - -const StatusRenderer = React.memo( - (params: ICellRendererParams<_Evaluation>) => { - const classes = useStyles() - const {token} = theme.useToken() - const duration = useDurationCounter( - params.data?.duration || 0, - [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes(params.value), - ) - const {label, color} = statusMapper(token)[params.value as EvaluationStatus] - - return ( - -
- {label} - - {duration} - - ) - }, - (prev, next) => prev.value === next.value && prev.data?.duration === next.data?.duration, -) - -const runningStatuses = [EvaluationStatus.INITIALIZED, EvaluationStatus.STARTED] - export const calcEvalDuration = (evaluation: _Evaluation) => { return dayjs( runningStatuses.includes(evaluation.status) ? Date.now() : evaluation.updated_at, @@ -318,33 +183,37 @@ const EvaluationResults: React.FC = () => { const colDefs = useMemo(() => { const colDefs: ColDef<_Evaluation>[] = [ { - minWidth: 280, - field: "id", + field: "variants", flex: 1, + minWidth: 160, + pinned: "left", headerCheckboxSelection: true, checkboxSelection: true, showDisabledCheckboxes: true, + cellRenderer: (params: any) => ( + + ), + valueGetter: (params) => params.data?.variants[0].variantName, + headerName: "Variant", + tooltipValueGetter: (params) => params.data?.variants[0].variantName, ...getFilterParams("text"), - pinned: "left", }, { field: "testset.name", + cellRenderer: (params: any) => ( + + ), flex: 1, minWidth: 160, tooltipValueGetter: (params) => params.value, ...getFilterParams("text"), }, - { - field: "variants", - flex: 1, - minWidth: 160, - valueGetter: (params) => - params.data?.variants.map((item) => item.variantName).join(","), - headerName: "Variant(s)", - tooltipValueGetter: (params) => - params.data?.variants.map((item) => item.variantName).join(","), - ...getFilterParams("text"), - }, ...evaluatorConfigs.map( (config) => ({ @@ -390,6 +259,34 @@ const EvaluationResults: React.FC = () => { return colDefs }, [evaluatorConfigs]) + const compareDisabled = useMemo( + () => + selected.length < 2 || + selected.some( + (item) => + runningStatuses.includes(item.status) || + item.testset.id !== selected[0].testset.id, + ), + [selected], + ) + + const compareBtnNode = ( + + ) + return (
@@ -402,27 +299,13 @@ const EvaluationResults: React.FC = () => { > Delete - + {compareDisabled ? ( + + {compareBtnNode} + + ) : ( + compareBtnNode + )} -
-
- ), - }, - ...dynamicColumns, - { - title: "Correct Answer", - dataIndex: "correctAnswer", - key: "correctAnswer", - width: "30%", - - render: (text: any, record: any, rowIndex: number) =>
{record.correctAnswer}
, - }, - { - title: "Evaluation", - dataIndex: "evaluation", - key: "score", - width: 200, - align: "center" as "left" | "right" | "center", - render: (score: string, record: any) => { - if ( - record.evaluationFlow === EvaluationFlow.COMPARISON_RUN_STARTED && - evaluationStatus === EvaluationFlow.EVALUATION_STARTED - ) { - return - } - if ( - record.evaluationFlow === EvaluationFlow.COMPARISON_RUN_STARTED && - evaluationStatus === EvaluationFlow.EVALUATION_FAILED - ) { - return - } - let tagColor = "" - - return ( - - -
- {score !== "" && ( - - {record.score} - - )} -
-
-
- ) - }, - }, - ] - - const onChangeEvaluationPromptTemplate = (e: any) => { - setEvaluationPromptTemplate(e.target.value) - } - - return ( -
- AI Critique Evaluation -
-
- - - -
- -
- - - exportAICritiqueEvaluationData(evaluation, rows)} - disabled={evaluationStatus !== EvaluationFlow.EVALUATION_FINISHED} - > - Export results - - - - - -
-
- {evaluationStatus === EvaluationFlow.EVALUATION_FAILED && ( -
Failed to run evaluation
- )} - - {evaluationStatus === EvaluationFlow.EVALUATION_INITIALIZED && ( -
Run evaluation to see results!
- )} - - {evaluationStatus === EvaluationFlow.EVALUATION_STARTED && } - - {evaluationStatus === EvaluationFlow.EVALUATION_FINISHED && - evaluationResults && - evaluationResults.results_data && ( -
-

Results Data:

- - {Object.entries(evaluationResults.results_data).map( - ([key, value], index) => ( -
- - - - - ), - )} - - - )} - - -
-
- - - ) -} - -export default AICritiqueEvaluationTable From 9eb28bcec858adbf8ced664056afdac867aa0163 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:14:32 +0100 Subject: [PATCH 240/414] Feat - created llm run rate limit type --- agenta-web/src/lib/Types.ts | 7 +++++++ agenta-web/src/services/evaluations/index.ts | 2 ++ 2 files changed, 9 insertions(+) diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 2f33533ee0..bf40d6f7cc 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -50,6 +50,13 @@ export interface PlaygroundTabsItem { closable: boolean } +export interface LLMRunRateLimit { + batch_size: number + max_retries: number + retry_delay: number + delay_between_batches: number +} + export interface Evaluation { id: string createdAt: string diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index a2cc6fc55e..b2ee464db0 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -5,6 +5,7 @@ import { EvaluationStatus, Evaluator, EvaluatorConfig, + LLMRunRateLimit, TypedValue, _Evaluation, _EvaluationScenario, @@ -117,6 +118,7 @@ export type CreateEvaluationData = { testset_id: string variant_ids: string[] evaluators_configs: string[] + rate_limit: LLMRunRateLimit } export const createEvalutaiton = async (appId: string, evaluation: CreateEvaluationData) => { return axios.post(`/api/evaluations/`, {...evaluation, app_id: appId}) From a127ec65c6a729f51b441321eb6db5e302d7a225 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:16:05 +0100 Subject: [PATCH 241/414] Update - remove default values in llm run rate limit api model --- .../agenta_backend/models/api/evaluation_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 6ac326a290..04340d02a2 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -236,10 +236,10 @@ class EvaluationSettingsTemplate(BaseModel): class LLMRunRateLimit(BaseModel): - batch_size: int = Field(default=10) - max_retries: int = Field(default=3) - retry_delay: int = Field(default=3) - delay_between_batches: int = Field(default=5) + batch_size: int + max_retries: int + retry_delay: int + delay_between_batches: int class NewEvaluation(BaseModel): From 6f35a30364321189bef74c48cf9ace96efffb06c Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:18:43 +0100 Subject: [PATCH 242/414] Update - modified batch_invoke function --- .../services/llm_apps_service.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index ddcde3f940..8aebc83e46 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -37,34 +37,49 @@ async def get_llm_app_output(uri: str, input: Any) -> AppOutput: async def run_with_retry( uri: str, input_data: Any, max_retry_count: int, retry_delay: int ) -> AppOutput: + retries = 0 + last_exception = None while retries < max_retry_count: try: result = await get_llm_app_output(uri, input_data) return result except (httpx.TimeoutException, httpx.ConnectTimeout, httpx.ConnectError) as e: + last_exception = e print(f"Error in evaluation. Retrying in {retry_delay} seconds:", e) await asyncio.sleep(retry_delay) retries += 1 - # If max retries reached, raise the last exception - raise e + # If max retries reached, return the last exception + return AppOutput(output=None, status=str(last_exception)) + +async def batch_invoke( + uri: str, testset_data: List[dict], rate_limit_config: dict +) -> List[AppOutput]: + batch_size = rate_limit_config[ + "batch_size" + ] # Number of testset to make in each batch + max_retries = rate_limit_config[ + "max_retries" + ] # Maximum number of times to retry the failed llm call + retry_delay = rate_limit_config[ + "retry_delay" + ] # Delay before retrying the failed llm call (in seconds) + delay_between_batches = rate_limit_config[ + "delay_between_batches" + ] # Delay between batches (in seconds) -async def batch_invoke(uri: str, testset_data: List[dict]) -> List[AppOutput]: - batch_size = 10 # Number of evaluations to make in each batch - max_retries = 3 # Maximum number of times to retry a failed evaluation - retry_delay = 3 # Delay before retrying a failed evaluation (in seconds) - delay_between_batches = 5 # Delay between batches (in seconds) - list_of_app_outputs: List[AppOutput] = [] # Outputs after running all batches + list_of_app_outputs: List[AppOutput] = [] # Outputs after running all batches async def run_batch(start_idx: int): print(f"Preparing {start_idx} batch...") end_idx = min(start_idx + batch_size, len(testset_data)) for index in range(start_idx, end_idx): - print(f"Running datapoint(s) in {start_idx} batch...") try: - batch_output: AppOutput = await run_with_retry(uri, testset_data[index], max_retries, retry_delay) + batch_output: AppOutput = await run_with_retry( + uri, testset_data[index], max_retries, retry_delay + ) list_of_app_outputs.append(batch_output) print(f"Adding outputs to batch {start_idx}") except Exception as exc: From a28be4852d8283751e986d4ad3f311f51a859ad0 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:19:11 +0100 Subject: [PATCH 243/414] Update - added extra inputs for evaluation run rate limit --- .../evaluationResults/NewEvaluationModal.tsx | 117 +++++++++++++++++- 1 file changed, 113 insertions(+), 4 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 1b34e34c5a..5ea6dcd61c 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -1,10 +1,10 @@ import {useAppId} from "@/hooks/useAppId" -import {JSSTheme, Variant, testset} from "@/lib/Types" +import {JSSTheme, Variant, LLMRunRateLimit, testset} from "@/lib/Types" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" import {fetchTestsets, fetchVariants} from "@/lib/services/api" import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations" -import {PlusOutlined} from "@ant-design/icons" -import {Divider, Form, Modal, Select, Spin, Tag, Typography} from "antd" +import {PlusOutlined, QuestionCircleOutlined} from "@ant-design/icons" +import {Divider, Form, Modal, Select, Spin, Tag, Typography, InputNumber, Row, Col, Tooltip} from "antd" import dayjs from "dayjs" import {useAtom} from "jotai" import Image from "next/image" @@ -75,9 +75,20 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { .finally(() => setFetching(false)) }, [props.open, appId]) + const [rateLimitValues, setRateLimitValues] = useState({ + batch_size: 10, + max_retries: 3, + retry_delay: 3, + delay_between_batches: 5, + }) + const onRateLimitInputChange = (field: keyof LLMRunRateLimit, value: number) => { + setRateLimitValues((prevValues: any) => ({ ...prevValues, [field]: value })); + } + const onSubmit = (values: CreateEvaluationData) => { setSubmitLoading(true) - createEvalutaiton(appId, values) + const EvaluationRateLimit: LLMRunRateLimit = rateLimitValues + createEvalutaiton(appId, {...values, rate_limit: EvaluationRateLimit}) .then(onSuccess) .catch(console.error) .finally(() => setSubmitLoading(false)) @@ -176,6 +187,104 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { })} + + + + + + + Batch Size  + + + + + } + name="batch_size" + style={{ marginBottom: '0' }} + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('batch_size', value) + } + style={{ width: '100%' }} + /> + + + + + Max Retries  + + + + + } + name="max_retries" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('max_retries', value) + } + style={{ width: '100%' }} + /> + + + + + Retry Delay  + + + + + } + name="retry_delay" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('retry_delay', value) + } + style={{ width: '100%' }} + /> + + + + + Delay Between Batches  + + + + + } + name="delay_between_batches" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange('delay_between_batches', value) + } + style={{ width: '100%' }} + /> + + + + From 73a4653de6844f57fb3621b420704b6b41fefc2c Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:19:58 +0100 Subject: [PATCH 244/414] Update - pass in rate_limit api model dict --- .../routers/evaluation_router.py | 6 +- .../agenta_backend/tasks/evaluations.py | 72 ++++++++++--------- 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index ce38e767f9..b5e0b143d6 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -96,6 +96,7 @@ async def create_evaluation( "variant_ids": [variant_id], # Only this variant ID "evaluators_configs": payload.evaluators_configs, "testset_id": payload.testset_id, + "rate_limit": payload.rate_limit.dict() } evaluation = await evaluation_service.create_new_evaluation( @@ -105,7 +106,10 @@ async def create_evaluation( ) evaluate.delay( - app_data, new_evaluation_data, evaluation.id, evaluation.testset_id + app_data, + new_evaluation_data, + evaluation.id, + evaluation.testset_id, ) evaluations.append(evaluation) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 81a86ca5d0..c6237b85d6 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -24,12 +24,15 @@ Result, ) from agenta_backend.services import evaluators_service -from agenta_backend.models.api.evaluation_model import NewEvaluation +from agenta_backend.models.api.evaluation_model import NewEvaluation, AppOutput @shared_task(queue="agenta_backend.tasks.evaluations.evaluate") def evaluate( - app_data: dict, new_evaluation_data: dict, evaluation_id: str, testset_id: str + app_data: dict, + new_evaluation_data: dict, + evaluation_id: str, + testset_id: str, ): loop = asyncio.get_event_loop() app = AppDB(**app_data) @@ -46,11 +49,25 @@ def evaluate( get_deployment_by_objectid(app_variant_db.base.deployment) ) - # TODO: remove if abraham's fix is working - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + #!NOTE: do not remove! this will be used in github workflow! + backend_environment = os.environ.get("ENVIRONMENT") + if backend_environment is not None and backend_environment == "github": + uri = f"http://{deployment.container_name}" + else: + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + + # 2. We get the output from the llm app + app_outputs: List[AppOutput] = loop.run_until_complete( + llm_apps_service.batch_invoke( + uri, testset.csvdata, evaluation.rate_limit.dict() + ) + ) + for data_point, app_output in zip(testset.csvdata, app_outputs): + if len(testset.csvdata) != len(app_outputs): + # TODO: properly handle error in the case where the length are not the same + break - for data_point in testset.csvdata: - # 1. We prepare the inputs + # 2. We prepare the inputs raw_inputs = ( app_variant_db.parameters.get("inputs", []) if app_variant_db.parameters @@ -67,17 +84,6 @@ def evaluate( for input_item in raw_inputs ] - #!NOTE: do not remove! this will be used in github workflow! - backend_environment = os.environ.get("ENVIRONMENT") - if backend_environment is not None and backend_environment == "github": - uri = f"http://{deployment.container_name}" - else: - uri = deployment.uri.replace( - "http://localhost", "http://host.docker.internal" - ) - # 2. We get the output from the llm app - variant_output = llm_apps_service.get_llm_app_output(uri, data_point) - # 3. We evaluate evaluators_results: [EvaluationScenarioResult] = [] for evaluator_config_id in evaluation.evaluators_configs: @@ -95,7 +101,7 @@ def evaluate( ) result = evaluators_service.evaluate( evaluator_config.evaluator_key, - variant_output, + app_output.output, data_point["correct_answer"], evaluator_config.settings_values, **additional_kwargs, @@ -108,22 +114,22 @@ def evaluate( evaluators_results.append(result_object) evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) - # 4. We create a new evaluation scenario - evaluation_scenario = loop.run_until_complete( - create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - variant_id=variant_id, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=inputs, - is_pinned=False, - note="", - correct_answer=data_point["correct_answer"], - outputs=[EvaluationScenarioOutputDB(type="text", value=variant_output)], - results=evaluators_results, - ) + # 4. We create a new evaluation scenario + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + variant_id=variant_id, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=inputs, + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], + results=evaluators_results, ) + ) aggregated_results = loop.run_until_complete( aggregate_evaluator_results(app, evaluators_aggregated_data) From 38b593a75fdb7a30fd5beafaff3743f4b787be25 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:21:16 +0100 Subject: [PATCH 245/414] :art: Format - ran black and prettier --- .../models/api/evaluation_model.py | 4 +- .../routers/evaluation_router.py | 2 +- .../services/llm_apps_service.py | 1 - .../modals/CreateAppStatusModal.tsx | 4 +- .../Evaluations/HumanEvaluationResult.tsx | 8 +--- .../evaluationResults/EvaluationResults.tsx | 8 ++-- .../evaluationResults/NewEvaluationModal.tsx | 39 ++++++++++++------- agenta-web/src/lib/services/api.ts | 10 ++--- 8 files changed, 38 insertions(+), 38 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 04340d02a2..1a6fab63b3 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -67,8 +67,8 @@ class NewHumanEvaluation(BaseModel): class AppOutput(BaseModel): - output: Any - status: str + output: Any + status: str class Evaluation(BaseModel): diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index b5e0b143d6..5cc4367c9c 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -96,7 +96,7 @@ async def create_evaluation( "variant_ids": [variant_id], # Only this variant ID "evaluators_configs": payload.evaluators_configs, "testset_id": payload.testset_id, - "rate_limit": payload.rate_limit.dict() + "rate_limit": payload.rate_limit.dict(), } evaluation = await evaluation_service.create_new_evaluation( diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index 8aebc83e46..a749cbc418 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -37,7 +37,6 @@ async def get_llm_app_output(uri: str, input: Any) -> AppOutput: async def run_with_retry( uri: str, input_data: Any, max_retry_count: int, retry_delay: int ) -> AppOutput: - retries = 0 last_exception = None while retries < max_retry_count: diff --git a/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx b/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx index 3a7d655407..cb165d4b04 100644 --- a/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx +++ b/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx @@ -196,8 +196,8 @@ const CreateAppStatusModal: React.FC> type === "success" ? "success" : type === "error" - ? "danger" - : "secondary" + ? "danger" + : "secondary" } strong={Object.keys(messages)[ix] === "success"} > diff --git a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx index 80e8905946..192280ace3 100644 --- a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx +++ b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx @@ -98,15 +98,11 @@ export default function HumanEvaluationResult() { } const fetchEvaluations = async () => { try { - fetchData( - `${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`, - ) + fetchData(`${getAgentaApiUrl()}/api/human-evaluations/?app_id=${app_id}`) .then((response) => { const fetchPromises = response.map((item: EvaluationResponseType) => { return fetchData( - `${getAgentaApiUrl()}/api/human-evaluations/${ - item.id - }/results/`, + `${getAgentaApiUrl()}/api/human-evaluations/${item.id}/results/`, ) .then((results) => { if (item.evaluation_type === EvaluationType.human_a_b_testing) { diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index dcc5ae6ae4..10a1f937ac 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -112,8 +112,8 @@ export function getTypedValue(res?: TypedValue) { return type === "number" ? round(Number(value), 2) : ["boolean", "bool"].includes(type as string) - ? capitalize(value?.toString()) - : value?.toString() + ? capitalize(value?.toString()) + : value?.toString() } export function getFilterParams(type: "number" | "text" | "date") { @@ -144,8 +144,8 @@ export function getFilterParams(type: "number" | "text" | "date") { type === "number" ? "agNumberColumnFilter" : type === "date" - ? "agDateColumnFilter" - : "agTextColumnFilter", + ? "agDateColumnFilter" + : "agTextColumnFilter", cellDataType: type, filterParams, } diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 5ea6dcd61c..5aed01d43f 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -4,7 +4,19 @@ import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" import {fetchTestsets, fetchVariants} from "@/lib/services/api" import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations" import {PlusOutlined, QuestionCircleOutlined} from "@ant-design/icons" -import {Divider, Form, Modal, Select, Spin, Tag, Typography, InputNumber, Row, Col, Tooltip} from "antd" +import { + Divider, + Form, + Modal, + Select, + Spin, + Tag, + Typography, + InputNumber, + Row, + Col, + Tooltip, +} from "antd" import dayjs from "dayjs" import {useAtom} from "jotai" import Image from "next/image" @@ -82,7 +94,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { delay_between_batches: 5, }) const onRateLimitInputChange = (field: keyof LLMRunRateLimit, value: number) => { - setRateLimitValues((prevValues: any) => ({ ...prevValues, [field]: value })); + setRateLimitValues((prevValues: any) => ({...prevValues, [field]: value})) } const onSubmit = (values: CreateEvaluationData) => { @@ -189,10 +201,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - + = ({onSuccess, ...props}) => { } name="batch_size" - style={{ marginBottom: '0' }} + style={{marginBottom: "0"}} rules={[{required: true, message: "This field is required"}]} > - onRateLimitInputChange('batch_size', value) + onRateLimitInputChange("batch_size", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> @@ -233,9 +242,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - onRateLimitInputChange('max_retries', value) + onRateLimitInputChange("max_retries", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> @@ -255,9 +264,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - onRateLimitInputChange('retry_delay', value) + onRateLimitInputChange("retry_delay", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> @@ -277,9 +286,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { - onRateLimitInputChange('delay_between_batches', value) + onRateLimitInputChange("delay_between_batches", value) } - style={{ width: '100%' }} + style={{width: "100%"}} /> diff --git a/agenta-web/src/lib/services/api.ts b/agenta-web/src/lib/services/api.ts index 0ff9fc5859..0b95afd04a 100644 --- a/agenta-web/src/lib/services/api.ts +++ b/agenta-web/src/lib/services/api.ts @@ -369,13 +369,9 @@ export const createNewEvaluation = async ( status: EvaluationFlow.EVALUATION_INITIALIZED, } - const response = await axios.post( - `${getAgentaApiUrl()}/api/human-evaluations/`, - data, - { - _ignoreError: ignoreAxiosError, - } as any, - ) + const response = await axios.post(`${getAgentaApiUrl()}/api/human-evaluations/`, data, { + _ignoreError: ignoreAxiosError, + } as any) return response.data.id } From 6429d7ffa7b3b05387af74c6c6a8fe915ea97207 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:54:06 +0100 Subject: [PATCH 246/414] Update - wrapped entire logic in try-except block --- .../agenta_backend/tasks/evaluations.py | 173 +++++++++--------- 1 file changed, 90 insertions(+), 83 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index a3579ccb99..799de79dc2 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -10,6 +10,7 @@ fetch_app_variant_by_id, fetch_evaluator_config, get_deployment_by_objectid, + update_evaluation, fetch_testset_by_id, create_new_evaluation_scenario, fetch_evaluator_config_by_appId, @@ -38,98 +39,104 @@ def evaluate( app = AppDB(**app_data) evaluation = NewEvaluation(**new_evaluation_data) - testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) - new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) - evaluators_aggregated_data = defaultdict(list) + try: + testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) + new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) + evaluators_aggregated_data = defaultdict(list) - variant_id = str(evaluation.variant_ids[0]) + variant_id = str(evaluation.variant_ids[0]) - app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) - deployment = loop.run_until_complete( - get_deployment_by_objectid(app_variant_db.base.deployment) - ) - - #!NOTE: do not remove! this will be used in github workflow! - backend_environment = os.environ.get("ENVIRONMENT") - if backend_environment is not None and backend_environment == "github": - uri = f"http://{deployment.container_name}" - else: - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") - - # 1. We get the output from the llm app - app_outputs: List[AppOutput] = loop.run_until_complete( - llm_apps_service.batch_invoke( - uri, testset.csvdata, evaluation.rate_limit.dict() - ) - ) - for data_point, app_output in zip(testset.csvdata, app_outputs): - if len(testset.csvdata) != len(app_outputs): - # TODO: properly handle error in the case where the length are not the same - break - - # 2. We prepare the inputs - raw_inputs = ( - app_variant_db.parameters.get("inputs", []) - if app_variant_db.parameters - else [] + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) + deployment = loop.run_until_complete( + get_deployment_by_objectid(app_variant_db.base.deployment) ) - inputs = [] - if raw_inputs: - inputs = [ - EvaluationScenarioInputDB( - name=input_item["name"], - type="text", - value=data_point[input_item["name"]], - ) - for input_item in raw_inputs - ] - - # 3. We evaluate - evaluators_results: [EvaluationScenarioResult] = [] - for evaluator_config_id in evaluation.evaluators_configs: - evaluator_config = loop.run_until_complete( - fetch_evaluator_config(evaluator_config_id) - ) - additional_kwargs = ( - { - "app_params": app_variant_db.config.parameters, - "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed - } - if evaluator_config.evaluator_key == "custom_code_run" - else {} + #!NOTE: do not remove! this will be used in github workflow! + backend_environment = os.environ.get("ENVIRONMENT") + if backend_environment is not None and backend_environment == "github": + uri = f"http://{deployment.container_name}" + else: + uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + + # 1. We get the output from the llm app + app_outputs: List[AppOutput] = loop.run_until_complete( + llm_apps_service.batch_invoke( + uri, testset.csvdata, evaluation.rate_limit.dict() ) - result = evaluators_service.evaluate( - evaluator_config.evaluator_key, - app_output.output, - data_point["correct_answer"], - evaluator_config.settings_values, - **additional_kwargs, + ) + for data_point, app_output in zip(testset.csvdata, app_outputs): + if len(testset.csvdata) != len(app_outputs): + # TODO: properly handle error in the case where the length are not the same + break + + # 2. We prepare the inputs + raw_inputs = ( + app_variant_db.parameters.get("inputs", []) + if app_variant_db.parameters + else [] ) + inputs = [] + if raw_inputs: + inputs = [ + EvaluationScenarioInputDB( + name=input_item["name"], + type="text", + value=data_point[input_item["name"]], + ) + for input_item in raw_inputs + ] + + # 3. We evaluate + evaluators_results: [EvaluationScenarioResult] = [] + for evaluator_config_id in evaluation.evaluators_configs: + evaluator_config = loop.run_until_complete( + fetch_evaluator_config(evaluator_config_id) + ) + + additional_kwargs = ( + { + "app_params": app_variant_db.config.parameters, + "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed + } + if evaluator_config.evaluator_key == "custom_code_run" + else {} + ) + result = evaluators_service.evaluate( + evaluator_config.evaluator_key, + app_output.output, + data_point["correct_answer"], + evaluator_config.settings_values, + **additional_kwargs, + ) - result_object = EvaluationScenarioResult( - evaluator_config=evaluator_config.id, - result=result, + result_object = EvaluationScenarioResult( + evaluator_config=evaluator_config.id, + result=result, + ) + evaluators_results.append(result_object) + evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) + + # 4. We create a new evaluation scenario + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + variant_id=variant_id, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=inputs, + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], + results=evaluators_results, ) - evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) - - # 4. We create a new evaluation scenario - evaluation_scenario = loop.run_until_complete( - create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - variant_id=variant_id, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=inputs, - is_pinned=False, - note="", - correct_answer=data_point["correct_answer"], - outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], - results=evaluators_results, ) - ) + except Exception as e: + print(f"An error occurred during evaluation: {e}") + loop.run_until_complete( + update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"}) + ) aggregated_results = loop.run_until_complete( aggregate_evaluator_results(app, evaluators_aggregated_data) From 06f5379f0f50a1080f633489a7b27d87fbe56cb5 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 15:58:34 +0100 Subject: [PATCH 247/414] Update - include rate_limit data in create_evaluation testcase --- .../agenta_backend/tasks/evaluations.py | 16 ++++++++++++---- .../test_evaluators_router.py | 6 ++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 799de79dc2..28f5ca7c84 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -41,7 +41,9 @@ def evaluate( try: testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) - new_evaluation_db = loop.run_until_complete(fetch_evaluation_by_id(evaluation_id)) + new_evaluation_db = loop.run_until_complete( + fetch_evaluation_by_id(evaluation_id) + ) evaluators_aggregated_data = defaultdict(list) variant_id = str(evaluation.variant_ids[0]) @@ -56,7 +58,9 @@ def evaluate( if backend_environment is not None and backend_environment == "github": uri = f"http://{deployment.container_name}" else: - uri = deployment.uri.replace("http://localhost", "http://host.docker.internal") + uri = deployment.uri.replace( + "http://localhost", "http://host.docker.internal" + ) # 1. We get the output from the llm app app_outputs: List[AppOutput] = loop.run_until_complete( @@ -114,7 +118,9 @@ def evaluate( result=result, ) evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append(result) + evaluators_aggregated_data[evaluator_config.evaluator_key].append( + result + ) # 4. We create a new evaluation scenario evaluation_scenario = loop.run_until_complete( @@ -128,7 +134,9 @@ def evaluate( is_pinned=False, note="", correct_answer=data_point["correct_answer"], - outputs=[EvaluationScenarioOutputDB(type="text", value=app_output.output)], + outputs=[ + EvaluationScenarioOutputDB(type="text", value=app_output.output) + ], results=evaluators_results, ) ) diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index b399e37fd4..c86201d146 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -161,6 +161,12 @@ async def test_create_evaluation(): "variant_ids": [str(app_variant.id)], "evaluators_configs": [], "testset_id": str(testset.id), + "rate_limit": { + "batch_size": 10, + "max_retries": 3, + "retry_delay": 3, + "delay_between_batches": 5, + }, } # Fetch evaluator configs From d732b064ca50293c13c22ca6feb70f5a03b0af54 Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Wed, 3 Jan 2024 15:59:04 +0100 Subject: [PATCH 248/414] installed beanie --- agenta-backend/poetry.lock | 42 +++++++++++++++++++++++++++++++++-- agenta-backend/pyproject.toml | 1 + 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/agenta-backend/poetry.lock b/agenta-backend/poetry.lock index fdf7b77937..f64a6eca82 100644 --- a/agenta-backend/poetry.lock +++ b/agenta-backend/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiodocker" @@ -246,6 +246,30 @@ files = [ {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] +[[package]] +name = "beanie" +version = "1.24.0" +description = "Asynchronous Python ODM for MongoDB" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "beanie-1.24.0-py3-none-any.whl", hash = "sha256:d48b047c9640d8b4312b781254fcf808ad03262321f1793cf1228972953e3649"}, + {file = "beanie-1.24.0.tar.gz", hash = "sha256:2328f0f745ea8b4626a818c79ed71021a5c4362fc2c0adc48a59e6b40c5ce54b"}, +] + +[package.dependencies] +click = ">=7" +lazy-model = "0.2.0" +motor = ">=2.5.0,<4.0.0" +pydantic = ">=1.10,<3.0" +toml = "*" +typing-extensions = {version = ">=4.7", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Markdown (>=3.3)", "Pygments (>=2.8.0)", "jinja2 (>=3.0.3)", "mkdocs (>=1.4)", "mkdocs-material (>=9.0)", "pydoc-markdown (>=4.8)"] +queue = ["beanie-batteries-queue (>=0.2)"] +test = ["asgi-lifespan (>=1.0.1)", "dnspython (>=2.1.0)", "fastapi (>=0.100)", "flake8 (>=3)", "httpx (>=0.23.0)", "pre-commit (>=2.3.0)", "pydantic-extra-types (>=2)", "pydantic-settings (>=2)", "pydantic[email]", "pyright (>=0)", "pytest (>=6.0.0)", "pytest-asyncio (>=0.21.0)", "pytest-cov (>=2.8.1)"] + [[package]] name = "billiard" version = "4.2.0" @@ -1148,6 +1172,20 @@ files = [ pydantic = ">=1,<3" requests = ">=2,<3" +[[package]] +name = "lazy-model" +version = "0.2.0" +description = "" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "lazy-model-0.2.0.tar.gz", hash = "sha256:57c0e91e171530c4fca7aebc3ac05a163a85cddd941bf7527cc46c0ddafca47c"}, + {file = "lazy_model-0.2.0-py3-none-any.whl", hash = "sha256:5a3241775c253e36d9069d236be8378288a93d4fc53805211fd152e04cc9c342"}, +] + +[package.dependencies] +pydantic = ">=1.9.0" + [[package]] name = "marshmallow" version = "3.20.1" @@ -2733,4 +2771,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "337e72d73feb6823a141ab0fd71374044a6210ce971fe4409edb685f959abbeb" +content-hash = "007f83104a7bff7addf4a6e982d210a74152ae446065a179e85000842c4a471e" diff --git a/agenta-backend/pyproject.toml b/agenta-backend/pyproject.toml index df3025c7e5..d14f74b1e1 100644 --- a/agenta-backend/pyproject.toml +++ b/agenta-backend/pyproject.toml @@ -33,6 +33,7 @@ sentry-sdk = {extras = ["fastapi"], version = "^1.34.0"} kubernetes = "^28.1.0" celery = "^5.3.6" watchdog = {extras = ["watchmedo"], version = "^3.0.0"} +beanie = "^1.24.0" [tool.poetry.group.dev.dependencies] pytest = "^7.3.1" From bc61f0a89b9246de97f3d45879e3d44b1c66a84b Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Wed, 3 Jan 2024 20:11:08 +0500 Subject: [PATCH 249/414] fixed prettier whitespace issue --- .../evaluations/evaluationResults/EvaluationResults.tsx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index c00d00bf84..d80e57aabd 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -46,8 +46,8 @@ export function getTypedValue(res?: TypedValue) { return type === "number" ? round(Number(value), 2) : ["boolean", "bool"].includes(type as string) - ? capitalize(value?.toString()) - : value?.toString() + ? capitalize(value?.toString()) + : value?.toString() } export function getFilterParams(type: "number" | "text" | "date") { @@ -78,8 +78,8 @@ export function getFilterParams(type: "number" | "text" | "date") { type === "number" ? "agNumberColumnFilter" : type === "date" - ? "agDateColumnFilter" - : "agTextColumnFilter", + ? "agDateColumnFilter" + : "agTextColumnFilter", cellDataType: type, filterParams, } From 04ca8bf85d6f262835a6a8a81f307589af55eccd Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Wed, 3 Jan 2024 20:40:41 +0100 Subject: [PATCH 250/414] refactor models to use beanie --- .../agenta_backend/models/db_models.py | 219 +++++++++--------- 1 file changed, 109 insertions(+), 110 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 90e56781cc..913fb38900 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -2,11 +2,11 @@ from typing import Any, Dict, List, Optional, Union from uuid import uuid4 -from bson import ObjectId -from odmantic import EmbeddedModel, Field, Model, Reference +from beanie import Document, Link, PydanticObjectId +from pydantic import BaseModel, Field -class APIKeyDB(Model): +class APIKeyDB(Document): prefix: str hashed_key: str user_id: str @@ -16,44 +16,43 @@ class APIKeyDB(Model): created_at: Optional[datetime] = datetime.utcnow() updated_at: Optional[datetime] - class Config: + class Settings: collection = "api_keys" - -class InvitationDB(EmbeddedModel): +class InvitationDB(BaseModel): token: str = Field(unique=True) email: str expiration_date: datetime = Field(default="0") used: bool = False -class OrganizationDB(Model): +class OrganizationDB(Document): name: str = Field(default="agenta") description: str = Field(default="") type: Optional[str] owner: str # user id - members: Optional[List[ObjectId]] + members: Optional[List[PydanticObjectId]] invitations: Optional[List[InvitationDB]] = [] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "organizations" -class UserDB(Model): +class UserDB(Document): uid: str = Field(default="0", unique=True, index=True) username: str = Field(default="agenta") email: str = Field(default="demo@agenta.ai", unique=True) - organizations: Optional[List[ObjectId]] = [] + organizations: Optional[List[PydanticObjectId]] = [] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "users" -class ImageDB(Model): +class ImageDB(Document): """Defines the info needed to get an image and connect it to the app variant""" type: Optional[str] = Field(default="image") @@ -61,28 +60,28 @@ class ImageDB(Model): docker_id: Optional[str] = Field(index=True) tags: Optional[str] deletable: bool = Field(default=True) - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") + user: UserDB = UserDB + organization: OrganizationDB = OrganizationDB created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) deletable: bool = Field(default=True) - class Config: + class Settings: collection = "docker_images" -class AppDB(Model): +class AppDB(Document): app_name: str - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") + organization: OrganizationDB = OrganizationDB + user: UserDB = UserDB created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) -class DeploymentDB(Model): - app: AppDB = Reference(key_name="app") - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") +class DeploymentDB(Document): + app: AppDB = AppDB + organization: OrganizationDB = OrganizationDB + user: UserDB = UserDB container_name: Optional[str] container_id: Optional[str] uri: Optional[str] @@ -90,32 +89,32 @@ class DeploymentDB(Model): created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "deployments" -class VariantBaseDB(Model): - app: AppDB = Reference(key_name="app") - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") +class VariantBaseDB(Document): + app: AppDB = AppDB + organization: OrganizationDB = OrganizationDB + user: UserDB = UserDB base_name: str - image: ImageDB = Reference(key_name="image") - deployment: Optional[ObjectId] # Reference to deployment + image: ImageDB = ImageDB + deployment: Optional[PydanticObjectId] # Link to deployment created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "bases" -class ConfigVersionDB(EmbeddedModel): +class ConfigVersionDB(BaseModel): version: int parameters: Dict[str, Any] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) -class ConfigDB(Model): +class ConfigDB(Document): config_name: str current_version: int = Field(default=1) parameters: Dict[str, Any] = Field(default=dict) @@ -123,22 +122,22 @@ class ConfigDB(Model): created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "configs" -class AppVariantDB(Model): - app: AppDB = Reference(key_name="app") +class AppVariantDB(Document): + app: AppDB = AppDB variant_name: str - image: ImageDB = Reference(key_name="image") - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") + image: ImageDB = ImageDB + user: UserDB = UserDB + organization: OrganizationDB = OrganizationDB parameters: Dict[str, Any] = Field(default=dict) # TODO: deprecated. remove previous_variant_name: Optional[str] # TODO: deprecated. remove base_name: Optional[str] - base: VariantBaseDB = Reference(key_name="bases") + base: VariantBaseDB = VariantBaseDB config_name: Optional[str] - config: ConfigDB = Reference(key_name="configs") + config: ConfigDB = ConfigDB created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -146,21 +145,21 @@ class AppVariantDB(Model): default=False ) # soft deletion for using the template variants - class Config: + class Settings: collection = "app_variants" -class AppEnvironmentDB(Model): - app: AppDB = Reference(key_name="app") +class AppEnvironmentDB(Document): + app: AppDB = AppDB name: str - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") - deployed_app_variant: Optional[ObjectId] - deployment: Optional[ObjectId] # reference to deployment + user: UserDB = UserDB + organization: OrganizationDB = OrganizationDB + deployed_app_variant: Optional[PydanticObjectId] + deployment: Optional[PydanticObjectId] # reference to deployment created_at: Optional[datetime] = Field(default=datetime.utcnow()) -class TemplateDB(Model): +class TemplateDB(Document): type: Optional[str] = Field(default="image") template_uri: Optional[str] tag_id: Optional[int] @@ -172,111 +171,111 @@ class TemplateDB(Model): digest: Optional[str] # sha256 hash of image digest last_pushed: Optional[datetime] - class Config: - collection = "templates" + class Settings: + name = "templates" -class TestSetDB(Model): +class TestSetDB(Document): name: str - app: AppDB = Reference(key_name="app") + app: AppDB = AppDB csvdata: List[Dict[str, str]] - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") + user: UserDB = UserDB + organization: OrganizationDB = OrganizationDB created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "testsets" -class CustomEvaluationDB(Model): +class CustomEvaluationDB(Document): evaluation_name: str python_code: str - app: AppDB = Reference(key_name="app") - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") + app: AppDB = AppDB + user: UserDB = UserDB + organization: OrganizationDB = OrganizationDB created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "custom_evaluations" -class EvaluationSettingsTemplate(EmbeddedModel): +class EvaluationSettingsTemplate(BaseModel): type: str default: str description: str -class EvaluatorConfigDB(Model): - app: AppDB = Reference(key_name="app") - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") +class EvaluatorConfigDB(Document): + app: AppDB = AppDB + organization: OrganizationDB = OrganizationDB + user: UserDB = UserDB name: str evaluator_key: str settings_values: Optional[Dict[str, Any]] = None created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "evaluators_configs" -class Result(EmbeddedModel): +class Result(BaseModel): type: str value: Any -class EvaluationScenarioResult(EmbeddedModel): - evaluator_config: ObjectId +class EvaluationScenarioResult(BaseModel): + evaluator_config: PydanticObjectId result: Result -class AggregatedResult(EmbeddedModel): - evaluator_config: ObjectId +class AggregatedResult(BaseModel): + evaluator_config: PydanticObjectId result: Result -class EvaluationScenarioInputDB(EmbeddedModel): +class EvaluationScenarioInputDB(BaseModel): name: str type: str value: str -class EvaluationScenarioOutputDB(EmbeddedModel): +class EvaluationScenarioOutputDB(BaseModel): type: str value: Any -class HumanEvaluationScenarioInput(EmbeddedModel): +class HumanEvaluationScenarioInput(BaseModel): input_name: str input_value: str -class HumanEvaluationScenarioOutput(EmbeddedModel): +class HumanEvaluationScenarioOutput(BaseModel): variant_id: str variant_output: str -class HumanEvaluationDB(Model): - app: AppDB = Reference(key_name="app") - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") +class HumanEvaluationDB(Document): + app: AppDB = AppDB + organization: OrganizationDB = OrganizationDB + user: UserDB = UserDB status: str evaluation_type: str - variants: List[ObjectId] - testset: TestSetDB = Reference(key_name="testsets") + variants: List[PydanticObjectId] + testset: TestSetDB = TestSetDB created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "human_evaluations" -class HumanEvaluationScenarioDB(Model): - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") - evaluation: HumanEvaluationDB = Reference(key_name="evaluations") +class HumanEvaluationScenarioDB(Document): + user: UserDB = UserDB + organization: OrganizationDB = OrganizationDB + evaluation: HumanEvaluationDB = HumanEvaluationDB inputs: List[HumanEvaluationScenarioInput] outputs: List[HumanEvaluationScenarioOutput] vote: Optional[str] @@ -287,46 +286,46 @@ class HumanEvaluationScenarioDB(Model): is_pinned: Optional[bool] note: Optional[str] - class Config: + class Settings: collection = "human_evaluations_scenarios" -class EvaluationDB(Model): - app: AppDB = Reference(key_name="app") - organization: OrganizationDB = Reference(key_name="organization") - user: UserDB = Reference(key_name="user") +class EvaluationDB(Document): + app: AppDB = AppDB + organization: OrganizationDB = OrganizationDB + user: UserDB = UserDB status: str = Field(default="EVALUATION_INITIALIZED") - testset: TestSetDB = Reference() - variants: List[ObjectId] - evaluators_configs: List[ObjectId] + testset: TestSetDB = TestSetDB + variants: List[PydanticObjectId] + evaluators_configs: List[PydanticObjectId] aggregated_results: List[AggregatedResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "evaluations" -class EvaluationScenarioDB(Model): - user: UserDB = Reference(key_name="user") - organization: OrganizationDB = Reference(key_name="organization") - evaluation: EvaluationDB = Reference(key_name="evaluations") - variant_id: ObjectId +class EvaluationScenarioDB(Document): + user: UserDB = UserDB + organization: OrganizationDB = OrganizationDB + evaluation: EvaluationDB = EvaluationDB + variant_id: PydanticObjectId inputs: List[EvaluationScenarioInputDB] outputs: List[EvaluationScenarioOutputDB] correct_answer: Optional[str] is_pinned: Optional[bool] note: Optional[str] - evaluators_configs: List[ObjectId] + evaluators_configs: List[PydanticObjectId] results: List[EvaluationScenarioResult] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) - class Config: + class Settings: collection = "evaluation_scenarios" -class SpanDB(Model): +class SpanDB(Document): parent_span_id: Optional[str] meta: Optional[Dict[str, Any]] event_name: str # Function or execution name @@ -344,11 +343,11 @@ class SpanDB(Model): cost: Optional[float] tags: Optional[List[str]] - class Config: + class Settings: collection = "spans" -class Feedback(EmbeddedModel): +class Feedback(BaseModel): uid: str = Field(default=str(uuid4())) user_id: str feedback: Optional[str] @@ -358,19 +357,19 @@ class Feedback(EmbeddedModel): updated_at: datetime = Field(default=datetime.utcnow()) -class TraceDB(Model): +class TraceDB(Document): app_id: Optional[str] variant_id: str - spans: List[ObjectId] + spans: List[PydanticObjectId] start_time: datetime end_time: datetime = Field(default=datetime.utcnow()) cost: Optional[float] latency: float status: str # initiated, completed, stopped, cancelled, failed token_consumption: Optional[int] - user: UserDB = Reference() + user: UserDB = UserDB tags: Optional[List[str]] feedbacks: Optional[List[Feedback]] - class Config: + class Settings: collection = "traces" From f706856d2835acfb8183445f754bbb5283782c96 Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Wed, 3 Jan 2024 20:41:32 +0100 Subject: [PATCH 251/414] set engine to use beanie ODM --- .../agenta_backend/models/db_engine.py | 100 +++++++++++++----- 1 file changed, 73 insertions(+), 27 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 8e79c9cf0c..a96402fc9b 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -2,14 +2,58 @@ import logging from odmantic import AIOEngine +from beanie import init_beanie from pymongo import MongoClient from motor.motor_asyncio import AsyncIOMotorClient - +from agenta_backend.models.db_models import ( + APIKeyDB, + OrganizationDB, + UserDB, + ImageDB, + AppDB, + DeploymentDB, + VariantBaseDB, + ConfigDB, + AppVariantDB, + TemplateDB, + TestSetDB, + CustomEvaluationDB, + EvaluatorConfigDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + EvaluationDB, + EvaluationScenarioDB, + SpanDB, + TraceDB +) # Configure and set logging level logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) +# Define Document Models +document_models = [ + APIKeyDB, + OrganizationDB, + UserDB, + ImageDB, + AppDB, + DeploymentDB, + VariantBaseDB, + ConfigDB, + AppVariantDB, + TemplateDB, + TestSetDB, + CustomEvaluationDB, + EvaluatorConfigDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + EvaluationDB, + EvaluationScenarioDB, + SpanDB, + TraceDB +] + class DBEngine(object): """ @@ -20,44 +64,44 @@ def __init__(self) -> None: self.mode = os.environ.get("DATABASE_MODE", "v2") self.db_url = os.environ["MONGODB_URI"] - @property - def initialize_client(self) -> AsyncIOMotorClient: - """ - Returns an instance of `AsyncIOMotorClient` initialized \ - with the provided `db_url`. - """ - - client = AsyncIOMotorClient(self.db_url) - return client - def engine(self) -> AIOEngine: + return True + + async def init_db(self) -> AIOEngine: """ - Returns an AIOEngine object with a specified database name based on the mode. + Initialize the database based on the mode. """ + client = AsyncIOMotorClient(os.environ["MONGODB_URI"]) + db_mode = os.environ.get("DATABASE_MODE", "v2") - if self.mode == "test": - aio_engine = AIOEngine( - client=self.initialize_client, database="agenta_test" + if db_mode == "test": + await init_beanie( + database=client["agenta_test"], + document_models=document_models ) logger.info("Using test database...") - return aio_engine - elif self.mode == "default": - aio_engine = AIOEngine(client=self.initialize_client, database="agenta") + elif db_mode == "default": + await init_beanie( + database=client["agenta"], + document_models=document_models + ) logger.info("Using default database...") - return aio_engine - elif self.mode == "v2": - aio_engine = AIOEngine(client=self.initialize_client, database="agenta_v2") + elif db_mode == "v2": + await init_beanie( + database=client["agenta_v2"], + document_models=document_models + ) logger.info("Using v2 database...") - return aio_engine else: # make sure that self.mode does only contain alphanumeric characters - if not self.mode.isalnum(): + if not db_mode.isalnum(): raise ValueError("Mode of database needs to be alphanumeric.") - aio_engine = AIOEngine( - client=self.initialize_client, database=f"agenta_{self.mode}" + await init_beanie( + database=client[f"agenta_{db_mode}"], + document_models=document_models ) - logger.info(f"Using {self.mode} database...") - return aio_engine + logger.info(f"Using {db_mode} database...") + def remove_db(self) -> None: """ @@ -71,3 +115,5 @@ def remove_db(self) -> None: client.drop_database("agenta_v2") elif self.mode == "test": client.drop_database("agenta_test") + else: + client.drop_database(f"agenta_{self.mode}") \ No newline at end of file From 47dbd6537a04c35685279663d70843df23292492 Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Wed, 3 Jan 2024 20:42:23 +0100 Subject: [PATCH 252/414] invoked databse on application startup event --- agenta-backend/agenta_backend/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index f16f7e8a50..783be4242d 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -29,6 +29,8 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from agenta_backend.models.db_engine import DBEngine + origins = [ "http://localhost:3000", "http://localhost:3001", @@ -48,6 +50,9 @@ async def lifespan(application: FastAPI, cache=True): application: FastAPI application. cache: A boolean value that indicates whether to use the cached data or not. """ + # first initialize the database + await DBEngine().init_db() + await templates_manager.update_and_sync_templates(cache=cache) yield From 392a7de83914057e261cf0c46ea3bd50e578e397 Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Wed, 3 Jan 2024 20:43:14 +0100 Subject: [PATCH 253/414] first refcator update template to work with beanie odm --- agenta-backend/agenta_backend/services/db_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index ad4cb1c014..f8a0b00ad3 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1373,12 +1373,12 @@ async def add_template(**kwargs: dict) -> str: Returns: template_id (Str): The Id of the created template. """ - existing_template = await engine.find_one( - TemplateDB, TemplateDB.tag_id == kwargs["tag_id"] + existing_template = await TemplateDB.find_one( + TemplateDB.tag_id == kwargs["tag_id"] ) if existing_template is None: db_template = TemplateDB(**kwargs) - await engine.save(db_template) + await db_template.create() return str(db_template.id) @@ -1452,14 +1452,14 @@ async def remove_old_template_from_db(tag_ids: list) -> None: templates_to_delete = [] try: - templates: List[TemplateDB] = await engine.find(TemplateDB) + templates: List[TemplateDB] = await TemplateDB.find().to_list() for temp in templates: if temp.tag_id not in tag_ids: templates_to_delete.append(temp) for template in templates_to_delete: - await engine.delete(template) + await template.delete() except DocumentParsingError as exc: remove_document_using_driver(str(exc.primary_value), "templates") From fb143e21daa2143012692a7f2e6c2e2445071fd3 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 22:11:46 +0100 Subject: [PATCH 254/414] Update - added switch form item to show advanced rate-limit configuration --- .../evaluationResults/NewEvaluationModal.tsx | 196 ++++++++++-------- 1 file changed, 106 insertions(+), 90 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 5aed01d43f..dff2b706c8 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -15,6 +15,7 @@ import { InputNumber, Row, Col, + Switch, Tooltip, } from "antd" import dayjs from "dayjs" @@ -73,6 +74,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { const [evaluatorConfigs] = useAtom(evaluatorConfigsAtom) const [evaluators] = useAtom(evaluatorsAtom) const [submitLoading, setSubmitLoading] = useState(false) + const [showRateLimitInputs, setShowRateLimitInputs] = useState(false) const [form] = Form.useForm() useEffect(() => { @@ -96,6 +98,10 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { const onRateLimitInputChange = (field: keyof LLMRunRateLimit, value: number) => { setRateLimitValues((prevValues: any) => ({...prevValues, [field]: value})) } + const onRateLimitSwitchChange = (checked: boolean) => { + setShowRateLimitInputs(checked) + } + const onSubmit = (values: CreateEvaluationData) => { setSubmitLoading(true) @@ -199,101 +205,111 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { })} - + + + - - - - - Batch Size  - - - - - } - name="batch_size" - style={{marginBottom: "0"}} - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("batch_size", value) + {showRateLimitInputs && ( + + + + + + Batch Size  + + + + } - style={{width: "100%"}} - /> - - - - - Max Retries  - - - - - } - name="max_retries" - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("max_retries", value) + name="batch_size" + style={{marginBottom: "0"}} + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("batch_size", value) + } + style={{width: "100%"}} + /> + + + + + Max Retries  + + + + } - style={{width: "100%"}} - /> - - - - - Retry Delay  - - - - - } - name="retry_delay" - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("retry_delay", value) + name="max_retries" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("max_retries", value) + } + style={{width: "100%"}} + /> + + + + + Retry Delay  + + + + } - style={{width: "100%"}} - /> - - - - - Delay Between Batches  - - - - - } - name="delay_between_batches" - rules={[{required: true, message: "This field is required"}]} - > - - onRateLimitInputChange("delay_between_batches", value) + style={{ marginBottom: '0' }} + name="retry_delay" + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("retry_delay", value) + } + style={{width: "100%"}} + /> + + + + + Delay Between Batches  + + + + } - style={{width: "100%"}} - /> - - - - + name="delay_between_batches" + style={{ marginBottom: '0' }} + rules={[{required: true, message: "This field is required"}]} + > + + onRateLimitInputChange("delay_between_batches", value) + } + style={{width: "100%"}} + /> + + + + + )} From 15846084f24143fd2dd9168fd0f6e107bf9babf5 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 3 Jan 2024 22:12:35 +0100 Subject: [PATCH 255/414] :art: Format - ran prettier --- .../evaluationResults/NewEvaluationModal.tsx | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index dff2b706c8..e881685aff 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -102,7 +102,6 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { setShowRateLimitInputs(checked) } - const onSubmit = (values: CreateEvaluationData) => { setSubmitLoading(true) const EvaluationRateLimit: LLMRunRateLimit = rateLimitValues @@ -207,7 +206,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { @@ -228,7 +227,9 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { } name="batch_size" style={{marginBottom: "0"}} - rules={[{required: true, message: "This field is required"}]} + rules={[ + {required: true, message: "This field is required"}, + ]} > = ({onSuccess, ...props}) => { } name="max_retries" - rules={[{required: true, message: "This field is required"}]} + rules={[ + {required: true, message: "This field is required"}, + ]} > = ({onSuccess, ...props}) => { } - style={{ marginBottom: '0' }} + style={{marginBottom: "0"}} name="retry_delay" - rules={[{required: true, message: "This field is required"}]} + rules={[ + {required: true, message: "This field is required"}, + ]} > = ({onSuccess, ...props}) => { } name="delay_between_batches" - style={{ marginBottom: '0' }} - rules={[{required: true, message: "This field is required"}]} + style={{marginBottom: "0"}} + rules={[ + {required: true, message: "This field is required"}, + ]} > - onRateLimitInputChange("delay_between_batches", value) + onRateLimitInputChange( + "delay_between_batches", + value, + ) } style={{width: "100%"}} /> From 7ff22970732429f968ea01de9bbca9c90ea74cd3 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 4 Jan 2024 07:34:09 +0100 Subject: [PATCH 256/414] comparison logic --- .../routers/evaluation_router.py | 30 +++++++++ .../services/evaluation_service.py | 62 ++++++++++++++++++- 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index c28c0aad22..57d7c228c9 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -242,3 +242,33 @@ async def webhook_example_fake(): random_generator = secrets.SystemRandom() random_number = random_generator.random() return {"score": random_number} + + +@router.get( + "/evaluation_scenarios/comparison-results/", + response_model=List, +) +async def fetch_evaluation_scenarios( + evaluations_ids: str, + testset_id: str, + app_variant_id: str, + request: Request, +): + """Fetches evaluation scenarios for a given evaluation ID. + + Arguments: + evaluation_id (str): The ID of the evaluation for which to fetch scenarios. + + Raises: + HTTPException: If the evaluation is not found or access is denied. + + Returns: + List[EvaluationScenario]: A list of evaluation scenarios. + """ + evaluations_ids_list = evaluations_ids.split(',') + user_org_data: dict = await get_user_and_org_id(request.state.user_id) + eval_scenarios = await evaluation_service.compare_evaluations_scenarios( + evaluations_ids_list, testset_id, app_variant_id, **user_org_data + ) + + return eval_scenarios \ No newline at end of file diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 91e69f42e1..2055d7f230 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -25,7 +25,7 @@ ) from agenta_backend.models import converters from agenta_backend.services import db_manager -from agenta_backend.services.db_manager import query, get_user +from agenta_backend.services.db_manager import fetch_app_variant_by_id, query, get_user from agenta_backend.utils.common import engine, check_access_to_app from agenta_backend.services.security.sandbox import execute_code_safely from agenta_backend.models.db_models import ( @@ -1042,3 +1042,63 @@ async def retrieve_evaluation_results( detail=f"You do not have access to this app: {str(evaluation.app.id)}", ) return await converters.aggregated_result_to_pydantic(evaluation.aggregated_results) + + + +async def compare_evaluations_scenarios(evaluations_ids: List[str], testset_id: str, app_variant_id: str, **user_org_data: dict): + all_scenarios = [] + grouped_scenarios = {} + for evaluation_id in evaluations_ids: + eval_scenarios = await fetch_evaluation_scenarios_for_evaluation( + evaluation_id, **user_org_data + ) + all_scenarios.append(eval_scenarios) + + app_variant_db = await fetch_app_variant_by_id(app_variant_id) + testset = await db_manager.fetch_testset_by_id(testset_id=testset_id) + + inputs = app_variant_db.parameters.get("inputs", []) + # inputs: [{'name': 'country'}] + formatted_inputs = extract_inputs_values_from_tesetset(inputs, testset.csvdata) + # formatted_inputs: [{'input_name': 'country', 'input_values': ['Nauru', 'Tuvalu'...]}] + + print(formatted_inputs) + groupped_scenarios_by_inputs=find_scenarios_by_input(formatted_inputs, all_scenarios) + print(groupped_scenarios_by_inputs) + return groupped_scenarios_by_inputs + + +def extract_inputs_values_from_tesetset(inputs, testset): + extracted_values = [] + + for input_item in inputs: + key_name = input_item['name'] + values = [entry[key_name] for entry in testset if key_name in entry] + + # Create a dictionary for each input with its values + input_dict = {'input_name': key_name, 'input_values': values} + extracted_values.append(input_dict) + + return extracted_values + + +def find_scenarios_by_input(formatted_inputs, all_scenarios): + results = [] + flattened_scenarios = [scenario for sublist in all_scenarios for scenario in sublist] + + for formatted_input in formatted_inputs: + input_name = formatted_input['input_name'] + for input_value in formatted_input['input_values']: + matching_scenarios = [ + scenario for scenario in flattened_scenarios + if any(input_item.name == input_name and input_item.value == input_value + for input_item in scenario.inputs) + ] + + results.append({ + 'input_name': input_name, + 'input_value': input_value, + 'scenarios': matching_scenarios + }) + + return results \ No newline at end of file From 29fa6daabfcea26ec9a3fd3c277450914111351c Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 4 Jan 2024 07:37:02 +0100 Subject: [PATCH 257/414] black --- .../routers/evaluation_router.py | 4 +- .../services/evaluation_service.py | 47 ++++++++++++------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 57d7c228c9..caadbf6a04 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -265,10 +265,10 @@ async def fetch_evaluation_scenarios( Returns: List[EvaluationScenario]: A list of evaluation scenarios. """ - evaluations_ids_list = evaluations_ids.split(',') + evaluations_ids_list = evaluations_ids.split(",") user_org_data: dict = await get_user_and_org_id(request.state.user_id) eval_scenarios = await evaluation_service.compare_evaluations_scenarios( evaluations_ids_list, testset_id, app_variant_id, **user_org_data ) - return eval_scenarios \ No newline at end of file + return eval_scenarios diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 2055d7f230..315ef0e342 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -1044,8 +1044,12 @@ async def retrieve_evaluation_results( return await converters.aggregated_result_to_pydantic(evaluation.aggregated_results) - -async def compare_evaluations_scenarios(evaluations_ids: List[str], testset_id: str, app_variant_id: str, **user_org_data: dict): +async def compare_evaluations_scenarios( + evaluations_ids: List[str], + testset_id: str, + app_variant_id: str, + **user_org_data: dict, +): all_scenarios = [] grouped_scenarios = {} for evaluation_id in evaluations_ids: @@ -1063,7 +1067,9 @@ async def compare_evaluations_scenarios(evaluations_ids: List[str], testset_id: # formatted_inputs: [{'input_name': 'country', 'input_values': ['Nauru', 'Tuvalu'...]}] print(formatted_inputs) - groupped_scenarios_by_inputs=find_scenarios_by_input(formatted_inputs, all_scenarios) + groupped_scenarios_by_inputs = find_scenarios_by_input( + formatted_inputs, all_scenarios + ) print(groupped_scenarios_by_inputs) return groupped_scenarios_by_inputs @@ -1072,11 +1078,11 @@ def extract_inputs_values_from_tesetset(inputs, testset): extracted_values = [] for input_item in inputs: - key_name = input_item['name'] + key_name = input_item["name"] values = [entry[key_name] for entry in testset if key_name in entry] # Create a dictionary for each input with its values - input_dict = {'input_name': key_name, 'input_values': values} + input_dict = {"input_name": key_name, "input_values": values} extracted_values.append(input_dict) return extracted_values @@ -1084,21 +1090,28 @@ def extract_inputs_values_from_tesetset(inputs, testset): def find_scenarios_by_input(formatted_inputs, all_scenarios): results = [] - flattened_scenarios = [scenario for sublist in all_scenarios for scenario in sublist] + flattened_scenarios = [ + scenario for sublist in all_scenarios for scenario in sublist + ] for formatted_input in formatted_inputs: - input_name = formatted_input['input_name'] - for input_value in formatted_input['input_values']: + input_name = formatted_input["input_name"] + for input_value in formatted_input["input_values"]: matching_scenarios = [ - scenario for scenario in flattened_scenarios - if any(input_item.name == input_name and input_item.value == input_value - for input_item in scenario.inputs) + scenario + for scenario in flattened_scenarios + if any( + input_item.name == input_name and input_item.value == input_value + for input_item in scenario.inputs + ) ] - results.append({ - 'input_name': input_name, - 'input_value': input_value, - 'scenarios': matching_scenarios - }) + results.append( + { + "input_name": input_name, + "input_value": input_value, + "scenarios": matching_scenarios, + } + ) - return results \ No newline at end of file + return results From cbd89ba4a533e1b0965b62ebc064b30bcd9ebf23 Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Thu, 4 Jan 2024 09:15:41 +0100 Subject: [PATCH 258/414] refactor db_engine to return engine --- .../agenta_backend/models/db_engine.py | 71 +++++++++---------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index a96402fc9b..1258cc24e0 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -55,53 +55,52 @@ ] -class DBEngine(object): +class DBEngine: """ - Database engine to initialize client and return engine based on mode + Database engine to initialize Beanie and return the engine based on mode. """ def __init__(self) -> None: self.mode = os.environ.get("DATABASE_MODE", "v2") self.db_url = os.environ["MONGODB_URI"] - - def engine(self) -> AIOEngine: - return True + self._engine: AIOEngine = None # Store the engine for reuse async def init_db(self) -> AIOEngine: """ - Initialize the database based on the mode. + Initialize Beanie based on the mode and store the engine. """ - client = AsyncIOMotorClient(os.environ["MONGODB_URI"]) - db_mode = os.environ.get("DATABASE_MODE", "v2") + if self._engine is not None: + return self._engine # Return the existing engine if already initialized - if db_mode == "test": - await init_beanie( - database=client["agenta_test"], - document_models=document_models - ) - logger.info("Using test database...") - elif db_mode == "default": - await init_beanie( - database=client["agenta"], - document_models=document_models - ) - logger.info("Using default database...") - elif db_mode == "v2": - await init_beanie( - database=client["agenta_v2"], - document_models=document_models - ) - logger.info("Using v2 database...") - else: - # make sure that self.mode does only contain alphanumeric characters - if not db_mode.isalnum(): - raise ValueError("Mode of database needs to be alphanumeric.") - await init_beanie( - database=client[f"agenta_{db_mode}"], - document_models=document_models - ) - logger.info(f"Using {db_mode} database...") - + client = AsyncIOMotorClient(self.db_url) + db_name = self._get_database_name(self.mode) + + self._engine = await init_beanie( + database=client[db_name], + document_models=document_models + ) + + logger.info(f"Using {db_name} database...") + return self._engine + + def _get_database_name(self, mode: str) -> str: + """ + Determine the appropriate database name based on the mode. + """ + if mode in ("test", "default", "v2"): + return f"agenta_{mode}" + + if not mode.isalnum(): + raise ValueError("Mode of database needs to be alphanumeric.") + return f"agenta_{mode}" + + def engine(self) -> AIOEngine: + """ + Return the initialized Beanie engine. + """ + if self._engine is None: + raise RuntimeError("Database engine has not been initialized yet.") + return self._engine def remove_db(self) -> None: """ From 10e39e3669b570bf561b436ab51768c04866fceb Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Thu, 4 Jan 2024 09:16:24 +0100 Subject: [PATCH 259/414] refactor db_manager to use beanie odm --- .../agenta_backend/services/db_manager.py | 369 +++++++++--------- 1 file changed, 176 insertions(+), 193 deletions(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index f8a0b00ad3..3f85ff94a1 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -4,7 +4,9 @@ from pathlib import Path from bson import ObjectId from datetime import datetime +from beanie.operators import In from urllib.parse import urlparse +from beanie import PydanticObjectId, Query from typing import Any, Dict, List, Optional from agenta_backend.models.api.api_models import ( @@ -44,7 +46,7 @@ UserDB, ) -from agenta_backend.utils.common import check_user_org_access, engine +from agenta_backend.utils.common import check_user_org_access from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum @@ -98,7 +100,7 @@ async def add_testset_to_app_variant( testset_db = TestSetDB( **testset, app=app_db, user=user_db, organization=org_db ) - await engine.save(testset_db) + await testset_db.create() except Exception as e: print(f"An error occurred in adding the default testset: {e}") @@ -115,16 +117,19 @@ async def get_image(app_variant: AppVariant, **kwargs: dict) -> ImageExtended: """ # Build the query expression for the two conditions - query_expression = ( - query.eq(AppVariantDB.app, ObjectId(app_variant.app_id)) - & query.eq(AppVariantDB.variant_name, app_variant.variant_name) - & query.eq(AppVariantDB.organization, ObjectId(app_variant.organization)) + query = Query(AppVariantDB) # Create a query for AppVariantDB + query = query.filter( + AppVariantDB.app == app_variant.app_id + ).filter( + AppVariantDB.variant_name == app_variant.variant_name + ).filter( + AppVariantDB.organization == app_variant.organization ) - db_app_variant: AppVariantDB = await engine.find_one(AppVariantDB, query_expression) + db_app_variant: AppVariantDB = await query.get() if db_app_variant: - image_db: ImageDB = await engine.find_one( - ImageDB, ImageDB.id == ObjectId(db_app_variant.image.id) + image_db: ImageDB = await ImageDB.find_one( + ImageDB.id == db_app_variant.image.id ) return image_db_to_pydantic(image_db) else: @@ -141,7 +146,7 @@ async def get_image_by_id(image_id: str) -> ImageDB: ImageDB: instance of image object """ - image = await engine.find_one(ImageDB, ImageDB.id == ObjectId(image_id)) + image = await ImageDB.find_one(ImageDB.id == image_id) return image @@ -152,7 +157,7 @@ async def fetch_app_by_id(app_id: str, **kwargs: dict) -> AppDB: app_id: _description_ """ assert app_id is not None, "app_id cannot be None" - app = await engine.find_one(AppDB, AppDB.id == ObjectId(app_id)) + app = await AppDB.find_one(AppDB.id == app_id) return app @@ -167,15 +172,15 @@ async def fetch_app_by_name( Returns: AppDB: the instance of the app """ + query = Query(AppDB) # Create a base query for AppDB + if not organization_id: user = await get_user(user_uid=user_org_data["uid"]) - query_expression = (AppDB.app_name == app_name) & (AppDB.user == user.id) - app = await engine.find_one(AppDB, query_expression) + query = query.filter(AppDB.app_name == app_name).filter(AppDB.user == user.id) else: - query_expression = (AppDB.app_name == app_name) & ( - AppDB.organization == ObjectId(organization_id) - ) - app = await engine.find_one(AppDB, query_expression) + query = query.filter(AppDB.app_name == app_name).filter(AppDB.organization == organization_id) + + app = await query.get() return app @@ -192,8 +197,8 @@ async def fetch_app_variant_by_id( AppVariantDB: The fetched app variant, or None if no app variant was found. """ assert app_variant_id is not None, "app_variant_id cannot be None" - app_variant = await engine.find_one( - AppVariantDB, AppVariantDB.id == ObjectId(app_variant_id) + app_variant = await AppVariantDB.find_one( + AppVariantDB.id == app_variant_id ) return app_variant @@ -211,7 +216,7 @@ async def fetch_base_by_id( """ if base_id is None: raise Exception("No base_id provided") - base = await engine.find_one(VariantBaseDB, VariantBaseDB.id == ObjectId(base_id)) + base = await VariantBaseDB.find_one(VariantBaseDB.id == base_id) if base is None: logger.error("Base not found") return False @@ -238,10 +243,11 @@ async def fetch_app_variant_by_name_and_appid( AppVariantDB: the instance of the app variant """ - query_expression = (AppVariantDB.variant_name == variant_name) & ( - AppVariantDB.app == ObjectId(app_id) + query = Query(AppVariantDB) + query = query.filter(AppVariantDB.variant_name == variant_name).filter( + AppVariantDB.app == app_id ) - app_variant_db = await engine.find_one(AppVariantDB, query_expression) + app_variant_db = await query.get() return app_variant_db @@ -267,7 +273,7 @@ async def create_new_variant_base( base_name=base_name, image=image, ) - await engine.save(base) + await base.create() return base @@ -292,7 +298,7 @@ async def create_new_config( ) ], ) - await engine.save(config_db) + await config_db.create() return config_db @@ -329,7 +335,7 @@ async def create_new_app_variant( config_name=config_name, parameters=parameters, ) - await engine.save(variant) + await variant.create() return variant @@ -385,7 +391,7 @@ async def create_image( user=user, organization=organization, ) - await engine.save(image) + await image.create() return image @@ -419,7 +425,7 @@ async def create_deployment( uri=uri, status=status, ) - await engine.save(deployment) + await deployment.create() return deployment @@ -452,7 +458,7 @@ async def create_app_and_envs( organization=organization_db, user=user_instance, ) - await engine.save(app) + await app.create() await initialize_environments(app, **user_org_data) return app @@ -467,9 +473,9 @@ async def create_user_organization(user_uid: str) -> OrganizationDB: OrganizationDB: Instance of OrganizationDB """ - user = await engine.find_one(UserDB, UserDB.uid == user_uid) + user = await UserDB.find_one(UserDB.uid == user_uid) org_db = OrganizationDB(owner=str(user.id), type="default") - await engine.save(org_db) + await org_db.create() return org_db @@ -485,7 +491,7 @@ async def get_deployment_by_objectid( DeploymentDB: instance of deployment object """ - deployment = await engine.find_one(DeploymentDB, DeploymentDB.id == deployment_id) + deployment = await DeploymentDB.find_one(DeploymentDB.id == deployment_id) logger.debug(f"deployment: {deployment}") return deployment @@ -500,8 +506,8 @@ async def get_organization_object(organization_id: str) -> OrganizationDB: Returns: OrganizationDB: The fetched organization. """ - organization = await engine.find_one( - OrganizationDB, OrganizationDB.id == ObjectId(organization_id) + organization = await OrganizationDB.find_one( + OrganizationDB.id == organization_id ) return organization @@ -517,9 +523,9 @@ async def get_organizations_by_list_ids(organization_ids: List) -> List: List: A list of dictionaries representing the retrieved organizations. """ - organizations_db: List[OrganizationDB] = await engine.find( - OrganizationDB, OrganizationDB.id.in_(organization_ids) - ) + organizations_db: List[OrganizationDB] = await OrganizationDB.find( + In(OrganizationDB.id, organization_ids) + ).to_list() return organizations_db @@ -535,12 +541,8 @@ async def list_app_variants_for_app_id( List[AppVariant]: List of AppVariant objects """ assert app_id is not None, "app_id cannot be None" - query_expression = AppVariantDB.app == ObjectId(app_id) - app_variants_db: List[AppVariantDB] = await engine.find( - AppVariantDB, - query_expression, - sort=(AppVariantDB.variant_name), - ) + query = Query(AppVariantDB).filter(AppVariantDB.app == app_id) + app_variants_db: List[AppVariantDB] = await query.sort("variant_name").find() return app_variants_db @@ -549,16 +551,10 @@ async def list_bases_for_app_id( app_id: str, base_name: Optional[str] = None, **kwargs: dict ) -> List[VariantBaseDB]: assert app_id is not None, "app_id cannot be None" - query_expression = VariantBaseDB.app == ObjectId(app_id) + query = Query(VariantBaseDB).filter(VariantBaseDB.app == app_id) if base_name: - query_expression = query_expression & query.eq( - VariantBaseDB.base_name, base_name - ) - bases_db: List[VariantBaseDB] = await engine.find( - VariantBaseDB, - query_expression, - sort=(VariantBaseDB.base_name), - ) + query = query.filter(VariantBaseDB.base_name == base_name) + bases_db: List[VariantBaseDB] = await query.sort("base_name").find() return bases_db @@ -573,12 +569,8 @@ async def list_variants_for_base( List[AppVariant]: List of AppVariant objects """ assert base is not None, "base cannot be None" - query_expression = AppVariantDB.base == ObjectId(base.id) - app_variants_db: List[AppVariantDB] = await engine.find( - AppVariantDB, - query_expression, - sort=(AppVariantDB.variant_name), - ) + query = Query(AppVariantDB).filter(AppVariantDB.base == base.id) + app_variants_db: List[AppVariantDB] = await query.sort("variant_name").find() return app_variants_db @@ -593,18 +585,17 @@ async def get_user(user_uid: str) -> UserDB: UserDB: instance of user """ - user = await engine.find_one(UserDB, UserDB.uid == user_uid) + user = await UserDB.find_one(UserDB.uid == user_uid) if user is None: if os.environ["FEATURE_FLAG"] not in ["cloud", "ee"]: create_user = UserDB(uid="0") - await engine.save(create_user) + await create_user.insert() - org = OrganizationDB(type="default", owner=str(create_user.id)) - await engine.save(org) + org = OrganizationDB(type="default", owner=create_user.id) + await org.insert() create_user.organizations.append(org.id) - await engine.save(create_user) - await engine.save(org) + await create_user.update() return create_user else: @@ -627,7 +618,7 @@ async def get_user_with_id(user_id: ObjectId): Exception: If an error occurs while getting the user from the database. """ try: - user = await engine.find_one(UserDB, UserDB.id == user_id) + user = await UserDB.find_one(UserDB.id == user_id) return user except Exception as e: logger.error(f"Failed to get user with id: {e}") @@ -655,7 +646,7 @@ async def get_user_with_email(email: str): raise Exception("Please provide a valid email address") try: - user = await engine.find_one(UserDB, UserDB.email == email) + user = await UserDB.find_one(UserDB.email == email) return user except Exception as e: logger.error(f"Failed to get user with email address: {e}") @@ -673,7 +664,7 @@ async def get_users_by_ids(user_ids: List) -> List: List: A list of dictionaries representing the retrieved users. """ - users_db: List[UserDB] = await engine.find(UserDB, UserDB.id.in_(user_ids)) + users_db: List[UserDB] = await UserDB.find(In(UserDB.id, user_ids)).to_list() return users_db @@ -691,10 +682,8 @@ async def get_orga_image_instance_by_docker_id( ImageDB: instance of image object """ - query_expression = (ImageDB.organization == ObjectId(organization_id)) & query.eq( - ImageDB.docker_id, docker_id - ) - image = await engine.find_one(ImageDB, query_expression) + query = Query(ImageDB).filter(ImageDB.docker_id == docker_id).filter(ImageDB.organization == organization_id) + image = await query.get() return image @@ -715,10 +704,8 @@ async def get_orga_image_instance_by_uri( if not parsed_url.scheme and not parsed_url.netloc: raise ValueError(f"Invalid URL: {template_uri}") - query_expression = (ImageDB.organization == ObjectId(organization_id)) & query.eq( - ImageDB.template_uri, template_uri - ) - image = await engine.find_one(ImageDB, query_expression) + query = Query(ImageDB).filter(ImageDB.template_uri == template_uri).filter(ImageDB.organization == organization_id) + image = await query.get() return image @@ -732,7 +719,7 @@ async def get_app_instance_by_id(app_id: str) -> AppDB: AppDB: instance of app object """ - app = await engine.find_one(AppDB, AppDB.id == ObjectId(app_id)) + app = await AppDB.find_one(AppDB.id == app_id) return app @@ -778,7 +765,7 @@ async def add_variant_from_base_and_config( ) ], ) - await engine.save(config_db) + await config_db.create() db_app_variant = AppVariantDB( app=previous_app_variant_db.app, variant_name=new_variant_name, @@ -793,7 +780,7 @@ async def add_variant_from_base_and_config( config=config_db, is_deleted=False, ) - await engine.save(db_app_variant) + await db_app_variant.create() return db_app_variant @@ -819,9 +806,9 @@ async def list_apps( elif org_id is not None: organization_access = await check_user_org_access(user_org_data, org_id) if organization_access: - apps: List[AppDB] = await engine.find( - AppDB, AppDB.organization == ObjectId(org_id) - ) + apps: List[AppDB] = await AppDB.find( + AppDB.organization == org_id + ).to_list() return [app_db_to_pydantic(app) for app in apps] else: @@ -831,7 +818,7 @@ async def list_apps( ) else: - apps: List[AppVariantDB] = await engine.find(AppDB, AppDB.user == user.id) + apps: List[AppVariantDB] = await AppDB.find(AppDB.user == user.id).to_list() return [app_db_to_pydantic(app) for app in apps] @@ -846,19 +833,18 @@ async def list_app_variants(app_id: str = None, **kwargs: dict) -> List[AppVaria # Construct query expressions logger.debug("app_id: %s", app_id) - query_filters = query.QueryExpression() + query = Query(AppVariantDB) if app_id is not None: - query_filters = query_filters & (AppVariantDB.app == ObjectId(app_id)) - logger.debug("query_filters: %s", query_filters) - app_variants_db: List[AppVariantDB] = await engine.find(AppVariantDB, query_filters) + query = query.filter(AppVariantDB.app == app_id) + + logger.debug("query: %s", query) + app_variants_db: List[AppVariantDB] = await query.find() # Include previous variant name return app_variants_db -async def check_is_last_variant_for_image( - db_app_variant: AppVariantDB, -) -> bool: +async def check_is_last_variant_for_image(db_app_variant: AppVariantDB) -> bool: """Checks whether the input variant is the sole variant that uses its linked image This is a helpful function to determine whether to delete the image when removing a variant Usually many variants will use the same image (these variants would have been created using the UI) @@ -870,16 +856,15 @@ async def check_is_last_variant_for_image( true if it's the last variant, false otherwise """ - # Build the query expression for the two conditions - logger.debug("db_app_variant: %s", db_app_variant) - query_expression = ( - AppVariantDB.organization == ObjectId(db_app_variant.organization.id) - ) & (AppVariantDB.base == ObjectId(db_app_variant.base.id)) - # Count the number of variants that match the query expression - count_variants = await engine.count(AppVariantDB, query_expression) + query = Query(AppVariantDB) + query = query.filter( + AppVariantDB.organization == db_app_variant.organization.id + ).filter( + AppVariantDB.base == db_app_variant.base.id + ) + count_variants = await query.count() - # If it's the only variant left that uses the image, delete the image - return bool(count_variants == 1) + return count_variants == 1 async def remove_deployment(deployment_db: DeploymentDB, **kwargs: dict): @@ -891,7 +876,7 @@ async def remove_deployment(deployment_db: DeploymentDB, **kwargs: dict): logger.debug("Removing deployment") assert deployment_db is not None, "deployment_db is missing" - await engine.delete(deployment_db) + await deployment_db.delete() async def remove_app_variant_from_db(app_variant_db: AppVariantDB, **kwargs: dict): @@ -912,12 +897,12 @@ async def remove_app_variant_from_db(app_variant_db: AppVariantDB, **kwargs: dic ) for environment in environments: environment.deployed_app_variant = None - await engine.save(environment) + await environment.create() # removing the config config = app_variant_db.config - await engine.delete(config) + await config.delete() - await engine.delete(app_variant_db) + await app_variant_db.delete() async def deploy_to_environment(environment_name: str, variant_id: str, **kwargs: dict): @@ -941,22 +926,22 @@ async def deploy_to_environment(environment_name: str, variant_id: str, **kwargs raise ValueError("App variant not found") # Find the environment for the given app name and user - query_filters = ( - AppEnvironmentDB.app == ObjectId(app_variant_db.app.id) - ) & query.eq(AppEnvironmentDB.name, environment_name) - environment_db: AppEnvironmentDB = await engine.find_one( - AppEnvironmentDB, query_filters - ) + query = Query(AppEnvironmentDB) + query = query.filter(AppEnvironmentDB.app == app_variant_db.app.id) + query = query.filter(AppEnvironmentDB.name == environment_name) + + environment_db: AppEnvironmentDB = await query.get() + if environment_db is None: raise ValueError(f"Environment {environment_name} not found") - if environment_db.deployed_app_variant == app_variant_db: + if environment_db.deployed_app_variant == app_variant_db.id: raise ValueError( f"Variant {app_variant_db.app.app_name}/{app_variant_db.variant_name} is already deployed to the environment {environment_name}" ) # Update the environment with the new variant name environment_db.deployed_app_variant = app_variant_db.id - await engine.save(environment_db) + await environment_db.update() async def list_environments(app_id: str, **kwargs: dict) -> List[AppEnvironmentDB]: @@ -976,9 +961,9 @@ async def list_environments(app_id: str, **kwargs: dict) -> List[AppEnvironmentD logging.error(f"App with id {app_id} not found") raise ValueError("App not found") - environments_db: List[AppEnvironmentDB] = await engine.find( - AppEnvironmentDB, AppEnvironmentDB.app == ObjectId(app_id) - ) + environments_db: List[AppEnvironmentDB] = await AppEnvironmentDB.find( + AppEnvironmentDB.app == app_id + ).to_list() return environments_db @@ -1023,7 +1008,7 @@ async def create_environment( user=app_db.user, organization=app_db.organization, ) - await engine.save(environment_db) + await environment_db.create() return environment_db @@ -1041,10 +1026,9 @@ async def list_environments_by_variant( List[AppEnvironmentDB]: A list of AppEnvironmentDB objects. """ - environments_db: List[AppEnvironmentDB] = await engine.find( - AppEnvironmentDB, - (AppEnvironmentDB.app == ObjectId(app_variant.app.id)), - ) + environments_db: List[AppEnvironmentDB] = await AppEnvironmentDB.find( + AppEnvironmentDB.app == app_variant.app.id, + ).to_list() return environments_db @@ -1065,7 +1049,7 @@ async def remove_image(image: ImageDB, **kwargs: dict): """ if image is None: raise ValueError("Image is None") - await engine.delete(image) + await image.delete() async def remove_environment(environment_db: AppEnvironmentDB, **kwargs: dict): @@ -1083,7 +1067,7 @@ async def remove_environment(environment_db: AppEnvironmentDB, **kwargs: dict): None """ assert environment_db is not None, "environment_db is missing" - await engine.delete(environment_db) + await environment_db.delete() async def remove_app_testsets(app_id: str, **kwargs): @@ -1101,12 +1085,12 @@ async def remove_app_testsets(app_id: str, **kwargs): deleted_count: int = 0 # Build query expression - testsets = await engine.find(TestSetDB, TestSetDB.app == ObjectId(app_id)) + testsets = await TestSetDB.find(TestSetDB.app == app_id).to_list() # Perform deletion if there are testsets to delete if testsets is not None: for testset in testsets: - await engine.delete(testset) + await testset.delete() deleted_count += 1 logger.info(f"{deleted_count} testset(s) deleted for app {app_id}") return deleted_count @@ -1131,7 +1115,7 @@ async def remove_base_from_db(base: VariantBaseDB, **kwargs): """ if base is None: raise ValueError("Base is None") - await engine.delete(base) + await base.delete() async def remove_app_by_id(app_id: str, **kwargs): @@ -1150,7 +1134,7 @@ async def remove_app_by_id(app_id: str, **kwargs): assert app_id is not None, "app_id cannot be None" app_instance = await fetch_app_by_id(app_id=app_id) assert app_instance is not None, f"app instance for {app_id} could not be found" - await engine.delete(app_instance) + await app_instance.delete() async def update_variant_parameters( @@ -1187,7 +1171,7 @@ async def update_variant_parameters( config_db.parameters = parameters # Save updated ConfigDB - await engine.save(config_db) + await config_db.create() except Exception as e: logging.error(f"Issue updating variant parameters: {e}") @@ -1210,14 +1194,13 @@ async def get_app_variant_by_app_name_and_environment( """ # Get the environment # Construct query filters for finding the environment in the database - query_filters_for_environment = query.eq(AppEnvironmentDB.name, environment) & ( - AppEnvironmentDB.app == ObjectId(app_id) + query = Query(AppEnvironmentDB) + query = query.filter(AppEnvironmentDB.name == environment).filter( + AppEnvironmentDB.app == app_id ) # Perform the database query to find the environment - environment_db = await engine.find_one( - AppEnvironmentDB, query_filters_for_environment - ) + environment_db = await query.get() if not environment_db: logger.info(f"Environment {environment} not found") @@ -1243,8 +1226,8 @@ async def get_app_variant_instance_by_id(variant_id: str): AppVariantDB: instance of app variant object """ - app_variant_db = await engine.find_one( - AppVariantDB, AppVariantDB.id == ObjectId(variant_id) + app_variant_db = await AppVariantDB.find_one( + AppVariantDB.id == variant_id ) return app_variant_db @@ -1257,7 +1240,7 @@ async def fetch_testset_by_id(testset_id: str) -> Optional[TestSetDB]: TestSetDB: The fetched testset, or None if no testset was found. """ assert testset_id is not None, "testset_id cannot be None" - testset = await engine.find_one(TestSetDB, TestSetDB.id == ObjectId(testset_id)) + testset = await TestSetDB.find_one(TestSetDB.id == testset_id) return testset @@ -1269,7 +1252,7 @@ async def fetch_testsets_by_app_id(app_id: str) -> List[TestSetDB]: List[TestSetDB]: The fetched testsets. """ assert app_id is not None, "app_id cannot be None" - testsets = await engine.find(TestSetDB, TestSetDB.app == ObjectId(app_id)) + testsets = await TestSetDB.find(TestSetDB.app == app_id).to_list() return testsets @@ -1281,8 +1264,8 @@ async def fetch_evaluation_by_id(evaluation_id: str) -> Optional[EvaluationDB]: EvaluationDB: The fetched evaluation, or None if no evaluation was found. """ assert evaluation_id is not None, "evaluation_id cannot be None" - evaluation = await engine.find_one( - EvaluationDB, EvaluationDB.id == ObjectId(evaluation_id) + evaluation = await EvaluationDB.find_one( + EvaluationDB.id == evaluation_id ) return evaluation @@ -1297,8 +1280,8 @@ async def fetch_human_evaluation_by_id( EvaluationDB: The fetched evaluation, or None if no evaluation was found. """ assert evaluation_id is not None, "evaluation_id cannot be None" - evaluation = await engine.find_one( - HumanEvaluationDB, HumanEvaluationDB.id == ObjectId(evaluation_id) + evaluation = await HumanEvaluationDB.find_one( + HumanEvaluationDB.id == evaluation_id ) return evaluation @@ -1313,9 +1296,8 @@ async def fetch_evaluation_scenario_by_id( EvaluationScenarioDB: The fetched evaluation scenario, or None if no evaluation scenario was found. """ assert evaluation_scenario_id is not None, "evaluation_scenario_id cannot be None" - evaluation_scenario = await engine.find_one( - EvaluationScenarioDB, - EvaluationScenarioDB.id == ObjectId(evaluation_scenario_id), + evaluation_scenario = await EvaluationScenarioDB.find_one( + EvaluationScenarioDB.id == evaluation_scenario_id ) return evaluation_scenario @@ -1330,9 +1312,8 @@ async def fetch_human_evaluation_scenario_by_id( EvaluationScenarioDB: The fetched evaluation scenario, or None if no evaluation scenario was found. """ assert evaluation_scenario_id is not None, "evaluation_scenario_id cannot be None" - evaluation_scenario = await engine.find_one( - HumanEvaluationScenarioDB, - HumanEvaluationScenarioDB.id == ObjectId(evaluation_scenario_id), + evaluation_scenario = await HumanEvaluationScenarioDB.find_one( + HumanEvaluationScenarioDB.id == evaluation_scenario_id, ) return evaluation_scenario @@ -1349,9 +1330,9 @@ async def find_previous_variant_from_base_id( Optional[AppVariantDB]: The previous variant, or None if no previous variant was found. """ assert base_id is not None, "base_id cannot be None" - previous_variants = await engine.find( - AppVariantDB, AppVariantDB.base == ObjectId(base_id) - ) + previous_variants = await AppVariantDB.find( + AppVariantDB.base == base_id + ).to_list() logger.debug("previous_variants: %s", previous_variants) if len(list(previous_variants)) == 0: return None @@ -1393,7 +1374,7 @@ async def add_zip_template(key, value): Returns: template_id (Str): The Id of the created template. """ - existing_template = await engine.find_one(TemplateDB, TemplateDB.name == key) + existing_template = await TemplateDB.find_one(TemplateDB.name == key) if existing_template: # Compare existing values with new values @@ -1406,7 +1387,7 @@ async def add_zip_template(key, value): return str(existing_template.id) else: # Values are changed, delete existing template - await engine.delete(existing_template) + await existing_template.delete() # Create a new template template_name = key @@ -1421,7 +1402,7 @@ async def add_zip_template(key, value): description=description, template_uri=template_uri, ) - await engine.save(template_db_instance) + await template_db_instance.create() return str(template_db_instance.id) @@ -1437,8 +1418,8 @@ async def get_template(template_id: str) -> TemplateDB: """ assert template_id is not None, "template_id cannot be None" - template_db = await engine.find_one( - TemplateDB, TemplateDB.id == ObjectId(template_id) + template_db = await TemplateDB.find_one( + TemplateDB.id == template_id ) return template_db @@ -1471,14 +1452,14 @@ def remove_document_using_driver(document_id: str, collection_name: str) -> None db = client.get_database("agenta_v2") collection = db.get_collection(collection_name) - deleted = collection.delete_one({"_id": ObjectId(document_id)}) + deleted = collection.delete_one({"_id": PydanticObjectId(document_id)}) print( f"Deleted documents in {collection_name} collection. Acknowledged: {deleted.acknowledged}" ) async def get_templates() -> List[Template]: - templates = await engine.find(TemplateDB) + templates = await TemplateDB.find_all() return templates_db_to_pydantic(templates) @@ -1492,7 +1473,8 @@ async def count_apps(**user_org_data: dict) -> int: if user is None: return 0 - no_of_apps = await engine.count(AppVariantDB, AppVariantDB.user == user.id) + query = Query(AppVariantDB).filter(AppVariantDB.user == user.id) + no_of_apps = await query.count() return no_of_apps @@ -1505,10 +1487,12 @@ async def update_base( Arguments: base (VariantBaseDB): The base object to update. """ + for key, value in kwargs.items(): - if key in base.__fields__: + if hasattr(base, key): setattr(base, key, value) - await engine.save(base) + + await base.update() return base @@ -1522,9 +1506,10 @@ async def update_app_variant( app_variant (AppVariantDB): The app variant object to update. """ for key, value in kwargs.items(): - if key in app_variant.__fields__: - setattr(app_variant, key, value) - await engine.save(app_variant) + if hasattr(app_variant, key): + setattr(app_variant, key, value) + + await app_variant.update() return app_variant @@ -1548,7 +1533,7 @@ async def fetch_base_and_check_access( """ if base_id is None: raise Exception("No base_id provided") - base = await engine.find_one(VariantBaseDB, VariantBaseDB.id == ObjectId(base_id)) + base = await VariantBaseDB.find_one(VariantBaseDB.id == base_id) if base is None: logger.error("Base not found") raise HTTPException(status_code=404, detail="Base not found") @@ -1579,7 +1564,7 @@ async def fetch_app_and_check_access( Raises: HTTPException: If the app is not found or the user does not have access to it. """ - app = await engine.find_one(AppDB, AppDB.id == ObjectId(app_id)) + app = await AppDB.find_one(AppDB.id == app_id) if app is None: logger.error("App not found") raise HTTPException @@ -1612,8 +1597,8 @@ async def fetch_app_variant_and_check_access( Raises: HTTPException: If the app variant is not found or the user does not have access to it. """ - app_variant = await engine.find_one( - AppVariantDB, AppVariantDB.id == ObjectId(app_variant_id) + app_variant = await AppVariantDB.find_one( + AppVariantDB.id == app_variant_id ) if app_variant is None: logger.error("App variant not found") @@ -1643,10 +1628,11 @@ async def fetch_app_by_name_and_organization( AppDB: the instance of the app """ - query_expression = (AppDB.app_name == app_name) & ( - AppDB.organization == ObjectId(organization_id) + query = Query(AppDB) + query = query.filter(AppDB.app_name == app_name).filter( + AppDB.organization == organization_id ) - app_db = await engine.find_one(AppDB, query_expression) + app_db = await query.get() return app_db @@ -1675,7 +1661,7 @@ async def create_new_evaluation( created_at=datetime.now().isoformat(), updated_at=datetime.now().isoformat(), ) - await engine.save(evaluation) + await evaluation.create() return evaluation @@ -1700,7 +1686,7 @@ async def create_new_evaluation_scenario( user=user, organization=organization, evaluation=evaluation, - variant_id=ObjectId(variant_id), + variant_id=PydanticObjectId(variant_id), inputs=inputs, outputs=outputs, correct_answer=correct_answer, @@ -1711,14 +1697,14 @@ async def create_new_evaluation_scenario( created_at=datetime.utcnow(), updated_at=datetime.utcnow(), ) - await engine.save(evaluation_scenario) + await evaluation_scenario.create() return evaluation_scenario async def update_evaluation_with_aggregated_results( evaluation_id: ObjectId, aggregated_results: List[AggregatedResult] ) -> EvaluationDB: - evaluation = await engine.find_one(EvaluationDB, EvaluationDB.id == evaluation_id) + evaluation = await EvaluationDB.find_one(EvaluationDB.id == evaluation_id) if not evaluation: raise ValueError("Evaluation not found") @@ -1727,7 +1713,7 @@ async def update_evaluation_with_aggregated_results( evaluation.aggregated_results = aggregated_results evaluation.updated_at = datetime.utcnow().isoformat() - await engine.save(evaluation) + await evaluation.update() return evaluation @@ -1740,10 +1726,9 @@ async def fetch_evaluators_configs(app_id: str): assert app_id is not None, "evaluation_id cannot be None" try: - query_expression = query.eq(EvaluatorConfigDB.app, ObjectId(app_id)) - evaluators_configs: [EvaluatorConfigDB] = await engine.find( - EvaluatorConfigDB, query_expression - ) + query = Query(EvaluatorConfigDB) + query = query.filter(EvaluatorConfigDB.app == app_id) + evaluators_configs = await query.get() return evaluators_configs except Exception as e: raise e @@ -1757,9 +1742,8 @@ async def fetch_evaluator_config(evaluator_config_id: str): """ try: - query_expression = query.eq(EvaluatorConfigDB.id, ObjectId(evaluator_config_id)) - evaluator_config: EvaluatorConfigDB = await engine.find_one( - EvaluatorConfigDB, query_expression + evaluator_config: EvaluatorConfigDB = await EvaluatorConfigDB.find_one( + EvaluatorConfigDB.id == evaluator_config_id ) return evaluator_config except Exception as e: @@ -1780,12 +1764,11 @@ async def fetch_evaluator_config_by_appId( """ try: - query_expression = query.eq(EvaluatorConfigDB.app, ObjectId(app_id)) & query.eq( - EvaluatorConfigDB.evaluator_key, evaluator_name - ) - evaluator_config: EvaluatorConfigDB = await engine.find_one( - EvaluatorConfigDB, query_expression + query = Query(EvaluatorConfigDB) + query = query.filter(EvaluatorConfigDB.app == app_id).filter( + EvaluatorConfigDB.evaluator_key == evaluator_name ) + evaluator_config = await query.get() return evaluator_config except Exception as e: raise e @@ -1811,7 +1794,7 @@ async def create_evaluator_config( ) try: - await engine.save(new_evaluator_config) + await new_evaluator_config.create() return new_evaluator_config except Exception as e: raise e @@ -1830,15 +1813,15 @@ async def update_evaluator_config( Returns: EvaluatorConfigDB: The updated evaluator configuration object. """ - evaluator_config = await engine.find_one( - EvaluatorConfigDB, EvaluatorConfigDB.id == ObjectId(evaluator_config_id) + evaluator_config = await EvaluatorConfigDB.find_one( + EvaluatorConfigDB.id == evaluator_config_id ) updates_dict = updates.dict(exclude_unset=True) for key, value in updates_dict.items(): - if key in evaluator_config.__fields__: + if hasattr(evaluator_config, key): setattr(evaluator_config, key, value) - await engine.save(evaluator_config) + await evaluator_config.update() return evaluator_config @@ -1870,12 +1853,12 @@ async def update_evaluation( Returns: EvaluatorConfigDB: The updated evaluator configuration object. """ - evaluation = await engine.find_one( - EvaluationDB, EvaluationDB.id == ObjectId(evaluation_id) + evaluation = await EvaluationDB.find_one( + EvaluationDB.id == evaluation_id ) for key, value in updates.items(): - if key in evaluation.__fields__: + if hasattr(evaluation, key): setattr(evaluation, key, value) - await engine.save(evaluation) + await evaluation.update() return evaluation From 17926096926ee5ce79595acaa7eafe5537aaa497 Mon Sep 17 00:00:00 2001 From: Nehemiah Onyekachukwu Emmanuel Date: Thu, 4 Jan 2024 09:16:34 +0100 Subject: [PATCH 260/414] refactor commons to use beanie odm --- agenta-backend/agenta_backend/utils/common.py | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/agenta-backend/agenta_backend/utils/common.py b/agenta-backend/agenta_backend/utils/common.py index 25c382d952..5bb19ede89 100644 --- a/agenta-backend/agenta_backend/utils/common.py +++ b/agenta-backend/agenta_backend/utils/common.py @@ -1,6 +1,5 @@ import logging -from bson import ObjectId -from odmantic import query +from beanie import Query from fastapi.types import DecoratedCallable from fastapi import APIRouter as FastAPIRouter from agenta_backend.models.db_engine import DBEngine @@ -13,7 +12,6 @@ VariantBaseDB, ) -engine = DBEngine().engine() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -60,7 +58,7 @@ def decorator(func: DecoratedCallable) -> DecoratedCallable: async def get_organization(org_id: str) -> OrganizationDB: - org = await engine.find_one(OrganizationDB, OrganizationDB.id == ObjectId(org_id)) + org = await OrganizationDB.find_one(OrganizationDB.id == org_id) if org is not None: return org else: @@ -70,22 +68,18 @@ async def get_organization(org_id: str) -> OrganizationDB: async def get_app_instance( app_id: str, variant_name: str = None, show_deleted: bool = False ) -> AppVariantDB: + query = Query(AppVariantDB) + query = query.filter(AppVariantDB.is_deleted == show_deleted) + query = query.filter(AppVariantDB.app == app_id) + if variant_name is not None: - query_expression = ( - query.eq(AppVariantDB.is_deleted, show_deleted) - & query.eq(AppVariantDB.app, ObjectId(app_id)) - & query.eq(AppVariantDB.variant_name, variant_name) - ) - else: - query_expression = query.eq(AppVariantDB.is_deleted, show_deleted) & query.eq( - AppVariantDB.app_name, ObjectId(app_id) - ) + query = query.filter(AppVariantDB.variant_name == variant_name) - print("query_expression: " + str(query_expression)) + print("query_expression:", query) - app_instance = await engine.find_one(AppVariantDB, query_expression) + app_instance = await query.get() - print("app_instance: " + str(app_instance)) + print("app_instance:", app_instance) return app_instance @@ -93,7 +87,7 @@ async def check_user_org_access( kwargs: dict, organization_id: str, check_owner=False ) -> bool: if check_owner: # Check that the user is the owner of the organization - user = await engine.find_one(UserDB, UserDB.uid == kwargs["uid"]) + user = await UserDB.find_one(UserDB.uid == kwargs["uid"]) organization = await get_organization(organization_id) if not organization: logger.error("Organization not found") @@ -135,7 +129,7 @@ async def check_access_to_app( # Fetch the app if only app_id is provided. if app is None: - app = await engine.find_one(AppDB, AppDB.id == ObjectId(app_id)) + app = await AppDB.find_one(AppDB.id == app_id) if app is None: logger.error("App not found") return False @@ -152,8 +146,8 @@ async def check_access_to_variant( ) -> bool: if variant_id is None: raise Exception("No variant_id provided") - variant = await engine.find_one( - AppVariantDB, AppVariantDB.id == ObjectId(variant_id) + variant = await AppVariantDB.find_one( + AppVariantDB.id == variant_id ) if variant is None: logger.error("Variant not found") @@ -169,7 +163,7 @@ async def check_access_to_base( ) -> bool: if base_id is None: raise Exception("No base_id provided") - base = await engine.find_one(VariantBaseDB, VariantBaseDB.id == ObjectId(base_id)) + base = await VariantBaseDB.find_one(VariantBaseDB.id == base_id) if base is None: logger.error("Base not found") return False From 7b3b446f424c7bea36775fbe4afdffbdaa06e9a0 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 4 Jan 2024 12:31:05 +0100 Subject: [PATCH 261/414] Refactor - migrate selectors service from odmantic to beanie --- .../agenta_backend/services/selectors.py | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/agenta-backend/agenta_backend/services/selectors.py b/agenta-backend/agenta_backend/services/selectors.py index 20c3e38b6f..141c16e4cd 100644 --- a/agenta-backend/agenta_backend/services/selectors.py +++ b/agenta-backend/agenta_backend/services/selectors.py @@ -1,14 +1,10 @@ -from bson import ObjectId from typing import Tuple, Dict, List + from agenta_backend.models.db_models import ( UserDB, OrganizationDB, ) -from odmantic import query -from agenta_backend.utils.common import engine -from agenta_backend.models.api.organization_models import Organization - async def get_user_and_org_id(user_uid_id) -> Dict[str, List]: """Retrieves the user ID and organization ID based on the logged-in session. @@ -35,15 +31,13 @@ async def get_user_objectid(user_uid: str) -> Tuple[str, List]: of the user's organization_ids. """ - user = await engine.find_one(UserDB, UserDB.uid == user_uid) - + user = await UserDB.find_one(UserDB.uid == user_uid) if user is not None: user_id = str(user.uid) organization_ids: List = ( [org for org in user.organizations] if user.organizations else [] ) return user_id, organization_ids - return None, [] @@ -57,16 +51,14 @@ async def get_user_own_org(user_uid: str) -> OrganizationDB: Organization: Instance of OrganizationDB """ - user = await engine.find_one(UserDB, UserDB.uid == user_uid) + user = await UserDB.find_one(UserDB.uid == user_uid) # Build the query expression for the two conditions - query_expression = query.eq(OrganizationDB.owner, str(user.id)) & query.eq( - OrganizationDB.type, "default" + query_expression = ( + OrganizationDB.owner == str(user.id), + OrganizationDB.type == "default", ) - - # get the organization - org: OrganizationDB = await engine.find_one(OrganizationDB, query_expression) - + org: OrganizationDB = await OrganizationDB.find_one(query_expression) if org is not None: return org else: From 7fdb8c649d6145ab731d3653dbf667cd630fc624 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 4 Jan 2024 16:58:16 +0100 Subject: [PATCH 262/414] Refactor - switch to beanie odm and referenced documents using link --- .../agenta_backend/models/db_models.py | 108 +++++++++--------- 1 file changed, 57 insertions(+), 51 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 913fb38900..ae0eefb6c2 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -1,9 +1,9 @@ +from uuid import uuid4 from datetime import datetime from typing import Any, Dict, List, Optional, Union -from uuid import uuid4 -from beanie import Document, Link, PydanticObjectId from pydantic import BaseModel, Field +from beanie import Document, Link, PydanticObjectId class APIKeyDB(Document): @@ -60,8 +60,8 @@ class ImageDB(Document): docker_id: Optional[str] = Field(index=True) tags: Optional[str] deletable: bool = Field(default=True) - user: UserDB = UserDB - organization: OrganizationDB = OrganizationDB + user: Link[UserDB] + organization: Link[OrganizationDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) deletable: bool = Field(default=True) @@ -72,16 +72,19 @@ class Settings: class AppDB(Document): app_name: str - organization: OrganizationDB = OrganizationDB - user: UserDB = UserDB + organization: Link[OrganizationDB] + user: Link[UserDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + class Settings: + collection = "app_db" + class DeploymentDB(Document): - app: AppDB = AppDB - organization: OrganizationDB = OrganizationDB - user: UserDB = UserDB + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] container_name: Optional[str] container_id: Optional[str] uri: Optional[str] @@ -94,11 +97,11 @@ class Settings: class VariantBaseDB(Document): - app: AppDB = AppDB - organization: OrganizationDB = OrganizationDB - user: UserDB = UserDB + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] base_name: str - image: ImageDB = ImageDB + image: Link[ImageDB] deployment: Optional[PydanticObjectId] # Link to deployment created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -118,7 +121,7 @@ class ConfigDB(Document): config_name: str current_version: int = Field(default=1) parameters: Dict[str, Any] = Field(default=dict) - version_history: List[ConfigVersionDB] = Field(default=[]) + version_history: List[Link[ConfigVersionDB]] = Field(default=[]) created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -127,17 +130,17 @@ class Settings: class AppVariantDB(Document): - app: AppDB = AppDB + app: Link[AppDB] variant_name: str - image: ImageDB = ImageDB - user: UserDB = UserDB - organization: OrganizationDB = OrganizationDB + image: Link[ImageDB] + user: Link[UserDB] + organization: Link[OrganizationDB] parameters: Dict[str, Any] = Field(default=dict) # TODO: deprecated. remove previous_variant_name: Optional[str] # TODO: deprecated. remove base_name: Optional[str] - base: VariantBaseDB = VariantBaseDB + base: Link[VariantBaseDB] config_name: Optional[str] - config: ConfigDB = ConfigDB + config: Link[ConfigDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -150,14 +153,17 @@ class Settings: class AppEnvironmentDB(Document): - app: AppDB = AppDB + app: Link[AppDB] name: str - user: UserDB = UserDB - organization: OrganizationDB = OrganizationDB + user: Link[UserDB] + organization: Link[OrganizationDB] deployed_app_variant: Optional[PydanticObjectId] deployment: Optional[PydanticObjectId] # reference to deployment created_at: Optional[datetime] = Field(default=datetime.utcnow()) + class Settings: + collection = "app_environment_db" + class TemplateDB(Document): type: Optional[str] = Field(default="image") @@ -177,10 +183,10 @@ class Settings: class TestSetDB(Document): name: str - app: AppDB = AppDB + app: Link[AppDB] csvdata: List[Dict[str, str]] - user: UserDB = UserDB - organization: OrganizationDB = OrganizationDB + user: Link[UserDB] + organization: Link[OrganizationDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -191,9 +197,9 @@ class Settings: class CustomEvaluationDB(Document): evaluation_name: str python_code: str - app: AppDB = AppDB - user: UserDB = UserDB - organization: OrganizationDB = OrganizationDB + app: Link[AppDB] + user: Link[UserDB] + organization: Link[OrganizationDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -208,9 +214,9 @@ class EvaluationSettingsTemplate(BaseModel): class EvaluatorConfigDB(Document): - app: AppDB = AppDB - organization: OrganizationDB = OrganizationDB - user: UserDB = UserDB + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] name: str evaluator_key: str settings_values: Optional[Dict[str, Any]] = None @@ -258,13 +264,13 @@ class HumanEvaluationScenarioOutput(BaseModel): class HumanEvaluationDB(Document): - app: AppDB = AppDB - organization: OrganizationDB = OrganizationDB - user: UserDB = UserDB + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] status: str evaluation_type: str variants: List[PydanticObjectId] - testset: TestSetDB = TestSetDB + testset: Link[TestSetDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -273,11 +279,11 @@ class Settings: class HumanEvaluationScenarioDB(Document): - user: UserDB = UserDB - organization: OrganizationDB = OrganizationDB - evaluation: HumanEvaluationDB = HumanEvaluationDB - inputs: List[HumanEvaluationScenarioInput] - outputs: List[HumanEvaluationScenarioOutput] + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[HumanEvaluationDB] + inputs: List[Link[HumanEvaluationScenarioInput]] + outputs: List[Link[HumanEvaluationScenarioOutput]] vote: Optional[str] score: Optional[Union[str, int]] correct_answer: Optional[str] @@ -291,11 +297,11 @@ class Settings: class EvaluationDB(Document): - app: AppDB = AppDB - organization: OrganizationDB = OrganizationDB - user: UserDB = UserDB + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] status: str = Field(default="EVALUATION_INITIALIZED") - testset: TestSetDB = TestSetDB + testset: Link[TestSetDB] variants: List[PydanticObjectId] evaluators_configs: List[PydanticObjectId] aggregated_results: List[AggregatedResult] @@ -307,12 +313,12 @@ class Settings: class EvaluationScenarioDB(Document): - user: UserDB = UserDB - organization: OrganizationDB = OrganizationDB - evaluation: EvaluationDB = EvaluationDB + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[EvaluationDB] variant_id: PydanticObjectId - inputs: List[EvaluationScenarioInputDB] - outputs: List[EvaluationScenarioOutputDB] + inputs: List[Link[EvaluationScenarioInputDB]] + outputs: List[Link[EvaluationScenarioOutputDB]] correct_answer: Optional[str] is_pinned: Optional[bool] note: Optional[str] @@ -367,7 +373,7 @@ class TraceDB(Document): latency: float status: str # initiated, completed, stopped, cancelled, failed token_consumption: Optional[int] - user: UserDB = UserDB + user: Link[UserDB] tags: Optional[List[str]] feedbacks: Optional[List[Feedback]] From 134556aea9045011dc6554a26e340b3e51d0c77d Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 4 Jan 2024 18:13:12 +0100 Subject: [PATCH 263/414] comparison results --- .../routers/evaluation_router.py | 8 +- .../services/evaluation_service.py | 88 +++++++++++-------- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index fb4feb0041..a4852598f2 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -1,6 +1,6 @@ import os import secrets -from typing import List +from typing import Any, List from fastapi.responses import JSONResponse from fastapi.encoders import jsonable_encoder @@ -250,12 +250,10 @@ async def webhook_example_fake(): @router.get( "/evaluation_scenarios/comparison-results/", - response_model=List, + response_model=Any, ) async def fetch_evaluation_scenarios( evaluations_ids: str, - testset_id: str, - app_variant_id: str, request: Request, ): """Fetches evaluation scenarios for a given evaluation ID. @@ -272,7 +270,7 @@ async def fetch_evaluation_scenarios( evaluations_ids_list = evaluations_ids.split(",") user_org_data: dict = await get_user_and_org_id(request.state.user_id) eval_scenarios = await evaluation_service.compare_evaluations_scenarios( - evaluations_ids_list, testset_id, app_variant_id, **user_org_data + evaluations_ids_list, **user_org_data ) return eval_scenarios diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 315ef0e342..fd91ab1fa5 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -1046,72 +1046,82 @@ async def retrieve_evaluation_results( async def compare_evaluations_scenarios( evaluations_ids: List[str], - testset_id: str, - app_variant_id: str, **user_org_data: dict, ): + evaluation = await db_manager.fetch_evaluation_by_id(evaluations_ids[0]) + testset = evaluation.testset + unique_tesetset_datapoints = remove_duplicates(testset.csvdata) + formatted_inputs = extract_inputs_values_from_testset(unique_tesetset_datapoints) + # # formatted_inputs: [{'input_name': 'country', 'input_value': 'Nauru'}] + all_scenarios = [] - grouped_scenarios = {} + for evaluation_id in evaluations_ids: eval_scenarios = await fetch_evaluation_scenarios_for_evaluation( evaluation_id, **user_org_data ) all_scenarios.append(eval_scenarios) - app_variant_db = await fetch_app_variant_by_id(app_variant_id) - testset = await db_manager.fetch_testset_by_id(testset_id=testset_id) - inputs = app_variant_db.parameters.get("inputs", []) - # inputs: [{'name': 'country'}] - formatted_inputs = extract_inputs_values_from_tesetset(inputs, testset.csvdata) - # formatted_inputs: [{'input_name': 'country', 'input_values': ['Nauru', 'Tuvalu'...]}] - - print(formatted_inputs) groupped_scenarios_by_inputs = find_scenarios_by_input( formatted_inputs, all_scenarios ) - print(groupped_scenarios_by_inputs) + return groupped_scenarios_by_inputs -def extract_inputs_values_from_tesetset(inputs, testset): +def extract_inputs_values_from_testset(testset): extracted_values = [] - for input_item in inputs: - key_name = input_item["name"] - values = [entry[key_name] for entry in testset if key_name in entry] + input_keys = testset[0].keys() - # Create a dictionary for each input with its values - input_dict = {"input_name": key_name, "input_values": values} - extracted_values.append(input_dict) + for entry in testset: + for key in input_keys: + if key != 'correct_answer': + extracted_values.append({"input_name": key, "input_value": entry[key]}) return extracted_values def find_scenarios_by_input(formatted_inputs, all_scenarios): results = [] - flattened_scenarios = [ - scenario for sublist in all_scenarios for scenario in sublist - ] + flattened_scenarios = [scenario for sublist in all_scenarios for scenario in sublist] for formatted_input in formatted_inputs: input_name = formatted_input["input_name"] - for input_value in formatted_input["input_values"]: - matching_scenarios = [ - scenario - for scenario in flattened_scenarios - if any( - input_item.name == input_name and input_item.value == input_value - for input_item in scenario.inputs - ) - ] - - results.append( - { - "input_name": input_name, - "input_value": input_value, - "scenarios": matching_scenarios, - } + input_value = formatted_input["input_value"] + + matching_scenarios = [ + scenario + for scenario in flattened_scenarios + if any( + input_item.name == input_name and input_item.value == input_value + for input_item in scenario.inputs ) + ] + + results.append( + { + "input_name": input_name, + "input_value": input_value, + "scenarios": matching_scenarios, + } + ) + + return { + "inputs": formatted_inputs, + "data" : results, + } + + +def remove_duplicates(csvdata): + unique_data = set() + unique_entries = [] + + for entry in csvdata: + entry_tuple = tuple(entry.items()) + if entry_tuple not in unique_data: + unique_data.add(entry_tuple) + unique_entries.append(entry) - return results + return unique_entries \ No newline at end of file From 9ea844a87ab266297d34aff19152c403f07b92e7 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 4 Jan 2024 18:30:54 +0100 Subject: [PATCH 264/414] black --- .../agenta_backend/services/evaluation_service.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index fd91ab1fa5..5db996c432 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -1062,7 +1062,6 @@ async def compare_evaluations_scenarios( ) all_scenarios.append(eval_scenarios) - groupped_scenarios_by_inputs = find_scenarios_by_input( formatted_inputs, all_scenarios ) @@ -1077,7 +1076,7 @@ def extract_inputs_values_from_testset(testset): for entry in testset: for key in input_keys: - if key != 'correct_answer': + if key != "correct_answer": extracted_values.append({"input_name": key, "input_value": entry[key]}) return extracted_values @@ -1085,7 +1084,9 @@ def extract_inputs_values_from_testset(testset): def find_scenarios_by_input(formatted_inputs, all_scenarios): results = [] - flattened_scenarios = [scenario for sublist in all_scenarios for scenario in sublist] + flattened_scenarios = [ + scenario for sublist in all_scenarios for scenario in sublist + ] for formatted_input in formatted_inputs: input_name = formatted_input["input_name"] @@ -1110,7 +1111,7 @@ def find_scenarios_by_input(formatted_inputs, all_scenarios): return { "inputs": formatted_inputs, - "data" : results, + "data": results, } @@ -1124,4 +1125,4 @@ def remove_duplicates(csvdata): unique_data.add(entry_tuple) unique_entries.append(entry) - return unique_entries \ No newline at end of file + return unique_entries From 636c6c401b6d80326a0ee1d67dfbc97be87571a6 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 4 Jan 2024 18:31:05 +0100 Subject: [PATCH 265/414] fix evaluations scenarios --- .../agenta_backend/tasks/evaluations.py | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 28f5ca7c84..a5a3970ecc 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -122,24 +122,25 @@ def evaluate( result ) - # 4. We create a new evaluation scenario - evaluation_scenario = loop.run_until_complete( - create_new_evaluation_scenario( - user=app.user, - organization=app.organization, - evaluation=new_evaluation_db, - variant_id=variant_id, - evaluators_configs=new_evaluation_db.evaluators_configs, - inputs=inputs, - is_pinned=False, - note="", - correct_answer=data_point["correct_answer"], - outputs=[ - EvaluationScenarioOutputDB(type="text", value=app_output.output) - ], - results=evaluators_results, + # 4. We create a new evaluation scenario + evaluation_scenario = loop.run_until_complete( + create_new_evaluation_scenario( + user=app.user, + organization=app.organization, + evaluation=new_evaluation_db, + variant_id=variant_id, + evaluators_configs=new_evaluation_db.evaluators_configs, + inputs=inputs, + is_pinned=False, + note="", + correct_answer=data_point["correct_answer"], + outputs=[ + EvaluationScenarioOutputDB(type="text", value=app_output.output) + ], + results=evaluators_results, + ) ) - ) + except Exception as e: print(f"An error occurred during evaluation: {e}") loop.run_until_complete( From d11b5673628e3c8e8ee4264a9980c5d0d82c25e9 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 4 Jan 2024 19:49:05 +0100 Subject: [PATCH 266/414] Refactor - migrate to beanie and fix bugs that arose in: creating app from template, playground and testsets --- .../agenta_backend/models/db_engine.py | 31 +- .../agenta_backend/models/db_models.py | 2 +- .../agenta_backend/routers/app_router.py | 2 + .../routers/container_router.py | 24 +- .../agenta_backend/routers/testset_router.py | 23 +- .../agenta_backend/routers/variants_router.py | 7 +- .../agenta_backend/services/db_manager.py | 299 ++++++++---------- .../services/evaluation_service.py | 277 ++-------------- .../services/event_db_manager.py | 68 ++-- .../services/results_service.py | 17 +- agenta-backend/agenta_backend/utils/common.py | 33 +- 11 files changed, 252 insertions(+), 531 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 1258cc24e0..4f759e6df6 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -1,12 +1,14 @@ import os import logging +from typing import List -from odmantic import AIOEngine -from beanie import init_beanie from pymongo import MongoClient +from beanie import init_beanie, Document from motor.motor_asyncio import AsyncIOMotorClient + from agenta_backend.models.db_models import ( APIKeyDB, + AppEnvironmentDB, OrganizationDB, UserDB, ImageDB, @@ -32,8 +34,9 @@ logger.setLevel(logging.INFO) # Define Document Models -document_models = [ +document_models: List[Document] = [ APIKeyDB, + AppEnvironmentDB, OrganizationDB, UserDB, ImageDB, @@ -63,25 +66,23 @@ class DBEngine: def __init__(self) -> None: self.mode = os.environ.get("DATABASE_MODE", "v2") self.db_url = os.environ["MONGODB_URI"] - self._engine: AIOEngine = None # Store the engine for reuse - async def init_db(self) -> AIOEngine: + async def initialize_client(self): + return AsyncIOMotorClient(self.db_url) + + async def init_db(self): """ Initialize Beanie based on the mode and store the engine. """ - if self._engine is not None: - return self._engine # Return the existing engine if already initialized - client = AsyncIOMotorClient(self.db_url) + client = await self.initialize_client() db_name = self._get_database_name(self.mode) - self._engine = await init_beanie( + await init_beanie( database=client[db_name], document_models=document_models ) - logger.info(f"Using {db_name} database...") - return self._engine def _get_database_name(self, mode: str) -> str: """ @@ -94,14 +95,6 @@ def _get_database_name(self, mode: str) -> str: raise ValueError("Mode of database needs to be alphanumeric.") return f"agenta_{mode}" - def engine(self) -> AIOEngine: - """ - Return the initialized Beanie engine. - """ - if self._engine is None: - raise RuntimeError("Database engine has not been initialized yet.") - return self._engine - def remove_db(self) -> None: """ Remove the database based on the mode. diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index ae0eefb6c2..8606111c8d 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -121,7 +121,7 @@ class ConfigDB(Document): config_name: str current_version: int = Field(default=1) parameters: Dict[str, Any] = Field(default=dict) - version_history: List[Link[ConfigVersionDB]] = Field(default=[]) + version_history: List[ConfigVersionDB] = Field(default=[]) created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/routers/app_router.py b/agenta-backend/agenta_backend/routers/app_router.py index 857e4ce181..008ff298b9 100644 --- a/agenta-backend/agenta_backend/routers/app_router.py +++ b/agenta-backend/agenta_backend/routers/app_router.py @@ -306,6 +306,8 @@ async def remove_app(app_id: str, request: Request): detail = f"Docker error while trying to remove the app: {str(e)}" raise HTTPException(status_code=500, detail=detail) except Exception as e: + import traceback + traceback.print_exc() detail = f"Unexpected error while trying to remove the app: {str(e)}" raise HTTPException(status_code=500, detail=detail) diff --git a/agenta-backend/agenta_backend/routers/container_router.py b/agenta-backend/agenta_backend/routers/container_router.py index f0fc073154..2a4b8c5254 100644 --- a/agenta-backend/agenta_backend/routers/container_router.py +++ b/agenta-backend/agenta_backend/routers/container_router.py @@ -1,15 +1,9 @@ import os +import logging from typing import List, Optional, Union -from agenta_backend.models.api.api_models import ( - URI, - Image, - RestartAppContainer, - Template, -) -from agenta_backend.services import db_manager -from fastapi import Request, UploadFile, HTTPException -from agenta_backend.utils.common import APIRouter + from fastapi.responses import JSONResponse +from fastapi import Request, UploadFile, HTTPException if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: from agenta_backend.commons.services.selectors import ( @@ -26,16 +20,24 @@ else: from agenta_backend.services import container_manager -import logging +from agenta_backend.models.api.api_models import ( + URI, + Image, + RestartAppContainer, + Template, +) +from agenta_backend.services import db_manager +from agenta_backend.utils.common import APIRouter -logger = logging.getLogger(__name__) +logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) router = APIRouter() # TODO: We need to improve this to use the introduced abstraction to also use start and stop service +#* Edit: someone remind me (abram) to work on this. @router.post("/build_image/", operation_id="build_image") async def build_image( app_id: str, diff --git a/agenta-backend/agenta_backend/routers/testset_router.py b/agenta-backend/agenta_backend/routers/testset_router.py index ec69df388a..fca1f2b89b 100644 --- a/agenta-backend/agenta_backend/routers/testset_router.py +++ b/agenta-backend/agenta_backend/routers/testset_router.py @@ -6,11 +6,10 @@ from bson import ObjectId from datetime import datetime from typing import Optional, List +from pydantic import ValidationError -from fastapi import HTTPException, UploadFile, File, Form, Request -from agenta_backend.utils.common import APIRouter from fastapi.responses import JSONResponse -from pydantic import ValidationError +from fastapi import HTTPException, UploadFile, File, Form, Request from agenta_backend.models.api.testset_model import ( TestSetSimpleResponse, @@ -18,15 +17,15 @@ NewTestset, TestSetOutputResponse, ) -from agenta_backend.utils.common import engine, check_access_to_app +from agenta_backend.services import db_manager from agenta_backend.models.db_models import TestSetDB from agenta_backend.services.db_manager import get_user -from agenta_backend.services import db_manager from agenta_backend.models.converters import testset_db_to_pydantic +from agenta_backend.utils.common import APIRouter, check_access_to_app -upload_folder = "./path/to/upload/folder" router = APIRouter() +upload_folder = "./path/to/upload/folder" if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: @@ -107,7 +106,7 @@ async def upload_file( testset_instance = TestSetDB(**document, user=user) except ValidationError as e: raise HTTPException(status_code=403, detail=e.errors()) - result = await engine.save(testset_instance) + result = await testset_instance.create() if isinstance(result.id, ObjectId): return TestSetSimpleResponse( @@ -172,7 +171,7 @@ async def import_testset( user = await get_user(user_uid=user_org_data["uid"]) testset_instance = TestSetDB(**document, user=user) - result = await engine.save(testset_instance) + result = await testset_instance.create() if isinstance(result.id, ObjectId): return TestSetSimpleResponse( @@ -239,7 +238,7 @@ async def create_testset( try: testset_instance = TestSetDB(**testset) - await engine.save(testset_instance) + await testset_instance.create() if testset_instance is not None: return TestSetSimpleResponse( @@ -288,9 +287,7 @@ async def update_testset( status_code=400, ) try: - test_set.update(testset_update) - await engine.save(test_set) - + await test_set.update({"$set": testset_update}) if isinstance(test_set.id, ObjectId): return { "status": "success", @@ -407,7 +404,7 @@ async def delete_testsets( {"detail": error_msg}, status_code=400, ) - await engine.delete(test_set) + await test_set.delete() deleted_ids.append(testset_id) return deleted_ids diff --git a/agenta-backend/agenta_backend/routers/variants_router.py b/agenta-backend/agenta_backend/routers/variants_router.py index e20668c59f..0553437377 100644 --- a/agenta-backend/agenta_backend/routers/variants_router.py +++ b/agenta-backend/agenta_backend/routers/variants_router.py @@ -64,7 +64,6 @@ async def add_variant_from_base_and_config( ) # Find the previous variant in the database - db_app_variant = await db_manager.add_variant_from_base_and_config( base_db=base_db, new_config_name=payload.new_config_name, @@ -72,9 +71,12 @@ async def add_variant_from_base_and_config( **user_org_data, ) logger.debug(f"Successfully added new variant: {db_app_variant}") - return await converters.app_variant_db_to_output(db_app_variant) + app_variant_db = await db_manager.get_app_variant_instance_by_id(str(db_app_variant.id)) + return await converters.app_variant_db_to_output(app_variant_db) except Exception as e: + import traceback + traceback.print_exc() logger.error(f"An exception occurred while adding the new variant: {e}") raise HTTPException(status_code=500, detail=str(e)) @@ -97,7 +99,6 @@ async def remove_variant( user_org_data: dict = await get_user_and_org_id(request.state.user_id) # Check app access - access_app = await check_access_to_variant( user_org_data, variant_id=variant_id, check_owner=True ) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 3f85ff94a1..684d91410a 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1,12 +1,8 @@ import os import logging -import pymongo from pathlib import Path -from bson import ObjectId from datetime import datetime -from beanie.operators import In from urllib.parse import urlparse -from beanie import PydanticObjectId, Query from typing import Any, Dict, List, Optional from agenta_backend.models.api.api_models import ( @@ -49,12 +45,11 @@ from agenta_backend.utils.common import check_user_org_access from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum - from fastapi import HTTPException from fastapi.responses import JSONResponse -from odmantic import query -from odmantic.exceptions import DocumentParsingError +from beanie.operators import In +from beanie import PydanticObjectId as ObjectId # Define logger @@ -117,20 +112,15 @@ async def get_image(app_variant: AppVariant, **kwargs: dict) -> ImageExtended: """ # Build the query expression for the two conditions - query = Query(AppVariantDB) # Create a query for AppVariantDB - query = query.filter( - AppVariantDB.app == app_variant.app_id - ).filter( - AppVariantDB.variant_name == app_variant.variant_name - ).filter( - AppVariantDB.organization == app_variant.organization + query_expression = ( + AppVariantDB.app.id == app_variant.app_id, + AppVariantDB.variant_name == app_variant.variant_name, + AppVariantDB.organization.id == app_variant.organization, ) - db_app_variant: AppVariantDB = await query.get() + db_app_variant = await AppVariantDB.find_one(query_expression) if db_app_variant: - image_db: ImageDB = await ImageDB.find_one( - ImageDB.id == db_app_variant.image.id - ) + image_db = await ImageDB.find_one(ImageDB.id == db_app_variant.image.id) return image_db_to_pydantic(image_db) else: raise Exception("App variant not found") @@ -157,7 +147,7 @@ async def fetch_app_by_id(app_id: str, **kwargs: dict) -> AppDB: app_id: _description_ """ assert app_id is not None, "app_id cannot be None" - app = await AppDB.find_one(AppDB.id == app_id) + app = await AppDB.find_one(AppDB.id == ObjectId(app_id), fetch_links=True) return app @@ -172,15 +162,17 @@ async def fetch_app_by_name( Returns: AppDB: the instance of the app """ - query = Query(AppDB) # Create a base query for AppDB if not organization_id: user = await get_user(user_uid=user_org_data["uid"]) - query = query.filter(AppDB.app_name == app_name).filter(AppDB.user == user.id) + query_expressions = {"app_name": app_name, "user": user.id} else: - query = query.filter(AppDB.app_name == app_name).filter(AppDB.organization == organization_id) + query_expressions = { + "app_name": app_name, + "organization": ObjectId(organization_id), + } - app = await query.get() + app = await AppDB.find_one(query_expressions) return app @@ -198,7 +190,7 @@ async def fetch_app_variant_by_id( """ assert app_variant_id is not None, "app_variant_id cannot be None" app_variant = await AppVariantDB.find_one( - AppVariantDB.id == app_variant_id + AppVariantDB.id == ObjectId(app_variant_id), fetch_links=True ) return app_variant @@ -216,7 +208,7 @@ async def fetch_base_by_id( """ if base_id is None: raise Exception("No base_id provided") - base = await VariantBaseDB.find_one(VariantBaseDB.id == base_id) + base = await VariantBaseDB.find_one(VariantBaseDB.id == ObjectId(base_id)) if base is None: logger.error("Base not found") return False @@ -243,11 +235,11 @@ async def fetch_app_variant_by_name_and_appid( AppVariantDB: the instance of the app variant """ - query = Query(AppVariantDB) - query = query.filter(AppVariantDB.variant_name == variant_name).filter( - AppVariantDB.app == app_id + query_expressions = ( + AppVariantDB.variant_name == variant_name, + AppVariantDB.app.id == ObjectId(app_id), ) - app_variant_db = await query.get() + app_variant_db = await AppVariantDB.find_one(query_expressions) return app_variant_db @@ -491,7 +483,9 @@ async def get_deployment_by_objectid( DeploymentDB: instance of deployment object """ - deployment = await DeploymentDB.find_one(DeploymentDB.id == deployment_id) + deployment = await DeploymentDB.find_one( + DeploymentDB.id == ObjectId(deployment_id), fetch_links=True + ) logger.debug(f"deployment: {deployment}") return deployment @@ -507,7 +501,7 @@ async def get_organization_object(organization_id: str) -> OrganizationDB: OrganizationDB: The fetched organization. """ organization = await OrganizationDB.find_one( - OrganizationDB.id == organization_id + OrganizationDB.id == ObjectId(organization_id) ) return organization @@ -523,10 +517,9 @@ async def get_organizations_by_list_ids(organization_ids: List) -> List: List: A list of dictionaries representing the retrieved organizations. """ - organizations_db: List[OrganizationDB] = await OrganizationDB.find( + organizations_db = await OrganizationDB.find( In(OrganizationDB.id, organization_ids) ).to_list() - return organizations_db @@ -541,9 +534,10 @@ async def list_app_variants_for_app_id( List[AppVariant]: List of AppVariant objects """ assert app_id is not None, "app_id cannot be None" - query = Query(AppVariantDB).filter(AppVariantDB.app == app_id) - app_variants_db: List[AppVariantDB] = await query.sort("variant_name").find() - + query_expressions = AppVariantDB.app.id == ObjectId(app_id) + app_variants_db = await AppVariantDB.find( + query_expressions, fetch_links=True + ).to_list() return app_variants_db @@ -551,10 +545,10 @@ async def list_bases_for_app_id( app_id: str, base_name: Optional[str] = None, **kwargs: dict ) -> List[VariantBaseDB]: assert app_id is not None, "app_id cannot be None" - query = Query(VariantBaseDB).filter(VariantBaseDB.app == app_id) + query_expressions = VariantBaseDB.app.id == ObjectId(app_id) if base_name: - query = query.filter(VariantBaseDB.base_name == base_name) - bases_db: List[VariantBaseDB] = await query.sort("base_name").find() + query_expressions += query_expressions(VariantBaseDB.base_name == base_name) + bases_db = await VariantBaseDB.find(query_expressions).sort("base_name").to_list() return bases_db @@ -569,9 +563,12 @@ async def list_variants_for_base( List[AppVariant]: List of AppVariant objects """ assert base is not None, "base cannot be None" - query = Query(AppVariantDB).filter(AppVariantDB.base == base.id) - app_variants_db: List[AppVariantDB] = await query.sort("variant_name").find() - + query_expressions = AppVariantDB.base.id == ObjectId(base.id) + app_variants_db = ( + await AppVariantDB.find(query_expressions, fetch_links=True) + .sort("variant_name") + .to_list() + ) return app_variants_db @@ -588,20 +585,18 @@ async def get_user(user_uid: str) -> UserDB: user = await UserDB.find_one(UserDB.uid == user_uid) if user is None: if os.environ["FEATURE_FLAG"] not in ["cloud", "ee"]: - create_user = UserDB(uid="0") - await create_user.insert() + user_db = UserDB(uid="0") + user = await user_db.create() - org = OrganizationDB(type="default", owner=create_user.id) - await org.insert() + org_db = OrganizationDB(type="default", owner=str(user.id)) + org = await org_db.create() - create_user.organizations.append(org.id) - await create_user.update() + user_db.organizations.append(org.id) + await user_db.save() - return create_user - else: - raise Exception("Please login or signup") - else: - return user + return user + raise Exception("Please login or signup") + return user async def get_user_with_id(user_id: ObjectId): @@ -664,8 +659,7 @@ async def get_users_by_ids(user_ids: List) -> List: List: A list of dictionaries representing the retrieved users. """ - users_db: List[UserDB] = await UserDB.find(In(UserDB.id, user_ids)).to_list() - + users_db = await UserDB.find(In(UserDB.id, user_ids)).to_list() return users_db @@ -682,8 +676,11 @@ async def get_orga_image_instance_by_docker_id( ImageDB: instance of image object """ - query = Query(ImageDB).filter(ImageDB.docker_id == docker_id).filter(ImageDB.organization == organization_id) - image = await query.get() + query_expressions = { + "docker_id": docker_id, + "organization": ObjectId(organization_id), + } + image = await ImageDB.find_one(query_expressions) return image @@ -704,8 +701,11 @@ async def get_orga_image_instance_by_uri( if not parsed_url.scheme and not parsed_url.netloc: raise ValueError(f"Invalid URL: {template_uri}") - query = Query(ImageDB).filter(ImageDB.template_uri == template_uri).filter(ImageDB.organization == organization_id) - image = await query.get() + query_expressions = ( + ImageDB.template_uri == template_uri, + ImageDB.organization == organization_id, + ) + image = await ImageDB.fine_one(query_expressions) return image @@ -719,7 +719,7 @@ async def get_app_instance_by_id(app_id: str) -> AppDB: AppDB: instance of app object """ - app = await AppDB.find_one(AppDB.id == app_id) + app = await AppDB.find_one(AppDB.id == ObjectId(app_id)) return app @@ -807,7 +807,7 @@ async def list_apps( organization_access = await check_user_org_access(user_org_data, org_id) if organization_access: apps: List[AppDB] = await AppDB.find( - AppDB.organization == org_id + AppDB.organization.id == ObjectId(org_id) ).to_list() return [app_db_to_pydantic(app) for app in apps] @@ -818,7 +818,7 @@ async def list_apps( ) else: - apps: List[AppVariantDB] = await AppDB.find(AppDB.user == user.id).to_list() + apps = await AppDB.find(AppDB.user.id == user.id).to_list() return [app_db_to_pydantic(app) for app in apps] @@ -832,15 +832,9 @@ async def list_app_variants(app_id: str = None, **kwargs: dict) -> List[AppVaria """ # Construct query expressions - logger.debug("app_id: %s", app_id) - query = Query(AppVariantDB) - if app_id is not None: - query = query.filter(AppVariantDB.app == app_id) - - logger.debug("query: %s", query) - app_variants_db: List[AppVariantDB] = await query.find() - - # Include previous variant name + app_variants_db = await AppVariantDB.find( + AppVariantDB.app.id == ObjectId(app_id), fetch_links=True + ).to_list() return app_variants_db @@ -856,14 +850,10 @@ async def check_is_last_variant_for_image(db_app_variant: AppVariantDB) -> bool: true if it's the last variant, false otherwise """ - query = Query(AppVariantDB) - query = query.filter( - AppVariantDB.organization == db_app_variant.organization.id - ).filter( - AppVariantDB.base == db_app_variant.base.id - ) - count_variants = await query.count() - + count_variants = await AppVariantDB.find( + AppVariantDB.organization.id == db_app_variant.organization.id, + AppVariantDB.base.id == db_app_variant.base.id, + ).count() return count_variants == 1 @@ -926,11 +916,10 @@ async def deploy_to_environment(environment_name: str, variant_id: str, **kwargs raise ValueError("App variant not found") # Find the environment for the given app name and user - query = Query(AppEnvironmentDB) - query = query.filter(AppEnvironmentDB.app == app_variant_db.app.id) - query = query.filter(AppEnvironmentDB.name == environment_name) - - environment_db: AppEnvironmentDB = await query.get() + environment_db = await AppEnvironmentDB.find_one( + AppEnvironmentDB.app.id == app_variant_db.app.id, + AppEnvironmentDB.name == environment_name, + ) if environment_db is None: raise ValueError(f"Environment {environment_name} not found") @@ -941,7 +930,7 @@ async def deploy_to_environment(environment_name: str, variant_id: str, **kwargs # Update the environment with the new variant name environment_db.deployed_app_variant = app_variant_db.id - await environment_db.update() + await environment_db.save() async def list_environments(app_id: str, **kwargs: dict) -> List[AppEnvironmentDB]: @@ -961,10 +950,9 @@ async def list_environments(app_id: str, **kwargs: dict) -> List[AppEnvironmentD logging.error(f"App with id {app_id} not found") raise ValueError("App not found") - environments_db: List[AppEnvironmentDB] = await AppEnvironmentDB.find( - AppEnvironmentDB.app == app_id + environments_db = await AppEnvironmentDB.find( + AppEnvironmentDB.app.id == ObjectId(app_id), fetch_links=True ).to_list() - return environments_db @@ -1026,10 +1014,9 @@ async def list_environments_by_variant( List[AppEnvironmentDB]: A list of AppEnvironmentDB objects. """ - environments_db: List[AppEnvironmentDB] = await AppEnvironmentDB.find( - AppEnvironmentDB.app == app_variant.app.id, + environments_db = await AppEnvironmentDB.find( + AppEnvironmentDB.app == app_variant.app.id, fetch_links=True ).to_list() - return environments_db @@ -1085,7 +1072,9 @@ async def remove_app_testsets(app_id: str, **kwargs): deleted_count: int = 0 # Build query expression - testsets = await TestSetDB.find(TestSetDB.app == app_id).to_list() + testsets = await TestSetDB.find( + TestSetDB.app.id == ObjectId(app_id), fetch_links=True + ).to_list() # Perform deletion if there are testsets to delete if testsets is not None: @@ -1171,7 +1160,7 @@ async def update_variant_parameters( config_db.parameters = parameters # Save updated ConfigDB - await config_db.create() + await config_db.save() except Exception as e: logging.error(f"Issue updating variant parameters: {e}") @@ -1192,15 +1181,15 @@ async def get_app_variant_by_app_name_and_environment( Returns: Optional[AppVariantDB]: The deployed app variant for the given app and environment, or None if not found. """ - # Get the environment + # Construct query filters for finding the environment in the database - query = Query(AppEnvironmentDB) - query = query.filter(AppEnvironmentDB.name == environment).filter( - AppEnvironmentDB.app == app_id + query_expressions = ( + AppEnvironmentDB.name == environment, + AppEnvironmentDB.app.id == ObjectId(app_id), ) # Perform the database query to find the environment - environment_db = await query.get() + environment_db = await AppEnvironmentDB.find_one(query_expressions) if not environment_db: logger.info(f"Environment {environment} not found") @@ -1212,7 +1201,6 @@ async def get_app_variant_by_app_name_and_environment( app_variant_db = await get_app_variant_instance_by_id( str(environment_db.deployed_app_variant) ) - return app_variant_db @@ -1227,7 +1215,7 @@ async def get_app_variant_instance_by_id(variant_id: str): """ app_variant_db = await AppVariantDB.find_one( - AppVariantDB.id == variant_id + AppVariantDB.id == ObjectId(variant_id), fetch_links=True ) return app_variant_db @@ -1240,7 +1228,9 @@ async def fetch_testset_by_id(testset_id: str) -> Optional[TestSetDB]: TestSetDB: The fetched testset, or None if no testset was found. """ assert testset_id is not None, "testset_id cannot be None" - testset = await TestSetDB.find_one(TestSetDB.id == testset_id) + testset = await TestSetDB.find_one( + TestSetDB.id == ObjectId(testset_id), fetch_links=True + ) return testset @@ -1252,7 +1242,7 @@ async def fetch_testsets_by_app_id(app_id: str) -> List[TestSetDB]: List[TestSetDB]: The fetched testsets. """ assert app_id is not None, "app_id cannot be None" - testsets = await TestSetDB.find(TestSetDB.app == app_id).to_list() + testsets = await TestSetDB.find(TestSetDB.app.id == ObjectId(app_id)).to_list() return testsets @@ -1264,9 +1254,7 @@ async def fetch_evaluation_by_id(evaluation_id: str) -> Optional[EvaluationDB]: EvaluationDB: The fetched evaluation, or None if no evaluation was found. """ assert evaluation_id is not None, "evaluation_id cannot be None" - evaluation = await EvaluationDB.find_one( - EvaluationDB.id == evaluation_id - ) + evaluation = await EvaluationDB.find_one(EvaluationDB.id == ObjectId(evaluation_id)) return evaluation @@ -1281,7 +1269,7 @@ async def fetch_human_evaluation_by_id( """ assert evaluation_id is not None, "evaluation_id cannot be None" evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.id == evaluation_id + HumanEvaluationDB.id == ObjectId(evaluation_id) ) return evaluation @@ -1297,7 +1285,7 @@ async def fetch_evaluation_scenario_by_id( """ assert evaluation_scenario_id is not None, "evaluation_scenario_id cannot be None" evaluation_scenario = await EvaluationScenarioDB.find_one( - EvaluationScenarioDB.id == evaluation_scenario_id + EvaluationScenarioDB.id == ObjectId(evaluation_scenario_id) ) return evaluation_scenario @@ -1313,7 +1301,7 @@ async def fetch_human_evaluation_scenario_by_id( """ assert evaluation_scenario_id is not None, "evaluation_scenario_id cannot be None" evaluation_scenario = await HumanEvaluationScenarioDB.find_one( - HumanEvaluationScenarioDB.id == evaluation_scenario_id, + HumanEvaluationScenarioDB.id == ObjectId(evaluation_scenario_id), ) return evaluation_scenario @@ -1331,7 +1319,7 @@ async def find_previous_variant_from_base_id( """ assert base_id is not None, "base_id cannot be None" previous_variants = await AppVariantDB.find( - AppVariantDB.base == base_id + AppVariantDB.base.id == ObjectId(base_id) ).to_list() logger.debug("previous_variants: %s", previous_variants) if len(list(previous_variants)) == 0: @@ -1354,9 +1342,7 @@ async def add_template(**kwargs: dict) -> str: Returns: template_id (Str): The Id of the created template. """ - existing_template = await TemplateDB.find_one( - TemplateDB.tag_id == kwargs["tag_id"] - ) + existing_template = await TemplateDB.find_one(TemplateDB.tag_id == kwargs["tag_id"]) if existing_template is None: db_template = TemplateDB(**kwargs) await db_template.create() @@ -1418,9 +1404,7 @@ async def get_template(template_id: str) -> TemplateDB: """ assert template_id is not None, "template_id cannot be None" - template_db = await TemplateDB.find_one( - TemplateDB.id == template_id - ) + template_db = await TemplateDB.find_one(TemplateDB.id == ObjectId(template_id)) return template_db @@ -1432,34 +1416,18 @@ async def remove_old_template_from_db(tag_ids: list) -> None: """ templates_to_delete = [] - try: - templates: List[TemplateDB] = await TemplateDB.find().to_list() - - for temp in templates: - if temp.tag_id not in tag_ids: - templates_to_delete.append(temp) - - for template in templates_to_delete: - await template.delete() - except DocumentParsingError as exc: - remove_document_using_driver(str(exc.primary_value), "templates") - + templates: List[TemplateDB] = await TemplateDB.find().to_list() -def remove_document_using_driver(document_id: str, collection_name: str) -> None: - """Deletes document from using pymongo driver""" + for temp in templates: + if temp.tag_id not in tag_ids: + templates_to_delete.append(temp) - client = pymongo.MongoClient(os.environ["MONGODB_URI"]) - db = client.get_database("agenta_v2") - - collection = db.get_collection(collection_name) - deleted = collection.delete_one({"_id": PydanticObjectId(document_id)}) - print( - f"Deleted documents in {collection_name} collection. Acknowledged: {deleted.acknowledged}" - ) + for template in templates_to_delete: + await template.delete() async def get_templates() -> List[Template]: - templates = await TemplateDB.find_all() + templates = await TemplateDB.find().to_list() return templates_db_to_pydantic(templates) @@ -1473,8 +1441,8 @@ async def count_apps(**user_org_data: dict) -> int: if user is None: return 0 - query = Query(AppVariantDB).filter(AppVariantDB.user == user.id) - no_of_apps = await query.count() + query_expressions = AppVariantDB.user.id == user.id + no_of_apps = await AppVariantDB.find(query_expressions).count() return no_of_apps @@ -1492,7 +1460,7 @@ async def update_base( if hasattr(base, key): setattr(base, key, value) - await base.update() + await base.save() return base @@ -1507,7 +1475,7 @@ async def update_app_variant( """ for key, value in kwargs.items(): if hasattr(app_variant, key): - setattr(app_variant, key, value) + setattr(app_variant, key, value) await app_variant.update() return app_variant @@ -1533,7 +1501,9 @@ async def fetch_base_and_check_access( """ if base_id is None: raise Exception("No base_id provided") - base = await VariantBaseDB.find_one(VariantBaseDB.id == base_id) + base = await VariantBaseDB.find_one( + VariantBaseDB.id == ObjectId(base_id), fetch_links=True + ) if base is None: logger.error("Base not found") raise HTTPException(status_code=404, detail="Base not found") @@ -1564,7 +1534,7 @@ async def fetch_app_and_check_access( Raises: HTTPException: If the app is not found or the user does not have access to it. """ - app = await AppDB.find_one(AppDB.id == app_id) + app = await AppDB.find_one(AppDB.id == ObjectId(app_id)) if app is None: logger.error("App not found") raise HTTPException @@ -1598,7 +1568,7 @@ async def fetch_app_variant_and_check_access( HTTPException: If the app variant is not found or the user does not have access to it. """ app_variant = await AppVariantDB.find_one( - AppVariantDB.id == app_variant_id + AppVariantDB.id == ObjectId(app_variant_id) ) if app_variant is None: logger.error("App variant not found") @@ -1628,11 +1598,9 @@ async def fetch_app_by_name_and_organization( AppDB: the instance of the app """ - query = Query(AppDB) - query = query.filter(AppDB.app_name == app_name).filter( - AppDB.organization == organization_id + app_db = await AppDB.find_one( + {"app_name": app_name, "organization": ObjectId(organization_id)} ) - app_db = await query.get() return app_db @@ -1686,7 +1654,7 @@ async def create_new_evaluation_scenario( user=user, organization=organization, evaluation=evaluation, - variant_id=PydanticObjectId(variant_id), + variant_id=ObjectId(variant_id), inputs=inputs, outputs=outputs, correct_answer=correct_answer, @@ -1704,7 +1672,7 @@ async def create_new_evaluation_scenario( async def update_evaluation_with_aggregated_results( evaluation_id: ObjectId, aggregated_results: List[AggregatedResult] ) -> EvaluationDB: - evaluation = await EvaluationDB.find_one(EvaluationDB.id == evaluation_id) + evaluation = await EvaluationDB.find_one(EvaluationDB.id == ObjectId(evaluation_id)) if not evaluation: raise ValueError("Evaluation not found") @@ -1726,9 +1694,8 @@ async def fetch_evaluators_configs(app_id: str): assert app_id is not None, "evaluation_id cannot be None" try: - query = Query(EvaluatorConfigDB) - query = query.filter(EvaluatorConfigDB.app == app_id) - evaluators_configs = await query.get() + query_expressions = EvaluatorConfigDB.app.id == ObjectId(app_id) + evaluators_configs = await EvaluatorConfigDB.find_one(query_expressions) return evaluators_configs except Exception as e: raise e @@ -1743,7 +1710,7 @@ async def fetch_evaluator_config(evaluator_config_id: str): try: evaluator_config: EvaluatorConfigDB = await EvaluatorConfigDB.find_one( - EvaluatorConfigDB.id == evaluator_config_id + EvaluatorConfigDB.id == ObjectId(evaluator_config_id) ) return evaluator_config except Exception as e: @@ -1764,11 +1731,11 @@ async def fetch_evaluator_config_by_appId( """ try: - query = Query(EvaluatorConfigDB) - query = query.filter(EvaluatorConfigDB.app == app_id).filter( - EvaluatorConfigDB.evaluator_key == evaluator_name + query_expressions = ( + EvaluatorConfigDB.app.id == ObjectId(app_id), + EvaluatorConfigDB.evaluator_key == evaluator_name, ) - evaluator_config = await query.get() + evaluator_config = await EvaluatorConfigDB.find_one(query_expressions) return evaluator_config except Exception as e: raise e @@ -1813,15 +1780,16 @@ async def update_evaluator_config( Returns: EvaluatorConfigDB: The updated evaluator configuration object. """ + evaluator_config = await EvaluatorConfigDB.find_one( - EvaluatorConfigDB.id == evaluator_config_id + EvaluatorConfigDB.id == ObjectId(evaluator_config_id) ) updates_dict = updates.dict(exclude_unset=True) for key, value in updates_dict.items(): if hasattr(evaluator_config, key): setattr(evaluator_config, key, value) - await evaluator_config.update() + await evaluator_config.save() return evaluator_config @@ -1830,9 +1798,10 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: assert evaluator_config_id is not None, "Evaluator Config ID cannot be None" try: - delete_result = remove_document_using_driver( - str(evaluator_config_id), "evaluators_configs" + evaluator_config = await EvaluatorConfigDB.find_one( + EvaluatorConfigDB.id == ObjectId(evaluator_config_id) ) + delete_result = evaluator_config.delete() return ( delete_result is None ) # checking if delete_result is None (has been deleted) @@ -1853,9 +1822,7 @@ async def update_evaluation( Returns: EvaluatorConfigDB: The updated evaluator configuration object. """ - evaluation = await EvaluationDB.find_one( - EvaluationDB.id == evaluation_id - ) + evaluation = await EvaluationDB.find_one(EvaluationDB.id == ObjectId(evaluation_id)) for key, value in updates.items(): if hasattr(evaluation, key): diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 91e69f42e1..78129e0bed 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -1,16 +1,12 @@ import logging -from bson import ObjectId from datetime import datetime from typing import Dict, List, Any from fastapi import HTTPException from agenta_backend.models.api.evaluation_model import ( - CustomEvaluationNames, Evaluation, EvaluationScenario, - CustomEvaluationOutput, - CustomEvaluationDetail, EvaluationScenarioInput, EvaluationType, EvaluationTypeSettings, @@ -25,9 +21,8 @@ ) from agenta_backend.models import converters from agenta_backend.services import db_manager -from agenta_backend.services.db_manager import query, get_user -from agenta_backend.utils.common import engine, check_access_to_app -from agenta_backend.services.security.sandbox import execute_code_safely +from agenta_backend.services.db_manager import get_user +from agenta_backend.utils.common import check_access_to_app from agenta_backend.models.db_models import ( AppVariantDB, EvaluationDB, @@ -41,9 +36,8 @@ CustomEvaluationDB, ) -from langchain.chains import LLMChain -from langchain.llms import OpenAI -from langchain.prompts import PromptTemplate +from beanie import PydanticObjectId as ObjectId + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -172,7 +166,7 @@ async def prepare_csvdata_and_create_evaluation_scenario( for name in payload_inputs ] except KeyError: - await engine.delete(new_evaluation) + await new_evaluation.delete() msg = f""" Columns in the test set should match the names of the inputs in the variant. Inputs names in variant are: {[variant_input for variant_input in payload_inputs]} while @@ -208,7 +202,7 @@ async def prepare_csvdata_and_create_evaluation_scenario( inputs=list_of_scenario_input, outputs=[], ) - await engine.save(eval_scenario_instance) + await eval_scenario_instance.create() async def create_evaluation_scenario( @@ -250,7 +244,7 @@ async def create_evaluation_scenario( updated_at=datetime.utcnow(), ) - await engine.save(new_eval_scenario) + await new_eval_scenario.create() async def update_human_evaluation_service( @@ -294,7 +288,7 @@ async def update_human_evaluation_service( # Update the evaluation evaluation.update(updates) - await engine.save(evaluation) + await evaluation.create() async def fetch_evaluation_scenarios_for_evaluation( @@ -317,10 +311,9 @@ async def fetch_evaluation_scenarios_for_evaluation( evaluation_id=evaluation_id, **user_org_data, ) - scenarios = await engine.find( - EvaluationScenarioDB, - EvaluationScenarioDB.evaluation == ObjectId(evaluation.id), - ) + scenarios = await EvaluationScenarioDB.find( + EvaluationScenarioDB.evaluation == ObjectId(evaluation.id) + ).to_list() eval_scenarios = [ converters.evaluation_scenario_db_to_pydantic(scenario) for scenario in scenarios @@ -348,12 +341,9 @@ async def fetch_human_evaluation_scenarios_for_evaluation( evaluation_id=evaluation_id, **user_org_data, ) - print("$$$$$$ evaluation") - print(evaluation) - scenarios = await engine.find( - HumanEvaluationScenarioDB, - HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id), - ) + scenarios = await HumanEvaluationScenarioDB.find( + HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id) + ).to_list() eval_scenarios = [ converters.human_evaluation_scenario_db_to_pydantic(scenario) for scenario in scenarios @@ -434,8 +424,7 @@ async def update_human_evaluation_scenario( if updated_data["correct_answer"] is not None: new_eval_set["correct_answer"] = updated_data["correct_answer"] - eval_scenario.update(new_eval_set) - await engine.save(eval_scenario) + await eval_scenario.update(new_eval_set) async def update_evaluation_scenario_score_service( @@ -458,7 +447,7 @@ async def update_evaluation_scenario_score_service( eval_scenario.score = score # Save the updated evaluation scenario - await engine.save(eval_scenario) + await eval_scenario.create() async def get_evaluation_scenario_score_service( @@ -535,9 +524,7 @@ async def fetch_list_evaluations( detail=f"You do not have access to this app: {app_id}", ) - evaluations_db = await engine.find( - EvaluationDB, EvaluationDB.app == ObjectId(app_id) - ) + evaluations_db = await EvaluationDB.find(EvaluationDB.app == ObjectId(app_id)).to_list() return [ await converters.evaluation_db_to_pydantic(evaluation) for evaluation in evaluations_db @@ -582,9 +569,9 @@ async def fetch_list_human_evaluations( detail=f"You do not have access to this app: {app_id}", ) - evaluations_db = await engine.find( - HumanEvaluationDB, HumanEvaluationDB.app == ObjectId(app_id) - ) + evaluations_db = await HumanEvaluationDB.find( + HumanEvaluationDB.app == ObjectId(app_id) + ).to_list() return [ await converters.human_evaluation_db_to_pydantic(evaluation) for evaluation in evaluations_db @@ -627,7 +614,7 @@ async def delete_human_evaluations( evaluation = await _fetch_human_evaluation_and_check_access( evaluation_id=evaluation_id, **user_org_data ) - await engine.delete(evaluation) + await evaluation.delete() async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) -> None: @@ -645,7 +632,7 @@ async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) - evaluation = await _fetch_evaluation_and_check_access( evaluation_id=evaluation_id, **user_org_data ) - await engine.delete(evaluation) + await evaluation.delete() async def create_custom_code_evaluation( @@ -680,7 +667,7 @@ async def create_custom_code_evaluation( updated_at=datetime.utcnow(), ) - await engine.save(custom_eval) + await custom_eval.create() return str(custom_eval.id) @@ -699,12 +686,13 @@ async def update_custom_code_evaluation( user = await get_user(user_uid=kwargs["uid"]) # Build query expression - query_expression = query.eq(CustomEvaluationDB.user, user.id) & query.eq( - CustomEvaluationDB.id, ObjectId(id) + query_expression = ( + CustomEvaluationDB.user == user.id, + CustomEvaluationDB.id == ObjectId(id), ) # Get custom evaluation - custom_eval = await engine.find_one(CustomEvaluationDB, query_expression) + custom_eval = await CustomEvaluationDB.find_one(query_expression) if not custom_eval: raise HTTPException(status_code=404, detail="Custom evaluation not found") @@ -714,216 +702,10 @@ async def update_custom_code_evaluation( custom_eval.updated_at = datetime.utcnow() # Save the updated custom evaluation - await engine.save(custom_eval) - + await custom_eval.create() return str(custom_eval.id) -async def execute_custom_code_evaluation( - evaluation_id: str, - app_id: str, - output: str, - correct_answer: str, - variant_id: str, - inputs: Dict[str, Any], - **user_org_data: dict, -): - """Execute the custom evaluation code. - - Args: - evaluation_id (str): the custom evaluation id - app_id (str): the ID of the app - output (str): required by the custom code - correct_answer (str): required by the custom code - variant_id (str): required by the custom code - inputs (Dict[str, Any]): required by the custom code - - Raises: - HTTPException: Evaluation not found - HTTPException: You do not have access to this app: {app_id} - HTTPException: App variant not found - HTTPException: Failed to execute custom code evaluation - - Returns: - result: The result of the executed custom code - """ - logger.debug( - f"evaluation_id {evaluation_id} | app_id {app_id} | variant_id {variant_id} | inputs {inputs} | output {output} | correct_answer {correct_answer}" - ) - # Get user object - user = await get_user(user_uid=user_org_data["uid"]) - - # Build query expression - query_expression = query.eq( - CustomEvaluationDB.id, ObjectId(evaluation_id) - ) & query.eq(CustomEvaluationDB.user, user.id) - - # Get custom evaluation - custom_eval = await engine.find_one(CustomEvaluationDB, query_expression) - if not custom_eval: - raise HTTPException(status_code=404, detail="Evaluation not found") - - # Check if user has app access - access = await check_access_to_app(user_org_data=user_org_data, app_id=app_id) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {app_id}", - ) - - # Retrieve app from database - app = await db_manager.fetch_app_by_id(app_id=app_id) - - # Build query expression for app variant - appvar_query_expression = query.eq(AppVariantDB.app, app.id) & query.eq( - AppVariantDB.id, ObjectId(variant_id) - ) - - # Get app variant object - app_variant = await engine.find_one(AppVariantDB, appvar_query_expression) - if not app_variant: - raise HTTPException(status_code=404, detail="App variant not found") - - # Execute the Python code with the provided inputs - try: - result = execute_code_safely( - app_variant.config.parameters, - inputs, - output, - correct_answer, - custom_eval.python_code, - ) - except Exception as e: - raise HTTPException( - status_code=500, - detail=f"Failed to execute custom code evaluation: {str(e)}", - ) - return result - - -async def fetch_custom_evaluations( - app_id: str, **user_org_data: dict -) -> List[CustomEvaluationOutput]: - """Fetch a list of custom evaluations from the database. - - Args: - app_name (str): the name of the app - - Returns: - List[CustomEvaluationOutput]: ls=ist of custom evaluations - """ - # Get user object - access = await check_access_to_app(user_org_data=user_org_data, app_id=app_id) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {app_id}", - ) - - # Retrieve app from database - app = await db_manager.fetch_app_by_id(app_id=app_id) - - # Get custom evaluations - custom_evals = await engine.find( - CustomEvaluationDB, CustomEvaluationDB.app == ObjectId(app.id) - ) - if not custom_evals: - return [] - - # Convert custom evaluations to evaluations - evaluations = [] - for custom_eval in custom_evals: - evaluations.append( - CustomEvaluationOutput( - id=str(custom_eval.id), - app_id=str(custom_eval.app.id), - evaluation_name=custom_eval.evaluation_name, - created_at=custom_eval.created_at, - ) - ) - return evaluations - - -async def fetch_custom_evaluation_detail( - id: str, **user_org_data: dict -) -> CustomEvaluationDetail: - """Fetch the detail of custom evaluation from the database. - - Args: - id (str): the id of the custom evaluation - - Returns: - CustomEvaluationDetail: Detail of the custom evaluation - """ - - # Get user object - user = await get_user(user_uid=user_org_data["uid"]) - - # Build query expression - query_expression = query.eq(CustomEvaluationDB.user, user.id) & query.eq( - CustomEvaluationDB.id, ObjectId(id) - ) - - # Get custom evaluation - custom_eval = await engine.find_one(CustomEvaluationDB, query_expression) - if not custom_eval: - raise HTTPException(status_code=404, detail="Custom evaluation not found") - - return CustomEvaluationDetail( - id=str(custom_eval.id), - app_id=str(custom_eval.app.id), - python_code=custom_eval.python_code, - evaluation_name=custom_eval.evaluation_name, - created_at=custom_eval.created_at, - updated_at=custom_eval.updated_at, - ) - - -async def fetch_custom_evaluation_names( - app_id: str, **user_org_data: dict -) -> List[CustomEvaluationNames]: - """Fetch the names of custom evaluation from the database. - - Args: - id (str): the name of the app the evaluation belongs to - - Returns: - List[CustomEvaluationNames]: the list of name of custom evaluations - """ - - # Get user object - user = await get_user(user_uid=user_org_data["uid"]) - - # Check if user has app access - access = await check_access_to_app(user_org_data=user_org_data, app_id=app_id) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {app_id}", - ) - - # Retrieve app from database - app = await db_manager.fetch_app_by_id(app_id=app_id) - - # Build query expression - query_expression = query.eq(CustomEvaluationDB.user, user.id) & query.eq( - CustomEvaluationDB.app, app.id - ) - - # Get custom evaluation - custom_evals = await engine.find(CustomEvaluationDB, query_expression) - - list_of_custom_eval_names = [] - for custom_eval in custom_evals: - list_of_custom_eval_names.append( - CustomEvaluationNames( - id=str(custom_eval.id), - evaluation_name=custom_eval.evaluation_name, - ) - ) - return list_of_custom_eval_names - - async def create_new_human_evaluation( payload: NewHumanEvaluation, **user_org_data: dict ) -> EvaluationDB: @@ -969,8 +751,7 @@ async def create_new_human_evaluation( created_at=current_time, updated_at=current_time, ) - newEvaluation = await engine.save(eval_instance) - + newEvaluation = await eval_instance.create() if newEvaluation is None: raise HTTPException( status_code=500, detail="Failed to create evaluation_scenario" diff --git a/agenta-backend/agenta_backend/services/event_db_manager.py b/agenta-backend/agenta_backend/services/event_db_manager.py index cbc2971644..ba7441883e 100644 --- a/agenta-backend/agenta_backend/services/event_db_manager.py +++ b/agenta-backend/agenta_backend/services/event_db_manager.py @@ -1,6 +1,5 @@ import logging from typing import List -from bson import ObjectId from datetime import datetime from fastapi import HTTPException @@ -26,14 +25,10 @@ Feedback as FeedbackDB, SpanDB, ) -from agenta_backend.models.db_engine import DBEngine -from odmantic import query +from beanie import PydanticObjectId as ObjectId -# Initialize database engine -engine = DBEngine().engine() - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -52,13 +47,8 @@ async def get_variant_traces( """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = ( - query.eq(TraceDB.user, user.id) - & query.eq(TraceDB.app_id, app_id) - & query.eq(TraceDB.variant_id, variant_id) - ) - - traces = await engine.find(TraceDB, query_expressions) + query_expressions = (TraceDB.user == user.id, TraceDB.app_id == app_id, TraceDB.variant_id == variant_id) + traces = await TraceDB.find(query_expressions).to_list() return [trace_db_to_pydantic(trace) for trace in traces] @@ -76,12 +66,12 @@ async def create_app_trace(payload: CreateTrace, **kwargs: dict) -> str: # Ensure spans exists in the db for span in payload.spans: - span_db = await engine.find_one(SpanDB, SpanDB.id == ObjectId(span)) + span_db = await SpanDB.find_one(SpanDB.id == ObjectId(span)) if span_db is None: raise HTTPException(404, detail=f"Span {span} does not exist") trace = TraceDB(**payload.dict(), user=user) - await engine.save(trace) + await trace.create() return trace_db_to_pydantic(trace)["trace_id"] @@ -96,12 +86,12 @@ async def get_trace_single(trace_id: str, **kwargs: dict) -> Trace: """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = query.eq(TraceDB.id, ObjectId(trace_id)) & query.eq( - TraceDB.user, user.id + query_expressions = (TraceDB.id == ObjectId(trace_id), + TraceDB.user == user.id ) # Get trace - trace = await engine.find_one(TraceDB, query_expressions) + trace = await TraceDB.find_one(query_expressions) return trace_db_to_pydantic(trace) @@ -119,16 +109,16 @@ async def trace_status_update( """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = query.eq(TraceDB.id, ObjectId(trace_id)) & query.eq( - TraceDB.user, user.id + query_expressions = (TraceDB.id == ObjectId(trace_id), + TraceDB.user == user.id ) # Get trace - trace = await engine.find_one(TraceDB, query_expressions) + trace = await TraceDB.find_one(query_expressions) # Update and save trace trace.status = payload.status - await engine.save(trace) + await trace.create() return True @@ -142,10 +132,8 @@ async def create_trace_span(payload: CreateSpan, **kwargs: dict) -> str: str: the created span id """ - user = await db_manager.get_user(user_uid=kwargs["uid"]) - span_db = SpanDB(**payload.dict()) - await engine.save(span_db) + await span_db.create() return str(span_db.id) @@ -160,12 +148,12 @@ async def get_trace_spans(trace_id: str, **kwargs: dict) -> List[Span]: """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = query.eq(TraceDB.id, ObjectId(trace_id)) & query.eq( - TraceDB.user, user.id + query_expressions = (TraceDB.id == ObjectId(trace_id), + TraceDB.user == user.id ) # Get trace - trace = await engine.find_one(TraceDB, query_expressions) + trace = await TraceDB.find_one(query_expressions) # Get trace spans spans = spans_db_to_pydantic(trace.spans) @@ -193,14 +181,14 @@ async def add_feedback_to_trace( created_at=datetime.utcnow(), ) - trace = await engine.find_one(TraceDB, TraceDB.id == ObjectId(trace_id)) + trace = await TraceDB.find_one(TraceDB.id == ObjectId(trace_id)) if trace.feedbacks is None: trace.feedbacks = [feedback] else: trace.feedbacks.append(feedback) # Update trace - await engine.save(trace) + await trace.create() return feedback.uid @@ -217,12 +205,12 @@ async def get_trace_feedbacks(trace_id: str, **kwargs: dict) -> List[Feedback]: user = await db_manager.get_user(user_uid=kwargs["uid"]) # Build query expressions - query_expressions = query.eq(TraceDB.id, ObjectId(trace_id)) & query.eq( - TraceDB.user, user.id + query_expressions = (TraceDB.id == ObjectId(trace_id), + TraceDB.user == user.id ) # Get feedbacks in trace - trace = await engine.find_one(TraceDB, query_expressions) + trace = await TraceDB.find_one(query_expressions) feedbacks = [feedback_db_to_pydantic(feedback) for feedback in trace.feedbacks] return feedbacks @@ -243,12 +231,12 @@ async def get_feedback_detail( user = await db_manager.get_user(user_uid=kwargs["uid"]) # Build query expressions - query_expressions = query.eq(TraceDB.id, ObjectId(trace_id)) & query.eq( - TraceDB.user, user.id + query_expressions = (TraceDB.id == ObjectId(trace_id), + TraceDB.user == user.id ) # Get trace - trace = await engine.find_one(TraceDB, query_expressions) + trace = await TraceDB.find_one(query_expressions) # Get feedback feedback = [ @@ -276,12 +264,12 @@ async def update_trace_feedback( user = await db_manager.get_user(user_uid=kwargs["uid"]) # Build query expressions - query_expressions = query.eq(TraceDB.id, ObjectId(trace_id)) & query.eq( - TraceDB.user, user.id + query_expressions = (TraceDB.id == ObjectId(trace_id), + TraceDB.user == user.id ) # Get trace - trace = await engine.find_one(TraceDB, query_expressions) + trace = await TraceDB.find_one(query_expressions) # update feedback feedback_json = {} @@ -292,7 +280,7 @@ async def update_trace_feedback( break # Save feedback in trace and return a copy - await engine.save(trace) + await trace.create() # Replace key and transform into a pydantic representation feedback_json["feedback_id"] = feedback_json.pop("uid") diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index a0d34eea5c..dba7c8f4d4 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -1,5 +1,3 @@ -from agenta_backend.utils.common import engine -from agenta_backend.services.db_manager import query from agenta_backend.models.db_models import ( EvaluationScenarioDB, EvaluationDB, @@ -13,10 +11,9 @@ async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB): - evaluation_scenarios = await engine.find( - HumanEvaluationScenarioDB, + evaluation_scenarios = await HumanEvaluationScenarioDB.find( HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id), - ) + ).to_list() results = {} if len(evaluation_scenarios) == 0: @@ -80,17 +77,13 @@ async def fetch_results_for_single_model_test(evaluation_id: str): {"$group": {"_id": "$score", "count": {"$sum": 1}}}, ] - results = {} - collection = engine.get_collection(HumanEvaluationScenarioDB) - aggregation_cursor = await collection.aggregate(pipeline).to_list(length=None) - for doc in aggregation_cursor: - results[doc["_id"]] = doc["count"] - return results + results = await HumanEvaluationScenarioDB.aggregate(pipeline).to_list(length=None) + return {result._id: result.count for result in results} async def fetch_average_score_for_custom_code_run(evaluation_id: str) -> float: query_exp = EvaluationScenarioDB.evaluation == ObjectId(evaluation_id) - eval_scenarios = await engine.find(EvaluationScenarioDB, query_exp) + eval_scenarios = await EvaluationScenarioDB.find(query_exp).to_list() list_of_scores = [] for scenario in eval_scenarios: diff --git a/agenta-backend/agenta_backend/utils/common.py b/agenta-backend/agenta_backend/utils/common.py index 5bb19ede89..f149b05680 100644 --- a/agenta-backend/agenta_backend/utils/common.py +++ b/agenta-backend/agenta_backend/utils/common.py @@ -1,9 +1,9 @@ import logging -from beanie import Query +from typing import Dict, List, Union, Optional, Any, Callable + from fastapi.types import DecoratedCallable from fastapi import APIRouter as FastAPIRouter -from agenta_backend.models.db_engine import DBEngine -from typing import Dict, List, Union, Optional, Any, Callable + from agenta_backend.models.db_models import ( UserDB, AppVariantDB, @@ -12,6 +12,9 @@ VariantBaseDB, ) +from beanie import PydanticObjectId as ObjectId + + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -58,7 +61,7 @@ def decorator(func: DecoratedCallable) -> DecoratedCallable: async def get_organization(org_id: str) -> OrganizationDB: - org = await OrganizationDB.find_one(OrganizationDB.id == org_id) + org = await OrganizationDB.find_one(OrganizationDB.id == ObjectId(org_id)) if org is not None: return org else: @@ -68,18 +71,11 @@ async def get_organization(org_id: str) -> OrganizationDB: async def get_app_instance( app_id: str, variant_name: str = None, show_deleted: bool = False ) -> AppVariantDB: - query = Query(AppVariantDB) - query = query.filter(AppVariantDB.is_deleted == show_deleted) - query = query.filter(AppVariantDB.app == app_id) - + queries = (AppVariantDB.is_deleted == show_deleted, AppVariantDB.app == app_id) if variant_name is not None: - query = query.filter(AppVariantDB.variant_name == variant_name) - - print("query_expression:", query) - - app_instance = await query.get() + queries += (AppVariantDB.variant_name == variant_name) - print("app_instance:", app_instance) + app_instance = await AppVariantDB.find_one(*queries) return app_instance @@ -99,7 +95,8 @@ async def check_user_org_access( logger.debug( f"object_organization_id: {object_organization_id}, user_organizations: {user_organizations}" ) - return object_organization_id in user_organizations + user_exists_in_organizations = object_organization_id in user_organizations + return user_exists_in_organizations async def check_access_to_app( @@ -129,7 +126,7 @@ async def check_access_to_app( # Fetch the app if only app_id is provided. if app is None: - app = await AppDB.find_one(AppDB.id == app_id) + app = await AppDB.find_one(AppDB.id == ObjectId(app_id), fetch_links=True) if app is None: logger.error("App not found") return False @@ -147,7 +144,7 @@ async def check_access_to_variant( if variant_id is None: raise Exception("No variant_id provided") variant = await AppVariantDB.find_one( - AppVariantDB.id == variant_id + AppVariantDB.id == ObjectId(variant_id), fetch_links=True ) if variant is None: logger.error("Variant not found") @@ -163,7 +160,7 @@ async def check_access_to_base( ) -> bool: if base_id is None: raise Exception("No base_id provided") - base = await VariantBaseDB.find_one(VariantBaseDB.id == base_id) + base = await VariantBaseDB.find_one(VariantBaseDB.id == base_id, fetch_links=True) if base is None: logger.error("Base not found") return False From dfa918f146a28394dbb1e0dcb30b2fa72fc9ec52 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 5 Jan 2024 06:51:17 +0100 Subject: [PATCH 267/414] Update - modified webhook test evaluator settings template --- .../agenta_backend/resources/evaluators/evaluators.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json index 80d6bc28a1..99a394d20c 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.json +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.json @@ -76,6 +76,12 @@ "type": "string", "default": "https://cloud.agenta.ai/api/evaluations/webhook_example_fake", "description": "URL for the webhook test" + }, + "webhook_body": { + "label": "Webhook Body", + "type": "object", + "default": "{}", + "description": "Request body for webhook URL" } } }, From 6daaa048df97cb710ed8e728994158699bd077df Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 5 Jan 2024 07:11:53 +0100 Subject: [PATCH 268/414] Refactor - migrate to beanie and fix bugs that arose in: evaluators, beanie intialization in celery worker, testsets, variant update/delete, and evaluations --- .../agenta_backend/models/db_models.py | 4 +-- .../routers/evaluators_router.py | 2 +- .../agenta_backend/services/db_manager.py | 25 ++++++---------- .../services/evaluation_service.py | 4 +-- .../services/evaluators_service.py | 29 ++++++++++--------- .../agenta_backend/tasks/evaluations.py | 7 +++-- .../evaluators/NewEvaluatorModal.tsx | 5 ++-- 7 files changed, 38 insertions(+), 38 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 8606111c8d..57b1faa5b8 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -317,8 +317,8 @@ class EvaluationScenarioDB(Document): organization: Link[OrganizationDB] evaluation: Link[EvaluationDB] variant_id: PydanticObjectId - inputs: List[Link[EvaluationScenarioInputDB]] - outputs: List[Link[EvaluationScenarioOutputDB]] + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] correct_answer: Optional[str] is_pinned: Optional[bool] note: Optional[str] diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 71dafc5c43..0002f4c13c 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -57,7 +57,7 @@ async def get_evaluators(): @router.get("/configs/", response_model=List[EvaluatorConfig]) -async def get_evaluator_configs(app_id: str = Query()): +async def get_evaluator_configs(app_id: str): """Endpoint to fetch evaluator configurations for a specific app. Args: diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 684d91410a..7ac5c8b5b7 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -41,7 +41,6 @@ TestSetDB, UserDB, ) - from agenta_backend.utils.common import check_user_org_access from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum @@ -1254,7 +1253,7 @@ async def fetch_evaluation_by_id(evaluation_id: str) -> Optional[EvaluationDB]: EvaluationDB: The fetched evaluation, or None if no evaluation was found. """ assert evaluation_id is not None, "evaluation_id cannot be None" - evaluation = await EvaluationDB.find_one(EvaluationDB.id == ObjectId(evaluation_id)) + evaluation = await EvaluationDB.find_one(EvaluationDB.id == ObjectId(evaluation_id), fetch_links=True) return evaluation @@ -1681,7 +1680,7 @@ async def update_evaluation_with_aggregated_results( evaluation.aggregated_results = aggregated_results evaluation.updated_at = datetime.utcnow().isoformat() - await evaluation.update() + await evaluation.save() return evaluation @@ -1694,8 +1693,7 @@ async def fetch_evaluators_configs(app_id: str): assert app_id is not None, "evaluation_id cannot be None" try: - query_expressions = EvaluatorConfigDB.app.id == ObjectId(app_id) - evaluators_configs = await EvaluatorConfigDB.find_one(query_expressions) + evaluators_configs = await EvaluatorConfigDB.find(EvaluatorConfigDB.app.id == ObjectId(app_id)).to_list() return evaluators_configs except Exception as e: raise e @@ -1731,11 +1729,8 @@ async def fetch_evaluator_config_by_appId( """ try: - query_expressions = ( - EvaluatorConfigDB.app.id == ObjectId(app_id), - EvaluatorConfigDB.evaluator_key == evaluator_name, - ) - evaluator_config = await EvaluatorConfigDB.find_one(query_expressions) + evaluator_config = await EvaluatorConfigDB.find_one(EvaluatorConfigDB.app.id == ObjectId(app_id), + EvaluatorConfigDB.evaluator_key == evaluator_name) return evaluator_config except Exception as e: raise e @@ -1801,10 +1796,8 @@ async def delete_evaluator_config(evaluator_config_id: str) -> bool: evaluator_config = await EvaluatorConfigDB.find_one( EvaluatorConfigDB.id == ObjectId(evaluator_config_id) ) - delete_result = evaluator_config.delete() - return ( - delete_result is None - ) # checking if delete_result is None (has been deleted) + delete_result = await evaluator_config.delete() + return delete_result.acknowledged except Exception as e: raise e @@ -1822,10 +1815,10 @@ async def update_evaluation( Returns: EvaluatorConfigDB: The updated evaluator configuration object. """ - evaluation = await EvaluationDB.find_one(EvaluationDB.id == ObjectId(evaluation_id)) + evaluation = await EvaluationDB.get(ObjectId(evaluation_id)) for key, value in updates.items(): if hasattr(evaluation, key): setattr(evaluation, key, value) - await evaluation.update() + await evaluation.save() return evaluation diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 78129e0bed..023d80aa78 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -524,7 +524,7 @@ async def fetch_list_evaluations( detail=f"You do not have access to this app: {app_id}", ) - evaluations_db = await EvaluationDB.find(EvaluationDB.app == ObjectId(app_id)).to_list() + evaluations_db = await EvaluationDB.find(EvaluationDB.app.id == ObjectId(app_id), fetch_links=True).to_list() return [ await converters.evaluation_db_to_pydantic(evaluation) for evaluation in evaluations_db @@ -570,7 +570,7 @@ async def fetch_list_human_evaluations( ) evaluations_db = await HumanEvaluationDB.find( - HumanEvaluationDB.app == ObjectId(app_id) + HumanEvaluationDB.app.id == ObjectId(app_id) ).to_list() return [ await converters.human_evaluation_db_to_pydantic(evaluation) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 94d9a3abe8..9f08d62dd1 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,4 +1,5 @@ import re +import json import httpx from typing import Any, Dict, Tuple @@ -48,8 +49,11 @@ def auto_webhook_test( ) -> Result: try: with httpx.Client() as client: + request_body = json.loads(settings_values.get("webhook_body", None)) + payload = request_body if request_body else {} response = client.post( - url=settings_values["webhook_url"], json=settings_values["webhook_body"] + url=settings_values["webhook_url"], + json=payload ) response.raise_for_status() response_data = response.json() @@ -67,7 +71,7 @@ def auto_webhook_test( print(f"An error occurred: {e}") -def custom_code_run( +def auto_custom_code_run( variant_output: str, correct_answer: str, settings_values: Dict[str, Any], @@ -79,7 +83,7 @@ def custom_code_run( inputs=kwargs["inputs"], output=variant_output, correct_answer=correct_answer, - code=settings_values["python_code"], + code=settings_values["code"], ) return Result(type="number", value=result) except Exception as exc: @@ -160,14 +164,13 @@ def evaluate( *additional_args: Tuple[Any], **additional_kwargs: Dict[str, Any], ) -> Result: - try: - evaluation_function = globals()[evaluator_name] - return evaluation_function( - correct_answer, - variant_output, - settings_values, - *additional_args, - **additional_kwargs, - ) - except KeyError: + evaluation_function = globals().get(evaluator_name, None) + if not evaluation_function: raise ValueError(f"Evaluation method '{evaluator_name}' not found.") + return evaluation_function( + correct_answer, + variant_output, + settings_values, + *additional_args, + **additional_kwargs, + ) \ No newline at end of file diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index acb145cb5f..aaf92ca291 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -9,6 +9,7 @@ fetch_evaluation_by_id, fetch_app_variant_by_id, fetch_evaluator_config, + fetch_app_by_id, get_deployment_by_objectid, fetch_testset_by_id, create_new_evaluation_scenario, @@ -24,6 +25,7 @@ AggregatedResult, Result, ) +from agenta_backend.models.db_engine import DBEngine from agenta_backend.services import evaluators_service from agenta_backend.models.api.evaluation_model import NewEvaluation @@ -35,7 +37,8 @@ def evaluate( loop = asyncio.get_event_loop() try: - app = AppDB(**app_data) + loop.run_until_complete(DBEngine().init_db()) + app = loop.run_until_complete(fetch_app_by_id(app_data["_id"])) evaluation = NewEvaluation(**new_evaluation_data) testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) @@ -100,7 +103,7 @@ def evaluate( "app_params": app_variant_db.config.parameters, "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed } - if evaluator_config.evaluator_key == "custom_code_run" + if evaluator_config.evaluator_key == "auto_custom_code_run" else {} ) result = evaluators_service.evaluate( diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx index 3bd7bd779c..754666bb8b 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx @@ -76,7 +76,6 @@ const DynamicFormField: React.FC = ({ isValidRegex(value) ? res("") : rej("Regex pattern is not valid"), ), }) - return ( = ({ ) : type === "text" ? ( + ) : type === "object" ? ( + ) : type === "code" ? ( ) : null} @@ -160,7 +161,7 @@ const NewEvaluatorModal: React.FC = ({ return ( Date: Fri, 5 Jan 2024 08:09:18 +0100 Subject: [PATCH 269/414] fix evaluations getting same output from llm --- .../services/llm_apps_service.py | 39 ++++++++++++------- .../agenta_backend/tasks/evaluations.py | 14 +++++-- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index a749cbc418..ecd660874d 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -12,18 +12,25 @@ logger.setLevel(logging.DEBUG) -async def get_llm_app_output(uri: str, input: Any) -> AppOutput: +async def get_llm_app_output(uri: str, datapoint: Any, parameters: dict) -> AppOutput: + prompt_user = replace_placeholders(parameters["prompt_user"], datapoint) + prompt_system = replace_placeholders(parameters["prompt_system"], datapoint) + url = f"{uri}/generate" - # TODO: adjust these hardcoded values in this payload payload = { - "temperature": 1, - "model": "gpt-3.5-turbo", - "max_tokens": -1, - "prompt_system": "You are an expert in geography.", - "prompt_user": f"What is the capital of {input}?", - "top_p": 1, - "inputs": {"country": input}, + "temperature": parameters["temperature"], + "model": parameters["model"], + "max_tokens": parameters["max_tokens"], + "prompt_system": prompt_system, + "prompt_user": prompt_user, + "top_p": parameters["top_p"], + "frequence_penalty": parameters["frequence_penalty"], + "presence_penalty": parameters["presence_penalty"], + "inputs": { + input_item["name"]: datapoint.get(input_item["name"], "") + for input_item in parameters["inputs"] + }, } async with httpx.AsyncClient() as client: @@ -35,13 +42,13 @@ async def get_llm_app_output(uri: str, input: Any) -> AppOutput: async def run_with_retry( - uri: str, input_data: Any, max_retry_count: int, retry_delay: int + uri: str, input_data: Any, parameters: dict, max_retry_count: int, retry_delay: int ) -> AppOutput: retries = 0 last_exception = None while retries < max_retry_count: try: - result = await get_llm_app_output(uri, input_data) + result = await get_llm_app_output(uri, input_data, parameters) return result except (httpx.TimeoutException, httpx.ConnectTimeout, httpx.ConnectError) as e: last_exception = e @@ -54,7 +61,7 @@ async def run_with_retry( async def batch_invoke( - uri: str, testset_data: List[dict], rate_limit_config: dict + uri: str, testset_data: List[dict], parameters: dict, rate_limit_config: dict ) -> List[AppOutput]: batch_size = rate_limit_config[ "batch_size" @@ -77,7 +84,7 @@ async def run_batch(start_idx: int): for index in range(start_idx, end_idx): try: batch_output: AppOutput = await run_with_retry( - uri, testset_data[index], max_retries, retry_delay + uri, testset_data[index], parameters, max_retries, retry_delay ) list_of_app_outputs.append(batch_output) print(f"Adding outputs to batch {start_idx}") @@ -95,3 +102,9 @@ async def run_batch(start_idx: int): # Start the first batch await run_batch(0) return list_of_app_outputs + + +def replace_placeholders(text, data): + for key, value in data.items(): + text = text.replace(f"{{{key}}}", value) + return text diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index a5a3970ecc..b4e91289af 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -40,15 +40,18 @@ def evaluate( evaluation = NewEvaluation(**new_evaluation_data) try: + variant_id = str(evaluation.variant_ids[0]) + app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) + app_variant_parameters = app_variant_db.config.parameters + + # TODO: we need fail evaluation if parameters are empty + testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) new_evaluation_db = loop.run_until_complete( fetch_evaluation_by_id(evaluation_id) ) evaluators_aggregated_data = defaultdict(list) - variant_id = str(evaluation.variant_ids[0]) - - app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) deployment = loop.run_until_complete( get_deployment_by_objectid(app_variant_db.base.deployment) ) @@ -65,7 +68,10 @@ def evaluate( # 1. We get the output from the llm app app_outputs: List[AppOutput] = loop.run_until_complete( llm_apps_service.batch_invoke( - uri, testset.csvdata, evaluation.rate_limit.dict() + uri, + testset.csvdata, + app_variant_parameters, + evaluation.rate_limit.dict(), ) ) for data_point, app_output in zip(testset.csvdata, app_outputs): From 523f1932e19f6b138c7276d680e8eb68a9f19504 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 5 Jan 2024 08:19:57 +0100 Subject: [PATCH 270/414] fail evaluation in case empty params or empty inputs --- agenta-backend/agenta_backend/tasks/evaluations.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index b4e91289af..b620b707cb 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -44,7 +44,15 @@ def evaluate( app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id)) app_variant_parameters = app_variant_db.config.parameters - # TODO: we need fail evaluation if parameters are empty + if ( + not app_variant_db.config.parameters + or "inputs" not in app_variant_db.config.parameters + or not app_variant_db.config.parameters["inputs"] + ): + loop.run_until_complete( + update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"}) + ) + return testset = loop.run_until_complete(fetch_testset_by_id(testset_id)) new_evaluation_db = loop.run_until_complete( From 8afe25009c489415acb79b025b9baf560aad8bbc Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 5 Jan 2024 08:48:45 +0100 Subject: [PATCH 271/414] fix build --- .../evaluationResults/NewEvaluationModal.tsx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index e881685aff..9fe27ce0c8 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -233,7 +233,8 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { > + onChange={(value: number | null) => + value !== null && onRateLimitInputChange("batch_size", value) } style={{width: "100%"}} @@ -257,7 +258,8 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { > + onChange={(value: number | null) => + value !== null && onRateLimitInputChange("max_retries", value) } style={{width: "100%"}} @@ -282,7 +284,8 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { > + onChange={(value: number | null) => + value !== null && onRateLimitInputChange("retry_delay", value) } style={{width: "100%"}} @@ -307,7 +310,8 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { > + onChange={(value: number | null) => + value !== null && onRateLimitInputChange( "delay_between_batches", value, From f81f1a55e41276a53bb3df3e0a3999cb2d5e9a1b Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 5 Jan 2024 14:41:02 +0100 Subject: [PATCH 272/414] Update - bump packages in agenta-cli poetry.lock --- agenta-cli/poetry.lock | 53 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/agenta-cli/poetry.lock b/agenta-cli/poetry.lock index 2d6fb497ac..792f318e38 100644 --- a/agenta-cli/poetry.lock +++ b/agenta-cli/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -13,24 +13,25 @@ files = [ [[package]] name = "anyio" -version = "3.7.1" +version = "4.2.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "anyio-3.7.1-py3-none-any.whl", hash = "sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5"}, - {file = "anyio-3.7.1.tar.gz", hash = "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780"}, + {file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"}, + {file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"}, ] [package.dependencies] -exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} idna = ">=2.8" sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} [package.extras] -doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-jquery"] -test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] -trio = ["trio (<0.22)"] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.23)"] [[package]] name = "asttokens" @@ -62,21 +63,22 @@ files = [ [[package]] name = "attrs" -version = "23.1.0" +version = "23.2.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.7" files = [ - {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, - {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, + {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"}, + {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, ] [package.extras] cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] -dev = ["attrs[docs,tests]", "pre-commit"] +dev = ["attrs[tests]", "pre-commit"] docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] tests = ["attrs[tests-no-zope]", "zope-interface"] -tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"] +tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"] [[package]] name = "backoff" @@ -286,19 +288,18 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth [[package]] name = "fastapi" -version = "0.105.0" +version = "0.108.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.8" files = [ - {file = "fastapi-0.105.0-py3-none-any.whl", hash = "sha256:f19ebf6fdc82a3281d10f2cb4774bdfa90238e3b40af3525a0c09fd08ad1c480"}, - {file = "fastapi-0.105.0.tar.gz", hash = "sha256:4d12838819aa52af244580675825e750ad67c9df4614f557a769606af902cf22"}, + {file = "fastapi-0.108.0-py3-none-any.whl", hash = "sha256:8c7bc6d315da963ee4cdb605557827071a9a7f95aeb8fcdd3bde48cdc8764dd7"}, + {file = "fastapi-0.108.0.tar.gz", hash = "sha256:5056e504ac6395bf68493d71fcfc5352fdbd5fda6f88c21f6420d80d81163296"}, ] [package.dependencies] -anyio = ">=3.7.1,<4.0.0" pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" -starlette = ">=0.27.0,<0.28.0" +starlette = ">=0.29.0,<0.33.0" typing-extensions = ">=4.8.0" [package.extras] @@ -891,13 +892,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] [[package]] name = "starlette" -version = "0.27.0" +version = "0.32.0.post1" description = "The little ASGI library that shines." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "starlette-0.27.0-py3-none-any.whl", hash = "sha256:918416370e846586541235ccd38a474c08b80443ed31c578a418e2209b3eef91"}, - {file = "starlette-0.27.0.tar.gz", hash = "sha256:6a6b0d042acb8d469a01eba54e9cda6cbd24ac602c4cd016723117d6a7e73b75"}, + {file = "starlette-0.32.0.post1-py3-none-any.whl", hash = "sha256:cd0cb10ddb49313f609cedfac62c8c12e56c7314b66d89bb077ba228bada1b09"}, + {file = "starlette-0.32.0.post1.tar.gz", hash = "sha256:e54e2b7e2fb06dff9eac40133583f10dfa05913f5a85bf26f427c7a40a9a3d02"}, ] [package.dependencies] @@ -931,13 +932,13 @@ files = [ [[package]] name = "traitlets" -version = "5.14.0" +version = "5.14.1" description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" files = [ - {file = "traitlets-5.14.0-py3-none-any.whl", hash = "sha256:f14949d23829023013c47df20b4a76ccd1a85effb786dc060f34de7948361b33"}, - {file = "traitlets-5.14.0.tar.gz", hash = "sha256:fcdaa8ac49c04dfa0ed3ee3384ef6dfdb5d6f3741502be247279407679296772"}, + {file = "traitlets-5.14.1-py3-none-any.whl", hash = "sha256:2e5a030e6eff91737c643231bfcf04a65b0132078dad75e4936700b213652e74"}, + {file = "traitlets-5.14.1.tar.gz", hash = "sha256:8585105b371a04b8316a43d5ce29c098575c2e477850b62b848b964f1444527e"}, ] [package.extras] From 836e6aaba8de328c7bbaefa517b109308e32c2d9 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 5 Jan 2024 14:42:11 +0100 Subject: [PATCH 273/414] Refactor - migrate to beanie and fix bugs that arose in human evaluations, variant deployment and create app from cli --- agenta-backend/agenta_backend/main.py | 11 +++-- .../models/api/evaluation_model.py | 2 +- .../agenta_backend/models/db_models.py | 6 +-- .../agenta_backend/routers/app_router.py | 5 +- .../routers/human_evaluation_router.py | 4 +- .../agenta_backend/services/db_manager.py | 47 +++++++++--------- .../services/evaluation_service.py | 49 ++----------------- .../services/results_service.py | 7 +-- .../agenta_backend/services/selectors.py | 8 +-- 9 files changed, 50 insertions(+), 89 deletions(-) diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index 783be4242d..f028919c00 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -1,5 +1,5 @@ import os -from celery import Celery +import asyncio from contextlib import asynccontextmanager from agenta_backend.config import settings @@ -20,6 +20,7 @@ configs_router, health_router, ) +from agenta_backend.models.db_engine import DBEngine if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: from agenta_backend.commons.services import templates_manager @@ -29,7 +30,8 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from agenta_backend.models.db_engine import DBEngine +from celery import Celery + origins = [ "http://localhost:3000", @@ -50,10 +52,9 @@ async def lifespan(application: FastAPI, cache=True): application: FastAPI application. cache: A boolean value that indicates whether to use the cached data or not. """ - # first initialize the database + # initialize the database await DBEngine().init_db() - - await templates_manager.update_and_sync_templates(cache=cache) + # await templates_manager.update_and_sync_templates(cache=cache) yield diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index f1ace267b8..f43da7282c 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -185,7 +185,7 @@ class AICritiqueCreate(BaseModel): class EvaluationScenarioUpdate(BaseModel): vote: Optional[str] - score: Optional[Union[str, int]] + score: Optional[Any] correct_answer: Optional[str] # will be used when running custom code evaluation outputs: Optional[List[EvaluationScenarioOutput]] inputs: Optional[List[EvaluationScenarioInput]] diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 57b1faa5b8..4645c6d875 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -282,10 +282,10 @@ class HumanEvaluationScenarioDB(Document): user: Link[UserDB] organization: Link[OrganizationDB] evaluation: Link[HumanEvaluationDB] - inputs: List[Link[HumanEvaluationScenarioInput]] - outputs: List[Link[HumanEvaluationScenarioOutput]] + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] vote: Optional[str] - score: Optional[Union[str, int]] + score: Optional[Any] correct_answer: Optional[str] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/routers/app_router.py b/agenta-backend/agenta_backend/routers/app_router.py index 008ff298b9..ee80bd02f7 100644 --- a/agenta-backend/agenta_backend/routers/app_router.py +++ b/agenta-backend/agenta_backend/routers/app_router.py @@ -186,6 +186,8 @@ async def create_app( ) return CreateAppOutput(app_id=str(app_db.id), app_name=str(app_db.app_name)) except Exception as e: + import traceback + traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) @@ -265,7 +267,7 @@ async def add_variant_from_image( ) app = await db_manager.fetch_app_by_id(app_id) - app_variant_db = await app_manager.add_variant_based_on_image( + variant_db = await app_manager.add_variant_based_on_image( app=app, variant_name=payload.variant_name, docker_id_or_template_uri=payload.docker_id, @@ -275,6 +277,7 @@ async def add_variant_from_image( is_template_image=False, **user_org_data, ) + app_variant_db = await db_manager.fetch_app_variant_by_id(str(variant_db.id)) return await converters.app_variant_db_to_output(app_variant_db) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/agenta-backend/agenta_backend/routers/human_evaluation_router.py b/agenta-backend/agenta_backend/routers/human_evaluation_router.py index 2f6b136935..2353411951 100644 --- a/agenta-backend/agenta_backend/routers/human_evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/human_evaluation_router.py @@ -155,8 +155,8 @@ async def fetch_evaluation_scenarios( return eval_scenarios -@router.put("/{evaluation_id}/", operation_id="update_evaluation") -async def update_evaluation( +@router.put("/{evaluation_id}/", operation_id="update_human_evaluation") +async def update_human_evaluation( request: Request, evaluation_id: str, update_data: HumanEvaluationUpdate = Body(...), diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 7ac5c8b5b7..ad33b78ff1 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -164,14 +164,9 @@ async def fetch_app_by_name( if not organization_id: user = await get_user(user_uid=user_org_data["uid"]) - query_expressions = {"app_name": app_name, "user": user.id} + app = await AppDB.find_one(AppDB.app_name == app_name, AppDB.user.id == user.id) else: - query_expressions = { - "app_name": app_name, - "organization": ObjectId(organization_id), - } - - app = await AppDB.find_one(query_expressions) + app = await AppDB.find_one(AppDB.app_name == app_name, AppDB.organization.id == ObjectId(organization_id)) return app @@ -533,9 +528,8 @@ async def list_app_variants_for_app_id( List[AppVariant]: List of AppVariant objects """ assert app_id is not None, "app_id cannot be None" - query_expressions = AppVariantDB.app.id == ObjectId(app_id) app_variants_db = await AppVariantDB.find( - query_expressions, fetch_links=True + AppVariantDB.app.id == ObjectId(app_id), fetch_links=True ).to_list() return app_variants_db @@ -675,11 +669,10 @@ async def get_orga_image_instance_by_docker_id( ImageDB: instance of image object """ - query_expressions = { - "docker_id": docker_id, - "organization": ObjectId(organization_id), - } - image = await ImageDB.find_one(query_expressions) + image = await ImageDB.find_one( + ImageDB.docker_id == docker_id, + ImageDB.organization.id == ObjectId(organization_id), + ) return image @@ -700,11 +693,10 @@ async def get_orga_image_instance_by_uri( if not parsed_url.scheme and not parsed_url.netloc: raise ValueError(f"Invalid URL: {template_uri}") - query_expressions = ( + image = await ImageDB.fine_one( ImageDB.template_uri == template_uri, - ImageDB.organization == organization_id, + ImageDB.organization.id == ObjectId(organization_id), ) - image = await ImageDB.fine_one(query_expressions) return image @@ -1253,7 +1245,9 @@ async def fetch_evaluation_by_id(evaluation_id: str) -> Optional[EvaluationDB]: EvaluationDB: The fetched evaluation, or None if no evaluation was found. """ assert evaluation_id is not None, "evaluation_id cannot be None" - evaluation = await EvaluationDB.find_one(EvaluationDB.id == ObjectId(evaluation_id), fetch_links=True) + evaluation = await EvaluationDB.find_one( + EvaluationDB.id == ObjectId(evaluation_id), fetch_links=True + ) return evaluation @@ -1268,7 +1262,7 @@ async def fetch_human_evaluation_by_id( """ assert evaluation_id is not None, "evaluation_id cannot be None" evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.id == ObjectId(evaluation_id) + HumanEvaluationDB.id == ObjectId(evaluation_id), fetch_links=True ) return evaluation @@ -1301,6 +1295,7 @@ async def fetch_human_evaluation_scenario_by_id( assert evaluation_scenario_id is not None, "evaluation_scenario_id cannot be None" evaluation_scenario = await HumanEvaluationScenarioDB.find_one( HumanEvaluationScenarioDB.id == ObjectId(evaluation_scenario_id), + fetch_links=True, ) return evaluation_scenario @@ -1533,7 +1528,7 @@ async def fetch_app_and_check_access( Raises: HTTPException: If the app is not found or the user does not have access to it. """ - app = await AppDB.find_one(AppDB.id == ObjectId(app_id)) + app = await AppDB.find_one(AppDB.id == ObjectId(app_id), fetch_links=True) if app is None: logger.error("App not found") raise HTTPException @@ -1567,7 +1562,7 @@ async def fetch_app_variant_and_check_access( HTTPException: If the app variant is not found or the user does not have access to it. """ app_variant = await AppVariantDB.find_one( - AppVariantDB.id == ObjectId(app_variant_id) + AppVariantDB.id == ObjectId(app_variant_id), fetch_links=True ) if app_variant is None: logger.error("App variant not found") @@ -1693,7 +1688,9 @@ async def fetch_evaluators_configs(app_id: str): assert app_id is not None, "evaluation_id cannot be None" try: - evaluators_configs = await EvaluatorConfigDB.find(EvaluatorConfigDB.app.id == ObjectId(app_id)).to_list() + evaluators_configs = await EvaluatorConfigDB.find( + EvaluatorConfigDB.app.id == ObjectId(app_id) + ).to_list() return evaluators_configs except Exception as e: raise e @@ -1729,8 +1726,10 @@ async def fetch_evaluator_config_by_appId( """ try: - evaluator_config = await EvaluatorConfigDB.find_one(EvaluatorConfigDB.app.id == ObjectId(app_id), - EvaluatorConfigDB.evaluator_key == evaluator_name) + evaluator_config = await EvaluatorConfigDB.find_one( + EvaluatorConfigDB.app.id == ObjectId(app_id), + EvaluatorConfigDB.evaluator_key == evaluator_name, + ) return evaluator_config except Exception as e: raise e diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 023d80aa78..c15f636aea 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -108,9 +108,6 @@ async def _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario = await db_manager.fetch_human_evaluation_scenario_by_id( evaluation_scenario_id=evaluation_scenario_id ) - - print("evaluation_scenario") - print(evaluation_scenario) if evaluation_scenario is None: raise HTTPException( status_code=404, @@ -287,8 +284,7 @@ async def update_human_evaluation_service( updates["evaluation_type_settings"] = current_settings # Update the evaluation - evaluation.update(updates) - await evaluation.create() + await evaluation.update({"$set": updates}) async def fetch_evaluation_scenarios_for_evaluation( @@ -342,7 +338,7 @@ async def fetch_human_evaluation_scenarios_for_evaluation( **user_org_data, ) scenarios = await HumanEvaluationScenarioDB.find( - HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id) + HumanEvaluationScenarioDB.evaluation.id == ObjectId(evaluation.id), fetch_links=True ).to_list() eval_scenarios = [ converters.human_evaluation_scenario_db_to_pydantic(scenario) @@ -424,7 +420,7 @@ async def update_human_evaluation_scenario( if updated_data["correct_answer"] is not None: new_eval_set["correct_answer"] = updated_data["correct_answer"] - await eval_scenario.update(new_eval_set) + await eval_scenario.update({"$set": new_eval_set}) async def update_evaluation_scenario_score_service( @@ -447,7 +443,7 @@ async def update_evaluation_scenario_score_service( eval_scenario.score = score # Save the updated evaluation scenario - await eval_scenario.create() + await eval_scenario.save() async def get_evaluation_scenario_score_service( @@ -570,7 +566,7 @@ async def fetch_list_human_evaluations( ) evaluations_db = await HumanEvaluationDB.find( - HumanEvaluationDB.app.id == ObjectId(app_id) + HumanEvaluationDB.app.id == ObjectId(app_id), fetch_links=True ).to_list() return [ await converters.human_evaluation_db_to_pydantic(evaluation) @@ -671,41 +667,6 @@ async def create_custom_code_evaluation( return str(custom_eval.id) -async def update_custom_code_evaluation( - id: str, payload: CreateCustomEvaluation, **kwargs: dict -) -> str: - """Update a custom code evaluation in the database. - Args: - id (str): the ID of the custom evaluation to update - payload (CreateCustomEvaluation): the payload with updated data - Returns: - str: the ID of the updated custom evaluation - """ - - # Get user object - user = await get_user(user_uid=kwargs["uid"]) - - # Build query expression - query_expression = ( - CustomEvaluationDB.user == user.id, - CustomEvaluationDB.id == ObjectId(id), - ) - - # Get custom evaluation - custom_eval = await CustomEvaluationDB.find_one(query_expression) - if not custom_eval: - raise HTTPException(status_code=404, detail="Custom evaluation not found") - - # Update the custom evaluation fields - custom_eval.evaluation_name = payload.evaluation_name - custom_eval.python_code = payload.python_code - custom_eval.updated_at = datetime.utcnow() - - # Save the updated custom evaluation - await custom_eval.create() - return str(custom_eval.id) - - async def create_new_human_evaluation( payload: NewHumanEvaluation, **user_org_data: dict ) -> EvaluationDB: diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index dba7c8f4d4..63293d9d86 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -12,7 +12,7 @@ async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB): evaluation_scenarios = await HumanEvaluationScenarioDB.find( - HumanEvaluationScenarioDB.evaluation == ObjectId(evaluation.id), + HumanEvaluationScenarioDB.evaluation.id == ObjectId(evaluation.id), ).to_list() results = {} @@ -82,8 +82,9 @@ async def fetch_results_for_single_model_test(evaluation_id: str): async def fetch_average_score_for_custom_code_run(evaluation_id: str) -> float: - query_exp = EvaluationScenarioDB.evaluation == ObjectId(evaluation_id) - eval_scenarios = await EvaluationScenarioDB.find(query_exp).to_list() + eval_scenarios = await EvaluationScenarioDB.find( + EvaluationScenarioDB.evaluation.id == ObjectId(evaluation_id) + ).to_list() list_of_scores = [] for scenario in eval_scenarios: diff --git a/agenta-backend/agenta_backend/services/selectors.py b/agenta-backend/agenta_backend/services/selectors.py index 141c16e4cd..bae85cc8b5 100644 --- a/agenta-backend/agenta_backend/services/selectors.py +++ b/agenta-backend/agenta_backend/services/selectors.py @@ -52,13 +52,9 @@ async def get_user_own_org(user_uid: str) -> OrganizationDB: """ user = await UserDB.find_one(UserDB.uid == user_uid) - - # Build the query expression for the two conditions - query_expression = ( - OrganizationDB.owner == str(user.id), - OrganizationDB.type == "default", + org: OrganizationDB = await OrganizationDB.find_one( + OrganizationDB.owner == str(user.id), OrganizationDB.type == "default" ) - org: OrganizationDB = await OrganizationDB.find_one(query_expression) if org is not None: return org else: From 89fca56fdd5310c241041d50c280b8d547c7223e Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 5 Jan 2024 14:44:08 +0100 Subject: [PATCH 274/414] :art: Format - ran black --- .../agenta_backend/models/db_engine.py | 11 +++---- .../agenta_backend/models/db_models.py | 1 + .../agenta_backend/routers/app_router.py | 2 ++ .../routers/container_router.py | 2 +- .../agenta_backend/routers/variants_router.py | 5 +++- .../agenta_backend/services/db_manager.py | 5 +++- .../services/evaluation_service.py | 7 +++-- .../services/evaluators_service.py | 7 ++--- .../services/event_db_manager.py | 30 +++++++------------ agenta-backend/agenta_backend/utils/common.py | 2 +- 10 files changed, 35 insertions(+), 37 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 4f759e6df6..5a912fbd85 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -26,7 +26,7 @@ EvaluationDB, EvaluationScenarioDB, SpanDB, - TraceDB + TraceDB, ) # Configure and set logging level @@ -54,7 +54,7 @@ EvaluationDB, EvaluationScenarioDB, SpanDB, - TraceDB + TraceDB, ] @@ -78,10 +78,7 @@ async def init_db(self): client = await self.initialize_client() db_name = self._get_database_name(self.mode) - await init_beanie( - database=client[db_name], - document_models=document_models - ) + await init_beanie(database=client[db_name], document_models=document_models) logger.info(f"Using {db_name} database...") def _get_database_name(self, mode: str) -> str: @@ -108,4 +105,4 @@ def remove_db(self) -> None: elif self.mode == "test": client.drop_database("agenta_test") else: - client.drop_database(f"agenta_{self.mode}") \ No newline at end of file + client.drop_database(f"agenta_{self.mode}") diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 4645c6d875..84e39af9d7 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -19,6 +19,7 @@ class APIKeyDB(Document): class Settings: collection = "api_keys" + class InvitationDB(BaseModel): token: str = Field(unique=True) email: str diff --git a/agenta-backend/agenta_backend/routers/app_router.py b/agenta-backend/agenta_backend/routers/app_router.py index ee80bd02f7..483bcc289f 100644 --- a/agenta-backend/agenta_backend/routers/app_router.py +++ b/agenta-backend/agenta_backend/routers/app_router.py @@ -187,6 +187,7 @@ async def create_app( return CreateAppOutput(app_id=str(app_db.id), app_name=str(app_db.app_name)) except Exception as e: import traceback + traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) @@ -310,6 +311,7 @@ async def remove_app(app_id: str, request: Request): raise HTTPException(status_code=500, detail=detail) except Exception as e: import traceback + traceback.print_exc() detail = f"Unexpected error while trying to remove the app: {str(e)}" raise HTTPException(status_code=500, detail=detail) diff --git a/agenta-backend/agenta_backend/routers/container_router.py b/agenta-backend/agenta_backend/routers/container_router.py index 2a4b8c5254..0f71250fa0 100644 --- a/agenta-backend/agenta_backend/routers/container_router.py +++ b/agenta-backend/agenta_backend/routers/container_router.py @@ -37,7 +37,7 @@ # TODO: We need to improve this to use the introduced abstraction to also use start and stop service -#* Edit: someone remind me (abram) to work on this. +# * Edit: someone remind me (abram) to work on this. @router.post("/build_image/", operation_id="build_image") async def build_image( app_id: str, diff --git a/agenta-backend/agenta_backend/routers/variants_router.py b/agenta-backend/agenta_backend/routers/variants_router.py index 0553437377..5cf9f8908f 100644 --- a/agenta-backend/agenta_backend/routers/variants_router.py +++ b/agenta-backend/agenta_backend/routers/variants_router.py @@ -71,11 +71,14 @@ async def add_variant_from_base_and_config( **user_org_data, ) logger.debug(f"Successfully added new variant: {db_app_variant}") - app_variant_db = await db_manager.get_app_variant_instance_by_id(str(db_app_variant.id)) + app_variant_db = await db_manager.get_app_variant_instance_by_id( + str(db_app_variant.id) + ) return await converters.app_variant_db_to_output(app_variant_db) except Exception as e: import traceback + traceback.print_exc() logger.error(f"An exception occurred while adding the new variant: {e}") raise HTTPException(status_code=500, detail=str(e)) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index ad33b78ff1..72b49e06b6 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -166,7 +166,10 @@ async def fetch_app_by_name( user = await get_user(user_uid=user_org_data["uid"]) app = await AppDB.find_one(AppDB.app_name == app_name, AppDB.user.id == user.id) else: - app = await AppDB.find_one(AppDB.app_name == app_name, AppDB.organization.id == ObjectId(organization_id)) + app = await AppDB.find_one( + AppDB.app_name == app_name, + AppDB.organization.id == ObjectId(organization_id), + ) return app diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index c15f636aea..62dafd0fad 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -338,7 +338,8 @@ async def fetch_human_evaluation_scenarios_for_evaluation( **user_org_data, ) scenarios = await HumanEvaluationScenarioDB.find( - HumanEvaluationScenarioDB.evaluation.id == ObjectId(evaluation.id), fetch_links=True + HumanEvaluationScenarioDB.evaluation.id == ObjectId(evaluation.id), + fetch_links=True, ).to_list() eval_scenarios = [ converters.human_evaluation_scenario_db_to_pydantic(scenario) @@ -520,7 +521,9 @@ async def fetch_list_evaluations( detail=f"You do not have access to this app: {app_id}", ) - evaluations_db = await EvaluationDB.find(EvaluationDB.app.id == ObjectId(app_id), fetch_links=True).to_list() + evaluations_db = await EvaluationDB.find( + EvaluationDB.app.id == ObjectId(app_id), fetch_links=True + ).to_list() return [ await converters.evaluation_db_to_pydantic(evaluation) for evaluation in evaluations_db diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 9f08d62dd1..82d834e5e7 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -51,10 +51,7 @@ def auto_webhook_test( with httpx.Client() as client: request_body = json.loads(settings_values.get("webhook_body", None)) payload = request_body if request_body else {} - response = client.post( - url=settings_values["webhook_url"], - json=payload - ) + response = client.post(url=settings_values["webhook_url"], json=payload) response.raise_for_status() response_data = response.json() score = response_data.get("score", None) @@ -173,4 +170,4 @@ def evaluate( settings_values, *additional_args, **additional_kwargs, - ) \ No newline at end of file + ) diff --git a/agenta-backend/agenta_backend/services/event_db_manager.py b/agenta-backend/agenta_backend/services/event_db_manager.py index ba7441883e..43e5e6d407 100644 --- a/agenta-backend/agenta_backend/services/event_db_manager.py +++ b/agenta-backend/agenta_backend/services/event_db_manager.py @@ -47,7 +47,11 @@ async def get_variant_traces( """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = (TraceDB.user == user.id, TraceDB.app_id == app_id, TraceDB.variant_id == variant_id) + query_expressions = ( + TraceDB.user == user.id, + TraceDB.app_id == app_id, + TraceDB.variant_id == variant_id, + ) traces = await TraceDB.find(query_expressions).to_list() return [trace_db_to_pydantic(trace) for trace in traces] @@ -86,9 +90,7 @@ async def get_trace_single(trace_id: str, **kwargs: dict) -> Trace: """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = (TraceDB.id == ObjectId(trace_id), - TraceDB.user == user.id - ) + query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace trace = await TraceDB.find_one(query_expressions) @@ -109,9 +111,7 @@ async def trace_status_update( """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = (TraceDB.id == ObjectId(trace_id), - TraceDB.user == user.id - ) + query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace trace = await TraceDB.find_one(query_expressions) @@ -148,9 +148,7 @@ async def get_trace_spans(trace_id: str, **kwargs: dict) -> List[Span]: """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = (TraceDB.id == ObjectId(trace_id), - TraceDB.user == user.id - ) + query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace trace = await TraceDB.find_one(query_expressions) @@ -205,9 +203,7 @@ async def get_trace_feedbacks(trace_id: str, **kwargs: dict) -> List[Feedback]: user = await db_manager.get_user(user_uid=kwargs["uid"]) # Build query expressions - query_expressions = (TraceDB.id == ObjectId(trace_id), - TraceDB.user == user.id - ) + query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get feedbacks in trace trace = await TraceDB.find_one(query_expressions) @@ -231,9 +227,7 @@ async def get_feedback_detail( user = await db_manager.get_user(user_uid=kwargs["uid"]) # Build query expressions - query_expressions = (TraceDB.id == ObjectId(trace_id), - TraceDB.user == user.id - ) + query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace trace = await TraceDB.find_one(query_expressions) @@ -264,9 +258,7 @@ async def update_trace_feedback( user = await db_manager.get_user(user_uid=kwargs["uid"]) # Build query expressions - query_expressions = (TraceDB.id == ObjectId(trace_id), - TraceDB.user == user.id - ) + query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace trace = await TraceDB.find_one(query_expressions) diff --git a/agenta-backend/agenta_backend/utils/common.py b/agenta-backend/agenta_backend/utils/common.py index f149b05680..4ed0f23c82 100644 --- a/agenta-backend/agenta_backend/utils/common.py +++ b/agenta-backend/agenta_backend/utils/common.py @@ -73,7 +73,7 @@ async def get_app_instance( ) -> AppVariantDB: queries = (AppVariantDB.is_deleted == show_deleted, AppVariantDB.app == app_id) if variant_name is not None: - queries += (AppVariantDB.variant_name == variant_name) + queries += AppVariantDB.variant_name == variant_name app_instance = await AppVariantDB.find_one(*queries) return app_instance From 3ff3c20cb7e20067c913b5c92fc3755905517fe2 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 5 Jan 2024 19:05:37 +0100 Subject: [PATCH 275/414] Refactor - migrate odmantic to beanie in backend testcases --- .../tests/observability_router/conftest.py | 7 --- .../test_observability_router.py | 60 +++++++++---------- .../test_organization_router.py | 8 +-- .../testset_router/test_testset_router.py | 20 +++---- .../user_profile_router/test_user_profile.py | 6 +- .../variants_evaluators_router/conftest.py | 8 +-- .../test_evaluators_router.py | 44 ++++++++------ .../tests/variants_router/conftest.py | 33 +++++----- .../test_app_variant_router.py | 26 ++++---- 9 files changed, 91 insertions(+), 121 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/observability_router/conftest.py b/agenta-backend/agenta_backend/tests/observability_router/conftest.py index 5c4a38c8cd..92523e645e 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/observability_router/conftest.py @@ -1,13 +1,6 @@ import pytest from datetime import datetime -from agenta_backend.models.db_engine import DBEngine -from agenta_backend.models.db_models import OrganizationDB - - -# Initialize database engine -engine = DBEngine().engine() - @pytest.fixture() def spans_db_data(): diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py index d37e8e683f..2f25a48a75 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py @@ -14,15 +14,11 @@ AppVariantDB, VariantBaseDB, ) -from agenta_backend.models.db_engine import DBEngine from agenta_backend.services import selectors import httpx -# Initialize database engine -engine = DBEngine().engine() - # Initialize http client test_client = httpx.AsyncClient() timeout = httpx.Timeout(timeout=5, read=None, write=5) @@ -48,13 +44,13 @@ async def test_create_spans_endpoint(spans_db_data): @pytest.mark.asyncio async def test_create_user_and_org(user_create_data, organization_create_data): user_db = UserDB(**user_create_data) - await engine.save(user_db) + await user_db.create() org_db = OrganizationDB(**organization_create_data, owner=str(user_db.id)) - await engine.save(org_db) + await org_db.create() user_db.organizations = [org_db.id] - await engine.save(user_db) + await user_db.save() assert org_db.name == "Agenta" assert user_db.username == "agenta" @@ -63,25 +59,25 @@ async def test_create_user_and_org(user_create_data, organization_create_data): @pytest.mark.asyncio async def test_create_organization(organization_create_data): - user_db = await engine.find_one(UserDB, UserDB.uid == "0") + user_db = await UserDB.find_one(UserDB.uid == "0") organization = OrganizationDB( **organization_create_data, type="default", owner=str(user_db.id), members=[user_db.id], ) - await engine.save(organization) + await organization.create() @pytest.mark.asyncio async def test_create_image_in_db(image_create_data): - user_db = await engine.find_one(UserDB, UserDB.uid == "0") - organization_db = await engine.find_one( - OrganizationDB, OrganizationDB.owner == str(user_db.id) + user_db = await UserDB.find_one(UserDB.uid == "0") + organization_db = await OrganizationDB.find_one( + OrganizationDB.owner == str(user_db.id) ) image_db = ImageDB(**image_create_data, user=user_db, organization=organization_db) - await engine.save(image_db) + await image_db.create() assert image_db.user.id == user_db.id assert image_db.tags == image_create_data["tags"] @@ -89,22 +85,22 @@ async def test_create_image_in_db(image_create_data): @pytest.mark.asyncio async def test_create_appvariant_in_db(app_variant_create_data): - user_db = await engine.find_one(UserDB, UserDB.uid == "0") + user_db = await UserDB.find_one(UserDB.uid == "0") organization_db = await selectors.get_user_own_org(user_db.uid) - image_db = await engine.find_one(ImageDB, ImageDB.user == user_db.id) + image_db = await ImageDB.find_one(ImageDB.user == user_db.id) app = AppDB( app_name="test_app", organization=organization_db, user=user_db, ) - await engine.save(app) + await app.create() db_config = ConfigDB( config_name="default", parameters={}, ) - await engine.save(db_config) + await db_config.create() db_base = VariantBaseDB( app=app, @@ -113,7 +109,7 @@ async def test_create_appvariant_in_db(app_variant_create_data): base_name="app", image=image_db, ) - await engine.save(db_base) + await db_base.create() app_variant_db = AppVariantDB( **app_variant_create_data, @@ -126,7 +122,7 @@ async def test_create_appvariant_in_db(app_variant_create_data): base=db_base, config=db_config, ) - await engine.save(app_variant_db) + await app_variant_db.create() assert app_variant_db.image.id == image_db.id assert app_variant_db.user.id == user_db.id @@ -147,7 +143,7 @@ async def test_create_spans_in_db(spans_db_data): # In this case, we are getting the first span id that was # created in the first test and updating the previous_span_id with it if previous_span_id is None and not first_span_id_used: - first_span = await engine.find_one(SpanDB) + first_span = await SpanDB.find_one() previous_span_id = str(first_span.id) # Create a new span instance @@ -159,7 +155,7 @@ async def test_create_spans_in_db(spans_db_data): # Save the span instance and set the first_span_id_used # to True to avoid reusing it - await engine.save(span_db) + await span_db.create() first_span_id_used = True # Check if the previous span id exists and that first_span_id_used is True @@ -172,15 +168,15 @@ async def test_create_spans_in_db(spans_db_data): @pytest.mark.asyncio async def fetch_spans_id(): - spans = await engine.find(SpanDB) + spans = await SpanDB.find().to_list() assert type(spans) == List[SpanDB] assert len(spans) == 3 @pytest.mark.asyncio async def test_create_trace_endpoint(trace_create_data): - spans = await engine.find(SpanDB) - variants = await engine.find(AppVariantDB) + spans = await SpanDB.find().to_list() + variants = await AppVariantDB.find(fetch_links=True).to_list() # Prepare required data spans_id = [str(span.id) for span in spans] @@ -202,7 +198,7 @@ async def test_create_trace_endpoint(trace_create_data): @pytest.mark.asyncio async def test_get_traces_endpoint(): - variants = await engine.find(AppVariantDB) + variants = await AppVariantDB.find(fetch_links=True).to_list() app_id, variant_id = variants[0].app.id, variants[0].id response = await test_client.get( @@ -214,9 +210,9 @@ async def test_get_traces_endpoint(): @pytest.mark.asyncio async def test_get_trace_endpoint(): - traces = await engine.find(TraceDB) + traces = await TraceDB.find().to_list() - variants = await engine.find(AppVariantDB) + variants = await AppVariantDB.find(fetch_links=True).to_list() app_id, variant_id = variants[0].app.id, variants[0].id response = await test_client.get( @@ -234,7 +230,7 @@ async def test_update_trace_status_endpoint(): "status": random.choice(["initiated", "completed", "stopped", "cancelled"]) } - traces = await engine.find(TraceDB) + traces = await TraceDB.find().to_list() response = await test_client.put( f"{BACKEND_API_HOST}/observability/traces/{str(traces[0].id)}/", json=payload, @@ -245,7 +241,7 @@ async def test_update_trace_status_endpoint(): @pytest.mark.asyncio async def test_create_feedback_endpoint(feedbacks_create_data): - traces = await engine.find(TraceDB) + traces = await TraceDB.find().to_list() for feedback_data in feedbacks_create_data: response = await test_client.post( f"{BACKEND_API_HOST}/observability/feedbacks/{str(traces[0].id)}/", @@ -257,7 +253,7 @@ async def test_create_feedback_endpoint(feedbacks_create_data): @pytest.mark.asyncio async def test_get_trace_feedbacks_endpoint(): - traces = await engine.find(TraceDB) + traces = await TraceDB.find().to_list() response = await test_client.get( f"{BACKEND_API_HOST}/observability/feedbacks/{str(traces[0].id)}/" ) @@ -267,7 +263,7 @@ async def test_get_trace_feedbacks_endpoint(): @pytest.mark.asyncio async def test_get_feedback_endpoint(): - traces = await engine.find(TraceDB) + traces = await TraceDB.find().to_list() feedback_id = traces[0].feedbacks[0].uid response = await test_client.get( f"{BACKEND_API_HOST}/observability/feedbacks/{str(traces[0].id)}/{feedback_id}/" @@ -278,7 +274,7 @@ async def test_get_feedback_endpoint(): @pytest.mark.asyncio async def test_update_feedback_endpoint(): - traces = await engine.find(TraceDB) + traces = await TraceDB.find(fetch_links=True).to_list() feedbacks_ids = [feedback.uid for feedback in traces[0].feedbacks] for feedback_id in feedbacks_ids: diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py index c19d32711e..222c9dbe9c 100644 --- a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py +++ b/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py @@ -2,16 +2,12 @@ from agenta_backend.services import selectors from agenta_backend.models.db_models import UserDB -from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.api.organization_models import OrganizationOutput import httpx import pytest -# Initialize database engine -engine = DBEngine().engine() - # Initialize http client test_client = httpx.AsyncClient() timeout = httpx.Timeout(timeout=5, read=None, write=5) @@ -34,7 +30,7 @@ async def test_list_organizations(): @pytest.mark.asyncio async def test_get_user_organization(): - user = await engine.find_one(UserDB, UserDB.uid == "0") + user = await UserDB.find_one(UserDB.uid == "0") user_org = await selectors.get_user_own_org(user.uid) response = await test_client.get(f"{BACKEND_API_HOST}/organizations/own/") @@ -48,7 +44,7 @@ async def test_get_user_organization(): @pytest.mark.asyncio async def test_user_does_not_have_an_organization(): user = UserDB(uid="0123", username="john_doe", email="johndoe@email.com") - await engine.save(user) + await user.create() user_org = await selectors.get_user_own_org(user.uid) assert user_org == None diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py index 0d6fe4cf1e..561fb8daba 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py @@ -1,7 +1,6 @@ import os from pathlib import Path -from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.db_models import ( AppDB, TestSetDB, @@ -11,9 +10,6 @@ import pytest -# Initialize database engine -engine = DBEngine().engine() - # Initialize http client test_client = httpx.AsyncClient() timeout = httpx.Timeout(timeout=5, read=None, write=5) @@ -32,7 +28,7 @@ @pytest.mark.asyncio async def test_create_testset(): - app = await engine.find_one(AppDB, AppDB.app_name == "test_app") + app = await AppDB.find_one(AppDB.app_name == "test_app") payload = { "name": "create_testset_main", @@ -61,8 +57,8 @@ async def test_create_testset(): @pytest.mark.asyncio async def test_update_testset(): - app = await engine.find_one(AppDB, AppDB.app_name == "test_app") - testset = await engine.find_one(TestSetDB, TestSetDB.app == app.id) + app = await AppDB.find_one(AppDB.app_name == "test_app") + testset = await AppDB.find_one(TestSetDB.app.id == app.id) payload = { "name": "update_testset", @@ -93,7 +89,7 @@ async def test_update_testset(): @pytest.mark.asyncio async def test_get_testsets(): - app = await engine.find_one(AppDB, AppDB.app_name == "test_app") + app = await AppDB.find_one(AppDB.app_name == "test_app") response = await test_client.get( f"{BACKEND_API_HOST}/testsets/?app_id={str(app.id)}" ) @@ -104,8 +100,8 @@ async def test_get_testsets(): @pytest.mark.asyncio() async def test_get_testset(): - app = await engine.find_one(AppDB, AppDB.app_name == "test_app") - testset = await engine.find_one(TestSetDB, TestSetDB.app == app.id) + app = await AppDB.find_one(AppDB.app_name == "test_app") + testset = await TestSetDB.find_one(TestSetDB.app.id == app.id) response = await test_client.get(f"{BACKEND_API_HOST}/testsets/{str(testset.id)}/") @@ -116,8 +112,8 @@ async def test_get_testset(): @pytest.mark.asyncio async def test_delete_testsets(): - app = await engine.find_one(AppDB, AppDB.app_name == "test_app") - testsets = await engine.find(TestSetDB, TestSetDB.app == app.id) + app = await AppDB.find_one(AppDB.app_name == "test_app") + testsets = await TestSetDB.find(TestSetDB.app.id == app.id) testset_ids = [str(testset.id) for testset in testsets] payload = {"testset_ids": testset_ids} diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py index 182ce64e93..a1df5dd30c 100644 --- a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py @@ -3,13 +3,9 @@ import pytest from agenta_backend.models.db_models import UserDB -from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.api.user_models import User -# Initialize database engine -engine = DBEngine().engine() - # Initialize http client test_client = httpx.AsyncClient() timeout = httpx.Timeout(timeout=5, read=None, write=5) @@ -24,7 +20,7 @@ @pytest.mark.asyncio async def test_user_profile(): - user_db = await engine.find_one(UserDB, UserDB.uid == "0") + user_db = await UserDB.find_one(UserDB.uid == "0") user_db_dict = User( id=str(user_db.id), uid=str(user_db.uid), diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py index 53abc772a7..2f05c10290 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py @@ -2,16 +2,12 @@ import httpx import pytest -from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.db_models import ( UserDB, OrganizationDB, ) -# Initialize database engine -engine = DBEngine().engine() - # Set global variables ENVIRONMENT = os.environ.get("ENVIRONMENT") OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") @@ -40,7 +36,7 @@ def fetch_single_prompt_template(fetch_templates): @pytest.fixture() async def fetch_user_organization(): - organization = await engine.find(OrganizationDB) + organization = await OrganizationDB.find().to_list() return {"org_id": str(organization[0].id)} @@ -56,7 +52,7 @@ def app_from_template(): @pytest.fixture(scope="session") async def fetch_user(): - user = await engine.find_one(UserDB, UserDB.uid == "0") + user = await UserDB.find_one(UserDB.uid == "0", fetch_links=True) return user diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py index c86201d146..fc8be57818 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py @@ -3,20 +3,17 @@ import pytest import asyncio -from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum from agenta_backend.models.db_models import ( - EvaluationDB, AppDB, TestSetDB, + EvaluationDB, AppVariantDB, DeploymentDB, + EvaluationScenarioDB, ) -# Initialize database engine -engine = DBEngine().engine() - # Initialize http client test_client = httpx.AsyncClient() timeout = httpx.Timeout(timeout=5, read=None, write=5) @@ -60,7 +57,7 @@ async def test_get_evaluators_endpoint(): async def test_create_auto_exact_match_evaluator_config( auto_exact_match_evaluator_config, ): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) payload = auto_exact_match_evaluator_config payload["app_id"] = str(app.id) @@ -76,7 +73,7 @@ async def test_create_auto_exact_match_evaluator_config( async def test_create_auto_similarity_match_evaluator_config( auto_similarity_match_evaluator_config, ): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) payload = auto_similarity_match_evaluator_config payload["app_id"] = str(app.id) @@ -92,7 +89,7 @@ async def test_create_auto_similarity_match_evaluator_config( async def test_create_auto_regex_test_evaluator_config( auto_regex_test_evaluator_config, ): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) payload = auto_regex_test_evaluator_config payload["app_id"] = str(app.id) payload["settings_values"]["regex_pattern"] = "^ig\\d{3}$" @@ -109,7 +106,7 @@ async def test_create_auto_regex_test_evaluator_config( async def test_create_auto_webhook_test_evaluator_config( auto_webhook_test_evaluator_config, ): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) payload = auto_webhook_test_evaluator_config payload["app_id"] = str(app.id) @@ -125,7 +122,7 @@ async def test_create_auto_webhook_test_evaluator_config( async def test_create_auto_ai_critique_evaluator_config( auto_ai_critique_evaluator_config, ): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) payload = auto_ai_critique_evaluator_config payload["app_id"] = str(app.id) @@ -139,7 +136,7 @@ async def test_create_auto_ai_critique_evaluator_config( @pytest.mark.asyncio async def test_get_evaluator_configs(): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) response = await test_client.get( f"{BACKEND_API_HOST}/evaluators/configs/?app_id={str(app.id)}", timeout=timeout, @@ -151,9 +148,9 @@ async def test_get_evaluator_configs(): @pytest.mark.asyncio async def test_create_evaluation(): # Fetch app, app_variant and testset - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) - app_variant = await engine.find_one(AppVariantDB, AppVariantDB.app == app.id) - testset = await engine.find_one(TestSetDB, TestSetDB.app == app.id) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) + app_variant = await AppVariantDB.find_one(AppVariantDB.app.id == app.id) + testset = await TestSetDB.find_one(TestSetDB.app.id == app.id) # Prepare payload payload = { @@ -196,7 +193,7 @@ async def test_create_evaluation(): @pytest.mark.asyncio async def test_fetch_evaluation_status(): - evaluations = await engine.find(EvaluationDB) # will return only one in this case + evaluations = await EvaluationDB.find().to_list() # will return only one in this case evaluation = evaluations[0] # Prepare and start short-polling request @@ -220,7 +217,7 @@ async def test_fetch_evaluation_status(): @pytest.mark.asyncio async def test_fetch_evaluation_results(): - evaluations = await engine.find(EvaluationDB) # will return only one in this case + evaluations = await EvaluationDB.find().to_list() # will return only one in this case evaluation = evaluations[0] response = await test_client.get( @@ -235,7 +232,7 @@ async def test_fetch_evaluation_results(): @pytest.mark.asyncio async def test_delete_evaluator_config(): - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) response = await test_client.get( f"{BACKEND_API_HOST}/evaluators/configs/?app_id={str(app.id)}", timeout=timeout, @@ -253,14 +250,23 @@ async def test_delete_evaluator_config(): assert len(evaluator_configs) == count_of_deleted_configs +@pytest.mark.asyncio +async def test_evaluation_scenario_match_evaluation_testset_length(): + evaluations = await EvaluationDB.find(fetch_links=True).to_list() # will return only one in this case + evaluation = evaluations[0] + evaluation_scenario_count = await EvaluationScenarioDB.find(EvaluationScenarioDB.evaluation.id == evaluation.id).count() + + assert evaluation_scenario_count == len(evaluation.testset.csvdata) + + @pytest.mark.asyncio async def test_remove_running_template_app_container(): import docker # Connect to the Docker daemon client = docker.from_env() - app = await engine.find_one(AppDB, AppDB.app_name == APP_NAME) - deployment = await engine.find_one(DeploymentDB, DeploymentDB.app == app.id) + app = await AppDB.find_one(AppDB.app_name == APP_NAME) + deployment = await DeploymentDB.find_one(DeploymentDB.app.id == app.id) try: # Retrieve container container = client.containers.get(deployment.container_name) diff --git a/agenta-backend/agenta_backend/tests/variants_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_router/conftest.py index 8b7917b7c0..fc5f9895c8 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_router/conftest.py @@ -1,7 +1,6 @@ import pytest import logging -from agenta_backend.models.db_engine import DBEngine from agenta_backend.models.db_models import ( AppDB, UserDB, @@ -14,9 +13,7 @@ from agenta_backend.services import selectors -# Initialize database engine -engine = DBEngine().engine() - +# Initialize logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -25,17 +22,16 @@ async def get_first_user_object(): """Get the user object from the database or create a new one if not found.""" - user = await engine.find_one(UserDB, UserDB.uid == "0") + user = await UserDB.find_one(UserDB.uid == "0") if user is None: create_user = UserDB(uid="0") - await engine.save(create_user) + await create_user.create() org = OrganizationDB(type="default", owner=str(create_user.id)) - await engine.save(org) + await org.create() create_user.organizations.append(org.id) - await engine.save(create_user) - await engine.save(org) + await create_user.save() return create_user return user @@ -45,19 +41,18 @@ async def get_first_user_object(): async def get_second_user_object(): """Create a second user object.""" - user = await engine.find_one(UserDB, UserDB.uid == "1") + user = await UserDB.find_one(UserDB.uid == "1") if user is None: create_user = UserDB( uid="1", username="test_user1", email="test_user1@email.com" ) - await engine.save(create_user) + await create_user.create() org = OrganizationDB(type="default", owner=str(create_user.id)) - await engine.save(org) + await org.create() create_user.organizations.append(org.id) - await engine.save(create_user) - await engine.save(org) + await create_user.save() return create_user return user @@ -69,7 +64,7 @@ async def get_first_user_app(get_first_user_object): organization = await selectors.get_user_own_org(user.uid) app = AppDB(app_name="myapp", organization=organization, user=user) - await engine.save(app) + await app.create() db_image = ImageDB( docker_id="sha256:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", @@ -77,18 +72,18 @@ async def get_first_user_app(get_first_user_object): user=user, organization=organization, ) - await engine.save(db_image) + await db_image.create() db_config = ConfigDB( config_name="default", parameters={}, ) - await engine.save(db_config) + await db_config.create() db_base = VariantBaseDB( base_name="app", image=db_image, organization=organization, user=user, app=app ) - await engine.save(db_base) + await db_base.create() appvariant = AppVariantDB( app=app, @@ -102,6 +97,6 @@ async def get_first_user_app(get_first_user_object): base=db_base, config=db_config, ) - await engine.save(appvariant) + await appvariant.create() return appvariant, user, organization, app, db_image, db_config, db_base diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py index 65df2ea317..d7e6a43a51 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py @@ -5,7 +5,6 @@ from bson import ObjectId from agenta_backend.routers import app_router -from agenta_backend.models.db_engine import DBEngine from agenta_backend.services import selectors, db_manager from agenta_backend.models.db_models import ( AppDB, @@ -16,9 +15,6 @@ ) -# Initialize database engine -engine = DBEngine().engine() - # Initialize http client test_client = httpx.AsyncClient() timeout = httpx.Timeout(timeout=5, read=None, write=5) @@ -66,7 +62,7 @@ async def test_list_apps(): async def test_create_app_variant(get_first_user_object): user = await get_first_user_object organization = await selectors.get_user_own_org(user.uid) - app = await engine.find_one(AppDB, AppDB.app_name == "app_variant_test") + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") db_image = ImageDB( docker_id="sha256:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", @@ -74,13 +70,13 @@ async def test_create_app_variant(get_first_user_object): user=user, organization=organization, ) - await engine.save(db_image) + await db_image.create() db_config = ConfigDB( config_name="default", parameters={}, ) - await engine.save(db_config) + await db_config.create() db_base = VariantBaseDB( base_name="app", @@ -89,7 +85,7 @@ async def test_create_app_variant(get_first_user_object): user=user, image=db_image, ) - await engine.save(db_base) + await db_base.create() appvariant = AppVariantDB( app=app, @@ -103,7 +99,7 @@ async def test_create_app_variant(get_first_user_object): base=db_base, config=db_config, ) - await engine.save(appvariant) + await appvariant.create() response = await test_client.get(f"{BACKEND_API_HOST}/apps/{str(app.id)}/variants/") assert response.status_code == 200 @@ -112,7 +108,7 @@ async def test_create_app_variant(get_first_user_object): @pytest.mark.asyncio async def test_list_app_variants(): - app_db = await engine.find_one(AppDB, AppDB.app_name == "app_variant_test") + app_db = await AppDB.find_one(AppDB.app_name == "app_variant_test") response = await test_client.get( f"{BACKEND_API_HOST}/apps/{str(app_db.id)}/variants/" ) @@ -131,7 +127,7 @@ async def test_delete_app_without_permission(get_second_user_object): organization=user2_organization, user=user2, ) - await engine.save(user2_app) + await user2_app.create() response = await test_client.delete( f"{BACKEND_API_HOST}/apps/{str(user2_app.id)}/", @@ -142,7 +138,7 @@ async def test_delete_app_without_permission(get_second_user_object): @pytest.mark.asyncio async def test_list_environments(): - app = await engine.find_one(AppDB, AppDB.app_name == "app_variant_test") + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") response = await test_client.get( f"{BACKEND_API_HOST}/apps/{str(app.id)}/environments/" ) @@ -165,8 +161,8 @@ async def test_get_variant_by_env(get_first_user_app): @pytest.mark.asyncio async def test_remove_app(): - app = await engine.find_one(AppDB, AppDB.app_name == "app_variant_test") - await engine.delete(app) + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") + await app.delete() - app = await engine.find_one(AppDB, AppDB.app_name == "app_variant_test") + app = await AppDB.find_one(AppDB, AppDB.app_name == "app_variant_test") assert app == None From 9d8549b232bccfa2ca9a0f4dc1d7b3191c3c2917 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 5 Jan 2024 20:42:14 +0100 Subject: [PATCH 276/414] add direct use in evaluator config --- agenta-backend/agenta_backend/models/api/evaluation_model.py | 2 ++ agenta-backend/agenta_backend/models/converters.py | 1 + agenta-backend/agenta_backend/models/db_models.py | 1 + agenta-backend/agenta_backend/routers/evaluators_router.py | 1 + agenta-backend/agenta_backend/services/db_manager.py | 2 ++ agenta-backend/agenta_backend/services/evaluator_manager.py | 2 ++ 6 files changed, 9 insertions(+) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index a1c3571fc6..3b748f3286 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -16,6 +16,7 @@ class EvaluatorConfig(BaseModel): id: str name: str evaluator_key: str + direct_use: bool settings_values: Optional[Dict[str, Any]] created_at: datetime updated_at: datetime @@ -270,6 +271,7 @@ class NewEvaluatorConfig(BaseModel): app_id: str name: str evaluator_key: str + direct_use: bool settings_values: dict diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index a34bdf1931..514d5c47ec 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -385,6 +385,7 @@ def evaluator_config_db_to_pydantic(evaluator_config: EvaluatorConfigDB): id=str(evaluator_config.id), name=evaluator_config.name, evaluator_key=evaluator_config.evaluator_key, + direct_use=evaluator_config.direct_use, settings_values=evaluator_config.settings_values, created_at=evaluator_config.created_at, updated_at=evaluator_config.updated_at, diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 90e56781cc..8871bc26df 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -214,6 +214,7 @@ class EvaluatorConfigDB(Model): user: UserDB = Reference(key_name="user") name: str evaluator_key: str + direct_use: bool settings_values: Optional[Dict[str, Any]] = None created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 71dafc5c43..10073433f5 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -102,6 +102,7 @@ async def create_new_evaluator_config( app_id=payload.app_id, name=payload.name, evaluator_key=payload.evaluator_key, + direct_use=payload.direct_use, settings_values=payload.settings_values, ) return evaluator_config diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index ad4cb1c014..a980383648 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1797,6 +1797,7 @@ async def create_evaluator_config( organization: OrganizationDB, name: str, evaluator_key: str, + direct_use: bool, settings_values: Optional[Dict[str, Any]] = None, ) -> EvaluatorConfigDB: """Create a new evaluator configuration in the database.""" @@ -1807,6 +1808,7 @@ async def create_evaluator_config( organization=organization, name=name, evaluator_key=evaluator_key, + direct_use=direct_use, settings_values=settings_values, ) diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index faceface00..738678916e 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -43,6 +43,7 @@ async def create_evaluator_config( app_id: str, name: str, evaluator_key: str, + direct_use: bool, settings_values: Optional[Dict[str, Any]] = None, ) -> EvaluatorConfig: """ @@ -64,6 +65,7 @@ async def create_evaluator_config( user=app.user, name=name, evaluator_key=evaluator_key, + direct_use=direct_use, settings_values=settings_values, ) return evaluator_config_db_to_pydantic(evaluator_config=evaluator_config) From 23aa54843ebf7ebabcf7cb7aed5799f238e03948 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Sat, 6 Jan 2024 00:47:26 +0500 Subject: [PATCH 277/414] comparisonResult type --- agenta-web/dev.Dockerfile | 46 ++++++++++---------- agenta-web/src/lib/Types.ts | 12 +++++ agenta-web/src/services/evaluations/index.ts | 11 +++++ 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile index 6f86dbd847..9c4ce68303 100644 --- a/agenta-web/dev.Dockerfile +++ b/agenta-web/dev.Dockerfile @@ -3,24 +3,24 @@ FROM node:18-alpine WORKDIR /app # Install dependencies based on the preferred package manager -COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ -RUN \ - if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ - elif [ -f package-lock.json ]; then npm i; \ - elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ - # Allow install without lockfile, so example works even without Node.js installed locally - else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ - fi +# COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ +# RUN \ +# if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ +# elif [ -f package-lock.json ]; then npm i; \ +# elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ +# # Allow install without lockfile, so example works even without Node.js installed locally +# else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ +# fi -COPY src ./src -COPY public ./public -COPY next.config.js . -COPY tsconfig.json . -COPY postcss.config.js . -COPY .env . -RUN if [ -f .env.local ]; then cp .env.local .; fi -# used in cloud -COPY sentry.* . +# COPY src ./src +# COPY public ./public +# COPY next.config.js . +# COPY tsconfig.json . +# COPY postcss.config.js . +# COPY .env . +# RUN if [ -f .env.local ]; then cp .env.local .; fi +# # used in cloud +# COPY sentry.* . # Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry # Uncomment the following line to disable telemetry at run time # ENV NEXT_TELEMETRY_DISABLED 1 @@ -28,10 +28,10 @@ COPY sentry.* . # Note: Don't expose ports here, Compose will handle that for us # Start Next.js in development mode based on the preferred package manager -CMD \ - if [ -f yarn.lock ]; then yarn dev; \ - elif [ -f package-lock.json ]; then npm run dev; \ - elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ - else yarn dev; \ - fi +# CMD \ +# if [ -f yarn.lock ]; then yarn dev; \ +# elif [ -f package-lock.json ]; then npm run dev; \ +# elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ +# else yarn dev; \ +# fi diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 12c424f3bf..1e307f2878 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -395,3 +395,15 @@ export interface AnnotationScenario { note?: string result: TypedValue } + +export interface ComparisonResult { + inputs: {name: string; value: string}[] + correct_answer: string + variants: { + variant_id: string + variant_name: string + evaluation_id: string + evaluator_configs: EvaluatorConfig[] + }[] + data: _EvaluationScenario[] +} diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index c2044387fa..2270330be8 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -2,6 +2,7 @@ import axios from "@/lib//helpers/axiosConfig" import { Annotation, AnnotationScenario, + ComparisonResult, EvaluationStatus, Evaluator, EvaluatorConfig, @@ -196,3 +197,13 @@ export const updateAnnotationScenario = async ( data, ) } + +// Comparison +export const fetchAllComparisonResults = async (evaluationIds: string[]) => { + const response = await axios.get(`/api/evaluations/evaluation_scenarios/comparison-results`, { + params: { + evaluations_ids: evaluationIds.join(","), + }, + }) + return response.data as ComparisonResult[] +} From 8e4fe8f93c0d738c4be2b00ae4858d9395af012d Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 5 Jan 2024 21:39:38 +0100 Subject: [PATCH 278/414] remove direct-use --- agenta-backend/agenta_backend/models/api/evaluation_model.py | 2 -- agenta-backend/agenta_backend/models/converters.py | 1 - agenta-backend/agenta_backend/models/db_models.py | 1 - agenta-backend/agenta_backend/services/db_manager.py | 2 -- 4 files changed, 6 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 3b748f3286..a1c3571fc6 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -16,7 +16,6 @@ class EvaluatorConfig(BaseModel): id: str name: str evaluator_key: str - direct_use: bool settings_values: Optional[Dict[str, Any]] created_at: datetime updated_at: datetime @@ -271,7 +270,6 @@ class NewEvaluatorConfig(BaseModel): app_id: str name: str evaluator_key: str - direct_use: bool settings_values: dict diff --git a/agenta-backend/agenta_backend/models/converters.py b/agenta-backend/agenta_backend/models/converters.py index 514d5c47ec..a34bdf1931 100644 --- a/agenta-backend/agenta_backend/models/converters.py +++ b/agenta-backend/agenta_backend/models/converters.py @@ -385,7 +385,6 @@ def evaluator_config_db_to_pydantic(evaluator_config: EvaluatorConfigDB): id=str(evaluator_config.id), name=evaluator_config.name, evaluator_key=evaluator_config.evaluator_key, - direct_use=evaluator_config.direct_use, settings_values=evaluator_config.settings_values, created_at=evaluator_config.created_at, updated_at=evaluator_config.updated_at, diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 8871bc26df..90e56781cc 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -214,7 +214,6 @@ class EvaluatorConfigDB(Model): user: UserDB = Reference(key_name="user") name: str evaluator_key: str - direct_use: bool settings_values: Optional[Dict[str, Any]] = None created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index a980383648..ad4cb1c014 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1797,7 +1797,6 @@ async def create_evaluator_config( organization: OrganizationDB, name: str, evaluator_key: str, - direct_use: bool, settings_values: Optional[Dict[str, Any]] = None, ) -> EvaluatorConfigDB: """Create a new evaluator configuration in the database.""" @@ -1808,7 +1807,6 @@ async def create_evaluator_config( organization=organization, name=name, evaluator_key=evaluator_key, - direct_use=direct_use, settings_values=settings_values, ) From 90277bc08e47342a38d102f9a5ce1856a931fd5c Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 5 Jan 2024 21:40:25 +0100 Subject: [PATCH 279/414] small method refactor --- .../routers/evaluators_router.py | 21 ++++++--------- .../services/evaluator_manager.py | 27 ++++++++++++++++--- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 10073433f5..998bacd281 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -33,25 +33,21 @@ @router.get("/", response_model=List[Evaluator]) -async def get_evaluators(): - """Fetches a list of evaluators from the hardcoded JSON file. +async def get_evaluators_endpoint(): + """ + Endpoint to fetch a list of evaluators. Returns: List[Evaluator]: A list of evaluator objects. """ - file_path = "agenta_backend/resources/evaluators/evaluators.json" + evaluators = evaluator_manager.get_evaluators() - if not os.path.exists(file_path): - raise HTTPException(status_code=404, detail="Evaluators file not found") + if evaluators is None: + raise HTTPException(status_code=500, detail="Error processing evaluators file") - try: - with open(file_path, "r") as file: - evaluators = json.load(file) - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Error reading evaluators file: {str(e)}" - ) + if not evaluators: + raise HTTPException(status_code=404, detail="No evaluators found") return evaluators @@ -102,7 +98,6 @@ async def create_new_evaluator_config( app_id=payload.app_id, name=payload.name, evaluator_key=payload.evaluator_key, - direct_use=payload.direct_use, settings_values=payload.settings_values, ) return evaluator_config diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 738678916e..31642aadc5 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -1,13 +1,36 @@ +import json +import os from typing import Any, Dict, Optional, List from agenta_backend.services import db_manager from agenta_backend.models.db_models import EvaluatorConfigDB -from agenta_backend.models.api.evaluation_model import EvaluatorConfig +from agenta_backend.models.api.evaluation_model import Evaluator, EvaluatorConfig from agenta_backend.models.converters import evaluator_config_db_to_pydantic +def get_evaluators() -> Optional[List[Evaluator]]: + """ + Fetches a list of evaluators from a JSON file. + + Returns: + Optional[List[Evaluator]]: A list of evaluator objects or None if an error occurs. + """ + + file_path = "agenta_backend/resources/evaluators/evaluators.json" + + if not os.path.exists(file_path): + return None + + try: + with open(file_path, "r") as file: + evaluators = json.load(file) + return evaluators + except Exception: + return None + + async def get_evaluators_configs(app_id: str) -> List[EvaluatorConfig]: """ Get evaluators configs by app_id. @@ -43,7 +66,6 @@ async def create_evaluator_config( app_id: str, name: str, evaluator_key: str, - direct_use: bool, settings_values: Optional[Dict[str, Any]] = None, ) -> EvaluatorConfig: """ @@ -65,7 +87,6 @@ async def create_evaluator_config( user=app.user, name=name, evaluator_key=evaluator_key, - direct_use=direct_use, settings_values=settings_values, ) return evaluator_config_db_to_pydantic(evaluator_config=evaluator_config) From 4a76e0a657e88987c944f6cf288b7cbc80376fbc Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Fri, 5 Jan 2024 21:41:08 +0100 Subject: [PATCH 280/414] add ready to use evaluators --- .../agenta_backend/routers/app_router.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/routers/app_router.py b/agenta-backend/agenta_backend/routers/app_router.py index 857e4ce181..15a9339eea 100644 --- a/agenta-backend/agenta_backend/routers/app_router.py +++ b/agenta-backend/agenta_backend/routers/app_router.py @@ -10,6 +10,7 @@ from agenta_backend.services import ( app_manager, db_manager, + evaluator_manager, ) from agenta_backend.utils.common import ( check_access_to_app, @@ -393,7 +394,23 @@ async def create_app_and_variant_from_template( **user_org_data, ) - logger.debug("Step 8: Starting variant and injecting environment variables") + logger.debug("Step 8: We create ready-to use evaluators") + evaluators = evaluator_manager.get_evaluators() + direct_use_evaluators = [ + evaluator for evaluator in evaluators if evaluator.get("direct_use") + ] + + for evaluator in direct_use_evaluators: + await db_manager.create_evaluator_config( + app=app, + organization=app.organization, + user=app.user, + name=evaluator["name"], + evaluator_key=evaluator["key"], + settings_values={}, + ) + + logger.debug("Step 9: Starting variant and injecting environment variables") if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: if not os.environ["OPENAI_API_KEY"]: raise Exception( From 777fb497207d8db7e4fdbe9c6c7eeba31fa626e8 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 13:18:34 +0100 Subject: [PATCH 281/414] Update - modified logic to aggregate results for single model test --- .../agenta_backend/services/results_service.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index 63293d9d86..dc9d9e8df2 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -72,13 +72,14 @@ async def _compute_stats_for_human_a_b_testing_evaluation(evaluation_scenarios: async def fetch_results_for_single_model_test(evaluation_id: str): - pipeline = [ - {"$match": {"evaluations": ObjectId(evaluation_id)}}, - {"$group": {"_id": "$score", "count": {"$sum": 1}}}, - ] - - results = await HumanEvaluationScenarioDB.aggregate(pipeline).to_list(length=None) - return {result._id: result.count for result in results} + results = await HumanEvaluationScenarioDB.find( + HumanEvaluationScenarioDB.evaluation.id == ObjectId(evaluation_id) + ).to_list() + scores_and_counts = {} + for result in results: + score = result.score + scores_and_counts[score] = scores_and_counts.get(score, 0) + 1 + return scores_and_counts async def fetch_average_score_for_custom_code_run(evaluation_id: str) -> float: From 8fda5d65acc1d74f47c470897ce1a1b39a25b572 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 13:22:56 +0100 Subject: [PATCH 282/414] Update - change from evaluation to annotations --- .../src/components/Evaluations/AutomaticEvaluationResult.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx b/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx index 3248322f78..b4d4379bb4 100644 --- a/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx +++ b/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx @@ -132,7 +132,7 @@ export default function AutomaticEvaluationResult() { EvaluationType[evaluation.evaluationType as keyof typeof EvaluationType] if (evaluationType === EvaluationType.single_model_test) { - router.push(`/apps/${app_id}/evaluations/${evaluation.key}/single_model_test`) + router.push(`/apps/${app_id}/annotations/${evaluation.key}/single_model_test`) } } From bb23323b62ebe24bb62d2eb5a6f1536ecbdc1f65 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 15:51:38 +0100 Subject: [PATCH 283/414] Update - save variant configuration and default variant parameters --- agenta-backend/agenta_backend/services/app_manager.py | 2 +- agenta-backend/agenta_backend/services/db_manager.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/app_manager.py b/agenta-backend/agenta_backend/services/app_manager.py index b15c335c4f..a5ebdcb000 100644 --- a/agenta-backend/agenta_backend/services/app_manager.py +++ b/agenta-backend/agenta_backend/services/app_manager.py @@ -461,7 +461,7 @@ async def add_variant_based_on_image( image=db_image, user=user_instance, organization=app.organization, - parameters={}, + parameters=config_db.parameters, base_name=base_name, config_name=config_name, base=db_base, diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 72b49e06b6..b573cc4f90 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1153,7 +1153,11 @@ async def update_variant_parameters( config_db.current_version = new_version config_db.parameters = parameters + # Update variant parameters + app_variant_db.parameters = config_db.parameters + # Save updated ConfigDB + await app_variant_db.save() await config_db.save() except Exception as e: From fa234eca57549ef0e60a3bae1b29f81e2fd2656a Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 17:15:15 +0100 Subject: [PATCH 284/414] Update - refactor testcases --- .../agenta_backend/tests/conftest.py | 7 +- .../tests/observability_router/conftest.py | 120 ------- .../variants_evaluators_router/conftest.py | 118 ------- .../tests/variants_main_router/conftest.py | 330 ++++++++++++++++++ .../test_app_variant_router.py | 11 +- .../test_observability_router.py | 30 +- .../test_variant_evaluators_router.py} | 37 +- .../test_variant_testset_router.py} | 15 +- .../test_organization_router.py | 0 .../tests/variants_router/conftest.py | 102 ------ .../test_user_profile.py | 0 11 files changed, 376 insertions(+), 394 deletions(-) delete mode 100644 agenta-backend/agenta_backend/tests/observability_router/conftest.py delete mode 100644 agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py create mode 100644 agenta-backend/agenta_backend/tests/variants_main_router/conftest.py rename agenta-backend/agenta_backend/tests/{variants_router => variants_main_router}/test_app_variant_router.py (93%) rename agenta-backend/agenta_backend/tests/{observability_router => variants_main_router}/test_observability_router.py (90%) rename agenta-backend/agenta_backend/tests/{variants_evaluators_router/test_evaluators_router.py => variants_main_router/test_variant_evaluators_router.py} (88%) rename agenta-backend/agenta_backend/tests/{testset_router/test_testset_router.py => variants_main_router/test_variant_testset_router.py} (86%) rename agenta-backend/agenta_backend/tests/{organization_router => variants_organization_router}/test_organization_router.py (100%) delete mode 100644 agenta-backend/agenta_backend/tests/variants_router/conftest.py rename agenta-backend/agenta_backend/tests/{user_profile_router => variants_user_profile_router}/test_user_profile.py (100%) diff --git a/agenta-backend/agenta_backend/tests/conftest.py b/agenta-backend/agenta_backend/tests/conftest.py index 0fb5fd275d..ac447d1c51 100644 --- a/agenta-backend/agenta_backend/tests/conftest.py +++ b/agenta-backend/agenta_backend/tests/conftest.py @@ -3,8 +3,6 @@ from agenta_backend.models.db_engine import DBEngine -engine = DBEngine().engine() - @pytest.fixture(scope="session", autouse=True) def event_loop(): @@ -14,7 +12,10 @@ def event_loop(): asyncio.set_event_loop(res) res._close = res.close + # Initialize beanie + res.run_until_complete(DBEngine().init_db()) + yield res res._close() # close event loop - DBEngine().remove_db() # drop database + # DBEngine().remove_db() # drop database diff --git a/agenta-backend/agenta_backend/tests/observability_router/conftest.py b/agenta-backend/agenta_backend/tests/observability_router/conftest.py deleted file mode 100644 index 92523e645e..0000000000 --- a/agenta-backend/agenta_backend/tests/observability_router/conftest.py +++ /dev/null @@ -1,120 +0,0 @@ -import pytest -from datetime import datetime - - -@pytest.fixture() -def spans_db_data(): - return [ - { - "parent_span_id": "string", - "meta": {}, - "event_name": "call", - "event_type": "fixture_call", - "start_time": str(datetime.utcnow()), - "duration": 8.30, - "status": "initiated", - "end_time": str(datetime.utcnow()), - "inputs": ["string"], - "outputs": ["string"], - "prompt_template": "string", - "tokens_input": 80, - "tokens_output": 25, - "token_total": 105, - "cost": 0.23, - "tags": ["string"], - }, - { - "parent_span_id": "string", - "meta": {}, - "event_name": "call", - "event_type": "fixture_call", - "start_time": str(datetime.utcnow()), - "duration": 13.30, - "status": "initiated", - "end_time": str(datetime.utcnow()), - "inputs": ["string"], - "outputs": ["string"], - "prompt_template": "string", - "tokens_input": 58, - "tokens_output": 65, - "token_total": 123, - "cost": 0.19, - "tags": ["string"], - }, - { - "parent_span_id": "string", - "meta": {}, - "event_name": "call", - "event_type": "fixture_call", - "start_time": str(datetime.utcnow()), - "duration": 18.30, - "status": "initiated", - "end_time": str(datetime.utcnow()), - "inputs": ["string"], - "outputs": ["string"], - "prompt_template": "string", - "tokens_input": 100, - "tokens_output": 35, - "token_total": 135, - "cost": 0.54, - "tags": ["string"], - }, - ] - - -@pytest.fixture() -def image_create_data(): - return { - "docker_id": "sha256:xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "tags": "agentaai/templates_v2:local_test_prompt", - "created_at": datetime.utcnow(), - "updated_at": datetime.utcnow(), - } - - -@pytest.fixture() -def app_variant_create_data(): - return { - "variant_name": "v1", - "parameters": {}, - "created_at": datetime.utcnow(), - "updated_at": datetime.utcnow(), - } - - -@pytest.fixture() -def trace_create_data(): - return { - "cost": 0.782, - "latency": 20, - "status": "completed", - "token_consumption": 638, - "tags": ["string"], - "start_time": str(datetime.utcnow()), - "end_time": str(datetime.utcnow()), - } - - -@pytest.fixture() -def organization_create_data(): - return { - "name": "Agenta", - "description": "Agenta is a platform for building and deploying machine learning models.", - } - - -@pytest.fixture() -def user_create_data(): - return { - "uid": "0", - "username": "agenta", - "email": "demo@agenta.ai", - } - - -@pytest.fixture() -def feedbacks_create_data(): - return [ - {"feedback": "thumbs up", "score": 0, "meta": {}}, - {"feedback": "thumbs down", "score": 10, "meta": {}}, - ] diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py deleted file mode 100644 index 2f05c10290..0000000000 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/conftest.py +++ /dev/null @@ -1,118 +0,0 @@ -import os -import httpx -import pytest - -from agenta_backend.models.db_models import ( - UserDB, - OrganizationDB, -) - - -# Set global variables -ENVIRONMENT = os.environ.get("ENVIRONMENT") -OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") -if ENVIRONMENT == "development": - BACKEND_API_HOST = "http://host.docker.internal/api" -elif ENVIRONMENT == "github": - BACKEND_API_HOST = "http://agenta-backend-test:8000" - - -@pytest.fixture(scope="session") -def fetch_templates(): - response = httpx.get(f"{BACKEND_API_HOST}/containers/templates/") - response_data = response.json() - return response_data - - -@pytest.fixture(scope="session") -def use_open_ai_key(): - return OPEN_AI_KEY - - -@pytest.fixture(scope="session") -def fetch_single_prompt_template(fetch_templates): - return fetch_templates[1] - - -@pytest.fixture() -async def fetch_user_organization(): - organization = await OrganizationDB.find().to_list() - return {"org_id": str(organization[0].id)} - - -@pytest.fixture() -def app_from_template(): - return { - "app_name": "string", - "env_vars": {"OPENAI_API_KEY": OPEN_AI_KEY}, - "organization_id": "string", - "template_id": "string", - } - - -@pytest.fixture(scope="session") -async def fetch_user(): - user = await UserDB.find_one(UserDB.uid == "0", fetch_links=True) - return user - - -@pytest.fixture() -def auto_exact_match_evaluator_config(): - return { - "app_id": "string", - "name": "ExactMatchEvaluator", - "evaluator_key": "auto_exact_match", - "settings_values": {}, - } - - -@pytest.fixture() -def auto_similarity_match_evaluator_config(): - return { - "app_id": "string", - "name": "SimilarityMatchEvaluator", - "evaluator_key": "auto_similarity_match", - "settings_values": {"similarity_threshold": 0.3}, - } - - -@pytest.fixture() -def auto_regex_test_evaluator_config(): - return { - "app_id": "string", - "name": "RegexEvaluator", - "evaluator_key": "auto_regex_test", - "settings_values": { - "regex_pattern": "^value\\d{3}$", - "regex_should_match": False, - }, - } - - -@pytest.fixture() -def auto_webhook_test_evaluator_config(): - return { - "app_id": "string", - "name": "WebhookEvaluator", - "evaluator_key": "auto_webhook_test", - "settings_values": { - "webhook_url": f"{BACKEND_API_HOST}/evaluations/webhook_example_fake/", - "webhook_body": {}, - }, - } - - -@pytest.fixture() -def auto_ai_critique_evaluator_config(): - return { - "app_id": "string", - "name": "AICritique_Evaluator", - "evaluator_key": "auto_ai_critique", - "settings_values": { - "open_ai_key": OPEN_AI_KEY, - "temperature": 0.9, - "evaluation_prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.", - "llm_app_prompt_template": "", - "llm_app_inputs": [{"input_name": "country", "input_value": "tunisia"}], - }, - } diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py new file mode 100644 index 0000000000..169cc8bf3c --- /dev/null +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -0,0 +1,330 @@ +import os +import pytest +import logging +from datetime import datetime + +from agenta_backend.models.db_models import ( + AppDB, + UserDB, + VariantBaseDB, + ImageDB, + ConfigDB, + AppVariantDB, + OrganizationDB, +) +from agenta_backend.services import selectors + +import httpx + + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +# Set global variables +ENVIRONMENT = os.environ.get("ENVIRONMENT") +OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") +if ENVIRONMENT == "development": + BACKEND_API_HOST = "http://host.docker.internal/api" +elif ENVIRONMENT == "github": + BACKEND_API_HOST = "http://agenta-backend-test:8000" + + +@pytest.fixture() +async def get_first_user_object(): + """Get the user object from the database or create a new one if not found.""" + + user = await UserDB.find_one(UserDB.uid == "0") + if user is None: + create_user = UserDB(uid="0") + await create_user.create() + + org = OrganizationDB(type="default", owner=str(create_user.id)) + await org.create() + + create_user.organizations.append(org.id) + await create_user.save() + + return create_user + return user + + +@pytest.fixture() +async def get_second_user_object(): + """Create a second user object.""" + + user = await UserDB.find_one(UserDB.uid == "1") + if user is None: + create_user = UserDB( + uid="1", username="test_user1", email="test_user1@email.com" + ) + await create_user.create() + + org = OrganizationDB(type="default", owner=str(create_user.id)) + await org.create() + + create_user.organizations.append(org.id) + await create_user.save() + + return create_user + return user + + +@pytest.fixture() +async def get_first_user_app(get_first_user_object): + user = await get_first_user_object + organization = await selectors.get_user_own_org(user.uid) + + app = AppDB(app_name="myapp", organization=organization, user=user) + await app.create() + + db_image = ImageDB( + docker_id="sha256:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + tags="agentaai/templates_v2:local_test_prompt", + user=user, + organization=organization, + ) + await db_image.create() + + db_config = ConfigDB( + config_name="default", + parameters={}, + ) + await db_config.create() + + db_base = VariantBaseDB( + base_name="app", image=db_image, organization=organization, user=user, app=app + ) + await db_base.create() + + appvariant = AppVariantDB( + app=app, + variant_name="app", + image=db_image, + user=user, + organization=organization, + parameters={}, + base_name="app", + config_name="default", + base=db_base, + config=db_config, + ) + await appvariant.create() + return appvariant, user, organization, app, db_image, db_config, db_base + + +@pytest.fixture() +def spans_db_data(): + return [ + { + "parent_span_id": "string", + "meta": {}, + "event_name": "call", + "event_type": "fixture_call", + "start_time": str(datetime.utcnow()), + "duration": 8.30, + "status": "initiated", + "end_time": str(datetime.utcnow()), + "inputs": ["string"], + "outputs": ["string"], + "prompt_template": "string", + "tokens_input": 80, + "tokens_output": 25, + "token_total": 105, + "cost": 0.23, + "tags": ["string"], + }, + { + "parent_span_id": "string", + "meta": {}, + "event_name": "call", + "event_type": "fixture_call", + "start_time": str(datetime.utcnow()), + "duration": 13.30, + "status": "initiated", + "end_time": str(datetime.utcnow()), + "inputs": ["string"], + "outputs": ["string"], + "prompt_template": "string", + "tokens_input": 58, + "tokens_output": 65, + "token_total": 123, + "cost": 0.19, + "tags": ["string"], + }, + { + "parent_span_id": "string", + "meta": {}, + "event_name": "call", + "event_type": "fixture_call", + "start_time": str(datetime.utcnow()), + "duration": 18.30, + "status": "initiated", + "end_time": str(datetime.utcnow()), + "inputs": ["string"], + "outputs": ["string"], + "prompt_template": "string", + "tokens_input": 100, + "tokens_output": 35, + "token_total": 135, + "cost": 0.54, + "tags": ["string"], + }, + ] + + +@pytest.fixture() +def image_create_data(): + return { + "docker_id": "sha256:xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "tags": "agentaai/templates_v2:local_test_prompt", + "created_at": datetime.utcnow(), + "updated_at": datetime.utcnow(), + } + + +@pytest.fixture() +def app_variant_create_data(): + return { + "variant_name": "v1", + "parameters": {}, + "created_at": datetime.utcnow(), + "updated_at": datetime.utcnow(), + } + + +@pytest.fixture() +def trace_create_data(): + return { + "cost": 0.782, + "latency": 20, + "status": "completed", + "token_consumption": 638, + "tags": ["string"], + "start_time": str(datetime.utcnow()), + "end_time": str(datetime.utcnow()), + } + + +@pytest.fixture() +def feedbacks_create_data(): + return [ + {"feedback": "thumbs up", "score": 0, "meta": {}}, + {"feedback": "thumbs down", "score": 10, "meta": {}}, + ] + + + +@pytest.fixture(scope="session") +def fetch_templates(): + response = httpx.get(f"{BACKEND_API_HOST}/containers/templates/") + response_data = response.json() + return response_data + + +@pytest.fixture(scope="session") +def use_open_ai_key(): + return OPEN_AI_KEY + + +@pytest.fixture(scope="session") +def fetch_single_prompt_template(fetch_templates): + return fetch_templates[1] + + +@pytest.fixture() +async def fetch_user_organization(): + organization = await OrganizationDB.find().to_list() + return {"org_id": str(organization[0].id)} + + +@pytest.fixture() +def app_from_template(): + return { + "app_name": "string", + "env_vars": {"OPENAI_API_KEY": OPEN_AI_KEY}, + "organization_id": "string", + "template_id": "string", + } + + +@pytest.fixture(scope="session") +async def fetch_user(): + user = await UserDB.find_one(UserDB.uid == "0", fetch_links=True) + return user + + +@pytest.fixture() +def update_app_variant_parameters(): + return { + "temperature": 1, + "model": "gpt-3.5-turbo", + "max_tokens": -1, + "prompt_system": "You are an expert in geography.", + "prompt_user": "What is the capital of {country}?", + "top_p": 1, + "frequence_penalty": 0, + "presence_penalty": 0 + } + + +@pytest.fixture() +def auto_exact_match_evaluator_config(): + return { + "app_id": "string", + "name": "ExactMatchEvaluator", + "evaluator_key": "auto_exact_match", + "settings_values": {}, + } + + +@pytest.fixture() +def auto_similarity_match_evaluator_config(): + return { + "app_id": "string", + "name": "SimilarityMatchEvaluator", + "evaluator_key": "auto_similarity_match", + "settings_values": {"similarity_threshold": 0.3}, + } + + +@pytest.fixture() +def auto_regex_test_evaluator_config(): + return { + "app_id": "string", + "name": "RegexEvaluator", + "evaluator_key": "auto_regex_test", + "settings_values": { + "regex_pattern": "^value\\d{3}$", + "regex_should_match": False, + }, + } + + +@pytest.fixture() +def auto_webhook_test_evaluator_config(): + return { + "app_id": "string", + "name": "WebhookEvaluator", + "evaluator_key": "auto_webhook_test", + "settings_values": { + "webhook_url": f"{BACKEND_API_HOST}/evaluations/webhook_example_fake/", + "webhook_body": {}, + }, + } + + +@pytest.fixture() +def auto_ai_critique_evaluator_config(): + return { + "app_id": "string", + "name": "AICritique_Evaluator", + "evaluator_key": "auto_ai_critique", + "settings_values": { + "open_ai_key": OPEN_AI_KEY, + "temperature": 0.9, + "evaluation_prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.", + "llm_app_prompt_template": "", + "llm_app_inputs": [{"input_name": "country", "input_value": "tunisia"}], + }, + } diff --git a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_app_variant_router.py similarity index 93% rename from agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py rename to agenta-backend/agenta_backend/tests/variants_main_router/test_app_variant_router.py index d7e6a43a51..f75a8c9b3d 100644 --- a/agenta-backend/agenta_backend/tests/variants_router/test_app_variant_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_app_variant_router.py @@ -55,7 +55,7 @@ async def test_list_apps(): response = await test_client.get(f"{BACKEND_API_HOST}/apps/") assert response.status_code == 200 - assert len(response.json()) == 3 + assert len(response.json()) == 1 @pytest.mark.asyncio @@ -157,12 +157,3 @@ async def test_get_variant_by_env(get_first_user_app): app_id=str(app.id), environment=environment.name ) assert response == [] - - -@pytest.mark.asyncio -async def test_remove_app(): - app = await AppDB.find_one(AppDB.app_name == "app_variant_test") - await app.delete() - - app = await AppDB.find_one(AppDB, AppDB.app_name == "app_variant_test") - assert app == None diff --git a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_observability_router.py similarity index 90% rename from agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py rename to agenta-backend/agenta_backend/tests/variants_main_router/test_observability_router.py index 2f25a48a75..1160e4d58c 100644 --- a/agenta-backend/agenta_backend/tests/observability_router/test_observability_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_observability_router.py @@ -41,34 +41,6 @@ async def test_create_spans_endpoint(spans_db_data): assert response.status_code == 200 -@pytest.mark.asyncio -async def test_create_user_and_org(user_create_data, organization_create_data): - user_db = UserDB(**user_create_data) - await user_db.create() - - org_db = OrganizationDB(**organization_create_data, owner=str(user_db.id)) - await org_db.create() - - user_db.organizations = [org_db.id] - await user_db.save() - - assert org_db.name == "Agenta" - assert user_db.username == "agenta" - assert user_db.organizations == [org_db.id] - - -@pytest.mark.asyncio -async def test_create_organization(organization_create_data): - user_db = await UserDB.find_one(UserDB.uid == "0") - organization = OrganizationDB( - **organization_create_data, - type="default", - owner=str(user_db.id), - members=[user_db.id], - ) - await organization.create() - - @pytest.mark.asyncio async def test_create_image_in_db(image_create_data): user_db = await UserDB.find_one(UserDB.uid == "0") @@ -87,7 +59,7 @@ async def test_create_image_in_db(image_create_data): async def test_create_appvariant_in_db(app_variant_create_data): user_db = await UserDB.find_one(UserDB.uid == "0") organization_db = await selectors.get_user_own_org(user_db.uid) - image_db = await ImageDB.find_one(ImageDB.user == user_db.id) + image_db = await ImageDB.find_one(ImageDB.user.id == user_db.id) app = AppDB( app_name="test_app", diff --git a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py similarity index 88% rename from agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py rename to agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py index fc8be57818..f445be65ca 100644 --- a/agenta-backend/agenta_backend/tests/variants_evaluators_router/test_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py @@ -6,7 +6,9 @@ from agenta_backend.models.api.evaluation_model import EvaluationStatusEnum from agenta_backend.models.db_models import ( AppDB, + ConfigDB, TestSetDB, + AppVariantDB, EvaluationDB, AppVariantDB, DeploymentDB, @@ -145,6 +147,25 @@ async def test_get_evaluator_configs(): assert type(response.json()) == list +@pytest.mark.asyncio +async def test_update_app_variant_parameters(update_app_variant_parameters): + app = await AppDB.find_one(AppDB.app_name == APP_NAME) + testset = await TestSetDB.find_one(TestSetDB.app.id == app.id) + app_variant = await AppVariantDB.find_one( + AppVariantDB.app.id == app.id, + AppVariantDB.variant_name == "app.default" + ) + + parameters = update_app_variant_parameters + parameters["inputs"] = [{"name": list(testset.csvdata[0].keys())[0]}] + payload = {"parameters": parameters} + + response = await test_client.put( + f"{BACKEND_API_HOST}/variants/{str(app_variant.id)}/parameters/", json=payload + ) + assert response.status_code == 200 + + @pytest.mark.asyncio async def test_create_evaluation(): # Fetch app, app_variant and testset @@ -193,7 +214,9 @@ async def test_create_evaluation(): @pytest.mark.asyncio async def test_fetch_evaluation_status(): - evaluations = await EvaluationDB.find().to_list() # will return only one in this case + evaluations = ( + await EvaluationDB.find().to_list() + ) # will return only one in this case evaluation = evaluations[0] # Prepare and start short-polling request @@ -217,7 +240,9 @@ async def test_fetch_evaluation_status(): @pytest.mark.asyncio async def test_fetch_evaluation_results(): - evaluations = await EvaluationDB.find().to_list() # will return only one in this case + evaluations = ( + await EvaluationDB.find().to_list() + ) # will return only one in this case evaluation = evaluations[0] response = await test_client.get( @@ -252,9 +277,13 @@ async def test_delete_evaluator_config(): @pytest.mark.asyncio async def test_evaluation_scenario_match_evaluation_testset_length(): - evaluations = await EvaluationDB.find(fetch_links=True).to_list() # will return only one in this case + evaluations = await EvaluationDB.find( + fetch_links=True + ).to_list() # will return only one in this case evaluation = evaluations[0] - evaluation_scenario_count = await EvaluationScenarioDB.find(EvaluationScenarioDB.evaluation.id == evaluation.id).count() + evaluation_scenario_count = await EvaluationScenarioDB.find( + EvaluationScenarioDB.evaluation.id == evaluation.id + ).count() assert evaluation_scenario_count == len(evaluation.testset.csvdata) diff --git a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_testset_router.py similarity index 86% rename from agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py rename to agenta-backend/agenta_backend/tests/variants_main_router/test_variant_testset_router.py index 561fb8daba..ce397ba346 100644 --- a/agenta-backend/agenta_backend/tests/testset_router/test_testset_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_testset_router.py @@ -28,7 +28,7 @@ @pytest.mark.asyncio async def test_create_testset(): - app = await AppDB.find_one(AppDB.app_name == "test_app") + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") payload = { "name": "create_testset_main", @@ -50,15 +50,14 @@ async def test_create_testset(): response = await test_client.post( f"{BACKEND_API_HOST}/testsets/{str(app.id)}/", json=payload ) - assert response.status_code == 200 assert response.json()["name"] == payload["name"] @pytest.mark.asyncio async def test_update_testset(): - app = await AppDB.find_one(AppDB.app_name == "test_app") - testset = await AppDB.find_one(TestSetDB.app.id == app.id) + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") + testset = await TestSetDB.find_one(TestSetDB.app.id == app.id) payload = { "name": "update_testset", @@ -89,7 +88,7 @@ async def test_update_testset(): @pytest.mark.asyncio async def test_get_testsets(): - app = await AppDB.find_one(AppDB.app_name == "test_app") + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") response = await test_client.get( f"{BACKEND_API_HOST}/testsets/?app_id={str(app.id)}" ) @@ -100,7 +99,7 @@ async def test_get_testsets(): @pytest.mark.asyncio() async def test_get_testset(): - app = await AppDB.find_one(AppDB.app_name == "test_app") + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") testset = await TestSetDB.find_one(TestSetDB.app.id == app.id) response = await test_client.get(f"{BACKEND_API_HOST}/testsets/{str(testset.id)}/") @@ -112,8 +111,8 @@ async def test_get_testset(): @pytest.mark.asyncio async def test_delete_testsets(): - app = await AppDB.find_one(AppDB.app_name == "test_app") - testsets = await TestSetDB.find(TestSetDB.app.id == app.id) + app = await AppDB.find_one(AppDB.app_name == "app_variant_test") + testsets = await TestSetDB.find(TestSetDB.app.id == app.id).to_list() testset_ids = [str(testset.id) for testset in testsets] payload = {"testset_ids": testset_ids} diff --git a/agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py b/agenta-backend/agenta_backend/tests/variants_organization_router/test_organization_router.py similarity index 100% rename from agenta-backend/agenta_backend/tests/organization_router/test_organization_router.py rename to agenta-backend/agenta_backend/tests/variants_organization_router/test_organization_router.py diff --git a/agenta-backend/agenta_backend/tests/variants_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_router/conftest.py deleted file mode 100644 index fc5f9895c8..0000000000 --- a/agenta-backend/agenta_backend/tests/variants_router/conftest.py +++ /dev/null @@ -1,102 +0,0 @@ -import pytest -import logging - -from agenta_backend.models.db_models import ( - AppDB, - UserDB, - VariantBaseDB, - ImageDB, - ConfigDB, - AppVariantDB, - OrganizationDB, -) - -from agenta_backend.services import selectors - -# Initialize logger -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) - - -@pytest.fixture() -async def get_first_user_object(): - """Get the user object from the database or create a new one if not found.""" - - user = await UserDB.find_one(UserDB.uid == "0") - if user is None: - create_user = UserDB(uid="0") - await create_user.create() - - org = OrganizationDB(type="default", owner=str(create_user.id)) - await org.create() - - create_user.organizations.append(org.id) - await create_user.save() - - return create_user - return user - - -@pytest.fixture() -async def get_second_user_object(): - """Create a second user object.""" - - user = await UserDB.find_one(UserDB.uid == "1") - if user is None: - create_user = UserDB( - uid="1", username="test_user1", email="test_user1@email.com" - ) - await create_user.create() - - org = OrganizationDB(type="default", owner=str(create_user.id)) - await org.create() - - create_user.organizations.append(org.id) - await create_user.save() - - return create_user - return user - - -@pytest.fixture() -async def get_first_user_app(get_first_user_object): - user = await get_first_user_object - organization = await selectors.get_user_own_org(user.uid) - - app = AppDB(app_name="myapp", organization=organization, user=user) - await app.create() - - db_image = ImageDB( - docker_id="sha256:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - tags="agentaai/templates_v2:local_test_prompt", - user=user, - organization=organization, - ) - await db_image.create() - - db_config = ConfigDB( - config_name="default", - parameters={}, - ) - await db_config.create() - - db_base = VariantBaseDB( - base_name="app", image=db_image, organization=organization, user=user, app=app - ) - await db_base.create() - - appvariant = AppVariantDB( - app=app, - variant_name="app", - image=db_image, - user=user, - organization=organization, - parameters={}, - base_name="app", - config_name="default", - base=db_base, - config=db_config, - ) - await appvariant.create() - - return appvariant, user, organization, app, db_image, db_config, db_base diff --git a/agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py similarity index 100% rename from agenta-backend/agenta_backend/tests/user_profile_router/test_user_profile.py rename to agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py From 12328d8927ca61ff7db04a7c822f515e5b26d2ae Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 17:15:44 +0100 Subject: [PATCH 285/414] Update - clean up unused codes --- agenta-backend/agenta_backend/tasks/evaluations.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index feced167d6..a439f5d424 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -38,8 +38,6 @@ def evaluate( testset_id: str, ): loop = asyncio.get_event_loop() - app = AppDB(**app_data) - evaluation = NewEvaluation(**new_evaluation_data) try: loop.run_until_complete(DBEngine().init_db()) @@ -50,7 +48,7 @@ def evaluate( app_variant_parameters = app_variant_db.config.parameters if ( - not app_variant_db.config.parameters + not app_variant_parameters or "inputs" not in app_variant_db.config.parameters or not app_variant_db.config.parameters["inputs"] ): From 17190f253b591ba1d90fe30f939a674ffe865a1c Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 17:16:11 +0100 Subject: [PATCH 286/414] Update - modified event db manager and db engine --- .../agenta_backend/models/db_engine.py | 4 -- .../services/event_db_manager.py | 63 ++++++++++--------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 5a912fbd85..820e9f6893 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -100,9 +100,5 @@ def remove_db(self) -> None: client = MongoClient(self.db_url) if self.mode == "default": client.drop_database("agenta") - elif self.mode == "v2": - client.drop_database("agenta_v2") - elif self.mode == "test": - client.drop_database("agenta_test") else: client.drop_database(f"agenta_{self.mode}") diff --git a/agenta-backend/agenta_backend/services/event_db_manager.py b/agenta-backend/agenta_backend/services/event_db_manager.py index 43e5e6d407..cb6bf35f22 100644 --- a/agenta-backend/agenta_backend/services/event_db_manager.py +++ b/agenta-backend/agenta_backend/services/event_db_manager.py @@ -47,12 +47,12 @@ async def get_variant_traces( """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = ( - TraceDB.user == user.id, + traces = await TraceDB.find( + TraceDB.user.id == user.id, TraceDB.app_id == app_id, TraceDB.variant_id == variant_id, - ) - traces = await TraceDB.find(query_expressions).to_list() + fetch_links=True + ).to_list() return [trace_db_to_pydantic(trace) for trace in traces] @@ -70,13 +70,13 @@ async def create_app_trace(payload: CreateTrace, **kwargs: dict) -> str: # Ensure spans exists in the db for span in payload.spans: - span_db = await SpanDB.find_one(SpanDB.id == ObjectId(span)) + span_db = await SpanDB.find_one(SpanDB.id == ObjectId(span), fetch_links=True) if span_db is None: raise HTTPException(404, detail=f"Span {span} does not exist") - trace = TraceDB(**payload.dict(), user=user) - await trace.create() - return trace_db_to_pydantic(trace)["trace_id"] + trace_db = TraceDB(**payload.dict(), user=user) + await trace_db.create() + return str(trace_db.id) async def get_trace_single(trace_id: str, **kwargs: dict) -> Trace: @@ -90,10 +90,11 @@ async def get_trace_single(trace_id: str, **kwargs: dict) -> Trace: """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace - trace = await TraceDB.find_one(query_expressions) + trace = await TraceDB.find_one( + TraceDB.id == ObjectId(trace_id), TraceDB.user.id == user.id, fetch_links=True + ) return trace_db_to_pydantic(trace) @@ -111,14 +112,15 @@ async def trace_status_update( """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace - trace = await TraceDB.find_one(query_expressions) + trace = await TraceDB.find_one( + TraceDB.id == ObjectId(trace_id), TraceDB.user.id == user.id + ) # Update and save trace trace.status = payload.status - await trace.create() + await trace.save() return True @@ -148,10 +150,11 @@ async def get_trace_spans(trace_id: str, **kwargs: dict) -> List[Span]: """ user = await db_manager.get_user(user_uid=kwargs["uid"]) - query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) # Get trace - trace = await TraceDB.find_one(query_expressions) + trace = await TraceDB.find_one( + TraceDB.id == ObjectId(trace_id), TraceDB.user.id == user.id, fetch_links=True + ) # Get trace spans spans = spans_db_to_pydantic(trace.spans) @@ -179,14 +182,14 @@ async def add_feedback_to_trace( created_at=datetime.utcnow(), ) - trace = await TraceDB.find_one(TraceDB.id == ObjectId(trace_id)) + trace = await TraceDB.find_one(TraceDB.id == ObjectId(trace_id), fetch_links=True) if trace.feedbacks is None: trace.feedbacks = [feedback] else: trace.feedbacks.append(feedback) # Update trace - await trace.create() + await trace.save() return feedback.uid @@ -202,11 +205,10 @@ async def get_trace_feedbacks(trace_id: str, **kwargs: dict) -> List[Feedback]: user = await db_manager.get_user(user_uid=kwargs["uid"]) - # Build query expressions - query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) - # Get feedbacks in trace - trace = await TraceDB.find_one(query_expressions) + trace = await TraceDB.find_one( + TraceDB.id == ObjectId(trace_id), TraceDB.user.id == user.id, fetch_links=True + ) feedbacks = [feedback_db_to_pydantic(feedback) for feedback in trace.feedbacks] return feedbacks @@ -226,11 +228,10 @@ async def get_feedback_detail( user = await db_manager.get_user(user_uid=kwargs["uid"]) - # Build query expressions - query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) - # Get trace - trace = await TraceDB.find_one(query_expressions) + trace = await TraceDB.find_one( + TraceDB.id == ObjectId(trace_id), TraceDB.user.id == user.id, fetch_links=True + ) # Get feedback feedback = [ @@ -257,22 +258,22 @@ async def update_trace_feedback( user = await db_manager.get_user(user_uid=kwargs["uid"]) - # Build query expressions - query_expressions = (TraceDB.id == ObjectId(trace_id), TraceDB.user == user.id) - # Get trace - trace = await TraceDB.find_one(query_expressions) + trace = await TraceDB.find_one( + TraceDB.id == ObjectId(trace_id), TraceDB.user.id == user.id, fetch_links=True + ) # update feedback feedback_json = {} for feedback in trace.feedbacks: if feedback.uid == feedback_id: - feedback.update(payload.dict()) + for key, value in payload.dict(exclude_none=True).items(): + setattr(feedback, key, value) feedback_json = feedback.dict() break # Save feedback in trace and return a copy - await trace.create() + await trace.save() # Replace key and transform into a pydantic representation feedback_json["feedback_id"] = feedback_json.pop("uid") From ff82a366433478042bec359ce2ca8bdcc59aebc8 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 17:17:00 +0100 Subject: [PATCH 287/414] :art: Format - ran black --- .../services/event_db_manager.py | 2 +- .../agenta_backend/tasks/evaluations.py | 4 +++- .../tests/variants_main_router/conftest.py | 19 +++++++++---------- .../test_variant_evaluators_router.py | 3 +-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/agenta-backend/agenta_backend/services/event_db_manager.py b/agenta-backend/agenta_backend/services/event_db_manager.py index cb6bf35f22..7a5d35ffdb 100644 --- a/agenta-backend/agenta_backend/services/event_db_manager.py +++ b/agenta-backend/agenta_backend/services/event_db_manager.py @@ -51,7 +51,7 @@ async def get_variant_traces( TraceDB.user.id == user.id, TraceDB.app_id == app_id, TraceDB.variant_id == variant_id, - fetch_links=True + fetch_links=True, ).to_list() return [trace_db_to_pydantic(trace) for trace in traces] diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index a439f5d424..2f94e880f8 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -87,7 +87,9 @@ def evaluate( ) for data_point, app_output in zip(testset.csvdata, app_outputs): if len(testset.csvdata) != len(app_outputs): - raise ValueError("Length of testset.csvdata and app_outputs are not the same") + raise ValueError( + "Length of testset.csvdata and app_outputs are not the same" + ) # 2. We prepare the inputs raw_inputs = ( diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py index 169cc8bf3c..2401f352b3 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -214,7 +214,6 @@ def feedbacks_create_data(): ] - @pytest.fixture(scope="session") def fetch_templates(): response = httpx.get(f"{BACKEND_API_HOST}/containers/templates/") @@ -257,15 +256,15 @@ async def fetch_user(): @pytest.fixture() def update_app_variant_parameters(): return { - "temperature": 1, - "model": "gpt-3.5-turbo", - "max_tokens": -1, - "prompt_system": "You are an expert in geography.", - "prompt_user": "What is the capital of {country}?", - "top_p": 1, - "frequence_penalty": 0, - "presence_penalty": 0 - } + "temperature": 1, + "model": "gpt-3.5-turbo", + "max_tokens": -1, + "prompt_system": "You are an expert in geography.", + "prompt_user": "What is the capital of {country}?", + "top_p": 1, + "frequence_penalty": 0, + "presence_penalty": 0, + } @pytest.fixture() diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py index f445be65ca..d17603e211 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py @@ -152,8 +152,7 @@ async def test_update_app_variant_parameters(update_app_variant_parameters): app = await AppDB.find_one(AppDB.app_name == APP_NAME) testset = await TestSetDB.find_one(TestSetDB.app.id == app.id) app_variant = await AppVariantDB.find_one( - AppVariantDB.app.id == app.id, - AppVariantDB.variant_name == "app.default" + AppVariantDB.app.id == app.id, AppVariantDB.variant_name == "app.default" ) parameters = update_app_variant_parameters From fdf5eb21089c24033d539d03c7b6cb5c93371dcc Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 6 Jan 2024 17:19:42 +0100 Subject: [PATCH 288/414] Update- modified main lifespan --- agenta-backend/agenta_backend/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/main.py b/agenta-backend/agenta_backend/main.py index f028919c00..594703181a 100644 --- a/agenta-backend/agenta_backend/main.py +++ b/agenta-backend/agenta_backend/main.py @@ -54,7 +54,7 @@ async def lifespan(application: FastAPI, cache=True): """ # initialize the database await DBEngine().init_db() - # await templates_manager.update_and_sync_templates(cache=cache) + await templates_manager.update_and_sync_templates(cache=cache) yield From fd8559c781b990c6e1127eb9c311551b37ab144d Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 7 Jan 2024 02:07:55 +0100 Subject: [PATCH 289/414] Update - modified auto_webhook_test evaluator service --- .../services/evaluators_service.py | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 82d834e5e7..fdaef49470 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -49,9 +49,16 @@ def auto_webhook_test( ) -> Result: try: with httpx.Client() as client: - request_body = json.loads(settings_values.get("webhook_body", None)) - payload = request_body if request_body else {} - response = client.post(url=settings_values["webhook_url"], json=payload) + webhook_body = settings_values.get("webhook_body", None) + if isinstance(webhook_body, str): + payload = json.loads(webhook_body) + if not webhook_body: + payload = {} + if isinstance(webhook_body, dict): + payload = webhook_body + response = client.post( + url=settings_values["webhook_url"], json=payload + ) response.raise_for_status() response_data = response.json() score = response_data.get("score", None) @@ -65,6 +72,8 @@ def auto_webhook_test( except httpx.HTTPError as e: print(f"An HTTP error occurred: {e}") except Exception as e: + import traceback + traceback.print_exc() print(f"An error occurred: {e}") @@ -164,10 +173,15 @@ def evaluate( evaluation_function = globals().get(evaluator_name, None) if not evaluation_function: raise ValueError(f"Evaluation method '{evaluator_name}' not found.") - return evaluation_function( - correct_answer, - variant_output, - settings_values, - *additional_args, - **additional_kwargs, - ) + try: + return evaluation_function( + correct_answer, + variant_output, + settings_values, + *additional_args, + **additional_kwargs, + ) + except Exception as exc: + raise RuntimeError( + f"Error occurred while running {evaluator_name} evaluation. Exception: {str(exc)}" + ) From d307e4de096cfca5eedba5dfeada2fbc6db7b1e6 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 7 Jan 2024 02:10:24 +0100 Subject: [PATCH 290/414] Update - modified conftest --- agenta-backend/agenta_backend/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tests/conftest.py b/agenta-backend/agenta_backend/tests/conftest.py index ac447d1c51..0c289c9894 100644 --- a/agenta-backend/agenta_backend/tests/conftest.py +++ b/agenta-backend/agenta_backend/tests/conftest.py @@ -18,4 +18,4 @@ def event_loop(): yield res res._close() # close event loop - # DBEngine().remove_db() # drop database + DBEngine().remove_db() # drop database From 2005016d88ae83000301517885ab6fce1a3bb270 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 7 Jan 2024 02:10:57 +0100 Subject: [PATCH 291/414] :art: Format - ran black --- agenta-backend/agenta_backend/services/evaluators_service.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index fdaef49470..8878cf8820 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -56,9 +56,7 @@ def auto_webhook_test( payload = {} if isinstance(webhook_body, dict): payload = webhook_body - response = client.post( - url=settings_values["webhook_url"], json=payload - ) + response = client.post(url=settings_values["webhook_url"], json=payload) response.raise_for_status() response_data = response.json() score = response_data.get("score", None) @@ -73,6 +71,7 @@ def auto_webhook_test( print(f"An HTTP error occurred: {e}") except Exception as e: import traceback + traceback.print_exc() print(f"An error occurred: {e}") From e5e48875d961f6dcce8237d253cabb8ae15758c7 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 7 Jan 2024 02:26:00 +0100 Subject: [PATCH 292/414] Cleanup - remove todo note --- agenta-backend/agenta_backend/tasks/evaluations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 2f94e880f8..23d8e6951c 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -118,7 +118,7 @@ def evaluate( additional_kwargs = ( { "app_params": app_variant_db.config.parameters, - "inputs": data_point, # TODO: fetch input from config parameters when #1102 has been fixed + "inputs": data_point, } if evaluator_config.evaluator_key == "auto_custom_code_run" else {} From 9512e962dfc9248c2af8014518d0967661058e89 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 7 Jan 2024 13:03:57 +0100 Subject: [PATCH 293/414] Update - modified query for fetching evaluation scenarios --- agenta-backend/agenta_backend/services/evaluation_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 776bb2932a..5e8a0a84e1 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -308,7 +308,7 @@ async def fetch_evaluation_scenarios_for_evaluation( **user_org_data, ) scenarios = await EvaluationScenarioDB.find( - EvaluationScenarioDB.evaluation == ObjectId(evaluation.id) + EvaluationScenarioDB.evaluation.id == ObjectId(evaluation.id), fetch_links=True ).to_list() eval_scenarios = [ converters.evaluation_scenario_db_to_pydantic(scenario) From 53d1dcfa501c1a156c670efa563a3a14415a7fec Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 7 Jan 2024 14:11:59 +0100 Subject: [PATCH 294/414] Update - make use of app variant config parameters --- agenta-backend/agenta_backend/tasks/evaluations.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 23d8e6951c..597739298c 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -49,8 +49,8 @@ def evaluate( if ( not app_variant_parameters - or "inputs" not in app_variant_db.config.parameters - or not app_variant_db.config.parameters["inputs"] + or "inputs" not in app_variant_parameters + or not app_variant_parameters["inputs"] ): loop.run_until_complete( update_evaluation(evaluation_id, {"status": "EVALUATION_FAILED"}) @@ -93,8 +93,8 @@ def evaluate( # 2. We prepare the inputs raw_inputs = ( - app_variant_db.parameters.get("inputs", []) - if app_variant_db.parameters + app_variant_parameters.get("inputs", []) + if app_variant_parameters else [] ) inputs = [] @@ -117,7 +117,7 @@ def evaluate( additional_kwargs = ( { - "app_params": app_variant_db.config.parameters, + "app_params": app_variant_parameters, "inputs": data_point, } if evaluator_config.evaluator_key == "auto_custom_code_run" From 4636a4914680af310b3cce2de64f8e6aa674126d Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 7 Jan 2024 14:29:41 +0100 Subject: [PATCH 295/414] Refactor - renamed db settings collection to name --- .../agenta_backend/models/db_models.py | 38 +++++++++---------- .../agenta_backend/services/app_manager.py | 2 +- .../agenta_backend/services/db_manager.py | 4 -- 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 84e39af9d7..f9b5b28d00 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -17,7 +17,7 @@ class APIKeyDB(Document): updated_at: Optional[datetime] class Settings: - collection = "api_keys" + name = "api_keys" class InvitationDB(BaseModel): @@ -38,7 +38,7 @@ class OrganizationDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "organizations" + name = "organizations" class UserDB(Document): @@ -50,7 +50,7 @@ class UserDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "users" + name = "users" class ImageDB(Document): @@ -68,7 +68,7 @@ class ImageDB(Document): deletable: bool = Field(default=True) class Settings: - collection = "docker_images" + name = "docker_images" class AppDB(Document): @@ -79,7 +79,7 @@ class AppDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "app_db" + name = "app_db" class DeploymentDB(Document): @@ -94,7 +94,7 @@ class DeploymentDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "deployments" + name = "deployments" class VariantBaseDB(Document): @@ -108,7 +108,7 @@ class VariantBaseDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "bases" + name = "bases" class ConfigVersionDB(BaseModel): @@ -127,7 +127,7 @@ class ConfigDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "configs" + name = "configs" class AppVariantDB(Document): @@ -150,7 +150,7 @@ class AppVariantDB(Document): ) # soft deletion for using the template variants class Settings: - collection = "app_variants" + name = "app_variants" class AppEnvironmentDB(Document): @@ -163,7 +163,7 @@ class AppEnvironmentDB(Document): created_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "app_environment_db" + name = "app_environment_db" class TemplateDB(Document): @@ -192,7 +192,7 @@ class TestSetDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "testsets" + name = "testsets" class CustomEvaluationDB(Document): @@ -205,7 +205,7 @@ class CustomEvaluationDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "custom_evaluations" + name = "custom_evaluations" class EvaluationSettingsTemplate(BaseModel): @@ -225,7 +225,7 @@ class EvaluatorConfigDB(Document): updated_at: datetime = Field(default=datetime.utcnow()) class Settings: - collection = "evaluators_configs" + name = "evaluators_configs" class Result(BaseModel): @@ -276,7 +276,7 @@ class HumanEvaluationDB(Document): updated_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - collection = "human_evaluations" + name = "human_evaluations" class HumanEvaluationScenarioDB(Document): @@ -294,7 +294,7 @@ class HumanEvaluationScenarioDB(Document): note: Optional[str] class Settings: - collection = "human_evaluations_scenarios" + name = "human_evaluations_scenarios" class EvaluationDB(Document): @@ -310,7 +310,7 @@ class EvaluationDB(Document): updated_at: datetime = Field(default=datetime.utcnow()) class Settings: - collection = "evaluations" + name = "evaluations" class EvaluationScenarioDB(Document): @@ -329,7 +329,7 @@ class EvaluationScenarioDB(Document): updated_at: datetime = Field(default=datetime.utcnow()) class Settings: - collection = "evaluation_scenarios" + name = "evaluation_scenarios" class SpanDB(Document): @@ -351,7 +351,7 @@ class SpanDB(Document): tags: Optional[List[str]] class Settings: - collection = "spans" + name = "spans" class Feedback(BaseModel): @@ -379,4 +379,4 @@ class TraceDB(Document): feedbacks: Optional[List[Feedback]] class Settings: - collection = "traces" + name = "traces" diff --git a/agenta-backend/agenta_backend/services/app_manager.py b/agenta-backend/agenta_backend/services/app_manager.py index a5ebdcb000..b15c335c4f 100644 --- a/agenta-backend/agenta_backend/services/app_manager.py +++ b/agenta-backend/agenta_backend/services/app_manager.py @@ -461,7 +461,7 @@ async def add_variant_based_on_image( image=db_image, user=user_instance, organization=app.organization, - parameters=config_db.parameters, + parameters={}, base_name=base_name, config_name=config_name, base=db_base, diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index b573cc4f90..72b49e06b6 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1153,11 +1153,7 @@ async def update_variant_parameters( config_db.current_version = new_version config_db.parameters = parameters - # Update variant parameters - app_variant_db.parameters = config_db.parameters - # Save updated ConfigDB - await app_variant_db.save() await config_db.save() except Exception as e: From 7188d976e7b8af040dd0bc4466e60e8b214f5267 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Mon, 8 Jan 2024 09:29:14 +0500 Subject: [PATCH 296/414] feature: evaluation comparison on frontend --- .../evaluationCompare/EvaluationCompare.tsx | 134 ++++++++---------- .../evaluationResults/EvaluationResults.tsx | 2 +- .../EvaluationScenarios.tsx | 2 +- agenta-web/src/lib/Types.ts | 18 ++- agenta-web/src/services/evaluations/index.ts | 73 ++++++++-- 5 files changed, 134 insertions(+), 95 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx index d51ad74b0b..6303fed986 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -1,14 +1,13 @@ import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {useAppId} from "@/hooks/useAppId" -import {JSSTheme, _Evaluation, _EvaluationScenario} from "@/lib/Types" -import {fetchAllEvaluationScenarios} from "@/services/evaluations" +import {ComparisonResultRow, JSSTheme, TestSet, _Evaluation, _EvaluationScenario} from "@/lib/Types" +import {fetchAllComparisonResults} from "@/services/evaluations" import {ColDef} from "ag-grid-community" import {AgGridReact} from "ag-grid-react" import {Space, Spin, Tag, Tooltip, Typography} from "antd" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import {getFilterParams, getTypedValue} from "../evaluationResults/EvaluationResults" -import {uniqBy} from "lodash" import {getTagColors} from "@/lib/helpers/colors" import {DownloadOutlined} from "@ant-design/icons" import {getAppValues} from "@/contexts/app.context" @@ -34,64 +33,52 @@ const EvaluationCompareMode: React.FC = () => { const appId = useAppId() const classes = useStyles() const {appTheme} = useAppTheme() - const [evaluationIds, setEvaluationIds] = useQueryParam("evaluations") - const [scenarios, setScenarios] = useState<_EvaluationScenario[]>([]) + const [evaluationIdsStr, setEvaluationIdsStr] = useQueryParam("evaluations") const [fetching, setFetching] = useState(false) + const [rows, setRows] = useState([]) + const [testset, setTestset] = useState() const gridRef = useRef>() - const [evalautions, variants] = useMemo(() => { - const evalautions = uniqBy( - scenarios.map((scenario) => scenario.evaluation), - "id", - ) - const variants = uniqBy( - evalautions.map((evaluation) => ({...evaluation.variants[0], evaluation})).flat(1), - "variantId", - ) - return [evalautions, variants] - }, [scenarios]) + const variants = useMemo(() => { + return rows[0]?.variants || [] + }, [rows]) - const colors = useMemo(() => getTagColors(), [evalautions]) + const colors = useMemo(() => getTagColors(), [variants]) + + const evaluationIds = useMemo( + () => evaluationIdsStr.split(",").filter((item) => !!item), + [evaluationIdsStr], + ) const colDefs = useMemo(() => { - const colDefs: ColDef<_EvaluationScenario>[] = [] - if (!scenarios.length || !evalautions.length) return colDefs + const colDefs: ColDef[] = [] + const {inputs, variants} = rows[0] || {} + + if (!rows.length || !variants.length) return [] + + inputs.forEach((ip, ix) => { + colDefs.push({ + headerName: `Input: ${ip.name}`, + minWidth: 200, + flex: 1, + field: `inputs.${ix}.value` as any, + ...getFilterParams("text"), + pinned: "left", + cellRenderer: LongTextCellRenderer, + }) + }) colDefs.push({ headerName: "Expected Output", minWidth: 280, flex: 1, - field: "correct_answer", + field: "correctAnswer", ...getFilterParams("text"), - valueGetter: (params) => { - return params.data?.correct_answer?.toString() || "" - }, pinned: "left", cellRenderer: LongTextCellRenderer, }) variants.forEach((variant, vi) => { - const evalaution = (variant as any).evaluation as _Evaluation - scenarios - .find((scenario) => scenario.evaluation.id === evalaution.id) - ?.inputs.forEach((input, index) => { - colDefs.push({ - headerComponent: () => ( - - Input: {input.name} - {variant.variantName} - - ), - minWidth: 200, - flex: 1, - field: `inputs.${index}`, - ...getFilterParams(input.type === "number" ? "number" : "text"), - valueGetter: (params) => { - return getTypedValue(params.data?.inputs[index]) - }, - cellRenderer: LongTextCellRenderer, - }) - }) colDefs.push({ headerComponent: () => ( @@ -101,14 +88,18 @@ const EvaluationCompareMode: React.FC = () => { ), minWidth: 280, flex: 1, - field: `outputs.0`, + field: `variants.${vi}.output` as any, ...getFilterParams("text"), valueGetter: (params) => { - return getTypedValue(params.data?.outputs[0]) + return getTypedValue( + params.data?.variants.find( + (item) => item.evaluationId === variant.evaluationId, + )?.output, + ) }, cellRenderer: LongTextCellRenderer, }) - evalaution.aggregated_results.forEach(({evaluator_config: config}) => { + variant.evaluatorConfigs.forEach(({evaluatorConfig: config}, ix) => { colDefs.push({ flex: 1, headerComponent: () => ( @@ -117,12 +108,15 @@ const EvaluationCompareMode: React.FC = () => { {variant.variantName} ), - field: "results", + field: `variants.${vi}.evaluatorConfigs.${ix}.result` as any, ...getFilterParams("text"), valueGetter: (params) => { return getTypedValue( - params.data?.results.find((item) => item.evaluator_config === config.id) - ?.result, + params.data?.variants + .find((item) => item.evaluationId === variant.evaluationId) + ?.evaluatorConfigs.find( + (item) => item.evaluatorConfig.id === config.id, + )?.result, ) }, }) @@ -130,24 +124,21 @@ const EvaluationCompareMode: React.FC = () => { }) return colDefs - }, [scenarios]) + }, [rows]) const fetcher = () => { setFetching(true) - Promise.all( - (evaluationIds?.split(",") || []).map((evalId) => - fetchAllEvaluationScenarios(appId, evalId), - ), - ) - .then((scenariosNest: _EvaluationScenario[][]) => { - setScenarios(uniqBy(scenariosNest.flat(1), "id")) + fetchAllComparisonResults(evaluationIds) + .then(({rows, testset}) => { + setRows(rows) + setTestset(testset) setTimeout(() => { if (!gridRef.current) return const ids: string[] = gridRef.current.api .getColumns() - ?.filter((column) => column.getColDef().field?.startsWith("results")) + ?.filter((column) => column.getColDef().field?.endsWith("result")) ?.map((item) => item.getColId()) || [] gridRef.current.api.autoSizeColumns(ids, false) setFetching(false) @@ -158,23 +149,18 @@ const EvaluationCompareMode: React.FC = () => { useEffect(() => { fetcher() - }, [appId, evaluationIds]) + }, [appId, evaluationIdsStr]) const handleDeleteVariant = (evalId: string) => { - setEvaluationIds( - evaluationIds - ?.split(",") - .filter((item) => item !== evalId) - .join(","), - ) + setEvaluationIdsStr(evaluationIds.filter((item) => item !== evalId).join(",")) } const onExport = () => { if (!gridRef.current) return const {currentApp} = getAppValues() gridRef.current.api.exportDataAsCsv({ - fileName: `${currentApp?.app_name}_${evalautions - .map(({variants}) => variants[0].variantName) + fileName: `${currentApp?.app_name}_${variants + .map(({variantName}) => variantName) .join("_")}.csv`, }) } @@ -186,10 +172,8 @@ const EvaluationCompareMode: React.FC = () => { Testset: - - {evalautions[0]?.testset.name || ""} + + {testset?.name || ""} @@ -197,7 +181,7 @@ const EvaluationCompareMode: React.FC = () => {
{variants?.map((v, vi) => ( handleDeleteVariant((v as any).evaluation.id)} closable @@ -219,9 +203,9 @@ const EvaluationCompareMode: React.FC = () => { appTheme === "dark" ? "ag-theme-alpine-dark" : "ag-theme-alpine" } ${classes.table}`} > - + ref={gridRef as any} - rowData={scenarios} + rowData={rows} columnDefs={colDefs} getRowId={(params) => params.data.id} headerHeight={64} diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index d80e57aabd..0856d01432 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -264,7 +264,7 @@ const EvaluationResults: React.FC = () => { selected.length < 2 || selected.some( (item) => - runningStatuses.includes(item.status) || + item.status !== EvaluationStatus.FINISHED || item.testset.id !== selected[0].testset.id, ), [selected], diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx index c63967af4e..8f74c22439 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -102,7 +102,7 @@ const EvaluationScenarios: React.FC = () => { const fetcher = () => { setFetching(true) - fetchAllEvaluationScenarios(appId, evaluationId) + fetchAllEvaluationScenarios(evaluationId) .then((scenarios) => { setScenarios(scenarios) setTimeout(() => { diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 0c57991be2..e3634fbfbd 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -403,14 +403,18 @@ export interface AnnotationScenario { result: TypedValue } -export interface ComparisonResult { +export type ComparisonResultRow = { inputs: {name: string; value: string}[] - correct_answer: string + correctAnswer: string variants: { - variant_id: string - variant_name: string - evaluation_id: string - evaluator_configs: EvaluatorConfig[] + variantId: string + variantName: string + output: TypedValue + evaluationId: string + evaluatorConfigs: { + evaluatorConfig: EvaluatorConfig + result: TypedValue + }[] }[] - data: _EvaluationScenario[] + id: string } diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index b21b8eec55..9a19bb98a9 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -2,12 +2,12 @@ import axios from "@/lib//helpers/axiosConfig" import { Annotation, AnnotationScenario, - ComparisonResult, + ComparisonResultRow, EvaluationStatus, Evaluator, EvaluatorConfig, LLMRunRateLimit, - TypedValue, + TestSet, _Evaluation, _EvaluationScenario, } from "@/lib/Types" @@ -20,6 +20,7 @@ import webhookImg from "@/media/link.png" import aiImg from "@/media/artificial-intelligence.png" import codeImg from "@/media/browser.png" import dayjs from "dayjs" +import {loadTestset} from "@/lib/services/api" //Prefix convention: // - fetch: GET single entity from server @@ -130,11 +131,9 @@ export const deleteEvaluations = async (evaluationsIds: string[]) => { } // Evaluation Scenarios -export const fetchAllEvaluationScenarios = async (appId: string, evaluationId: string) => { +export const fetchAllEvaluationScenarios = async (evaluationId: string) => { const [{data: evaluationScenarios}, evaluation] = await Promise.all([ - axios.get(`/api/evaluations/${evaluationId}/evaluation_scenarios/`, { - params: {app_id: appId}, - }), + axios.get(`/api/evaluations/${evaluationId}/evaluation_scenarios/`), fetchEvaluation(evaluationId), ]) @@ -203,10 +202,62 @@ export const updateAnnotationScenario = async ( // Comparison export const fetchAllComparisonResults = async (evaluationIds: string[]) => { - const response = await axios.get(`/api/evaluations/evaluation_scenarios/comparison-results`, { - params: { - evaluations_ids: evaluationIds.join(","), - }, + const scenarioGroups = await Promise.all(evaluationIds.map(fetchAllEvaluationScenarios)) + const testset: TestSet = await loadTestset(scenarioGroups[0][0].evaluation.testset.id) + + const inputsNameSet = new Set() + scenarioGroups.forEach((group) => { + group.forEach((scenario) => { + scenario.inputs.forEach((input) => inputsNameSet.add(input.name)) + }) }) - return response.data as ComparisonResult[] + + const rows: ComparisonResultRow[] = [] + const inputNames = Array.from(inputsNameSet) + const inputValuesSet = new Set() + const variants = scenarioGroups.map((group) => group[0].evaluation.variants[0]) + for (const data of testset.csvdata) { + const inputValues = inputNames + .filter((name) => data[name] !== undefined) + .map((name) => ({name, value: data[name]})) + const inputValuesStr = inputValues.map((ip) => ip.value).join("") + if (inputValuesSet.has(inputValuesStr)) continue + else inputValuesSet.add(inputValuesStr) + + rows.push({ + id: inputValuesStr, + inputs: inputNames + .map((name) => ({name, value: data[name]})) + .filter((ip) => ip.value !== undefined), + correctAnswer: data.correct_answer || "", + variants: variants.map((variant, ix) => { + const group = scenarioGroups[ix] + const scenario = group.find((scenario) => + scenario.inputs.every((input) => + inputValues.some( + (ip) => ip.name === input.name && ip.value === input.value, + ), + ), + ) + return { + variantId: variant.variantId, + variantName: variant.variantName, + output: scenario?.outputs[0] || {type: "string", value: ""}, + evaluationId: scenario?.evaluation.id || "", + evaluatorConfigs: (scenario?.evaluators_configs || []).map((config) => ({ + evaluatorConfig: config, + result: scenario?.results.find( + (result) => result.evaluator_config === config.id, + )?.result || {type: "string", value: ""}, + })), + } + }), + }) + } + + return { + rows, + testset, + evaluations: scenarioGroups.map((group) => group[0].evaluation), + } } From 321a41f9d849643ba4a5d2557b841655e4205987 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Mon, 8 Jan 2024 09:54:56 +0500 Subject: [PATCH 297/414] bug fix in compare - remove variant | null checks in compare page --- .../evaluationCompare/EvaluationCompare.tsx | 57 +++++++++++++------ 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx index 6303fed986..85c002ef60 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -13,6 +13,8 @@ import {DownloadOutlined} from "@ant-design/icons" import {getAppValues} from "@/contexts/app.context" import {useQueryParam} from "@/hooks/useQuery" import {LongTextCellRenderer} from "../cellRenderers/cellRenderers" +import {stringToNumberInRange} from "@/lib/helpers/utils" +import Link from "next/link" const useStyles = createUseStyles((theme: JSSTheme) => ({ table: { @@ -25,6 +27,15 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ alignItems: "center", justifyContent: "space-between", }, + tag: { + "& a": { + color: "inherit", + "&:hover": { + color: "inherit", + textDecoration: "underline", + }, + }, + }, })) interface Props {} @@ -33,7 +44,7 @@ const EvaluationCompareMode: React.FC = () => { const appId = useAppId() const classes = useStyles() const {appTheme} = useAppTheme() - const [evaluationIdsStr, setEvaluationIdsStr] = useQueryParam("evaluations") + const [evaluationIdsStr = "", setEvaluationIdsStr] = useQueryParam("evaluations") const [fetching, setFetching] = useState(false) const [rows, setRows] = useState([]) const [testset, setTestset] = useState() @@ -43,7 +54,12 @@ const EvaluationCompareMode: React.FC = () => { return rows[0]?.variants || [] }, [rows]) - const colors = useMemo(() => getTagColors(), [variants]) + const colors = useMemo(() => { + const colors = getTagColors() + return variants.map( + (v) => colors[stringToNumberInRange(v.evaluationId, 0, colors.length - 1)], + ) + }, [variants]) const evaluationIds = useMemo( () => evaluationIdsStr.split(",").filter((item) => !!item), @@ -176,21 +192,28 @@ const EvaluationCompareMode: React.FC = () => { {testset?.name || ""} - - Variants: -
- {variants?.map((v, vi) => ( - handleDeleteVariant((v as any).evaluation.id)} - closable - > - {v.variantName} - - ))} -
-
+ + + Variants: +
+ {variants?.map((v, vi) => ( + handleDeleteVariant(v.evaluationId)} + closable={evaluationIds.length > 1} + className={classes.tag} + > + + {v.variantName} + + + ))} +
+
+
From 26de2c99090b904621f913f3958238afab6ec58b Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 8 Jan 2024 10:04:59 +0100 Subject: [PATCH 298/414] Update - remove traceback debug --- agenta-backend/agenta_backend/routers/app_router.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/app_router.py b/agenta-backend/agenta_backend/routers/app_router.py index 483bcc289f..41cf049b13 100644 --- a/agenta-backend/agenta_backend/routers/app_router.py +++ b/agenta-backend/agenta_backend/routers/app_router.py @@ -186,9 +186,6 @@ async def create_app( ) return CreateAppOutput(app_id=str(app_db.id), app_name=str(app_db.app_name)) except Exception as e: - import traceback - - traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) @@ -310,9 +307,6 @@ async def remove_app(app_id: str, request: Request): detail = f"Docker error while trying to remove the app: {str(e)}" raise HTTPException(status_code=500, detail=detail) except Exception as e: - import traceback - - traceback.print_exc() detail = f"Unexpected error while trying to remove the app: {str(e)}" raise HTTPException(status_code=500, detail=detail) From 03e7efade1c10b64e0ae133b94ab1c61e51806d9 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 8 Jan 2024 11:59:32 +0100 Subject: [PATCH 299/414] Cleanup - remove redundant db/api models and code --- .../models/api/evaluation_model.py | 35 -------- .../agenta_backend/models/db_engine.py | 2 - .../agenta_backend/models/db_models.py | 19 ----- .../services/evaluation_service.py | 81 +------------------ .../services/results_service.py | 7 +- 5 files changed, 6 insertions(+), 138 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 04ff3000e7..5e1dc7c6cf 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -21,25 +21,8 @@ class EvaluatorConfig(BaseModel): updated_at: datetime -class EvaluationTypeSettings(BaseModel): - similarity_threshold: Optional[float] - regex_pattern: Optional[str] - regex_should_match: Optional[bool] - webhook_url: Optional[str] - custom_code_evaluation_id: Optional[str] - llm_app_prompt_template: Optional[str] - evaluation_prompt_template: Optional[str] - - class EvaluationType(str, Enum): - auto_exact_match = "auto_exact_match" - auto_similarity_match = "auto_similarity_match" - auto_regex_test = "auto_regex_test" - auto_webhook_test = "auto_webhook_test" - auto_ai_critique = "auto_ai_critique" human_a_b_testing = "human_a_b_testing" - human_scoring = "human_scoring" - custom_code_run = "custom_code_run" single_model_test = "single_model_test" @@ -63,7 +46,6 @@ class NewHumanEvaluation(BaseModel): app_id: str variant_ids: List[str] evaluation_type: EvaluationType - evaluation_type_settings: Optional[EvaluationTypeSettings] inputs: List[str] testset_id: str status: str @@ -99,7 +81,6 @@ class SimpleEvaluationOutput(BaseModel): class HumanEvaluationUpdate(BaseModel): status: Optional[EvaluationStatusEnum] - evaluation_type_settings: Optional[EvaluationTypeSettings] class EvaluationScenarioResult(BaseModel): @@ -134,7 +115,6 @@ class HumanEvaluation(BaseModel): user_id: str user_username: str evaluation_type: EvaluationType - evaluation_type_settings: Optional[EvaluationTypeSettings] variant_ids: List[str] variant_names: List[str] testset_id: str @@ -179,15 +159,6 @@ class EvaluationScenario(BaseModel): results: List[EvaluationScenarioResult] -class AICritiqueCreate(BaseModel): - correct_answer: str - llm_app_prompt_template: Optional[str] - inputs: List[EvaluationScenarioInput] - outputs: List[EvaluationScenarioOutput] - evaluation_prompt_template: Optional[str] - open_ai_key: Optional[str] - - class EvaluationScenarioUpdate(BaseModel): vote: Optional[str] score: Optional[Any] @@ -245,12 +216,6 @@ class EvaluationWebhook(BaseModel): score: float -class EvaluationSettingsTemplate(BaseModel): - type: str - default: str - description: str - - class LLMRunRateLimit(BaseModel): batch_size: int max_retries: int diff --git a/agenta-backend/agenta_backend/models/db_engine.py b/agenta-backend/agenta_backend/models/db_engine.py index 820e9f6893..b951be23a0 100644 --- a/agenta-backend/agenta_backend/models/db_engine.py +++ b/agenta-backend/agenta_backend/models/db_engine.py @@ -19,7 +19,6 @@ AppVariantDB, TemplateDB, TestSetDB, - CustomEvaluationDB, EvaluatorConfigDB, HumanEvaluationDB, HumanEvaluationScenarioDB, @@ -47,7 +46,6 @@ AppVariantDB, TemplateDB, TestSetDB, - CustomEvaluationDB, EvaluatorConfigDB, HumanEvaluationDB, HumanEvaluationScenarioDB, diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index f9b5b28d00..ea579f45f9 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -195,25 +195,6 @@ class Settings: name = "testsets" -class CustomEvaluationDB(Document): - evaluation_name: str - python_code: str - app: Link[AppDB] - user: Link[UserDB] - organization: Link[OrganizationDB] - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - - class Settings: - name = "custom_evaluations" - - -class EvaluationSettingsTemplate(BaseModel): - type: str - default: str - description: str - - class EvaluatorConfigDB(Document): app: Link[AppDB] organization: Link[OrganizationDB] diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 5e8a0a84e1..1c6daf7c83 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -9,13 +9,11 @@ EvaluationScenario, EvaluationScenarioInput, EvaluationType, - EvaluationTypeSettings, HumanEvaluation, HumanEvaluationScenario, HumanEvaluationUpdate, NewEvaluation, EvaluationScenarioUpdate, - CreateCustomEvaluation, EvaluationStatusEnum, NewHumanEvaluation, ) @@ -33,7 +31,6 @@ HumanEvaluationScenarioOutput, UserDB, AppDB, - CustomEvaluationDB, ) from beanie import PydanticObjectId as ObjectId @@ -268,21 +265,6 @@ async def update_human_evaluation_service( if update_payload.status is not None: updates["status"] = update_payload.status - if update_payload.evaluation_type_settings is not None: - current_settings = evaluation.evaluation_type_settings - new_settings = update_payload.evaluation_type_settings - - # Update only the fields that are explicitly set in the payload - for field in EvaluationTypeSettings.__annotations__.keys(): - setattr( - current_settings, - field, - getattr(new_settings, field, None) - or getattr(current_settings, field, None), - ) - - updates["evaluation_type_settings"] = current_settings - # Update the evaluation await evaluation.update({"$set": updates}) @@ -376,11 +358,6 @@ async def update_human_evaluation_scenario( new_eval_set = {} if updated_data["score"] is not None and evaluation_type in [ - EvaluationType.auto_exact_match, - EvaluationType.auto_similarity_match, - EvaluationType.auto_regex_test, - EvaluationType.auto_webhook_test, - EvaluationType.auto_ai_critique, EvaluationType.single_model_test, ]: new_eval_set["score"] = updated_data["score"] @@ -389,8 +366,6 @@ async def update_human_evaluation_scenario( and evaluation_type == EvaluationType.human_a_b_testing ): new_eval_set["vote"] = updated_data["vote"] - elif evaluation_type == EvaluationType.custom_code_run: - new_eval_set["correct_answer"] = updated_data["correct_answer"] if updated_data["outputs"] is not None: new_outputs = [ @@ -471,14 +446,7 @@ async def get_evaluation_scenario_score_service( def _extend_with_evaluation(evaluation_type: EvaluationType): evaluation = {} - if ( - evaluation_type == EvaluationType.auto_exact_match - or evaluation_type == EvaluationType.auto_similarity_match - or evaluation_type == EvaluationType.auto_regex_test - or evaluation_type == EvaluationType.auto_webhook_test - or evaluation_type == EvaluationType.single_model_test - or EvaluationType.auto_ai_critique - ): + if evaluation_type == EvaluationType.single_model_test: evaluation["score"] = "" if evaluation_type == EvaluationType.human_a_b_testing: @@ -488,15 +456,8 @@ def _extend_with_evaluation(evaluation_type: EvaluationType): def _extend_with_correct_answer(evaluation_type: EvaluationType, row: dict): correct_answer = {} - if ( - evaluation_type == EvaluationType.auto_exact_match - or evaluation_type == EvaluationType.auto_similarity_match - or evaluation_type == EvaluationType.auto_regex_test - or evaluation_type == EvaluationType.auto_ai_critique - or evaluation_type == EvaluationType.auto_webhook_test - ): - if row["correct_answer"]: - correct_answer["correct_answer"] = row["correct_answer"] + if row["correct_answer"]: + correct_answer["correct_answer"] = row["correct_answer"] return correct_answer @@ -634,42 +595,6 @@ async def delete_evaluations(evaluation_ids: List[str], **user_org_data: dict) - await evaluation.delete() -async def create_custom_code_evaluation( - payload: CreateCustomEvaluation, **user_org_data: dict -) -> str: - """Save the custom evaluation code in the database. - - Args: - payload (CreateCustomEvaluation): the required payload - - Returns: - str: the custom evaluation id - """ - - # Initialize custom evaluation instance - access = await check_access_to_app( - user_org_data=user_org_data, app_id=payload.app_id - ) - if not access: - raise HTTPException( - status_code=403, - detail=f"You do not have access to this app: {payload.app_id}", - ) - app = await db_manager.fetch_app_by_id(app_id=payload.app_id) - custom_eval = CustomEvaluationDB( - evaluation_name=payload.evaluation_name, - user=app.user, - organization=app.organization, - app=app, - python_code=payload.python_code, - created_at=datetime.utcnow(), - updated_at=datetime.utcnow(), - ) - - await custom_eval.create() - return str(custom_eval.id) - - async def create_new_human_evaluation( payload: NewHumanEvaluation, **user_org_data: dict ) -> EvaluationDB: diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index dc9d9e8df2..d33a1c419f 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -1,13 +1,12 @@ from agenta_backend.models.db_models import ( - EvaluationScenarioDB, - EvaluationDB, HumanEvaluationDB, + EvaluationScenarioDB, HumanEvaluationScenarioDB, ) -from agenta_backend.services import evaluation_service from agenta_backend.services import db_manager from agenta_backend.models.api.evaluation_model import EvaluationType -from bson import ObjectId + +from beanie import PydanticObjectId as ObjectId async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB): From 0b753a5866bd21541ccec049e73e41669b139521 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 8 Jan 2024 14:29:54 +0100 Subject: [PATCH 300/414] Cleanup - refactor user_service to make use of beanie odm --- agenta-backend/agenta_backend/services/user_service.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/agenta-backend/agenta_backend/services/user_service.py b/agenta-backend/agenta_backend/services/user_service.py index 1cf0dfc323..b1761598fd 100644 --- a/agenta-backend/agenta_backend/services/user_service.py +++ b/agenta-backend/agenta_backend/services/user_service.py @@ -1,4 +1,3 @@ -from agenta_backend.utils.common import engine from agenta_backend.models.db_models import UserDB from agenta_backend.models.api.user_models import User, UserUpdate @@ -9,17 +8,16 @@ async def create_new_user(payload: User) -> UserDB: username=payload.username, email=payload.email, ) - user = await engine.save(user_instance) + user = await user_instance.create() return user async def update_user(user_uid: str, payload: UserUpdate) -> UserDB: - user = await engine.find_one(UserDB, UserDB.uid == user_uid) + user = await UserDB.find_one(UserDB.uid == user_uid, fetch_links=True) if user is not None: values_to_update = {key: value for key, value in payload.dict()} - updated_user = user.update(values_to_update) - await engine.save(updated_user) + await user.update({"$set": values_to_update}) return user raise NotFound("Credentials not found. Please try again!") From c16f0c4aaa11280bb78d413b843a67ec6e759f2d Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Mon, 8 Jan 2024 19:44:03 +0500 Subject: [PATCH 301/414] fixed bug in evaluation_config not read correctly in aggregated_results --- .../services/evaluation_service.py | 2 -- .../agenta_backend/tasks/evaluations.py | 19 +++++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 5db996c432..2711aab903 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -115,8 +115,6 @@ async def _fetch_human_evaluation_scenario_and_check_access( evaluation_scenario_id=evaluation_scenario_id ) - print("evaluation_scenario") - print(evaluation_scenario) if evaluation_scenario is None: raise HTTPException( status_code=404, diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index b620b707cb..fdf2fa5503 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -58,7 +58,9 @@ def evaluate( new_evaluation_db = loop.run_until_complete( fetch_evaluation_by_id(evaluation_id) ) - evaluators_aggregated_data = defaultdict(list) + evaluators_aggregated_data = defaultdict( + lambda: {"evaluator_key": "", "results": list()} + ) deployment = loop.run_until_complete( get_deployment_by_objectid(app_variant_db.base.deployment) @@ -132,7 +134,14 @@ def evaluate( result=result, ) evaluators_results.append(result_object) - evaluators_aggregated_data[evaluator_config.evaluator_key].append( + if ( + evaluators_aggregated_data[evaluator_config_id]["evaluator_key"] + == "" + ): + evaluators_aggregated_data[evaluator_config_id][ + "evaluator_key" + ] = evaluator_config.evaluator_key + evaluators_aggregated_data[evaluator_config_id]["results"].append( result ) @@ -175,7 +184,9 @@ async def aggregate_evaluator_results( app: AppDB, evaluators_aggregated_data: dict ) -> List[AggregatedResult]: aggregated_results = [] - for evaluator_key, results in evaluators_aggregated_data.items(): + for config_id, val in evaluators_aggregated_data.items(): + evaluator_key = val["evaluator_key"] or "" + results = val["results"] or [] if evaluator_key != "auto_ai_critique": average_value = ( sum([result.value for result in results]) / len(results) @@ -198,7 +209,7 @@ async def aggregate_evaluator_results( ) except TypeError: average_value = None - evaluator_config = await fetch_evaluator_config_by_appId(app.id, evaluator_key) + evaluator_config = await fetch_evaluator_config(config_id) aggregated_result = AggregatedResult( evaluator_config=evaluator_config.id, result=Result(type="number", value=average_value), From 41b4f7693e19de7530709b3b187b373c13d9f864 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Mon, 8 Jan 2024 22:15:13 +0500 Subject: [PATCH 302/414] fixed TS error in DynamicFormField --- .../pages/evaluations/evaluators/NewEvaluatorModal.tsx | 2 -- 1 file changed, 2 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx index 754666bb8b..37db39c56e 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx @@ -100,8 +100,6 @@ const DynamicFormField: React.FC = ({ ) : type === "text" ? ( - ) : type === "object" ? ( - ) : type === "code" ? ( ) : null} From a191f3f6bef70fd53009b0ee251e6661783601e7 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Mon, 8 Jan 2024 22:21:45 +0500 Subject: [PATCH 303/414] reverted dev.dockerFile code suppression --- agenta-web/dev.Dockerfile | 46 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile index 9c4ce68303..6f86dbd847 100644 --- a/agenta-web/dev.Dockerfile +++ b/agenta-web/dev.Dockerfile @@ -3,24 +3,24 @@ FROM node:18-alpine WORKDIR /app # Install dependencies based on the preferred package manager -# COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ -# RUN \ -# if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ -# elif [ -f package-lock.json ]; then npm i; \ -# elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ -# # Allow install without lockfile, so example works even without Node.js installed locally -# else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ -# fi +COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ +RUN \ + if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ + elif [ -f package-lock.json ]; then npm i; \ + elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ + # Allow install without lockfile, so example works even without Node.js installed locally + else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ + fi -# COPY src ./src -# COPY public ./public -# COPY next.config.js . -# COPY tsconfig.json . -# COPY postcss.config.js . -# COPY .env . -# RUN if [ -f .env.local ]; then cp .env.local .; fi -# # used in cloud -# COPY sentry.* . +COPY src ./src +COPY public ./public +COPY next.config.js . +COPY tsconfig.json . +COPY postcss.config.js . +COPY .env . +RUN if [ -f .env.local ]; then cp .env.local .; fi +# used in cloud +COPY sentry.* . # Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry # Uncomment the following line to disable telemetry at run time # ENV NEXT_TELEMETRY_DISABLED 1 @@ -28,10 +28,10 @@ WORKDIR /app # Note: Don't expose ports here, Compose will handle that for us # Start Next.js in development mode based on the preferred package manager -# CMD \ -# if [ -f yarn.lock ]; then yarn dev; \ -# elif [ -f package-lock.json ]; then npm run dev; \ -# elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ -# else yarn dev; \ -# fi +CMD \ + if [ -f yarn.lock ]; then yarn dev; \ + elif [ -f package-lock.json ]; then npm run dev; \ + elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ + else yarn dev; \ + fi From b92766fae2643b51aa5b4ef3093e6cc74f1379de Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 8 Jan 2024 19:00:54 +0100 Subject: [PATCH 304/414] removed previous e2e tests --- .../cypress/e2e/ai-critic-evaluation.cy.ts | 102 ----------------- agenta-web/cypress/e2e/code-evaluation.cy.ts | 84 -------------- .../cypress/e2e/exact-match-evaluation.cy.ts | 67 ----------- agenta-web/cypress/e2e/regex-evaluation.cy.ts | 105 ------------------ 4 files changed, 358 deletions(-) delete mode 100644 agenta-web/cypress/e2e/ai-critic-evaluation.cy.ts delete mode 100644 agenta-web/cypress/e2e/code-evaluation.cy.ts delete mode 100644 agenta-web/cypress/e2e/exact-match-evaluation.cy.ts delete mode 100644 agenta-web/cypress/e2e/regex-evaluation.cy.ts diff --git a/agenta-web/cypress/e2e/ai-critic-evaluation.cy.ts b/agenta-web/cypress/e2e/ai-critic-evaluation.cy.ts deleted file mode 100644 index f53b455fc4..0000000000 --- a/agenta-web/cypress/e2e/ai-critic-evaluation.cy.ts +++ /dev/null @@ -1,102 +0,0 @@ -describe("AI Critics Evaluation workflow", () => { - let app_id - let testset_name - before(() => { - cy.createVariantsAndTestsets() - cy.get("@app_id").then((appId) => { - app_id = appId - }) - cy.get("@testsetName").then((testsetName) => { - testset_name = testsetName - }) - }) - - context("When you select evaluation without an API key", () => { - beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations`) - cy.clearLocalStorage("llmAvailableProvidersToken") - - cy.get('[data-cy="evaluation-error-modal"]').should("not.exist") - cy.get('[data-cy="ai-critic-button"]').click() - - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") - cy.get('[data-cy="variant-0"]').click() - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseout") - - cy.get('[data-cy="selected-testset"]').trigger("mouseover") - cy.get('[data-cy^="testset"]').contains(testset_name).click() - cy.get('[data-cy="selected-testset"]').trigger("mouseout") - - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - cy.get('[data-cy="evaluation-error-modal"]').should("exist") - }) - - it("Should display when starting evaluation", () => { - cy.get('[data-cy="evaluation-error-modal-ok-button"]').click() - }) - - it("Should navigate to Settings when clicking on the modal", () => { - cy.get('[data-cy="evaluation-error-modal-nav-button"]').click() - cy.url().should("include", "/settings") - }) - }) - - context("When you select evaluation with an API key", () => { - beforeEach(() => { - cy.addingOpenaiKey() - - cy.visit(`/apps/${app_id}/evaluations`) - cy.get('[data-cy="ai-critic-button"]').click() - - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") - cy.get('[data-cy="variant-0"]').click() - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseout") - - cy.get('[data-cy="selected-testset"]').trigger("mouseover") - cy.get('[data-cy^="testset"]').contains(testset_name).click() - cy.get('[data-cy="selected-testset"]').trigger("mouseout") - - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - }) - - it("Should successfully navigate to AI Critic", () => { - cy.get('[data-cy="evaluation-error-modal"]').should("not.exist") - cy.url().should("include", "/auto_ai_critique") - }) - - it("Should complete the evaluation workflow without errors", () => { - cy.get('[data-cy="ai-critic-evaluation-result"]').should( - "contain.text", - "Run evaluation to see results!", - ) - cy.get(".ant-message-notice-content").should("not.exist") - cy.wait(1500) - cy.clickLinkAndWait('[data-cy="ai-critic-run-evaluation"]') - cy.get(".ant-spin").should("exist") - - cy.get('[data-cy="ai-critic-evaluation-result"]').should("contain.text", "Results Data") - - cy.get(".ant-spin").should("not.exist") - cy.get(".ant-message-notice-content").should("contain.text", "Evaluation Results Saved") - }) - - it("Should execute evaluation workflow with error", () => { - cy.clearLocalStorage("llmAvailableProvidersToken") - cy.wait(1000) - cy.clickLinkAndWait('[data-cy="ai-critic-run-evaluation"]') - - cy.get(".ant-spin").should("exist") - cy.get('[data-cy="ai-critic-evaluation-result"]').should( - "contain.text", - "Failed to run evaluation", - ) - - cy.get(".ant-spin").should("not.exist") - cy.get(".ant-message-notice-content").should("exist") - }) - }) - - after(() => { - cy.cleanupVariantAndTestset() - }) -}) diff --git a/agenta-web/cypress/e2e/code-evaluation.cy.ts b/agenta-web/cypress/e2e/code-evaluation.cy.ts deleted file mode 100644 index 58b1499499..0000000000 --- a/agenta-web/cypress/e2e/code-evaluation.cy.ts +++ /dev/null @@ -1,84 +0,0 @@ -import {randString} from "../../src/lib/helpers/utils" - -// This is added to prevent Cypress from failing the test prematurely due to application errors. -Cypress.on("uncaught:exception", () => false) - -describe("Code Evaluation workflow", () => { - const eval_name = randString(5) - let app_id - let testset_name - before(() => { - cy.createVariantsAndTestsets() - cy.get("@app_id").then((appId) => { - app_id = appId - }) - cy.get("@testsetName").then((testsetName) => { - testset_name = testsetName - }) - }) - - context("When navigating to Evaluation Page", () => { - it("Should reach the Evaluation Page", () => { - cy.visit(`/apps/${app_id}/playground`) - cy.contains(/modify parameters/i) - cy.clickLinkAndWait('[data-cy="app-evaluations-link"]') - cy.url().should("include", "/evaluations") - }) - }) - - context("When creating a new evaluation", () => { - beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations`) - cy.clickLinkAndWait('[data-cy="code-evaluation-button"]') - cy.clickLinkAndWait('[data-cy="new-code-evaluation-button"]') - cy.url().should("include", "/create_custom_evaluation") - }) - - it("Should add a new Code Evaluation successfully", () => { - cy.get('[data-cy="code-evaluation-save-button"]').should("be.disabled") - cy.get('[data-cy="code-evaluation-input"]').type(eval_name) - cy.get(".monaco-editor").type(".") - cy.get('[data-cy="code-evaluation-save-button"]').should("not.be.disabled") - cy.clickLinkAndWait('[data-cy="code-evaluation-save-button"]') - cy.url().should("include", "/evaluations") - }) - }) - - context("When executing the evaluation", () => { - it("Should execute evaluation workflow successfully", () => { - cy.visit(`/apps/${app_id}/evaluations`) - cy.clickLinkAndWait('[data-cy="code-evaluation-button"]') - cy.get('[data-cy^="code-evaluation-option"]').contains(eval_name).click() - - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") - cy.get('[data-cy="variant-0"]').click() - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseout") - - cy.get('[data-cy="selected-testset"]').trigger("mouseover") - cy.get('[data-cy^="testset"]').contains(testset_name).click() - cy.get('[data-cy="selected-testset"]').trigger("mouseout") - - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - cy.url().should("include", "/custom_code_run") - cy.wait(1500) - cy.clickLinkAndWait('[data-cy="code-evaluation-run"]') - cy.get(".ant-spin").should("exist") - }) - }) - - context("When displaying Code Evaluation result", () => { - it("Should display Code Evaluation result", () => { - cy.visit(`/apps/${app_id}/evaluations`) - cy.url().should("include", "/evaluations") - cy.get('[data-cy="automatic-evaluation-result"]').within(() => { - cy.get("tr").last().should("contain.text", "Custom Code Run") - }) - }) - }) - - after(() => { - cy.cleanupVariantAndTestset() - }) -}) diff --git a/agenta-web/cypress/e2e/exact-match-evaluation.cy.ts b/agenta-web/cypress/e2e/exact-match-evaluation.cy.ts deleted file mode 100644 index 24812a6b32..0000000000 --- a/agenta-web/cypress/e2e/exact-match-evaluation.cy.ts +++ /dev/null @@ -1,67 +0,0 @@ -describe("Exact Match Evaluation workflow", () => { - let app_id - let testset_name - before(() => { - cy.createVariantsAndTestsets() - cy.get("@app_id").then((appId) => { - app_id = appId - }) - cy.get("@testsetName").then((testsetName) => { - testset_name = testsetName - }) - }) - - context("When navigating to Evaluation Page", () => { - it("Should reach the Evaluation Page", () => { - cy.visit(`/apps/${app_id}/playground`) - cy.contains(/modify parameters/i) - cy.clickLinkAndWait('[data-cy="app-evaluations-link"]') - cy.url().should("include", "/evaluations") - }) - }) - - context("When executing the evaluation", () => { - beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations`) - cy.url().should("include", "/evaluations") - }) - - it("Should execute evaluation workflow successfully", () => { - cy.get('[data-cy="exact-match-button"]').click() - - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") - cy.get('[data-cy="variant-0"]').click() - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseout") - - cy.get('[data-cy="selected-testset"]').trigger("mouseover") - cy.get('[data-cy^="testset"]').contains(testset_name).click() - cy.get('[data-cy="selected-testset"]').trigger("mouseout") - - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - - cy.url().should("include", "/auto_exact_match") - cy.wait(1500) - cy.get('[data-cy="exact-match-evaluation-button"]').click() - - cy.get('[data-cy="exact-match-evaluation-score"]') - .invoke("text") - .then((text) => { - // Check if the text contains either "correct" or "wrong" - expect(text.includes("correct") || text.includes("wrong")).to.be.true - }) - - cy.get(".ant-statistic-content-value").first().should("contain", "3 out of 3") - cy.get(".ant-message-notice-content").should("exist") - }) - - it("Should display Exact Match Evaluation result", () => { - cy.get('[data-cy="automatic-evaluation-result"]').within(() => { - cy.get("tr").last().should("contain.text", "Exact Match") - }) - }) - }) - - after(() => { - cy.cleanupVariantAndTestset() - }) -}) diff --git a/agenta-web/cypress/e2e/regex-evaluation.cy.ts b/agenta-web/cypress/e2e/regex-evaluation.cy.ts deleted file mode 100644 index 7c65917845..0000000000 --- a/agenta-web/cypress/e2e/regex-evaluation.cy.ts +++ /dev/null @@ -1,105 +0,0 @@ -describe("Regex Evaluation workflow", () => { - let app_id - let testset_name - before(() => { - cy.createVariantsAndTestsets() - cy.get("@app_id").then((appId) => { - app_id = appId - }) - cy.get("@testsetName").then((testsetName) => { - testset_name = testsetName - }) - }) - - context("When navigating to Evaluation Page", () => { - it("Should reach the Evaluation Page", () => { - cy.visit(`/apps/${app_id}/playground`) - cy.contains(/modify parameters/i) - cy.clickLinkAndWait('[data-cy="app-evaluations-link"]') - cy.url().should("include", "/evaluations") - }) - }) - - context("When no Variant and Testset are Selected", () => { - beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations`) - }) - - it("Should display a warning to select Variant", () => { - cy.clickLinkAndWait('[data-cy="regex-button"]') - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - cy.get(".ant-message-notice-content") - .should("contain.text", "Please select a variant") - .should("exist") - }) - - it("Should display a warning to select Testset", () => { - cy.clickLinkAndWait('[data-cy="regex-button"]') - - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") - cy.get('[data-cy="variant-0"]').click() - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseout") - - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - cy.get(".ant-message-notice-content") - .should("contain.text", "Please select a testset") - .should("exist") - }) - }) - - context("When Variant and Testset are Selected", () => { - beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations`) - cy.clickLinkAndWait('[data-cy="regex-button"]') - - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") - cy.get('[data-cy="variant-0"]').click() - cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseout") - - cy.get('[data-cy="selected-testset"]').trigger("mouseover") - cy.get('[data-cy^="testset"]').contains(testset_name).click() - cy.get('[data-cy="selected-testset"]').trigger("mouseout") - - cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') - - cy.location("pathname").should("include", "/auto_regex_test") - - cy.get(".ant-form-item-explain-error").should("not.exist") - }) - - it("Should display error for missing regex pattern", () => { - cy.clickLinkAndWait('[data-cy="regex-evaluation-run"]') - - cy.get(".ant-form-item-explain-error").should("exist") - }) - - it("Should execute evaluation workflow successfully", () => { - cy.get('[data-cy="regex-evaluation-input"]').type(`^[A-Z][a-z]*$`) - - cy.get('[data-cy="regex-evaluation-strategy"]').within(() => { - cy.get("label").eq(0).click() - }) - - cy.clickLinkAndWait('[data-cy="regex-evaluation-run"]') - - cy.get('[data-cy="regex-evaluation-regex-match"]', {timeout: 60000}) - .invoke("text") - .then((text) => { - // Check if the text contains either "Match" or "Mismatch" - expect(text.includes("Match") || text.includes("Mismatch")).to.be.true - }) - cy.get('[data-cy="regex-evaluation-score"]', {timeout: 60000}) - .invoke("text") - .then((text) => { - // Check if the text contains either "correct" or "wrong" - expect(text.includes("correct") || text.includes("wrong")).to.be.true - }) - - cy.get(".ant-message-notice-content").should("exist") - }) - }) - - after(() => { - cy.cleanupVariantAndTestset() - }) -}) From f0d9e1ec48db397ce24693e67862884c17a289fa Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 8 Jan 2024 19:01:46 +0100 Subject: [PATCH 305/414] test: single model test --- .../e2e/single-model-test-evaluation.cy.ts | 43 +++++++++++++++++++ .../EvaluationVotePanel.tsx | 5 ++- .../evaluators/NewEvaluatorModal.tsx | 9 ++-- 3 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts diff --git a/agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts b/agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts new file mode 100644 index 0000000000..371ecb12a7 --- /dev/null +++ b/agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts @@ -0,0 +1,43 @@ +describe("Single Model Test workflow", () => { + let app_id + let testset_name + before(() => { + cy.createVariantsAndTestsets() + cy.get("@app_id").then((appId) => { + app_id = appId + }) + cy.get("@testsetName").then((testsetName) => { + testset_name = testsetName + }) + }) + + context("When executing the evaluation", () => { + it("Should successfully execute the evaluation process", () => { + cy.visit(`/apps/${app_id}/annotations`) + cy.url().should("include", "/annotations") + cy.clickLinkAndWait('[data-cy="singleModel-button"]') + + cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") + cy.get('[data-cy="variant-0"]').click() + cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseout") + + cy.get('[data-cy="selected-testset"]').trigger("mouseover") + cy.get('[data-cy^="testset"]').contains(testset_name).click() + cy.get('[data-cy="selected-testset"]').trigger("mouseout") + + cy.clickLinkAndWait('[data-cy="start-new-evaluation-button"]') + cy.url().should("include", "/single_model_test") + cy.get('[data-cy="evalInstructionsShown-ok-btn"]').click() + + cy.get('[data-cy="evaluation-vote-panel-numeric-vote-input"]').should("not.exist") + + cy.wait(1000) + cy.get('[data-cy="single-model-run-all-button"]').click() + cy.get('[data-cy="evaluation-vote-panel-numeric-vote-input"]').type("100") + }) + }) + + after(() => { + cy.cleanupVariantAndTestset() + }) +}) diff --git a/agenta-web/src/components/Evaluations/EvaluationCardView/EvaluationVotePanel.tsx b/agenta-web/src/components/Evaluations/EvaluationCardView/EvaluationVotePanel.tsx index 06875a67f1..d36d84c072 100644 --- a/agenta-web/src/components/Evaluations/EvaluationCardView/EvaluationVotePanel.tsx +++ b/agenta-web/src/components/Evaluations/EvaluationCardView/EvaluationVotePanel.tsx @@ -92,7 +92,7 @@ const ComparisonVote: React.FC = ({variants, onChange, valu onClick={getOnClick(variant.variantId)} type={value === variant.variantId ? "primary" : undefined} danger - data-cy="abTesting-app-variant-vote-button" + data-cy="evaluation-vote-panel-comparison-vote-button" > {String.fromCharCode(65 + ix)}: {variant.variantName} @@ -107,7 +107,7 @@ const ComparisonVote: React.FC = ({variants, onChange, valu type={value === badId ? "primary" : undefined} key={badId} onClick={getOnClick(badId)} - data-cy="abTesting-both-bad-vote-button" + data-cy="evaluation-vote-panel-comparison-both-bad-vote-button-button" > Both are bad @@ -230,6 +230,7 @@ const NumericScoreVote: React.FC = ({ } min={min} max={max} + data-cy="evaluation-vote-panel-numeric-vote-input" onChange={(score) => _onChange(variant.variantId, score)} /> / {max} diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx index 754666bb8b..3bde3d1c68 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx @@ -184,7 +184,7 @@ const NewEvaluatorModal: React.FC = ({ label="Name" rules={[{required: true, message: "This field is required"}]} > - + = ({ } className={classes.radioGroup} > - {evaluators.map((evaluator) => ( + {evaluators.map((evaluator, index) => ( -
+
{evaluator.icon_url && ( Date: Mon, 8 Jan 2024 19:05:45 +0100 Subject: [PATCH 306/414] crud test for evaluators --- agenta-web/cypress.config.ts | 1 + agenta-web/cypress/e2e/eval.evaluators.cy.ts | 46 +++++++++++++++++++ agenta-web/dev.Dockerfile | 46 +++++++++---------- .../evaluations/evaluators/EvaluatorCard.tsx | 13 +++++- 4 files changed, 81 insertions(+), 25 deletions(-) create mode 100644 agenta-web/cypress/e2e/eval.evaluators.cy.ts diff --git a/agenta-web/cypress.config.ts b/agenta-web/cypress.config.ts index 432b8a894d..cd7aba93fb 100644 --- a/agenta-web/cypress.config.ts +++ b/agenta-web/cypress.config.ts @@ -19,6 +19,7 @@ export default defineConfig({ }, }) }, + experimentalStudio: true, }, env: { baseApiURL: "http://localhost/api", diff --git a/agenta-web/cypress/e2e/eval.evaluators.cy.ts b/agenta-web/cypress/e2e/eval.evaluators.cy.ts new file mode 100644 index 0000000000..b5cef367ad --- /dev/null +++ b/agenta-web/cypress/e2e/eval.evaluators.cy.ts @@ -0,0 +1,46 @@ +import {randString} from "../../src/lib/helpers/utils" + +describe("Evaluators CRUD Test", function () { + let newEvalName = randString(5) + let app_id + before(() => { + cy.createVariant() + cy.get("@app_id").then((appId) => { + app_id = appId + }) + }) + + context("CRUD operation with evaluators", () => { + beforeEach(() => { + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") + }) + + it("should create a new evaluator", () => { + cy.get("#rc-tabs-1-tab-evaluators > :nth-child(2)").click() + cy.get('[data-cy="evaluator-card"]').should("have.length", 1) + cy.get(".ant-space > :nth-child(2) > .ant-btn").click() + cy.get('[data-cy="new-evaluator-modal-input"]').type(newEvalName) + cy.get('[data-cy="new-evaluator-modal-button-0"]').click() + cy.get(".ant-modal-footer > .ant-btn-primary > :nth-child(2)").click() + cy.get('[data-cy="evaluator-card"]').should("have.length", 2) + }) + + it("should update an evaluator", () => { + cy.get("#rc-tabs-1-tab-evaluators > :nth-child(2)").click() + cy.get('[data-cy^="evaluator-card-edit-button"]').eq(0).click() + cy.get('[data-cy="new-evaluator-modal-input"]').type("edit") + cy.get(".ant-modal-footer > .ant-btn-primary > .ant-btn-icon > .anticon > svg").click() + }) + + it("should delete an evaluator", () => { + cy.get("#rc-tabs-1-tab-evaluators > :nth-child(2)").click() + cy.get('[data-cy^="evaluator-card-delete-button"]').eq(0).click() + cy.get(".ant-modal-confirm-btns > :nth-child(2) > span").click() + }) + }) + + after(() => { + cy.cleanupVariantAndTestset() + }) +}) diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile index 9c4ce68303..6af573852c 100644 --- a/agenta-web/dev.Dockerfile +++ b/agenta-web/dev.Dockerfile @@ -3,24 +3,24 @@ FROM node:18-alpine WORKDIR /app # Install dependencies based on the preferred package manager -# COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ -# RUN \ -# if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ -# elif [ -f package-lock.json ]; then npm i; \ -# elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ -# # Allow install without lockfile, so example works even without Node.js installed locally -# else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ -# fi +COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ +RUN \ + if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ + elif [ -f package-lock.json ]; then npm i; \ + elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ + # Allow install without lockfile, so example works even without Node.js installed locally + else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ + fi -# COPY src ./src -# COPY public ./public -# COPY next.config.js . -# COPY tsconfig.json . -# COPY postcss.config.js . -# COPY .env . -# RUN if [ -f .env.local ]; then cp .env.local .; fi -# # used in cloud -# COPY sentry.* . +COPY src ./src +COPY public ./public +COPY next.config.js . +COPY tsconfig.json . +COPY postcss.config.js . +COPY .env . +RUN if [ -f .env.local ]; then cp .env.local .; fi +# # used in cloud +COPY sentry.* . # Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry # Uncomment the following line to disable telemetry at run time # ENV NEXT_TELEMETRY_DISABLED 1 @@ -28,10 +28,10 @@ WORKDIR /app # Note: Don't expose ports here, Compose will handle that for us # Start Next.js in development mode based on the preferred package manager -# CMD \ -# if [ -f yarn.lock ]; then yarn dev; \ -# elif [ -f package-lock.json ]; then npm run dev; \ -# elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ -# else yarn dev; \ -# fi +CMD \ + if [ -f yarn.lock ]; then yarn dev; \ + elif [ -f package-lock.json ]; then npm run dev; \ + elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ + else yarn dev; \ + fi diff --git a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx index cc7b3b1173..d3aba48c20 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx @@ -75,10 +75,19 @@ const EvaluatorCard: React.FC = ({evaluatorConfig, onEdit, onSuccessDelet evaluator.direct_use ? [] : [ - , - , + , + , ] } + data-cy="evaluator-card" >
From aac985844edfad018dd30ea8b850a804863e644c Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 8 Jan 2024 21:00:10 +0100 Subject: [PATCH 307/414] improvements in previous tests --- agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts | 14 +++++++++----- agenta-web/cypress/e2e/app-navigation.cy.ts | 5 +++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts b/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts index e0e24fb0ec..e6479837fd 100644 --- a/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts +++ b/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts @@ -71,14 +71,18 @@ describe("A/B Testing Evaluation workflow", () => { cy.url().should("include", "/human_a_b_testing") cy.get('[data-cy="evalInstructionsShown-ok-btn"]').click() - cy.get('[data-cy="abTesting-app-variant-vote-button"]').should("not.exist") - cy.get('[data-cy="abTesting-both-bad-vote-button"]').should("not.exist") + cy.get('[data-cy="evaluation-vote-panel-comparison-vote-button"]').should("not.exist") + cy.get( + '[data-cy="evaluation-vote-panel-comparison-both-bad-vote-button-button"]', + ).should("not.exist") cy.wait(1000) cy.get('[data-cy="abTesting-run-all-button"]').click() - cy.get('[data-cy="abTesting-app-variant-vote-button"]').eq(0).click() - cy.get('[data-cy="abTesting-app-variant-vote-button"]').eq(1).click() - cy.get('[data-cy="abTesting-both-bad-vote-button"]').click() + cy.get('[data-cy="evaluation-vote-panel-comparison-vote-button"]').eq(0).click() + cy.get('[data-cy="evaluation-vote-panel-comparison-vote-button"]').eq(1).click() + cy.get( + '[data-cy="evaluation-vote-panel-comparison-both-bad-vote-button-button"]', + ).click() }) }) diff --git a/agenta-web/cypress/e2e/app-navigation.cy.ts b/agenta-web/cypress/e2e/app-navigation.cy.ts index c6a3cdc04c..65d376e2b6 100644 --- a/agenta-web/cypress/e2e/app-navigation.cy.ts +++ b/agenta-web/cypress/e2e/app-navigation.cy.ts @@ -34,6 +34,11 @@ describe("App Navigation without errors", () => { //TOOD add more assertions specific to the new evaluations page }) + it("should navigate successfully to Annotations", () => { + cy.clickLinkAndWait('[data-cy="app-annotations-link"]') + cy.location("pathname").should("include", "/annotations") + }) + if (isDemo()) { it("should navigate successfully to Endpoints", () => { cy.clickLinkAndWait('[data-cy="app-endpoints-link"]') From 013a2a9ce7bec0c7d7a603e5745fc3182a93bfd4 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 8 Jan 2024 21:59:15 +0100 Subject: [PATCH 308/414] test: CRUD operation for evaluation and comparison --- agenta-web/cypress/e2e/eval.comparison.cy.ts | 59 ++++++++++++++++++ agenta-web/cypress/e2e/eval.evaluations.cy.ts | 60 +++++++++++++++++++ .../evaluationCompare/EvaluationCompare.tsx | 1 + .../evaluationResults/EvaluationResults.tsx | 1 + .../evaluationResults/NewEvaluationModal.tsx | 13 ++-- 5 files changed, 129 insertions(+), 5 deletions(-) create mode 100644 agenta-web/cypress/e2e/eval.comparison.cy.ts create mode 100644 agenta-web/cypress/e2e/eval.evaluations.cy.ts diff --git a/agenta-web/cypress/e2e/eval.comparison.cy.ts b/agenta-web/cypress/e2e/eval.comparison.cy.ts new file mode 100644 index 0000000000..6406aaeb1a --- /dev/null +++ b/agenta-web/cypress/e2e/eval.comparison.cy.ts @@ -0,0 +1,59 @@ +describe("Evaluators CRUD Test", function () { + let app_id + before(() => { + cy.createVariant() + cy.get("@app_id").then((appId) => { + app_id = appId + }) + cy.get('[data-cy="playground-save-changes-button"]').eq(0).click() + }) + + context("CRUD operation with evaluators", () => { + beforeEach(() => { + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") + }) + + it("should create a new Evaluation", () => { + Array.from({length: 2}).map((_) => { + cy.get('[data-cy="new-evaluation-button"]').click() + cy.get(".ant-modal-content").should("exist") + + cy.get('[data-cy="select-testset-group"]').click() + cy.get('[data-cy="select-testset-option"]').click() + + cy.get('[data-cy="select-variant-group"]').click() + cy.get('[data-cy="select-variant-option"]').eq(0).click() + cy.get('[data-cy="select-variant-group"]').click() + + cy.get('[data-cy="select-evaluators-group"]').click() + cy.get('[data-cy="select-evaluators-option"]').eq(0).click() + cy.get('[data-cy="select-evaluators-group"]').click() + + cy.get( + ".ant-modal-footer > .ant-btn-primary > .ant-btn-icon > .anticon > svg", + ).click() + cy.wait(1000) + }) + }) + + it("should create a new Evaluation", () => { + cy.get('.ag-row[row-index="0"]').should("exist") + cy.get('.ag-row[row-index="1"]').should("exist") + cy.get('.ag-cell[col-id="status"]').should("contain.text", "Completed") + }) + + it("should create a new Evaluation", () => { + cy.get("#ag-33-input").check() + cy.get("#ag-35-input").check() + cy.get(":nth-child(2) > .ant-btn > .ant-btn-icon > .anticon > svg").click() + cy.location("pathname").should("include", "/evaluations/compare") + cy.contains(/Evaluations Comparison/i) + cy.get('[data-cy="evaluation-compare-table"]').should("exist") + }) + }) + + after(() => { + cy.cleanupVariantAndTestset() + }) +}) diff --git a/agenta-web/cypress/e2e/eval.evaluations.cy.ts b/agenta-web/cypress/e2e/eval.evaluations.cy.ts new file mode 100644 index 0000000000..03c1cb0f81 --- /dev/null +++ b/agenta-web/cypress/e2e/eval.evaluations.cy.ts @@ -0,0 +1,60 @@ +describe("Evaluators CRUD Test", function () { + let app_id + before(() => { + cy.createVariant() + cy.get("@app_id").then((appId) => { + app_id = appId + }) + cy.get('[data-cy="playground-save-changes-button"]').eq(0).click() + }) + + context("CRUD operation with evaluators", () => { + beforeEach(() => { + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") + }) + + it("should create a new Evaluation", () => { + cy.get('[data-cy="new-evaluation-button"]').click() + cy.get(".ant-modal-content").should("exist") + + cy.get('[data-cy="select-testset-group"]').click() + cy.get('[data-cy="select-testset-option"]').click() + + cy.get('[data-cy="select-variant-group"]').click() + cy.get('[data-cy="select-variant-option"]').eq(0).click() + cy.get('[data-cy="select-variant-group"]').click() + + cy.get('[data-cy="select-evaluators-group"]').click() + cy.get('[data-cy="select-evaluators-option"]').eq(0).click() + cy.get('[data-cy="select-evaluators-group"]').click() + + cy.get(".ant-modal-footer > .ant-btn-primary > .ant-btn-icon > .anticon > svg").click() + }) + + it("should create a new Evaluation", () => { + cy.get('.ag-row[row-index="0"]').should("exist") + }) + + it("should create a new Evaluation", () => { + cy.get('[data-cy="new-evaluation-button"]').click() + cy.get(".ant-modal-content").should("exist") + + cy.get(".ant-modal-footer > .ant-btn-primary > .ant-btn-icon > .anticon > svg").click() + cy.get(".ant-modal-content").should("contain.text", "This field is required") + }) + + it("should delete an Evaluation", () => { + /* ==== Generated with Cypress Studio ==== */ + cy.get(".ag-root-wrapper").should("exist") + cy.get("#ag-4-input").check() + cy.get(".ant-space > :nth-child(1) > .ant-btn").click() + cy.get(".ant-modal-confirm-btns > :nth-child(2) > span").click() + /* ==== End Cypress Studio ==== */ + }) + }) + + after(() => { + cy.cleanupVariantAndTestset() + }) +}) diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx index 85c002ef60..2eb709169e 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -225,6 +225,7 @@ const EvaluationCompareMode: React.FC = () => { className={`${ appTheme === "dark" ? "ag-theme-alpine-dark" : "ag-theme-alpine" } ${classes.table}`} + data-cy="evaluation-compare-table" > ref={gridRef as any} diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 0856d01432..cd95eb7192 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -310,6 +310,7 @@ const EvaluationResults: React.FC = () => { icon={} type="primary" onClick={() => setNewEvalModalOpen(true)} + data-cy="new-evaluation-button" > New Evaluation diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 9fe27ce0c8..beadc19417 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -133,27 +133,29 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { label="Which testset do you want to use?" rules={[{required: true, message: "This field is required"}]} > - {testSets.map((testSet) => ( - + {testSet.name} ))} + - {variants.map((variant) => ( - + {variant.variantName} ))} + = ({onSuccess, ...props}) => { false ) }} + data-cy="select-evaluators-group" > {evaluatorConfigs.map((config) => { const evaluator = evaluators.find( (item) => item.key === config.evaluator_key, )! return ( - +
{evaluator.icon_url && ( From 6ea59ae8a15bf085dc29babbbee1b16edd2f7298 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 8 Jan 2024 22:34:31 +0100 Subject: [PATCH 309/414] test: eval scenarios --- agenta-web/cypress/e2e/eval.scenarios.cy.ts | 48 +++++++++++++++++++ .../modals/CreateAppStatusModal.tsx | 4 +- .../evaluationResults/NewEvaluationModal.tsx | 24 ++++++++-- .../EvaluationScenarios.tsx | 1 + 4 files changed, 71 insertions(+), 6 deletions(-) create mode 100644 agenta-web/cypress/e2e/eval.scenarios.cy.ts diff --git a/agenta-web/cypress/e2e/eval.scenarios.cy.ts b/agenta-web/cypress/e2e/eval.scenarios.cy.ts new file mode 100644 index 0000000000..4ea9b5df52 --- /dev/null +++ b/agenta-web/cypress/e2e/eval.scenarios.cy.ts @@ -0,0 +1,48 @@ +describe("Evaluators CRUD Test", function () { + let app_id + before(() => { + cy.createVariant() + cy.get("@app_id").then((appId) => { + app_id = appId + }) + cy.get('[data-cy="playground-save-changes-button"]').eq(0).click() + }) + + context("CRUD operation with evaluators", () => { + beforeEach(() => { + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") + }) + + it("should create a new evaluator", () => { + cy.get('[data-cy="new-evaluation-button"]').click() + cy.get(".ant-modal-content").should("exist") + + cy.get('[data-cy="select-testset-group"]').click() + cy.get('[data-cy="select-testset-option"]').click() + + cy.get('[data-cy="select-variant-group"]').click() + cy.get('[data-cy="select-variant-option"]').eq(0).click() + cy.get('[data-cy="select-variant-group"]').click() + + cy.get('[data-cy="select-evaluators-group"]').click() + cy.get('[data-cy="select-evaluators-option"]').eq(0).click() + cy.get('[data-cy="select-evaluators-group"]').click() + + cy.get(".ant-modal-footer > .ant-btn-primary > .ant-btn-icon > .anticon > svg").click() + }) + + it("should create a new Evaluation", () => { + cy.get('.ag-row[row-index="0"]').should("exist") + cy.get('.ag-cell[col-id="status"]').should("contain.text", "Completed") + cy.get('.ag-row-first > [col-id="aggregated_results"]').click() + cy.get(".ag-cell-focus").dblclick() + cy.contains(/Evaluation Results/i) + cy.get('[data-cy="evalaution-scenarios-table"]').should("exist") + }) + }) + + after(() => { + cy.cleanupVariantAndTestset() + }) +}) diff --git a/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx b/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx index cb165d4b04..3a7d655407 100644 --- a/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx +++ b/agenta-web/src/components/AppSelector/modals/CreateAppStatusModal.tsx @@ -196,8 +196,8 @@ const CreateAppStatusModal: React.FC> type === "success" ? "success" : type === "error" - ? "danger" - : "secondary" + ? "danger" + : "secondary" } strong={Object.keys(messages)[ix] === "success"} > diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index beadc19417..a486c21082 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -135,7 +135,11 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { > +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Sameh Methnani
Sameh Methnani

💻 📖
Suad Suljovic
Suad Suljovic

💻 🎨 🧑‍🏫 👀
burtenshaw
burtenshaw

💻
Abram
Abram

💻 📖
Israel Abebe
Israel Abebe

🐛 🎨 💻
Master X
Master X

💻
corinthian
corinthian

💻 🎨
Pavle Janjusevic
Pavle Janjusevic

🚇
Kaosi Ezealigo
Kaosi Ezealigo

🐛 💻
Alberto Nunes
Alberto Nunes

🐛
Maaz Bin Khawar
Maaz Bin Khawar

💻 👀 🧑‍🏫
Nehemiah Onyekachukwu Emmanuel
Nehemiah Onyekachukwu Emmanuel

💻 💡 📖
Philip Okiokio
Philip Okiokio

📖
Abhinav Pandey
Abhinav Pandey

💻
Ramchandra Warang
Ramchandra Warang

💻 🐛
Biswarghya Biswas
Biswarghya Biswas

💻
Uddeepta Raaj Kashyap
Uddeepta Raaj Kashyap

💻
Nayeem Abdullah
Nayeem Abdullah

💻
Kang Suhyun
Kang Suhyun

💻
Yoon
Yoon

💻
Kirthi Bagrecha Jain
Kirthi Bagrecha Jain

💻
Navdeep
Navdeep

💻
Rhythm Sharma
Rhythm Sharma

💻
Osinachi Chukwujama
Osinachi Chukwujama

💻
莫尔索
莫尔索

📖
Agunbiade Adedeji
Agunbiade Adedeji

💻
Emmanuel Oloyede
Emmanuel Oloyede

💻 📖
Dhaneshwarguiyan
Dhaneshwarguiyan

💻
Priyanshu Prajapati
Priyanshu Prajapati

📖
Raviteja
Raviteja

💻
Arijit
Arijit

💻
Yachika9925
Yachika9925

📖
Aldrin
Aldrin

⚠️
seungduk.kim.2304
seungduk.kim.2304

💻
Andrei Dragomir
Andrei Dragomir

💻
diego
diego

💻
brockWith
brockWith

💻
Dennis Zelada
Dennis Zelada

💻
Romain Brucker
Romain Brucker

💻
+ + + + + + +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind are welcome! + +**Attribution**: Testing icons created by [Freepik - Flaticon](https://www.flaticon.com/free-icons/testing) diff --git a/agenta-cli/agenta/__init__.py b/agenta-cli/agenta/__init__.py index 8a683f19e1..b46ab36d5e 100644 --- a/agenta-cli/agenta/__init__.py +++ b/agenta-cli/agenta/__init__.py @@ -14,5 +14,6 @@ ) from .sdk.utils.preinit import PreInitObject from .sdk.agenta_init import Config, init +from .sdk.utils.helper.openai_cost import calculate_token_usage config = PreInitObject("agenta.config", Config) diff --git a/agenta-cli/agenta/sdk/__init__.py b/agenta-cli/agenta/sdk/__init__.py index ebd87f40ba..672e09511c 100644 --- a/agenta-cli/agenta/sdk/__init__.py +++ b/agenta-cli/agenta/sdk/__init__.py @@ -15,5 +15,6 @@ BinaryParam, ) from .agenta_init import Config, init +from .utils.helper.openai_cost import calculate_token_usage config = PreInitObject("agenta.config", Config) diff --git a/agenta-cli/agenta/sdk/agenta_decorator.py b/agenta-cli/agenta/sdk/agenta_decorator.py index 677b9c0ae2..813a3a61d8 100644 --- a/agenta-cli/agenta/sdk/agenta_decorator.py +++ b/agenta-cli/agenta/sdk/agenta_decorator.py @@ -1,17 +1,18 @@ """The code for the Agenta SDK""" import os import sys +import time import inspect import argparse import traceback import functools from pathlib import Path from tempfile import NamedTemporaryFile -from typing import Any, Callable, Dict, Optional, Tuple, List +from typing import Any, Callable, Dict, Optional, Tuple, List, Union from fastapi import Body, FastAPI, UploadFile -from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse +from fastapi.middleware.cors import CORSMiddleware import agenta from .context import save_context @@ -26,6 +27,7 @@ TextParam, MessagesInput, FileInputURL, + FuncResponse, BinaryParam, ) @@ -91,7 +93,7 @@ async def wrapper_deployed(*args, **kwargs) -> Any: update_function_signature(wrapper, func_signature, config_params, ingestible_files) route = f"/{endpoint_name}" - app.post(route)(wrapper) + app.post(route, response_model=FuncResponse)(wrapper) update_deployed_function_signature( wrapper_deployed, @@ -99,7 +101,7 @@ async def wrapper_deployed(*args, **kwargs) -> Any: ingestible_files, ) route_deployed = f"/{endpoint_name}_deployed" - app.post(route_deployed)(wrapper_deployed) + app.post(route_deployed, response_model=FuncResponse)(wrapper_deployed) override_schema( openapi_schema=app.openapi(), func_name=func.__name__, @@ -149,7 +151,9 @@ def ingest_files( func_params[name] = ingest_file(func_params[name]) -async def execute_function(func: Callable[..., Any], *args, **func_params) -> Any: +async def execute_function( + func: Callable[..., Any], *args, **func_params +) -> Union[Dict[str, Any], JSONResponse]: """Execute the function and handle any exceptions.""" try: @@ -159,14 +163,20 @@ async def execute_function(func: Callable[..., Any], *args, **func_params) -> An it awaits their execution. """ is_coroutine_function = inspect.iscoroutinefunction(func) + start_time = time.perf_counter() if is_coroutine_function: result = await func(*args, **func_params) else: result = func(*args, **func_params) + end_time = time.perf_counter() + latency = end_time - start_time if isinstance(result, Context): save_context(result) - return result + if isinstance(result, Dict): + return FuncResponse(**result, latency=round(latency, 4)).dict() + if isinstance(result, str): + return FuncResponse(message=result, latency=round(latency, 4)).dict() except Exception as e: return handle_exception(e) diff --git a/agenta-cli/agenta/sdk/agenta_init.py b/agenta-cli/agenta/sdk/agenta_init.py index 6803fecd80..df7ebb4709 100644 --- a/agenta-cli/agenta/sdk/agenta_init.py +++ b/agenta-cli/agenta/sdk/agenta_init.py @@ -1,3 +1,5 @@ +from agenta.client.exceptions import APIRequestError +from agenta.client.backend.client import AgentaApi import os import logging from typing import Any, Optional @@ -7,8 +9,6 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) -from agenta.client.backend.client import AgentaApi -from agenta.client.exceptions import APIRequestError BACKEND_URL_SUFFIX = os.environ.get("BACKEND_URL_SUFFIX", "api") CLIENT_API_KEY = os.environ.get("AGENTA_API_KEY") @@ -104,11 +104,11 @@ def __init__(self, base_id, host): else: self.persist = True - def register_default(self, overwrite=True, **kwargs): + def register_default(self, overwrite=False, **kwargs): """alias for default""" return self.default(overwrite=overwrite, **kwargs) - def default(self, overwrite=True, **kwargs): + def default(self, overwrite=False, **kwargs): """Saves the default parameters to the app_name and base_name in case they are not already saved. Args: overwrite: Whether to overwrite the existing configuration or not diff --git a/agenta-cli/agenta/sdk/types.py b/agenta-cli/agenta/sdk/types.py index 408c6ade3e..a3f6b39eba 100644 --- a/agenta-cli/agenta/sdk/types.py +++ b/agenta-cli/agenta/sdk/types.py @@ -1,5 +1,5 @@ import json -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from pydantic import BaseModel, Extra, HttpUrl, Field @@ -10,6 +10,19 @@ def __init__(self, file_name: str, file_path: str): self.file_path = file_path +class LLMTokenUsage(BaseModel): + completion_tokens: int + prompt_tokens: int + total_tokens: int + + +class FuncResponse(BaseModel): + message: str + usage: Optional[LLMTokenUsage] + cost: Optional[float] + latency: float + + class DictInput(dict): def __new__(cls, default_keys=None): instance = super().__new__(cls, default_keys) diff --git a/agenta-cli/agenta/sdk/utils/helper/openai_cost.py b/agenta-cli/agenta/sdk/utils/helper/openai_cost.py new file mode 100644 index 0000000000..8473dee57f --- /dev/null +++ b/agenta-cli/agenta/sdk/utils/helper/openai_cost.py @@ -0,0 +1,166 @@ +# https://raw.githubusercontent.com/langchain-ai/langchain/23eb480c3866db8693a3a2d63b787c898c54bb35/libs/community/langchain_community/callbacks/openai_info.py +MODEL_COST_PER_1K_TOKENS = { + # GPT-4 input + "gpt-4": 0.03, + "gpt-4-0314": 0.03, + "gpt-4-0613": 0.03, + "gpt-4-32k": 0.06, + "gpt-4-32k-0314": 0.06, + "gpt-4-32k-0613": 0.06, + "gpt-4-vision-preview": 0.01, + "gpt-4-1106-preview": 0.01, + # GPT-4 output + "gpt-4-completion": 0.06, + "gpt-4-0314-completion": 0.06, + "gpt-4-0613-completion": 0.06, + "gpt-4-32k-completion": 0.12, + "gpt-4-32k-0314-completion": 0.12, + "gpt-4-32k-0613-completion": 0.12, + "gpt-4-vision-preview-completion": 0.03, + "gpt-4-1106-preview-completion": 0.03, + # GPT-3.5 input + "gpt-3.5-turbo": 0.0015, + "gpt-3.5-turbo-0301": 0.0015, + "gpt-3.5-turbo-0613": 0.0015, + "gpt-3.5-turbo-1106": 0.001, + "gpt-3.5-turbo-instruct": 0.0015, + "gpt-3.5-turbo-16k": 0.003, + "gpt-3.5-turbo-16k-0613": 0.003, + # GPT-3.5 output + "gpt-3.5-turbo-completion": 0.002, + "gpt-3.5-turbo-0301-completion": 0.002, + "gpt-3.5-turbo-0613-completion": 0.002, + "gpt-3.5-turbo-1106-completion": 0.002, + "gpt-3.5-turbo-instruct-completion": 0.002, + "gpt-3.5-turbo-16k-completion": 0.004, + "gpt-3.5-turbo-16k-0613-completion": 0.004, + # Azure GPT-35 input + "gpt-35-turbo": 0.0015, # Azure OpenAI version of ChatGPT + "gpt-35-turbo-0301": 0.0015, # Azure OpenAI version of ChatGPT + "gpt-35-turbo-0613": 0.0015, + "gpt-35-turbo-instruct": 0.0015, + "gpt-35-turbo-16k": 0.003, + "gpt-35-turbo-16k-0613": 0.003, + # Azure GPT-35 output + "gpt-35-turbo-completion": 0.002, # Azure OpenAI version of ChatGPT + "gpt-35-turbo-0301-completion": 0.002, # Azure OpenAI version of ChatGPT + "gpt-35-turbo-0613-completion": 0.002, + "gpt-35-turbo-instruct-completion": 0.002, + "gpt-35-turbo-16k-completion": 0.004, + "gpt-35-turbo-16k-0613-completion": 0.004, + # Others + "text-ada-001": 0.0004, + "ada": 0.0004, + "text-babbage-001": 0.0005, + "babbage": 0.0005, + "text-curie-001": 0.002, + "curie": 0.002, + "text-davinci-003": 0.02, + "text-davinci-002": 0.02, + "code-davinci-002": 0.02, + # Fine Tuned input + "babbage-002-finetuned": 0.0016, + "davinci-002-finetuned": 0.012, + "gpt-3.5-turbo-0613-finetuned": 0.012, + # Fine Tuned output + "babbage-002-finetuned-completion": 0.0016, + "davinci-002-finetuned-completion": 0.012, + "gpt-3.5-turbo-0613-finetuned-completion": 0.016, + # Azure Fine Tuned input + "babbage-002-azure-finetuned": 0.0004, + "davinci-002-azure-finetuned": 0.002, + "gpt-35-turbo-0613-azure-finetuned": 0.0015, + # Azure Fine Tuned output + "babbage-002-azure-finetuned-completion": 0.0004, + "davinci-002-azure-finetuned-completion": 0.002, + "gpt-35-turbo-0613-azure-finetuned-completion": 0.002, + # Legacy fine-tuned models + "ada-finetuned-legacy": 0.0016, + "babbage-finetuned-legacy": 0.0024, + "curie-finetuned-legacy": 0.012, + "davinci-finetuned-legacy": 0.12, +} + + +def standardize_model_name( + model_name: str, + is_completion: bool = False, +) -> str: + """ + Standardize the model name to a format that can be used in the OpenAI API. + + Args: + model_name: Model name to standardize. + is_completion: Whether the model is used for completion or not. + Defaults to False. + + Returns: + Standardized model name. + """ + + model_name = model_name.lower() + if ".ft-" in model_name: + model_name = model_name.split(".ft-")[0] + "-azure-finetuned" + if ":ft-" in model_name: + model_name = model_name.split(":")[0] + "-finetuned-legacy" + if "ft:" in model_name: + model_name = model_name.split(":")[1] + "-finetuned" + if is_completion and ( + model_name.startswith("gpt-4") + or model_name.startswith("gpt-3.5") + or model_name.startswith("gpt-35") + or ("finetuned" in model_name and "legacy" not in model_name) + ): + return model_name + "-completion" + else: + return model_name + + +def get_openai_token_cost_for_model( + model_name: str, num_tokens: int, is_completion: bool = False +) -> float: + """ + Get the cost in USD for a given model and number of tokens. + + Args: + model_name: Name of the model + num_tokens: Number of tokens. + is_completion: Whether the model is used for completion or not. + Defaults to False. + + Returns: + Cost in USD. + """ + + model_name = standardize_model_name(model_name, is_completion=is_completion) + if model_name not in MODEL_COST_PER_1K_TOKENS: + raise ValueError( + f"Unknown model: {model_name}. Please provide a valid OpenAI model name." + "Known models are: " + ", ".join(MODEL_COST_PER_1K_TOKENS.keys()) + ) + return MODEL_COST_PER_1K_TOKENS[model_name] * (num_tokens / 1000) + + +def calculate_token_usage(model_name: str, token_usage: dict) -> float: + """Calculates the total cost of using a language model based on the model name and token + usage. + + Args: + model_name: The name of the model used to determine the cost per token. + token_usage: Contains information about the usage of tokens for a particular model. + + Returns: + Total cost of using a model. + """ + + completion_tokens = token_usage.get("completion_tokens", 0) + prompt_tokens = token_usage.get("prompt_tokens", 0) + model_name = standardize_model_name(model_name) + if model_name in MODEL_COST_PER_1K_TOKENS: + completion_cost = get_openai_token_cost_for_model( + model_name, completion_tokens, is_completion=True + ) + prompt_cost = get_openai_token_cost_for_model(model_name, prompt_tokens) + total_cost = prompt_cost + completion_cost + return total_cost + return 0 diff --git a/agenta-web/src/components/AppSelector/modals/WriteOwnAppModal.tsx b/agenta-web/src/components/AppSelector/modals/WriteOwnAppModal.tsx index 0e9fa28692..a0242f6e8c 100644 --- a/agenta-web/src/components/AppSelector/modals/WriteOwnAppModal.tsx +++ b/agenta-web/src/components/AppSelector/modals/WriteOwnAppModal.tsx @@ -192,7 +192,7 @@ const WriteOwnAppModal: React.FC = ({...props}) => {
Check out{" "} - + our tutorial for writing your first LLM app diff --git a/agenta-web/src/components/EvaluationTable/ABTestingEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/ABTestingEvaluationTable.tsx index 163e490af8..fb6331e2a8 100644 --- a/agenta-web/src/components/EvaluationTable/ABTestingEvaluationTable.tsx +++ b/agenta-web/src/components/EvaluationTable/ABTestingEvaluationTable.tsx @@ -21,6 +21,7 @@ import {testsetRowToChatMessages} from "@/lib/helpers/testset" import EvaluationVotePanel from "../Evaluations/EvaluationCardView/EvaluationVotePanel" import VariantAlphabet from "../Evaluations/EvaluationCardView/VariantAlphabet" import {ParamsFormWithRun} from "./SingleModelEvaluationTable" +import {PassThrough} from "stream" const {Title} = Typography @@ -238,6 +239,9 @@ const ABTestingEvaluationTable: React.FC = ({ ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) : [], ) + if (typeof result !== "string") { + result = result.message + } setRowValue(rowIndex, variant.variantId, result) ;(outputs as KeyValuePair)[variant.variantId] = result diff --git a/agenta-web/src/components/EvaluationTable/SingleModelEvaluationTable.tsx b/agenta-web/src/components/EvaluationTable/SingleModelEvaluationTable.tsx index 679e0b2fd8..a1ae8ecd76 100644 --- a/agenta-web/src/components/EvaluationTable/SingleModelEvaluationTable.tsx +++ b/agenta-web/src/components/EvaluationTable/SingleModelEvaluationTable.tsx @@ -298,6 +298,9 @@ const SingleModelEvaluationTable: React.FC = ({ ? testsetRowToChatMessages(evaluation.testset.csvdata[rowIndex], false) : [], ) + if (typeof result !== "string") { + result = result.message + } setRowValue(rowIndex, variant.variantId, result) ;(outputs as KeyValuePair)[variant.variantId] = result diff --git a/agenta-web/src/components/Playground/Views/TestView.tsx b/agenta-web/src/components/Playground/Views/TestView.tsx index fd61056d37..31e7fc4b77 100644 --- a/agenta-web/src/components/Playground/Views/TestView.tsx +++ b/agenta-web/src/components/Playground/Views/TestView.tsx @@ -98,6 +98,11 @@ interface BoxComponentProps { inputParams: Parameter[] | null testData: GenericObject result: string + additionalData: { + cost: number | null + latency: number | null + usage: {completion_tokens: number; prompt_tokens: number; total_tokens: number} | null + } onInputParamChange: (paramName: string, newValue: any) => void onRun: () => void onAddToTestset: (params: Record) => void @@ -110,6 +115,7 @@ const BoxComponent: React.FC = ({ inputParams, testData, result, + additionalData, onInputParamChange, onRun, onAddToTestset, @@ -162,6 +168,30 @@ const BoxComponent: React.FC = ({ imageSize="large" /> + {additionalData?.cost || additionalData?.latency ? ( + +

+ Tokens:{" "} + {additionalData.usage !== null + ? JSON.stringify(additionalData.usage.total_tokens) + : 0} +

+

+ Cost:{" "} + {additionalData.cost !== null + ? `$${additionalData.cost.toFixed(4)}` + : "$0.00"} +

+

+ Latency:{" "} + {additionalData.latency !== null + ? `${Math.round(additionalData.latency * 1000)}ms` + : "0ms"} +

+
+ ) : ( + "" + )}
@@ -363,6 +364,7 @@ const AddToTestSetDrawer: React.FC = ({params, isChatVariant, ...props}) }} disableAdd disableRemove + disableEditRole />
diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx index d9f6d0a237..8fd84351aa 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -53,6 +53,7 @@ const EvaluationScenarios: React.FC = () => { scenarios[0]?.inputs.forEach((input, index) => { colDefs.push({ flex: 1, + minWidth: 240, headerName: `Input: ${input.name}`, ...getFilterParams(input.type === "number" ? "number" : "text"), field: `inputs.${index}`, @@ -64,6 +65,7 @@ const EvaluationScenarios: React.FC = () => { }) colDefs.push({ flex: 1, + minWidth: 300, headerName: "Expected Output", field: "correct_answer", ...getFilterParams("text"), @@ -71,10 +73,13 @@ const EvaluationScenarios: React.FC = () => { return params.data?.correct_answer?.toString() || "" }, cellRenderer: LongTextCellRenderer, + // wrapText: true, + // autoHeight: true, }) evalaution?.variants.forEach((_, index) => { colDefs.push({ flex: 1, + minWidth: 300, headerName: "Output", ...getFilterParams("text"), field: `outputs.0`, @@ -82,6 +87,8 @@ const EvaluationScenarios: React.FC = () => { return getTypedValue(params.data?.outputs[index]) }, cellRenderer: LongTextCellRenderer, + // wrapText: true, + // autoHeight: true, }) }) scenarios[0]?.evaluators_configs.forEach((config) => { diff --git a/agenta-web/src/lib/helpers/testset.ts b/agenta-web/src/lib/helpers/testset.ts index 820aa3ce2b..600945e69b 100644 --- a/agenta-web/src/lib/helpers/testset.ts +++ b/agenta-web/src/lib/helpers/testset.ts @@ -9,29 +9,19 @@ const isObjectChatMessage = (obj: GenericObject) => { // TODO: the logic to determine if a testset is chatbase should be improved /** - * @returns the key of the column which contains the chat messages, "" if testset is not chat bases + * @returns the key of the column which contains the chat messages, "" if testset is not chat based */ export function getTestsetChatColumn(csvData: KeyValuePair[]) { let columnKey = "" if (!csvData.length) return columnKey - const {correct_answer, ...restCols} = csvData[0] - let isCorrectAnswerChat = false + const cols = csvData[0] - // check if correct_answer is a chat message object - if (correct_answer) { - const parsedCorrectAnswer = safeParse(correct_answer) - if (parsedCorrectAnswer && isObjectChatMessage(parsedCorrectAnswer)) - isCorrectAnswerChat = true - } - - //check if any col other than correct_answer is an array of chat messages - for (const [key, col] of Object.entries(restCols)) { + //check if any col is an array of chat messages + for (const [key, col] of Object.entries(cols)) { const parsedCol = safeParse(col) if (Array.isArray(parsedCol) && parsedCol.every(isObjectChatMessage)) { - if (isCorrectAnswerChat) { - columnKey = key - } + columnKey = key break } } @@ -48,7 +38,7 @@ export function testsetRowToChatMessages(rowData: KeyValuePair, includeCorrectAn let chat = safeParse(rowData[chatColumn], []) if (includeCorrectAnswer) { - chat = chat.concat([safeParse(rowData.correct_answer, defaultNewMessage)]) + chat = chat.concat([{content: rowData.correct_answer || "", role: ChatRole.Assistant}]) } return chat.map((item: KeyValuePair) => ({...item, id: uuidv4()})) From 5bbadfe63ff4f472a3d03184a743a8f5e04dbe1e Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Thu, 11 Jan 2024 00:07:37 +0500 Subject: [PATCH 352/414] added lm_providers_keys field in create evaluation payload --- .../evaluations/evaluationResults/NewEvaluationModal.tsx | 8 ++++++-- agenta-web/src/services/evaluations/index.ts | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index a486c21082..8169c888aa 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -1,6 +1,7 @@ import {useAppId} from "@/hooks/useAppId" import {JSSTheme, Variant, LLMRunRateLimit, testset} from "@/lib/Types" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" +import {getApikeys} from "@/lib/helpers/utils" import {fetchTestsets, fetchVariants} from "@/lib/services/api" import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations" import {PlusOutlined, QuestionCircleOutlined} from "@ant-design/icons" @@ -104,8 +105,11 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { const onSubmit = (values: CreateEvaluationData) => { setSubmitLoading(true) - const EvaluationRateLimit: LLMRunRateLimit = rateLimitValues - createEvalutaiton(appId, {...values, rate_limit: EvaluationRateLimit}) + createEvalutaiton(appId, { + ...values, + rate_limit: rateLimitValues, + lm_providers_keys: {open_ai: getApikeys()}, + }) .then(onSuccess) .catch(console.error) .finally(() => setSubmitLoading(false)) diff --git a/agenta-web/src/services/evaluations/index.ts b/agenta-web/src/services/evaluations/index.ts index 9a19bb98a9..875fbb57f3 100644 --- a/agenta-web/src/services/evaluations/index.ts +++ b/agenta-web/src/services/evaluations/index.ts @@ -6,6 +6,7 @@ import { EvaluationStatus, Evaluator, EvaluatorConfig, + KeyValuePair, LLMRunRateLimit, TestSet, _Evaluation, @@ -121,6 +122,7 @@ export type CreateEvaluationData = { variant_ids: string[] evaluators_configs: string[] rate_limit: LLMRunRateLimit + lm_providers_keys: KeyValuePair } export const createEvalutaiton = async (appId: string, evaluation: CreateEvaluationData) => { return axios.post(`/api/evaluations/`, {...evaluation, app_id: appId}) From bcb0df21e7ff7c661076188d4c5214815587f20b Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 11 Jan 2024 11:52:40 +0100 Subject: [PATCH 353/414] fixes --- .../models/api/evaluation_model.py | 2 +- .../routers/evaluation_router.py | 18 +---- .../agenta_backend/services/db_manager.py | 2 +- .../services/evaluators_service.py | 65 +++++++------------ 4 files changed, 27 insertions(+), 60 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 70cbf695f6..15a48e2fd4 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -224,7 +224,7 @@ class LLMRunRateLimit(BaseModel): class LMProvidersEnum(str, Enum): - open_ai = "open_ai" + openai = "openai" class NewEvaluation(BaseModel): diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index f3f8c6a95b..cc7250b743 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -21,7 +21,7 @@ from agenta_backend.utils.common import check_access_to_app from agenta_backend.services.db_manager import ( - check_if_ai_critique_exists_in_list_of_evlautors_configs, + check_if_ai_critique_exists_in_list_of_evaluators_configs, ) if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: @@ -64,25 +64,13 @@ async def create_evaluation( if app is None: raise HTTPException(status_code=404, detail="App not found") - if await check_if_ai_critique_exists_in_list_of_evlautors_configs( + if await check_if_ai_critique_exists_in_list_of_evaluators_configs( payload.evaluators_configs ): # Check if lm_providers_keys is provided and not empty if not payload.lm_providers_keys: return JSONResponse( - {"detail": "Missing Key"}, - status_code=400, - ) - - # Check if there is an OpenAI key and if it starts with 'sk-' - has_valid_open_ai_key = any( - key == LMProvidersEnum.open_ai and value.startswith("sk-") - for key, value in payload.lm_providers_keys.items() - ) - - if not has_valid_open_ai_key: - return JSONResponse( - {"detail": "Invalid or missing OpenAI key"}, + {"detail": "Missing LM provider Key"}, status_code=400, ) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index ce74f8139c..adbac48879 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1715,7 +1715,7 @@ async def fetch_evaluator_config(evaluator_config_id: str): raise e -async def check_if_ai_critique_exists_in_list_of_evlautors_configs( +async def check_if_ai_critique_exists_in_list_of_evaluators_configs( evaluators_configs_ids: List[str], ) -> bool: """Fetch evaluator configurations from the database. diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index d4ac842a5a..15c1d7832f 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -123,68 +123,47 @@ def auto_ai_critique( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], ) -> str: - """Evaluate a response using an AI critique based on provided - - An evaluation prompt, - - An LLM App prompt, - - An LLM App output, - - a correct answer. + """ + Evaluate a response using an AI critique based on provided inputs, output, correct answer, app parameters, and settings. Args: - llm_app_prompt_template (str): the prompt template of the llm app variant - llm_app_inputs (list): parameters - correct_answer (str): correct answer - variant_output (str): the output of an ll app variant with given parameters - evaluation_prompt_template (str): evaluation prompt set by an agenta user in the ai evaluation view + inputs (Dict[str, Any]): Input parameters for the LLM app variant. + output (str): The output of the LLM app variant. + correct_answer (str): Correct answer for evaluation. + app_params (Dict[str, Any]): Application parameters. + settings_values (Dict[str, Any]): Settings for the evaluation. + lm_providers_keys (Dict[str, Any]): Keys for language model providers. Returns: - str: returns an evaluation + str: Evaluation result. """ + llm = OpenAI( - openai_api_key=lm_providers_keys["open_ai"], + openai_api_key=lm_providers_keys["openai"], temperature=0.8, - model="gpt-3.5-turbo-1106", + model="gpt-3.5-turbo-instruct", ) - input_variables = [] - - # List of default variables - default_vars = [ - "variant_output", - "llm_app_prompt_template", - "correct_answer", - ] - - # Check default variables - for var in default_vars: - if "{%s}" % var in settings_values["evaluation_prompt_template"]: - input_variables.append(var) - - # Iterate over llm_app_inputs and check if the variable name exists in the evaluation_prompt_template - for input_item in settings_values["llm_app_inputs"]: - if ( - "{%s}" % input_item["input_name"] - in settings_values["evaluation_prompt_template"] - ): - input_variables.append(input_item["input_name"]) - chain_run_args = { - "llm_app_prompt_template": settings_values["llm_app_prompt_template"], - "correct_answer": correct_answer, + "llm_app_prompt_template": app_params.get("prompt_user", ""), "variant_output": output, + "correct_answer": correct_answer, } - for input_item in settings_values["llm_app_inputs"]: - chain_run_args[input_item["input_name"]] = input_item["input_value"] + for input_item in app_params.get("inputs", []): + input_name = input_item.get("name") + if input_name and input_name in inputs: + chain_run_args[input_name] = inputs[input_name] prompt = PromptTemplate( - input_variables=input_variables, - template=settings_values["evaluation_prompt_template"], + input_variables=list(chain_run_args.keys()), # Use the keys from chain_run_args + template=settings_values["prompt_template"], ) chain = LLMChain(llm=llm, prompt=prompt) - output = chain.run(**chain_run_args) + evaluation_output = chain.run(**chain_run_args) - return Result(type="text", value=output.strip()) + return Result(type="text", value=evaluation_output.strip()) def evaluate( From 87eacc97363f0c1d76361f1b359ffcba2cebc3e8 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 11 Jan 2024 12:08:02 +0100 Subject: [PATCH 354/414] small refactor --- .../routers/evaluation_router.py | 18 +++++------- .../services/evaluator_manager.py | 29 ++++++++++++++++++- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index cc7250b743..3cc5a572fc 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -20,8 +20,8 @@ from agenta_backend.services import evaluation_service from agenta_backend.utils.common import check_access_to_app -from agenta_backend.services.db_manager import ( - check_if_ai_critique_exists_in_list_of_evaluators_configs, +from agenta_backend.services.evaluator_manager import ( + check_ai_critique_inputs, ) if os.environ["FEATURE_FLAG"] in ["cloud", "ee"]: @@ -64,15 +64,11 @@ async def create_evaluation( if app is None: raise HTTPException(status_code=404, detail="App not found") - if await check_if_ai_critique_exists_in_list_of_evaluators_configs( - payload.evaluators_configs - ): - # Check if lm_providers_keys is provided and not empty - if not payload.lm_providers_keys: - return JSONResponse( - {"detail": "Missing LM provider Key"}, - status_code=400, - ) + success, response = await check_ai_critique_inputs( + payload.evaluators_configs, payload.lm_providers_keys + ) + if not success: + return response evaluations = [] diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 3409facb44..c2fd5a6055 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -1,6 +1,8 @@ import json import os -from typing import Any, Dict, Optional, List +from typing import Any, Dict, Optional, List, Tuple + +from fastapi.responses import JSONResponse from agenta_backend.services import db_manager @@ -155,3 +157,28 @@ async def create_ready_to_use_evaluators(app: AppDB): evaluator_key=evaluator["key"], settings_values={}, ) + + +async def check_ai_critique_inputs( + evaluators_configs: List[str], lm_providers_keys: Optional[Dict[str, Any]] +) -> Tuple[bool, Optional[JSONResponse]]: + """ + Checks if AI critique exists in evaluators configs and validates lm_providers_keys. + + Args: + evaluators_configs (List[str]): List of evaluator configurations. + lm_providers_keys (Optional[Dict[str, Any]]): Language model provider keys. + + Returns: + Tuple[bool, Optional[JSONResponse]]: Returns a tuple containing a boolean indicating success, + and a JSONResponse in case of error. + """ + if await db_manager.check_if_ai_critique_exists_in_list_of_evaluators_configs( + evaluators_configs + ): + if not lm_providers_keys: + return False, JSONResponse( + {"detail": "Missing LM provider Key"}, + status_code=400, + ) + return True, None From 491110992af2255d9e00487b553e46e0e1bbc93f Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 11 Jan 2024 12:12:55 +0100 Subject: [PATCH 355/414] rename open_ai to openai --- .../pages/evaluations/evaluationResults/NewEvaluationModal.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 8169c888aa..eda657142f 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -108,7 +108,7 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { createEvalutaiton(appId, { ...values, rate_limit: rateLimitValues, - lm_providers_keys: {open_ai: getApikeys()}, + lm_providers_keys: {openai: getApikeys()}, }) .then(onSuccess) .catch(console.error) From 0bb374263438b2147e03834fc7e6c5df826a20db Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Thu, 11 Jan 2024 16:37:44 +0500 Subject: [PATCH 356/414] only redirect for no llm keys in case of ai critiquie --- .../evaluationResults/EvaluationResults.tsx | 2 +- .../evaluationResults/NewEvaluationModal.tsx | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index a9e8190890..5758d86363 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -336,7 +336,7 @@ const EvaluationResults: React.FC = () => { icon={} type="primary" onClick={() => { - if (!redirectIfNoLLMKeys()) setNewEvalModalOpen(true) + setNewEvalModalOpen(true) }} data-cy="new-evaluation-button" > diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx index 8169c888aa..8e87bcaa3b 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx @@ -1,7 +1,7 @@ import {useAppId} from "@/hooks/useAppId" import {JSSTheme, Variant, LLMRunRateLimit, testset} from "@/lib/Types" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {getApikeys} from "@/lib/helpers/utils" +import {getApikeys, redirectIfNoLLMKeys} from "@/lib/helpers/utils" import {fetchTestsets, fetchVariants} from "@/lib/services/api" import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations" import {PlusOutlined, QuestionCircleOutlined} from "@ant-design/icons" @@ -104,6 +104,16 @@ const NewEvaluationModal: React.FC = ({onSuccess, ...props}) => { } const onSubmit = (values: CreateEvaluationData) => { + // redirect if no llm keys and an AI Critique config is selected + if ( + values.evaluators_configs.some( + (id) => + evaluatorConfigs.find((config) => config.id === id)?.evaluator_key === + "auto_ai_critique", + ) && + redirectIfNoLLMKeys() + ) + return setSubmitLoading(true) createEvalutaiton(appId, { ...values, From 72d68025bbd6a0388ca9bd5e801294715ffc3b8f Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Thu, 11 Jan 2024 18:00:09 +0500 Subject: [PATCH 357/414] made evaluator columns header consistent across screens --- .../evaluationCompare/EvaluationCompare.tsx | 37 ++++++++++++------- .../EvaluationScenarios.tsx | 24 +++++++++--- agenta-web/src/lib/helpers/colors.ts | 2 + 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx index e6159f12b6..4c52a2302d 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -15,14 +15,15 @@ import {Space, Spin, Tag, Tooltip, Typography} from "antd" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import {getFilterParams, getTypedValue} from "../evaluationResults/EvaluationResults" -import {getTagColors} from "@/lib/helpers/colors" +import {getColorFromStr, getRandomColors} from "@/lib/helpers/colors" import {DownloadOutlined} from "@ant-design/icons" import {getAppValues} from "@/contexts/app.context" import {useQueryParam} from "@/hooks/useQuery" import {LongTextCellRenderer} from "../cellRenderers/cellRenderers" -import {stringToNumberInRange} from "@/lib/helpers/utils" import Link from "next/link" import AgCustomHeader from "@/components/AgCustomHeader/AgCustomHeader" +import {useAtom} from "jotai" +import {evaluatorsAtom} from "@/lib/atoms/evaluation" const useStyles = createUseStyles((theme: JSSTheme) => ({ table: { @@ -57,6 +58,7 @@ const EvaluationCompareMode: React.FC = () => { const [fetching, setFetching] = useState(false) const [rows, setRows] = useState([]) const [testset, setTestset] = useState() + const [evaluators] = useAtom(evaluatorsAtom) const gridRef = useRef>() const variants = useMemo(() => { @@ -64,10 +66,10 @@ const EvaluationCompareMode: React.FC = () => { }, [rows]) const colors = useMemo(() => { - const colors = getTagColors() const previous = new Set() + const colors = getRandomColors() return variants.map((v) => { - const color = colors[stringToNumberInRange(v.evaluationId, 0, colors.length - 1)] + const color = getColorFromStr(v.evaluationId) if (previous.has(color)) return colors.find((c) => !previous.has(c))! previous.add(color) return color @@ -143,18 +145,27 @@ const EvaluationCompareMode: React.FC = () => { }) }) - Object.entries(confgisMap).forEach(([configId, configs]) => { + Object.entries(confgisMap).forEach(([_, configs]) => { configs.forEach(({config, variant, color}) => { colDefs.push({ flex: 1, - headerComponent: (props: any) => ( - - - Evaluator: {config.name} - {variant.variantName} - - - ), + headerName: config.name, + headerComponent: (props: any) => { + const evaluator = evaluators.find( + (item) => item.key === config.evaluator_key, + ) + return ( + + + + {config.name} + {evaluator?.name} + + {variant.variantName} + + + ) + }, field: "variants.0.evaluatorConfigs.0.result" as any, ...getFilterParams("text"), valueGetter: (params) => { diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx index 8fd84351aa..dc18742a60 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -5,7 +5,7 @@ import {deleteEvaluations, fetchAllEvaluationScenarios} from "@/services/evaluat import {DeleteOutlined, DownloadOutlined} from "@ant-design/icons" import {ColDef} from "ag-grid-community" import {AgGridReact} from "ag-grid-react" -import {Space, Spin, Tooltip, Typography} from "antd" +import {Space, Spin, Tag, Tooltip, Typography} from "antd" import {useRouter} from "next/router" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" @@ -14,6 +14,9 @@ import {getAppValues} from "@/contexts/app.context" import AlertPopup from "@/components/AlertPopup/AlertPopup" import {formatDate} from "@/lib/helpers/dateTimeHelper" import {LongTextCellRenderer} from "../cellRenderers/cellRenderers" +import AgCustomHeader from "@/components/AgCustomHeader/AgCustomHeader" +import {useAtom} from "jotai" +import {evaluatorsAtom} from "@/lib/atoms/evaluation" const useStyles = createUseStyles((theme: JSSTheme) => ({ infoRow: { @@ -43,6 +46,7 @@ const EvaluationScenarios: React.FC = () => { const evaluationId = router.query.evaluation_id as string const [scenarios, setScenarios] = useState<_EvaluationScenario[]>([]) const [fetching, setFetching] = useState(false) + const [evaluators] = useAtom(evaluatorsAtom) const gridRef = useRef>() const evalaution = scenarios[0]?.evaluation @@ -73,8 +77,6 @@ const EvaluationScenarios: React.FC = () => { return params.data?.correct_answer?.toString() || "" }, cellRenderer: LongTextCellRenderer, - // wrapText: true, - // autoHeight: true, }) evalaution?.variants.forEach((_, index) => { colDefs.push({ @@ -87,13 +89,23 @@ const EvaluationScenarios: React.FC = () => { return getTypedValue(params.data?.outputs[index]) }, cellRenderer: LongTextCellRenderer, - // wrapText: true, - // autoHeight: true, }) }) scenarios[0]?.evaluators_configs.forEach((config) => { colDefs.push({ - headerName: `Evaluator: ${config.name}`, + headerName: config.name, + headerComponent: (props: any) => { + const evaluator = evaluators.find((item) => item.key === config.evaluator_key)! + return ( + + + {config.name} + {evaluator.name} + + + ) + }, + autoHeaderHeight: true, field: `results`, ...getFilterParams("text"), valueGetter: (params) => { diff --git a/agenta-web/src/lib/helpers/colors.ts b/agenta-web/src/lib/helpers/colors.ts index fa1f114573..1ec58956a4 100644 --- a/agenta-web/src/lib/helpers/colors.ts +++ b/agenta-web/src/lib/helpers/colors.ts @@ -79,3 +79,5 @@ export const fadeColor = (hex: string, opacity: number) => { } export const getTagColors = () => [...tagColors] + +export const getRandomColors = () => [...colors] From fa02500d46f664d41f9122342c2a526b41ee4f14 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 11 Jan 2024 15:54:08 +0100 Subject: [PATCH 358/414] Update - added logic to migrate old evaluation scenario to new auto/human evaluation scenario --- .../20240110165900_evaluations_revamp.py | 456 ++++++++++++++++++ 1 file changed, 456 insertions(+) create mode 100644 agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py new file mode 100644 index 0000000000..f972784806 --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -0,0 +1,456 @@ +from datetime import datetime +from typing import Any, Dict, List, Optional + + +from pydantic import BaseModel, Field +from beanie import free_fall_migration, Document, Link, PydanticObjectId + + +class OrganizationDB(Document): + name: str = Field(default="agenta") + description: str = Field(default="") + type: Optional[str] + owner: str # user id + members: Optional[List[PydanticObjectId]] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "organizations" + + +class UserDB(Document): + uid: str = Field(default="0", unique=True, index=True) + username: str = Field(default="agenta") + email: str = Field(default="demo@agenta.ai", unique=True) + organizations: Optional[List[PydanticObjectId]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "users" + + +class AppDB(Document): + app_name: str + organization: Link[OrganizationDB] + user: Link[UserDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_db" + + +class TestSetDB(Document): + name: str + app: Link[AppDB] + csvdata: List[Dict[str, str]] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "testsets" + + +class EvaluatorConfigDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + name: str + evaluator_key: str + settings_values: Optional[Dict[str, Any]] = None + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluators_configs" + + +class Result(BaseModel): + type: str + value: Any + + +class EvaluationScenarioResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class AggregatedResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class EvaluationScenarioInputDB(BaseModel): + name: str + type: str + value: str + + +class EvaluationScenarioOutputDB(BaseModel): + type: str + value: Any + + +class HumanEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class HumanEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + variants: List[PydanticObjectId] + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "human_evaluations" + + +class HumanEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[HumanEvaluationDB] + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] + vote: Optional[str] + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "human_evaluations_scenarios" + + +class EvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str = Field(default="EVALUATION_INITIALIZED") + testset: Link[TestSetDB] + variant: PydanticObjectId + evaluators_configs: List[PydanticObjectId] + aggregated_results: List[AggregatedResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluations" + + +class EvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[EvaluationDB] + variant_id: PydanticObjectId + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] + evaluators_configs: List[PydanticObjectId] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluation_scenarios" + + +class OldEvaluationTypeSettings(BaseModel): + similarity_threshold: Optional[float] + regex_pattern: Optional[str] + regex_should_match: Optional[bool] + webhook_url: Optional[str] + llm_app_prompt_template: Optional[str] + custom_code_evaluation_id: Optional[str] + evaluation_prompt_template: Optional[str] + + +class OldEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class OldEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class OldEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + evaluation_type_settings: OldEvaluationTypeSettings + variants: List[PydanticObjectId] + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluations" + + +class OldEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[OldEvaluationDB] + inputs: List[OldEvaluationScenarioInput] + outputs: List[OldEvaluationScenarioOutput] # EvaluationScenarioOutput + vote: Optional[str] + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "evaluation_scenarios" + + +class OldCustomEvaluationDB(Document): + evaluation_name: str + python_code: str + app: Link[AppDB] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "custom_evaluations" + + +def modify_app_id_store( + app_id: str, + variant_ids: str, + evaluation_type: str, + app_keyvalue_store: Dict[str, Dict[str, List[str]]], +): + app_id_store = app_keyvalue_store.get(app_id, None) + if not app_id_store: + app_keyvalue_store[app_id] = {"variant_ids": [], "evaluation_types": []} + app_id_store = app_keyvalue_store[app_id] + + app_id_store_variant_ids = list(app_id_store["variant_ids"]) + if variant_ids not in list(app_id_store["variant_ids"]): + app_id_store_variant_ids.extend(variant_ids) + app_id_store["variant_ids"] = list(set(app_id_store_variant_ids)) + + app_id_store_evaluation_types = list(app_id_store["evaluation_types"]) + if evaluation_type not in app_id_store_evaluation_types: + app_id_store_evaluation_types.append(evaluation_type) + app_id_store["evaluation_types"] = list(set(app_id_store_evaluation_types)) + + +class Forward: + @free_fall_migration( + document_models=[ + AppDB, + UserDB, + OrganizationDB, + TestSetDB, + OldEvaluationDB, + EvaluatorConfigDB, + HumanEvaluationDB, + EvaluationDB, + OldCustomEvaluationDB, + ] + ) + async def migrate_old_evaluation_to_new_evaluation(self, session): + # STEP 1: + # Create a key-value store that saves all the variants & evaluation types for a particular app id + # Example: {"app_id": {"evaluation_types": ["string", "string"], "variant_ids": ["string", "string"]}} + app_keyvalue_store = {} + old_evaluations = await OldEvaluationDB.find(fetch_links=True).to_list() + for old_eval in old_evaluations: + app_id = old_eval.app.id + variant_ids = [str(variant_id) for variant_id in old_eval.variants] + evaluation_type = old_eval.evaluation_type + modify_app_id_store( + str(app_id), variant_ids, evaluation_type, app_keyvalue_store + ) + + # STEP 2: + # Loop through the app_id key-store to create evaluator configs + # based on the evaluation types available + for app_id, app_id_store in app_keyvalue_store.items(): + app_evaluator_configs: List[EvaluatorConfigDB] = [] + for evaluation_type in app_id_store[ + "evaluation_types" + ]: # the values in this case are the evaluation type + custom_code_evaluations = await OldCustomEvaluationDB.find( + OldCustomEvaluationDB.app == PydanticObjectId(app_id) + ).to_list() + if evaluation_type == "custom_code_run": + for custom_code_evaluation in custom_code_evaluations: + eval_config = EvaluatorConfigDB( + app=PydanticObjectId(app_id), + organization=old_eval.organization.id, + user=old_eval.user.id, + name=f"{old_eval.app.app_name}_{old_eval.evaluation_type}", + evaluator_key=f"auto_{evaluation_type}", + settings_values={} + if custom_code_evaluation is None + else {"code": custom_code_evaluation.python_code}, + ) + await eval_config.create(session=session) + app_evaluator_configs.append(eval_config) + + if evaluation_type != "custom_code_run": + eval_config = EvaluatorConfigDB( + app=PydanticObjectId(app_id), + organization=old_eval.organization.id, + user=old_eval.user.id, + name=f"{old_eval.app.app_name}_{old_eval.evaluation_type}", + evaluator_key=evaluation_type, + settings_values={}, + ) + await eval_config.create(session=session) + app_evaluator_configs.append(eval_config) + + # STEP 3 (a): + # Retrieve evaluator configs for app id + auto_evaluator_configs: List[PydanticObjectId] = [] + for evaluator_config in app_evaluator_configs: + # In the case where the evaluator key is not a human evaluator, + # Append the evaluator config id in the list of auto evaluator configs + if evaluator_config.evaluator_key not in [ + "human_a_b_testing", + "single_model_test", + ]: + auto_evaluator_configs.append(evaluator_config.id) + + # STEP 3 (b): + # In the case where the evaluator key is a human evaluator, + # Proceed to create the human evaluation with the evaluator config + for evaluator_config in app_evaluator_configs: + if evaluator_config.evaluator_key in [ + "human_a_b_testing", + "single_model_test", + ]: + new_eval = HumanEvaluationDB( + app=PydanticObjectId(app_id), + organization=old_eval.organization.id, + user=old_eval.user.id, + status=old_eval.status, + evaluation_type=evaluator_config.evaluator_key, + variants=app_id_store["variant_ids"], + testset=old_eval.testset.id, + ) + await new_eval.create(session=session) # replace(session=session) + + # STEP 3 (c): + # Proceed to create a single evaluation for every variant in the app_id_store + # with the auto_evaluator_configs + if auto_evaluator_configs is not None: + for variant in app_id_store["variant_ids"]: + new_eval = EvaluationDB( + app=PydanticObjectId(app_id), + organization=old_eval.organization.id, + user=old_eval.user.id, + status=old_eval.status, + testset=old_eval.testset.id, + variant=variant, + evaluators_configs=auto_evaluator_configs, + aggregated_results=[], + ) + await new_eval.create(session=session) + + @free_fall_migration( + document_models=[ + AppDB, + OrganizationDB, + UserDB, + TestSetDB, + OldEvaluationDB, + OldEvaluationScenarioDB, + EvaluationScenarioDB, + HumanEvaluationScenarioDB, + ] + ) + async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, session): + old_scenarios = await OldEvaluationScenarioDB.find(fetch_links=True).to_list() + for old_scenario in old_scenarios: + if old_scenario.evaluation.evaluation_type in [ + "human_a_b_testing", + "single_model_test", + ]: + scenario_inputs = [ + HumanEvaluationScenarioInput( + input_name=input.input_name, + input_value=input.input_value, + ) + for input in old_scenario.inputs + ] + scenario_outputs = [ + HumanEvaluationScenarioOutput( + variant_id=output.variant_id, + variant_output=output.variant_output, + ) + for output in old_scenario.outputs + ] + new_scenario = HumanEvaluationScenarioDB( + user=old_scenario.user.id, + organization=old_scenario.organization.id, + evaluation=old_scenario.evaluation.id, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + vote=old_scenario.vote, + score=old_scenario.score, + ) + await new_scenario.insert(session=session) + else: + new_scenario = EvaluationScenarioDB( + user=old_scenario.user.id, + organization=old_scenario.organization.id, + evaluation=old_scenario.evaluation.id, + variant_id=old_scenario.evaluation.variants[0], + inputs=[ + EvaluationScenarioInputDB( + name=input.input_name, + type=type(input.input_value).__name__, + value=input.input_value, + ) + for input in old_scenario.inputs + ], + outputs=[ + EvaluationScenarioOutputDB( + type=type(output.variant_output).__name__, + value=output.variant_output, + ) + for output in old_scenario.outputs + ], + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + evaluators_configs=[], + results=[], + ) + await new_scenario.insert(session=session) + + +class Backward: + ... From aa45abdfa243b89297eac286f831af712f97f47f Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 11 Jan 2024 15:55:19 +0100 Subject: [PATCH 359/414] :art: Format - ran black --- .../20240110001454_initial_migration.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py b/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py index ced0493ee3..ef06b5c11b 100644 --- a/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py +++ b/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py @@ -250,7 +250,7 @@ class Forward: VariantBaseDB, ConfigDB, AppVariantDB, - OldAppVariantDB + OldAppVariantDB, ] ) async def change_app_variant_fields( @@ -259,20 +259,23 @@ async def change_app_variant_fields( output_document.base = input_document.bases output_document.config = input_document.configs - @iterative_migration(document_models=[ + @iterative_migration( + document_models=[ OrganizationDB, UserDB, AppDB, TestSetDB, EvaluationDB, - OldEvaluationDB - ]) + OldEvaluationDB, + ] + ) async def rename_evaluation_fields( self, input_document: OldEvaluationDB, output_document: EvaluationDB ): output_document.testset = input_document.testsets - @iterative_migration(document_models=[ + @iterative_migration( + document_models=[ OrganizationDB, UserDB, AppDB, @@ -280,8 +283,9 @@ async def rename_evaluation_fields( EvaluationDB, OldEvaluationDB, EvaluationScenarioDB, - OldEvaluationScenarioDB - ]) + OldEvaluationScenarioDB, + ] + ) async def rename_evaluation_scenarios_fields( self, input_document: OldEvaluationScenarioDB, @@ -299,7 +303,7 @@ class Backward: VariantBaseDB, ConfigDB, AppVariantDB, - OldAppVariantDB + OldAppVariantDB, ] ) async def change_app_variant_fields( @@ -308,20 +312,23 @@ async def change_app_variant_fields( output_document.bases = input_document.base output_document.configs = input_document.config - @iterative_migration(document_models=[ + @iterative_migration( + document_models=[ OrganizationDB, UserDB, AppDB, TestSetDB, EvaluationDB, - OldEvaluationDB - ]) + OldEvaluationDB, + ] + ) async def rename_evaluation_fields( self, input_document: EvaluationDB, output_document: OldEvaluationDB ): output_document.testsets = input_document.testset - @iterative_migration(document_models=[ + @iterative_migration( + document_models=[ OrganizationDB, UserDB, AppDB, @@ -329,8 +336,9 @@ async def rename_evaluation_fields( EvaluationDB, OldEvaluationDB, EvaluationScenarioDB, - OldEvaluationScenarioDB - ]) + OldEvaluationScenarioDB, + ] + ) async def rename_evaluation_scenarios_fields( self, input_document: EvaluationScenarioDB, From 090002a8ecb79129707e85d61966f3cca9937432 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 11 Jan 2024 16:32:43 +0100 Subject: [PATCH 360/414] Update - modified evaluations revamp migration logic --- .../migrations/20240110165900_evaluations_revamp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index f972784806..b71f41152a 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -382,9 +382,11 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): OrganizationDB, UserDB, TestSetDB, + EvaluationDB, OldEvaluationDB, OldEvaluationScenarioDB, EvaluationScenarioDB, + HumanEvaluationDB, HumanEvaluationScenarioDB, ] ) From 2e7fc64aa6a4ba81774552e6aa3d45198f58f071 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Thu, 11 Jan 2024 19:38:29 +0100 Subject: [PATCH 361/414] fix for the new lm app response --- agenta-backend/agenta_backend/services/llm_apps_service.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index 3caa069dbe..f040b85e21 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -77,7 +77,9 @@ async def invoke_app( url, json=payload, timeout=httpx.Timeout(timeout=5, read=None, write=5) ) response.raise_for_status() - return AppOutput(output=response.json(), status="success") + + lm_app_response = response.json() + return AppOutput(output=lm_app_response["message"], status="success") async def run_with_retry( From a4fd1c0f2e58297a0ce5b1310c7c151d7e3d594f Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Thu, 11 Jan 2024 21:26:58 +0100 Subject: [PATCH 362/414] fixed issue with inputs in eval --- .../services/llm_apps_service.py | 25 ++++++++++++++----- .../agenta_backend/tasks/evaluations.py | 13 +++++++--- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index f040b85e21..67115ad5ef 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -32,10 +32,18 @@ async def make_payload( for param in openapi_parameters: if param["type"] == "input": payload[param["name"]] = datapoint.get(param["name"], "") - elif param["type"] == "dict": - for input_name in parameters[param["name"]]: - input_name_ = input_name["name"] - inputs_dict[input_name_] = datapoint.get(input_name_, "") + elif param["type"] == "dict": # in case of dynamic inputs (as in our templates) + # let's get the list of the dynamic inputs + if ( + param["name"] in parameters + ): # in case we have modified in the playground the default list of inputs (e.g. country_name) + input_names = [_["name"] for _ in parameters[param["name"]]] + else: # otherwise we use the default from the openapi + input_names = param["default"] + # now we put them in a dict which we would put under "inputs" in the payload + + for input_name in input_names: + inputs_dict[input_name] = datapoint.get(input_name, "") elif param["type"] == "messages": # TODO: Right now the FE is saving chats always under the column name chats. The whole logic for handling chats and dynamic inputs is convoluted and needs rework in time. payload[param["name"]] = json.loads(datapoint.get("chat", "")) @@ -219,8 +227,13 @@ async def get_parameters_from_openapi(uri: str) -> List[Dict]: parameters = [] for name, param in properties.items(): - parameters.append({"name": name, "type": param.get("x-parameter", "input")}) - + parameters.append( + { + "name": name, + "type": param.get("x-parameter", "input"), + "default": param.get("default", []), + } + ) return parameters diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index abef76ea22..5b0b861389 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -261,9 +261,16 @@ def get_app_inputs(app_variant_parameters, openapi_parameters) -> List[Dict[str, for param in openapi_parameters: if param["type"] == "input": list_inputs.append({"name": param["name"], "type": "input"}) - elif param["type"] == "dict": - for input_name in app_variant_parameters[param["name"]]: - list_inputs.append({"name": input_name["name"], "type": "dict_input"}) + elif param["type"] == "dict": # in case of dynamic inputs (as in our templates) + # let's get the list of the dynamic inputs + if ( + param["name"] in app_variant_parameters + ): # in case we have modified in the playground the default list of inputs (e.g. country_name) + input_names = [_["name"] for _ in app_variant_parameters[param["name"]]] + else: # otherwise we use the default from the openapi + input_names = param["default"] + for input_name in input_names: + list_inputs.append({"name": input_name, "type": "dict_input"}) elif param["type"] == "messages": list_inputs.append({"name": param["name"], "type": "messages"}) elif param["type"] == "file_url": From 623e97d85f304a8fd38ccd542b0b6765deeebb2f Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 12:40:13 +0100 Subject: [PATCH 363/414] Feat - created migration file to change odmantic reference to link --- .../20240110165900_evaluations_revamp.py | 2 +- ...20721_change_odmantic_reference_to_link.py | 326 ++++++++++++++++++ 2 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index b71f41152a..9daadf3af2 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -455,4 +455,4 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi class Backward: - ... + pass diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py b/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py new file mode 100644 index 0000000000..7aaca4b5e3 --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py @@ -0,0 +1,326 @@ +from beanie import iterative_migration, Document, Link + +from uuid import uuid4 +from datetime import datetime +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field +from beanie import Document, Link, PydanticObjectId + + +class APIKeyDB(Document): + prefix: str + hashed_key: str + user_id: str + rate_limit: int = Field(default=0) + hidden: Optional[bool] = Field(default=False) + expiration_date: Optional[datetime] + created_at: Optional[datetime] = datetime.utcnow() + updated_at: Optional[datetime] + + class Settings: + name = "api_keys" + + +class InvitationDB(BaseModel): + token: str = Field(unique=True) + email: str + expiration_date: datetime = Field(default="0") + used: bool = False + + +class OrganizationDB(Document): + name: str = Field(default="agenta") + description: str = Field(default="") + type: Optional[str] + owner: str # user id + members: Optional[List[PydanticObjectId]] + invitations: Optional[List[InvitationDB]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "organizations" + + +class UserDB(Document): + uid: str = Field(default="0", unique=True, index=True) + username: str = Field(default="agenta") + email: str = Field(default="demo@agenta.ai", unique=True) + organizations: Optional[List[PydanticObjectId]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "users" + + +class ImageDB(Document): + """Defines the info needed to get an image and connect it to the app variant""" + + type: Optional[str] = Field(default="image") + template_uri: Optional[str] + docker_id: Optional[str] = Field(index=True) + tags: Optional[str] + deletable: bool = Field(default=True) + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "docker_images" + + +class AppDB(Document): + app_name: str + organization: Link[OrganizationDB] + user: Link[UserDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_db" + + +class DeploymentDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + container_name: Optional[str] + container_id: Optional[str] + uri: Optional[str] + status: str + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "deployments" + + +class VariantBaseDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + base_name: str + image: Link[ImageDB] + deployment: Optional[PydanticObjectId] # Link to deployment + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "bases" + + +class ConfigVersionDB(BaseModel): + version: int + parameters: Dict[str, Any] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + +class ConfigDB(Document): + config_name: str + current_version: int = Field(default=1) + parameters: Dict[str, Any] = Field(default=dict) + version_history: List[ConfigVersionDB] = Field(default=[]) + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "configs" + + +class AppVariantDB(Document): + app: Link[AppDB] + variant_name: str + image: Link[ImageDB] + user: Link[UserDB] + organization: Link[OrganizationDB] + parameters: Dict[str, Any] = Field(default=dict) # TODO: deprecated. remove + previous_variant_name: Optional[str] # TODO: deprecated. remove + base_name: Optional[str] + base: Link[VariantBaseDB] + config_name: Optional[str] + config: Link[ConfigDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + is_deleted: bool = Field( # TODO: deprecated. remove + default=False + ) # soft deletion for using the template variants + + class Settings: + name = "app_variants" + + +class AppEnvironmentDB(Document): + app: Link[AppDB] + name: str + user: Link[UserDB] + organization: Link[OrganizationDB] + deployed_app_variant: Optional[PydanticObjectId] + deployment: Optional[PydanticObjectId] # reference to deployment + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_environment_db" + + +class TemplateDB(Document): + type: Optional[str] = Field(default="image") + template_uri: Optional[str] + tag_id: Optional[int] + name: str = Field(unique=True) # tag name of image + repo_name: Optional[str] + title: str + description: str + size: Optional[int] + digest: Optional[str] # sha256 hash of image digest + last_pushed: Optional[datetime] + + class Settings: + name = "templates" + + +class TestSetDB(Document): + name: str + app: Link[AppDB] + csvdata: List[Dict[str, str]] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "testsets" + + +class EvaluatorConfigDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + name: str + evaluator_key: str + settings_values: Optional[Dict[str, Any]] = None + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluators_configs" + + +class Result(BaseModel): + type: str + value: Any + + +class EvaluationScenarioResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class AggregatedResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class EvaluationScenarioInputDB(BaseModel): + name: str + type: str + value: str + + +class EvaluationScenarioOutputDB(BaseModel): + type: str + value: Any + + +class HumanEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class HumanEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + variants: List[PydanticObjectId] + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "human_evaluations" + + +class HumanEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[HumanEvaluationDB] + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] + vote: Optional[str] + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "human_evaluations_scenarios" + + +class EvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str = Field(default="EVALUATION_INITIALIZED") + testset: Link[TestSetDB] + variant: PydanticObjectId + evaluators_configs: List[PydanticObjectId] + aggregated_results: List[AggregatedResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluations" + + +class EvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[EvaluationDB] + variant_id: PydanticObjectId + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] + evaluators_configs: List[PydanticObjectId] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluation_scenarios" + + +class Forward: + @iterative_migration(document_models=[OrganizationDB, UserDB, ImageDB]) + async def rename_image_db_reference_to_link(self, input_document: ImageDB, output_document: ImageDB): + output_document.user = input_document.user + +class Backward: + @iterative_migration(document_models=[OrganizationDB, UserDB, ImageDB]) + async def rename_image_db_reference_to_link(self, input_document: ImageDB, output_document: ImageDB): + output_document.user = input_document.user + From b7833ed9796c4b715ba369e8300fc40d17f95f9b Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 13:30:55 +0100 Subject: [PATCH 364/414] Update - conclude iterative migraiton logic to change odmantic reference to link --- .../20240110001454_initial_migration.py | 12 ++-- ...20721_change_odmantic_reference_to_link.py | 62 +++++++++++++++++-- 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py b/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py index ef06b5c11b..b44c47246d 100644 --- a/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py +++ b/agenta-backend/agenta_backend/migrations/20240110001454_initial_migration.py @@ -167,9 +167,9 @@ class OldEvaluationDB(Document): organization: Link[OrganizationDB] user: Link[UserDB] status: str - evaluation_type: str - evaluation_type_settings: OldEvaluationTypeSettings - variants: List[PydanticObjectId] + evaluation_type: Optional[str] + evaluation_type_settings: Optional[OldEvaluationTypeSettings] + variants: Optional[List[PydanticObjectId]] testsets: Link[TestSetDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -183,9 +183,9 @@ class EvaluationDB(Document): organization: Link[OrganizationDB] user: Link[UserDB] status: str - evaluation_type: str - evaluation_type_settings: OldEvaluationTypeSettings - variants: List[PydanticObjectId] + evaluation_type: Optional[str] + evaluation_type_settings: Optional[OldEvaluationTypeSettings] + variants: Optional[List[PydanticObjectId]] testset: Link[TestSetDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py b/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py index 7aaca4b5e3..12518af42c 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py @@ -316,11 +316,65 @@ class Settings: class Forward: @iterative_migration(document_models=[OrganizationDB, UserDB, ImageDB]) - async def rename_image_db_reference_to_link(self, input_document: ImageDB, output_document: ImageDB): + async def rename_image_db_reference_to_link( + self, input_document: ImageDB, output_document: ImageDB + ): output_document.user = input_document.user -class Backward: - @iterative_migration(document_models=[OrganizationDB, UserDB, ImageDB]) - async def rename_image_db_reference_to_link(self, input_document: ImageDB, output_document: ImageDB): + @iterative_migration(document_models=[OrganizationDB, UserDB, AppDB]) + async def rename_app_db_reference_to_link( + self, input_document: AppDB, output_document: AppDB + ): + output_document.user = input_document.user + output_document.organization = input_document.organization + + @iterative_migration(document_models=[OrganizationDB, UserDB, AppDB, DeploymentDB]) + async def rename_deployment_db_reference_to_link( + self, input_document: DeploymentDB, output_document: DeploymentDB + ): + output_document.app = input_document.app + output_document.user = input_document.user + output_document.organization = input_document.organization + + @iterative_migration( + document_models=[OrganizationDB, UserDB, AppDB, ImageDB, VariantBaseDB] + ) + async def rename_variant_base_db_reference_to_link( + self, input_document: VariantBaseDB, output_document: VariantBaseDB + ): + output_document.app = input_document.app + output_document.user = input_document.user + output_document.organization = input_document.organization + output_document.image = input_document.image + + @iterative_migration( + document_models=[OrganizationDB, UserDB, AppDB, AppEnvironmentDB] + ) + async def rename_app_environment_db_reference_to_link( + self, input_document: AppEnvironmentDB, output_document: AppEnvironmentDB + ): + output_document.app = input_document.app output_document.user = input_document.user + output_document.organization = input_document.organization + @iterative_migration(document_models=[OrganizationDB, UserDB, AppDB, TestSetDB]) + async def rename_testset_db_reference_to_link( + self, input_document: TestSetDB, output_document: TestSetDB + ): + output_document.app = input_document.app + output_document.user = input_document.user + output_document.organization = input_document.organization + + @iterative_migration( + document_models=[OrganizationDB, UserDB, AppDB, EvaluatorConfigDB] + ) + async def rename_evaluator_config_db_reference_to_link( + self, input_document: EvaluatorConfigDB, output_document: EvaluatorConfigDB + ): + output_document.app = input_document.app + output_document.user = input_document.user + output_document.organization = input_document.organization + + +class Backward: + pass From 109fbc80e5d33088bb8617b4acea50938aef4791 Mon Sep 17 00:00:00 2001 From: MohammedMaaz Date: Fri, 12 Jan 2024 17:51:01 +0500 Subject: [PATCH 365/414] row click improved | empty state ui in results page --- agenta-web/dev.Dockerfile | 60 +++---- .../evaluationResults/EvaluationResults.tsx | 155 +++++++++++------- .../pages/apps/[app_id]/evaluations/index.tsx | 2 +- agenta-web/src/services/evaluations/index.ts | 12 +- 4 files changed, 137 insertions(+), 92 deletions(-) diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile index 6af573852c..ad5bc9a53b 100644 --- a/agenta-web/dev.Dockerfile +++ b/agenta-web/dev.Dockerfile @@ -1,37 +1,37 @@ FROM node:18-alpine -WORKDIR /app +# WORKDIR /app -# Install dependencies based on the preferred package manager -COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ -RUN \ - if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ - elif [ -f package-lock.json ]; then npm i; \ - elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ - # Allow install without lockfile, so example works even without Node.js installed locally - else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ - fi +# # Install dependencies based on the preferred package manager +# COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ +# RUN \ +# if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ +# elif [ -f package-lock.json ]; then npm i; \ +# elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ +# # Allow install without lockfile, so example works even without Node.js installed locally +# else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ +# fi -COPY src ./src -COPY public ./public -COPY next.config.js . -COPY tsconfig.json . -COPY postcss.config.js . -COPY .env . -RUN if [ -f .env.local ]; then cp .env.local .; fi -# # used in cloud -COPY sentry.* . -# Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry -# Uncomment the following line to disable telemetry at run time -# ENV NEXT_TELEMETRY_DISABLED 1 +# COPY src ./src +# COPY public ./public +# COPY next.config.js . +# COPY tsconfig.json . +# COPY postcss.config.js . +# COPY .env . +# RUN if [ -f .env.local ]; then cp .env.local .; fi +# # # used in cloud +# COPY sentry.* . +# # Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry +# # Uncomment the following line to disable telemetry at run time +# # ENV NEXT_TELEMETRY_DISABLED 1 -# Note: Don't expose ports here, Compose will handle that for us +# # Note: Don't expose ports here, Compose will handle that for us -# Start Next.js in development mode based on the preferred package manager -CMD \ - if [ -f yarn.lock ]; then yarn dev; \ - elif [ -f package-lock.json ]; then npm run dev; \ - elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ - else yarn dev; \ - fi +# # Start Next.js in development mode based on the preferred package manager +# CMD \ +# if [ -f yarn.lock ]; then yarn dev; \ +# elif [ -f package-lock.json ]; then npm run dev; \ +# elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ +# else yarn dev; \ +# fi diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx index 5758d86363..a1f9ea2590 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx @@ -3,7 +3,7 @@ import {AgGridReact} from "ag-grid-react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {ColDef} from "ag-grid-community" import {createUseStyles} from "react-jss" -import {Button, Space, Spin, Tag, Tooltip, theme} from "antd" +import {Button, Empty, Space, Spin, Tag, Tooltip, Typography, theme} from "antd" import {DeleteOutlined, PlusCircleOutlined, SlidersOutlined, SwapOutlined} from "@ant-design/icons" import {EvaluationStatus, GenericObject, JSSTheme, TypedValue, _Evaluation} from "@/lib/Types" import {capitalize, round, uniqBy} from "lodash" @@ -13,9 +13,8 @@ import duration from "dayjs/plugin/duration" import NewEvaluationModal from "./NewEvaluationModal" import {useAppId} from "@/hooks/useAppId" import {deleteEvaluations, fetchAllEvaluations, fetchEvaluationStatus} from "@/services/evaluations" -import {useRouter} from "next/router" import {useUpdateEffect} from "usehooks-ts" -import {redirectIfNoLLMKeys, shortPoll} from "@/lib/helpers/utils" +import {shortPoll} from "@/lib/helpers/utils" import AlertPopup from "@/components/AlertPopup/AlertPopup" import { LinkCellRenderer, @@ -26,10 +25,16 @@ import { import {useAtom} from "jotai" import {evaluatorsAtom} from "@/lib/atoms/evaluation" import AgCustomHeader from "@/components/AgCustomHeader/AgCustomHeader" +import {useRouter} from "next/router" dayjs.extend(relativeTime) dayjs.extend(duration) const useStyles = createUseStyles((theme: JSSTheme) => ({ + emptyRoot: { + height: "calc(100vh - 260px)", + display: "grid", + placeItems: "center", + }, root: { display: "flex", flexDirection: "column", @@ -100,13 +105,13 @@ const EvaluationResults: React.FC = () => { const {appTheme} = useAppTheme() const classes = useStyles() const appId = useAppId() - const router = useRouter() const [evaluations, setEvaluations] = useState<_Evaluation[]>([]) const [evaluators] = useAtom(evaluatorsAtom) const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) const [fetching, setFetching] = useState(false) const [selected, setSelected] = useState<_Evaluation[]>([]) const stoppers = useRef() + const router = useRouter() const {token} = theme.useToken() const gridRef = useRef() @@ -314,58 +319,96 @@ const EvaluationResults: React.FC = () => { ) return ( -
- - - {compareDisabled ? ( - - {compareBtnNode} - - ) : ( - compareBtnNode - )} - - - -
- - ref={gridRef as any} - rowData={evaluations} - columnDefs={colDefs} - getRowId={(params) => params.data.id} - onRowDoubleClicked={(params) => - EvaluationStatus.FINISHED === params.data?.status && - router.push(`/apps/${appId}/evaluations/${params.data?.id}`) - } - rowSelection="multiple" - suppressRowClickSelection - onSelectionChanged={(event) => setSelected(event.api.getSelectedRows())} - tooltipShowDelay={0} - /> + <> + {!fetching && !evaluations.length ? ( +
+ + + + Or + + +
- - + ) : ( +
+ + + {compareDisabled ? ( + + {compareBtnNode} + + ) : ( + compareBtnNode + )} + + + +
+ + ref={gridRef as any} + rowData={evaluations} + columnDefs={colDefs} + getRowId={(params) => params.data.id} + onRowClicked={(params) => { + // ignore clicks on the checkbox col + if ( + params.eventPath?.find( + (item: any) => item.ariaColIndex === "1", + ) + ) + return + EvaluationStatus.FINISHED === params.data?.status && + router.push(`/apps/${appId}/evaluations/${params.data?.id}`) + }} + rowSelection="multiple" + suppressRowClickSelection + onSelectionChanged={(event) => + setSelected(event.api.getSelectedRows()) + } + tooltipShowDelay={0} + /> +
+
+
+ )} setNewEvalModalOpen(false)} @@ -374,7 +417,7 @@ const EvaluationResults: React.FC = () => { fetcher() }} /> -
+ ) } diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx index 5abc7a7811..d4ff95e3fb 100644 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx @@ -45,7 +45,7 @@ const Evaluations: React.FC = () => {
({ appId: item.app_id, created_at: item.created_at, updated_at: item.updated_at, - duration: dayjs( - [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes(item.status) - ? Date.now() - : item.updated_at, - ).diff(dayjs(item.created_at), "milliseconds"), + duration: + 500000 || + dayjs( + [EvaluationStatus.STARTED, EvaluationStatus.INITIALIZED].includes(item.status) + ? Date.now() + : item.updated_at, + ).diff(dayjs(item.created_at), "milliseconds"), status: item.status, testset: { id: item.testset_id, From 60da49ff44bc3a6d1739879a8e32bdc2644a0de6 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 15:09:05 +0100 Subject: [PATCH 366/414] Update - set default version to old evaluation models --- .../migrations/20240110165900_evaluations_revamp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 9daadf3af2..6dd8e44e94 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -201,6 +201,7 @@ class OldEvaluationDB(Document): evaluation_type: str evaluation_type_settings: OldEvaluationTypeSettings variants: List[PydanticObjectId] + version: str = Field("odmantic") testset: Link[TestSetDB] created_at: Optional[datetime] = Field(default=datetime.utcnow()) updated_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -216,6 +217,7 @@ class OldEvaluationScenarioDB(Document): inputs: List[OldEvaluationScenarioInput] outputs: List[OldEvaluationScenarioOutput] # EvaluationScenarioOutput vote: Optional[str] + version: str = Field("odmantic") score: Optional[Any] correct_answer: Optional[str] created_at: Optional[datetime] = Field(default=datetime.utcnow()) @@ -230,6 +232,7 @@ class Settings: class OldCustomEvaluationDB(Document): evaluation_name: str python_code: str + version: str = Field("odmantic") app: Link[AppDB] user: Link[UserDB] organization: Link[OrganizationDB] From 08a89ac0792ab9e463dc786f3a58639fab4710a8 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 16:06:30 +0100 Subject: [PATCH 367/414] Update - added cleanup codes --- .../20240110165900_evaluations_revamp.py | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 6dd8e44e94..412f85419a 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -1,7 +1,8 @@ +import os from datetime import datetime from typing import Any, Dict, List, Optional - +from pymongo import MongoClient from pydantic import BaseModel, Field from beanie import free_fall_migration, Document, Link, PydanticObjectId @@ -265,6 +266,26 @@ def modify_app_id_store( app_id_store["evaluation_types"] = list(set(app_id_store_evaluation_types)) +def set_odmantic_version_in_old_evaluation_records(): + # Initialize mongo client + client = MongoClient("mongodb://username:password@192.168.16.1:27017") + db = client["agenta_v2"] + + evaluation_db = db.get_collection("evaluations") + evaluation_scenario_db = db.get_collection("evaluation_scenarios") + + def update_evaluation_version(): + for document in evaluation_db.find(): + document.update({"_id": document["_id"], "$set": {"version": "odmantic"}}) + + def update_evaluation_scenario_version(): + for document in evaluation_scenario_db.find(): + document.update({"_id": document["_id"], "$set": {"version": "odmantic"}}) + + update_evaluation_version() + update_evaluation_scenario_version() + + class Forward: @free_fall_migration( document_models=[ @@ -280,6 +301,11 @@ class Forward: ] ) async def migrate_old_evaluation_to_new_evaluation(self, session): + # PREPARATION: + # Update old evaluation, and scenario records version to + # odmantic to clean up records after use + set_odmantic_version_in_old_evaluation_records() + # STEP 1: # Create a key-value store that saves all the variants & evaluation types for a particular app id # Example: {"app_id": {"evaluation_types": ["string", "string"], "variant_ids": ["string", "string"]}} @@ -456,6 +482,11 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi ) await new_scenario.insert(session=session) + # # Cleanup: remove old evaluation records with odmantic as their version + await OldCustomEvaluationDB.find(lazy_parse=True).to_list() + await OldEvaluationDB.find({"version": "odmantic"}, lazy_parse=True).to_list() + await OldEvaluationScenarioDB.find({"version": "odmantic"}, lazy_parse=True).delete() + class Backward: pass From f3e07b537b2cd4b315c8bcdd1de1a3b50e41cab8 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 17:00:04 +0100 Subject: [PATCH 368/414] Update - renamed current evaluation, scenarios collection with 'new_' prefix --- .../20240110165900_evaluations_revamp.py | 33 ++----------------- .../agenta_backend/models/db_models.py | 4 +-- 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 412f85419a..d90add0e9a 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -152,7 +152,7 @@ class EvaluationDB(Document): updated_at: datetime = Field(default=datetime.utcnow()) class Settings: - name = "evaluations" + name = "new_evaluations" class EvaluationScenarioDB(Document): @@ -171,7 +171,7 @@ class EvaluationScenarioDB(Document): updated_at: datetime = Field(default=datetime.utcnow()) class Settings: - name = "evaluation_scenarios" + name = "new_evaluation_scenarios" class OldEvaluationTypeSettings(BaseModel): @@ -266,25 +266,6 @@ def modify_app_id_store( app_id_store["evaluation_types"] = list(set(app_id_store_evaluation_types)) -def set_odmantic_version_in_old_evaluation_records(): - # Initialize mongo client - client = MongoClient("mongodb://username:password@192.168.16.1:27017") - db = client["agenta_v2"] - - evaluation_db = db.get_collection("evaluations") - evaluation_scenario_db = db.get_collection("evaluation_scenarios") - - def update_evaluation_version(): - for document in evaluation_db.find(): - document.update({"_id": document["_id"], "$set": {"version": "odmantic"}}) - - def update_evaluation_scenario_version(): - for document in evaluation_scenario_db.find(): - document.update({"_id": document["_id"], "$set": {"version": "odmantic"}}) - - update_evaluation_version() - update_evaluation_scenario_version() - class Forward: @free_fall_migration( @@ -301,11 +282,6 @@ class Forward: ] ) async def migrate_old_evaluation_to_new_evaluation(self, session): - # PREPARATION: - # Update old evaluation, and scenario records version to - # odmantic to clean up records after use - set_odmantic_version_in_old_evaluation_records() - # STEP 1: # Create a key-value store that saves all the variants & evaluation types for a particular app id # Example: {"app_id": {"evaluation_types": ["string", "string"], "variant_ids": ["string", "string"]}} @@ -482,11 +458,6 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi ) await new_scenario.insert(session=session) - # # Cleanup: remove old evaluation records with odmantic as their version - await OldCustomEvaluationDB.find(lazy_parse=True).to_list() - await OldEvaluationDB.find({"version": "odmantic"}, lazy_parse=True).to_list() - await OldEvaluationScenarioDB.find({"version": "odmantic"}, lazy_parse=True).delete() - class Backward: pass diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index c336badf19..68d15450b8 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -290,7 +290,7 @@ class EvaluationDB(Document): updated_at: datetime = Field(default=datetime.utcnow()) class Settings: - name = "evaluations" + name = "new_evaluations" class EvaluationScenarioDB(Document): @@ -309,7 +309,7 @@ class EvaluationScenarioDB(Document): updated_at: datetime = Field(default=datetime.utcnow()) class Settings: - name = "evaluation_scenarios" + name = "new_evaluation_scenarios" class SpanDB(Document): From a352d1dde6ae6ca1f089670af4c73db01f34ca08 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 17:31:45 +0100 Subject: [PATCH 369/414] Update - modified migration for evaluations revamp --- .../20240110165900_evaluations_revamp.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index d90add0e9a..0dbf78ac43 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -310,27 +310,27 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): for custom_code_evaluation in custom_code_evaluations: eval_config = EvaluatorConfigDB( app=PydanticObjectId(app_id), - organization=old_eval.organization.id, - user=old_eval.user.id, + organization=old_eval.organization, + user=old_eval.user, name=f"{old_eval.app.app_name}_{old_eval.evaluation_type}", evaluator_key=f"auto_{evaluation_type}", settings_values={} if custom_code_evaluation is None else {"code": custom_code_evaluation.python_code}, ) - await eval_config.create(session=session) + await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) if evaluation_type != "custom_code_run": eval_config = EvaluatorConfigDB( app=PydanticObjectId(app_id), - organization=old_eval.organization.id, - user=old_eval.user.id, + organization=old_eval.organization, + user=old_eval.user, name=f"{old_eval.app.app_name}_{old_eval.evaluation_type}", evaluator_key=evaluation_type, settings_values={}, ) - await eval_config.create(session=session) + await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) # STEP 3 (a): @@ -343,7 +343,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): "human_a_b_testing", "single_model_test", ]: - auto_evaluator_configs.append(evaluator_config.id) + auto_evaluator_configs.append(PydanticObjectId(evaluator_config.id)) # STEP 3 (b): # In the case where the evaluator key is a human evaluator, @@ -355,14 +355,14 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): ]: new_eval = HumanEvaluationDB( app=PydanticObjectId(app_id), - organization=old_eval.organization.id, - user=old_eval.user.id, + organization=old_eval.organization, + user=old_eval.user, status=old_eval.status, evaluation_type=evaluator_config.evaluator_key, variants=app_id_store["variant_ids"], - testset=old_eval.testset.id, + testset=old_eval.testset, ) - await new_eval.create(session=session) # replace(session=session) + await new_eval.insert(session=session) # replace(session=session) # STEP 3 (c): # Proceed to create a single evaluation for every variant in the app_id_store @@ -371,15 +371,15 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): for variant in app_id_store["variant_ids"]: new_eval = EvaluationDB( app=PydanticObjectId(app_id), - organization=old_eval.organization.id, - user=old_eval.user.id, + organization=old_eval.organization, + user=old_eval.user, status=old_eval.status, - testset=old_eval.testset.id, - variant=variant, + testset=old_eval.testset, + variant=PydanticObjectId(variant), evaluators_configs=auto_evaluator_configs, aggregated_results=[], ) - await new_eval.create(session=session) + await new_eval.insert(session=session) @free_fall_migration( document_models=[ @@ -417,9 +417,9 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi for output in old_scenario.outputs ] new_scenario = HumanEvaluationScenarioDB( - user=old_scenario.user.id, - organization=old_scenario.organization.id, - evaluation=old_scenario.evaluation.id, + user=old_scenario.user, + organization=old_scenario.organization, + evaluation=old_scenario.evaluation, inputs=scenario_inputs, outputs=scenario_outputs, correct_answer=old_scenario.correct_answer, @@ -431,9 +431,9 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi await new_scenario.insert(session=session) else: new_scenario = EvaluationScenarioDB( - user=old_scenario.user.id, - organization=old_scenario.organization.id, - evaluation=old_scenario.evaluation.id, + user=old_scenario.user, + organization=old_scenario.organization, + evaluation=old_scenario.evaluation, variant_id=old_scenario.evaluation.variants[0], inputs=[ EvaluationScenarioInputDB( From 3e8cbce89530bbc1a5eb470a6ba4ce32ed0eb2df Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 18:59:30 +0100 Subject: [PATCH 370/414] Update - modified logic to migrate old evaluation scenario --- .../20240110165900_evaluations_revamp.py | 125 ++++++++++-------- 1 file changed, 69 insertions(+), 56 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 0dbf78ac43..eb7feed807 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -266,7 +266,6 @@ def modify_app_id_store( app_id_store["evaluation_types"] = list(set(app_id_store_evaluation_types)) - class Forward: @free_fall_migration( document_models=[ @@ -397,66 +396,80 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): ) async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, session): old_scenarios = await OldEvaluationScenarioDB.find(fetch_links=True).to_list() + new_evaluations = await EvaluationDB.find(fetch_links=True).to_list() + new_human_evaluations = await HumanEvaluationDB.find(fetch_links=True).to_list() + combined_evaluations = new_evaluations + new_human_evaluations for old_scenario in old_scenarios: - if old_scenario.evaluation.evaluation_type in [ - "human_a_b_testing", - "single_model_test", - ]: - scenario_inputs = [ - HumanEvaluationScenarioInput( - input_name=input.input_name, - input_value=input.input_value, - ) - for input in old_scenario.inputs - ] - scenario_outputs = [ - HumanEvaluationScenarioOutput( - variant_id=output.variant_id, - variant_output=output.variant_output, - ) - for output in old_scenario.outputs - ] - new_scenario = HumanEvaluationScenarioDB( - user=old_scenario.user, - organization=old_scenario.organization, - evaluation=old_scenario.evaluation, - inputs=scenario_inputs, - outputs=scenario_outputs, - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - vote=old_scenario.vote, - score=old_scenario.score, - ) - await new_scenario.insert(session=session) - else: - new_scenario = EvaluationScenarioDB( - user=old_scenario.user, - organization=old_scenario.organization, - evaluation=old_scenario.evaluation, - variant_id=old_scenario.evaluation.variants[0], - inputs=[ - EvaluationScenarioInputDB( - name=input.input_name, - type=type(input.input_value).__name__, - value=input.input_value, + for new_evaluation in combined_evaluations: + if type( + new_evaluation + ) == HumanEvaluationDB and old_scenario.evaluation.evaluation_type in [ + "human_a_b_testing", + "single_model_test", + ]: + scenario_inputs = [ + HumanEvaluationScenarioInput( + input_name=input.input_name, + input_value=input.input_value, ) for input in old_scenario.inputs - ], - outputs=[ - EvaluationScenarioOutputDB( - type=type(output.variant_output).__name__, - value=output.variant_output, + ] + scenario_outputs = [ + HumanEvaluationScenarioOutput( + variant_id=output.variant_id, + variant_output=output.variant_output, ) for output in old_scenario.outputs - ], - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - evaluators_configs=[], - results=[], - ) - await new_scenario.insert(session=session) + ] + if old_scenario.evaluation.app.id == new_evaluation.app.id: + new_scenario = HumanEvaluationScenarioDB( + user=new_evaluation.user, + organization=new_evaluation.organization, + evaluation=new_evaluation, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + vote=old_scenario.vote, + score=old_scenario.score, + ) + await new_scenario.insert(session=session) + + if type( + new_evaluation + ) == EvaluationDB and old_scenario.evaluation.evaluation_type not in [ + "human_a_b_testing", + "single_model_test", + ]: + if old_scenario.evaluation.app.id == new_evaluation.app.id: + new_scenario = EvaluationScenarioDB( + user=new_evaluation.user, + organization=new_evaluation.organization, + evaluation=new_evaluation, + variant_id=old_scenario.evaluation.variants[0], + inputs=[ + EvaluationScenarioInputDB( + name=input.input_name, + type=type(input.input_value).__name__, + value=input.input_value, + ) + for input in old_scenario.inputs + ], + outputs=[ + EvaluationScenarioOutputDB( + type=type(output.variant_output).__name__, + value=output.variant_output, + ) + for output in old_scenario.outputs + ], + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + evaluators_configs=[], + results=[], + ) + await new_scenario.insert(session=session) class Backward: From 9f8f79836a91de7a59a8fd5ff2bb7ada69159620 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 19:23:45 +0100 Subject: [PATCH 371/414] Update - refactor migrate old evaluation scenario logic --- .../20240110165900_evaluations_revamp.py | 129 ++++++++---------- 1 file changed, 59 insertions(+), 70 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index eb7feed807..5fc389d70a 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -398,78 +398,67 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi old_scenarios = await OldEvaluationScenarioDB.find(fetch_links=True).to_list() new_evaluations = await EvaluationDB.find(fetch_links=True).to_list() new_human_evaluations = await HumanEvaluationDB.find(fetch_links=True).to_list() - combined_evaluations = new_evaluations + new_human_evaluations + for old_scenario in old_scenarios: - for new_evaluation in combined_evaluations: - if type( - new_evaluation - ) == HumanEvaluationDB and old_scenario.evaluation.evaluation_type in [ - "human_a_b_testing", - "single_model_test", - ]: - scenario_inputs = [ - HumanEvaluationScenarioInput( - input_name=input.input_name, - input_value=input.input_value, - ) - for input in old_scenario.inputs - ] - scenario_outputs = [ - HumanEvaluationScenarioOutput( - variant_id=output.variant_id, - variant_output=output.variant_output, - ) - for output in old_scenario.outputs - ] - if old_scenario.evaluation.app.id == new_evaluation.app.id: - new_scenario = HumanEvaluationScenarioDB( - user=new_evaluation.user, - organization=new_evaluation.organization, - evaluation=new_evaluation, - inputs=scenario_inputs, - outputs=scenario_outputs, - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - vote=old_scenario.vote, - score=old_scenario.score, - ) - await new_scenario.insert(session=session) + for evaluation in new_evaluations: + if old_scenario.evaluation.app.id == evaluation.app.id: + new_scenario = EvaluationScenarioDB( + user=evaluation.user, + organization=evaluation.organization, + evaluation=evaluation, + variant_id=old_scenario.evaluation.variants[0], + inputs=[ + EvaluationScenarioInputDB( + name=input.input_name, + type=type(input.input_value).__name__, + value=input.input_value, + ) + for input in old_scenario.inputs + ], + outputs=[ + EvaluationScenarioOutputDB( + type=type(output.variant_output).__name__, + value=output.variant_output, + ) + for output in old_scenario.outputs + ], + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + evaluators_configs=[], + results=[], + ) + await new_scenario.insert(session=session) - if type( - new_evaluation - ) == EvaluationDB and old_scenario.evaluation.evaluation_type not in [ - "human_a_b_testing", - "single_model_test", - ]: - if old_scenario.evaluation.app.id == new_evaluation.app.id: - new_scenario = EvaluationScenarioDB( - user=new_evaluation.user, - organization=new_evaluation.organization, - evaluation=new_evaluation, - variant_id=old_scenario.evaluation.variants[0], - inputs=[ - EvaluationScenarioInputDB( - name=input.input_name, - type=type(input.input_value).__name__, - value=input.input_value, - ) - for input in old_scenario.inputs - ], - outputs=[ - EvaluationScenarioOutputDB( - type=type(output.variant_output).__name__, - value=output.variant_output, - ) - for output in old_scenario.outputs - ], - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - evaluators_configs=[], - results=[], - ) - await new_scenario.insert(session=session) + for evaluation in new_human_evaluations: + scenario_inputs = [ + HumanEvaluationScenarioInput( + input_name=input.input_name, + input_value=input.input_value, + ) + for input in old_scenario.inputs + ] + scenario_outputs = [ + HumanEvaluationScenarioOutput( + variant_id=output.variant_id, + variant_output=output.variant_output, + ) + for output in old_scenario.outputs + ] + if old_scenario.evaluation.app.id == evaluation.app.id: + new_scenario = HumanEvaluationScenarioDB( + user=evaluation.user, + organization=evaluation.organization, + evaluation=evaluation, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + vote=old_scenario.vote, + score=old_scenario.score, + ) + await new_scenario.insert(session=session) class Backward: From 784adea3ab55f03af04c983718333225f76788fe Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 12 Jan 2024 21:09:25 +0100 Subject: [PATCH 372/414] Update - added print debug statements --- .../migrations/20240110165900_evaluations_revamp.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 5fc389d70a..69c552f622 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -308,10 +308,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type == "custom_code_run": for custom_code_evaluation in custom_code_evaluations: eval_config = EvaluatorConfigDB( - app=PydanticObjectId(app_id), + app=old_eval.app, organization=old_eval.organization, user=old_eval.user, - name=f"{old_eval.app.app_name}_{old_eval.evaluation_type}", + name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=f"auto_{evaluation_type}", settings_values={} if custom_code_evaluation is None @@ -322,10 +322,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type != "custom_code_run": eval_config = EvaluatorConfigDB( - app=PydanticObjectId(app_id), + app=old_eval.app, organization=old_eval.organization, user=old_eval.user, - name=f"{old_eval.app.app_name}_{old_eval.evaluation_type}", + name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=evaluation_type, settings_values={}, ) @@ -401,6 +401,7 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi for old_scenario in old_scenarios: for evaluation in new_evaluations: + print(f"Checking scenario for evaluation: {old_scenario.evaluation.app.id} == {evaluation.app.id}") if old_scenario.evaluation.app.id == evaluation.app.id: new_scenario = EvaluationScenarioDB( user=evaluation.user, @@ -431,6 +432,7 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi await new_scenario.insert(session=session) for evaluation in new_human_evaluations: + print(f"Checking human scenario for evaluation: {old_scenario.evaluation.app.id} == {evaluation.app.id}") scenario_inputs = [ HumanEvaluationScenarioInput( input_name=input.input_name, From 5afb7b9bf35b7fd6c05cadb19d678cd9a35b6dd7 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 00:28:04 +0100 Subject: [PATCH 373/414] Update - modified migrate old evaluation scenario logic --- .../20240110165900_evaluations_revamp.py | 98 ++++++++++--------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 69c552f622..d1177acfd5 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -400,39 +400,44 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi new_human_evaluations = await HumanEvaluationDB.find(fetch_links=True).to_list() for old_scenario in old_scenarios: - for evaluation in new_evaluations: - print(f"Checking scenario for evaluation: {old_scenario.evaluation.app.id} == {evaluation.app.id}") - if old_scenario.evaluation.app.id == evaluation.app.id: - new_scenario = EvaluationScenarioDB( - user=evaluation.user, - organization=evaluation.organization, - evaluation=evaluation, - variant_id=old_scenario.evaluation.variants[0], - inputs=[ - EvaluationScenarioInputDB( - name=input.input_name, - type=type(input.input_value).__name__, - value=input.input_value, - ) - for input in old_scenario.inputs - ], - outputs=[ - EvaluationScenarioOutputDB( - type=type(output.variant_output).__name__, - value=output.variant_output, - ) - for output in old_scenario.outputs - ], - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - evaluators_configs=[], - results=[], - ) - await new_scenario.insert(session=session) - - for evaluation in new_human_evaluations: - print(f"Checking human scenario for evaluation: {old_scenario.evaluation.app.id} == {evaluation.app.id}") + matching_evaluations = [ + evaluation for evaluation in new_evaluations if old_scenario.evaluation.app.id == evaluation.app.id + ] + for evaluation in matching_evaluations: + new_scenario = EvaluationScenarioDB( + user=evaluation.user, + organization=evaluation.organization, + evaluation=evaluation, + variant_id=old_scenario.evaluation.variants[0], + inputs=[ + EvaluationScenarioInputDB( + name=input.input_name, + type=type(input.input_value).__name__, + value=input.input_value, + ) + for input in old_scenario.inputs + ], + outputs=[ + EvaluationScenarioOutputDB( + type=type(output.variant_output).__name__, + value=output.variant_output, + ) + for output in old_scenario.outputs + ], + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + evaluators_configs=[], + results=[], + ) + await new_scenario.insert(session=session) + + matching_human_evaluations = [ + evaluation + for evaluation in new_human_evaluations + if old_scenario.evaluation.app.id == evaluation.app.id + ] + for human_evaluation in matching_human_evaluations: scenario_inputs = [ HumanEvaluationScenarioInput( input_name=input.input_name, @@ -447,20 +452,19 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi ) for output in old_scenario.outputs ] - if old_scenario.evaluation.app.id == evaluation.app.id: - new_scenario = HumanEvaluationScenarioDB( - user=evaluation.user, - organization=evaluation.organization, - evaluation=evaluation, - inputs=scenario_inputs, - outputs=scenario_outputs, - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - vote=old_scenario.vote, - score=old_scenario.score, - ) - await new_scenario.insert(session=session) + new_scenario = HumanEvaluationScenarioDB( + user=human_evaluation.user, + organization=human_evaluation.organization, + evaluation=human_evaluation, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + vote=old_scenario.vote, + score=old_scenario.score, + ) + await new_scenario.insert(session=session) class Backward: From 7b3d6649df898dcb01de097611f2db5c8d68d691 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 10:39:39 +0100 Subject: [PATCH 374/414] Update - cleanup evaluation service and modified evaluations revamp migration logic --- .../20240110165900_evaluations_revamp.py | 34 +++++++++++++++---- .../services/evaluation_service.py | 4 --- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index d1177acfd5..95e60586f2 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -320,7 +320,19 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) - if evaluation_type != "custom_code_run": + if evaluation_type == "auto_similarity_match": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values={"similarity_threshold": 0.5}, + ) + await eval_config.insert(session=session) + app_evaluator_configs.append(eval_config) + + if evaluation_type not in ["custom_code_run", "auto_similarity_match"]: eval_config = EvaluatorConfigDB( app=old_eval.app, organization=old_eval.organization, @@ -361,7 +373,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): variants=app_id_store["variant_ids"], testset=old_eval.testset, ) - await new_eval.insert(session=session) # replace(session=session) + await new_eval.insert(session=session) # STEP 3 (c): # Proceed to create a single evaluation for every variant in the app_id_store @@ -376,7 +388,6 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): testset=old_eval.testset, variant=PydanticObjectId(variant), evaluators_configs=auto_evaluator_configs, - aggregated_results=[], ) await new_eval.insert(session=session) @@ -401,9 +412,18 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi for old_scenario in old_scenarios: matching_evaluations = [ - evaluation for evaluation in new_evaluations if old_scenario.evaluation.app.id == evaluation.app.id + evaluation + for evaluation in new_evaluations + if old_scenario.evaluation.app == evaluation.app.id ] for evaluation in matching_evaluations: + results = [ + EvaluationScenarioResult( + evaluator_config=PydanticObjectId(evaluator_config), + result=old_scenario.score, + ) + for evaluator_config in evaluation.evaluators_configs + ] new_scenario = EvaluationScenarioDB( user=evaluation.user, organization=evaluation.organization, @@ -427,15 +447,15 @@ async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, sessi correct_answer=old_scenario.correct_answer, is_pinned=old_scenario.is_pinned, note=old_scenario.note, - evaluators_configs=[], - results=[], + evaluators_configs=evaluation.evaluators_configs, + results=results, ) await new_scenario.insert(session=session) matching_human_evaluations = [ evaluation for evaluation in new_human_evaluations - if old_scenario.evaluation.app.id == evaluation.app.id + if old_scenario.evaluation.app == evaluation.app.id ] for human_evaluation in matching_human_evaluations: scenario_inputs = [ diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py index 3182b8d393..db79d2f55f 100644 --- a/agenta-backend/agenta_backend/services/evaluation_service.py +++ b/agenta-backend/agenta_backend/services/evaluation_service.py @@ -610,9 +610,6 @@ async def create_new_human_evaluation( """ user = await get_user(user_uid=user_org_data["uid"]) - # Initialize evaluation type settings - evaluation_type_settings = {} - current_time = datetime.utcnow() # Fetch app @@ -633,7 +630,6 @@ async def create_new_human_evaluation( user=user, status=payload.status, evaluation_type=payload.evaluation_type, - evaluation_type_settings=evaluation_type_settings, variants=variants, testset=testset, created_at=current_time, From 01a32e389b0616fab08de7cfd8039c18d2a5cad1 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 15:56:04 +0100 Subject: [PATCH 375/414] Feat - created evaluation scenarios revamp migration --- ...40113131802_evaluation_scenarios_revamp.py | 412 ++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 agenta-backend/agenta_backend/migrations/20240113131802_evaluation_scenarios_revamp.py diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240113131802_evaluation_scenarios_revamp.py new file mode 100644 index 0000000000..d8ba377e79 --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/20240113131802_evaluation_scenarios_revamp.py @@ -0,0 +1,412 @@ +from datetime import datetime +from typing import Any, Dict, List, Optional + +from beanie.operators import In +from pydantic import BaseModel, Field +from beanie import free_fall_migration, Document, Link, PydanticObjectId + + +class OrganizationDB(Document): + name: str = Field(default="agenta") + description: str = Field(default="") + type: Optional[str] + owner: str # user id + members: Optional[List[PydanticObjectId]] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "organizations" + + +class UserDB(Document): + uid: str = Field(default="0", unique=True, index=True) + username: str = Field(default="agenta") + email: str = Field(default="demo@agenta.ai", unique=True) + organizations: Optional[List[PydanticObjectId]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "users" + + +class AppDB(Document): + app_name: str + organization: Link[OrganizationDB] + user: Link[UserDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_db" + + +class TestSetDB(Document): + name: str + app: Link[AppDB] + csvdata: List[Dict[str, str]] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "testsets" + + +class EvaluatorConfigDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + name: str + evaluator_key: str + settings_values: Optional[Dict[str, Any]] = None + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluators_configs" + + +class Result(BaseModel): + type: str + value: Any + + +class EvaluationScenarioResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class AggregatedResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class EvaluationScenarioInputDB(BaseModel): + name: str + type: str + value: str + + +class EvaluationScenarioOutputDB(BaseModel): + type: str + value: Any + + +class HumanEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class HumanEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + variants: List[PydanticObjectId] + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "human_evaluations" + + +class HumanEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[HumanEvaluationDB] + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] + vote: Optional[str] + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "human_evaluations_scenarios" + + +class EvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str = Field(default="EVALUATION_INITIALIZED") + testset: Link[TestSetDB] + variant: PydanticObjectId + evaluators_configs: List[PydanticObjectId] + aggregated_results: List[AggregatedResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluations" + + +class EvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[EvaluationDB] + variant_id: PydanticObjectId + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] + evaluators_configs: List[PydanticObjectId] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluation_scenarios" + + +class OldEvaluationTypeSettings(BaseModel): + similarity_threshold: Optional[float] + regex_pattern: Optional[str] + regex_should_match: Optional[bool] + webhook_url: Optional[str] + llm_app_prompt_template: Optional[str] + custom_code_evaluation_id: Optional[str] + evaluation_prompt_template: Optional[str] + + +class OldEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class OldEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class OldEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + evaluation_type_settings: OldEvaluationTypeSettings + variants: List[PydanticObjectId] + version: str = Field("odmantic") + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluations" + + +class OldEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[OldEvaluationDB] + inputs: List[OldEvaluationScenarioInput] + outputs: List[OldEvaluationScenarioOutput] # EvaluationScenarioOutput + vote: Optional[str] + version: str = Field("odmantic") + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "evaluation_scenarios" + + +class OldCustomEvaluationDB(Document): + evaluation_name: str + python_code: str + version: str = Field("odmantic") + app: Link[AppDB] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "custom_evaluations" + + +def modify_app_id_store( + app_id: str, + variant_ids: str, + evaluation_type: str, + app_keyvalue_store: Dict[str, Dict[str, List[str]]], +): + app_id_store = app_keyvalue_store.get(app_id, None) + if not app_id_store: + app_keyvalue_store[app_id] = {"variant_ids": [], "evaluation_types": []} + app_id_store = app_keyvalue_store[app_id] + + app_id_store_variant_ids = list(app_id_store["variant_ids"]) + if variant_ids not in list(app_id_store["variant_ids"]): + app_id_store_variant_ids.extend(variant_ids) + app_id_store["variant_ids"] = list(set(app_id_store_variant_ids)) + + app_id_store_evaluation_types = list(app_id_store["evaluation_types"]) + if evaluation_type not in app_id_store_evaluation_types: + app_id_store_evaluation_types.append(evaluation_type) + app_id_store["evaluation_types"] = list(set(app_id_store_evaluation_types)) + + +class Forward: + @free_fall_migration( + document_models=[ + AppDB, + OrganizationDB, + UserDB, + TestSetDB, + EvaluationDB, + OldEvaluationDB, + OldEvaluationScenarioDB, + EvaluationScenarioDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + ] + ) + async def migrate_old_auto_evaluation_scenario_to_new_auto_evaluation_scenario( + self, session + ): + old_auto_scenarios = await OldEvaluationScenarioDB.find( + In( + OldEvaluationScenarioDB.evaluation.evaluation_type, + [ + "auto_exact_match", + "auto_similarity_match", + "auto_regex_test", + "auto_ai_critique", + "auto_custom_code_run", + "auto_webhook_test", + ], + ), + fetch_links=True, + ).to_list() + for old_scenario in old_auto_scenarios: + matching_evaluation = await EvaluationDB.find_one( + EvaluationDB.app.id == old_scenario.evaluation.app.id, + fetch_links=True, + ) + if matching_evaluation: + results = [ + EvaluationScenarioResult( + evaluator_config=PydanticObjectId(evaluator_config), + result=Result( + type="number" + if isinstance(old_scenario.score, int) + else "number" + if isinstance(old_scenario.score, float) + else "string" + if isinstance(old_scenario.score, str) + else "boolean" + if isinstance(old_scenario.score, bool) + else "any", + value=old_scenario.score, + ), + ) + for evaluator_config in matching_evaluation.evaluators_configs + ] + new_scenario = EvaluationScenarioDB( + user=matching_evaluation.user, + organization=matching_evaluation.organization, + evaluation=matching_evaluation, + variant_id=old_scenario.evaluation.variants[0], + inputs=[ + EvaluationScenarioInputDB( + name=input.input_name, + type=type(input.input_value).__name__, + value=input.input_value, + ) + for input in old_scenario.inputs + ], + outputs=[ + EvaluationScenarioOutputDB( + type=type(output.variant_output).__name__, + value=output.variant_output, + ) + for output in old_scenario.outputs + ], + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + evaluators_configs=matching_evaluation.evaluators_configs, + results=results, + ) + await new_scenario.insert(session=session) + + @free_fall_migration( + document_models=[ + AppDB, + OrganizationDB, + UserDB, + TestSetDB, + EvaluationDB, + OldEvaluationDB, + OldEvaluationScenarioDB, + EvaluationScenarioDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + ] + ) + async def migrate_old_human_evaluation_scenario_to_new_human_evaluation_scenario( + self, session + ): + old_human_scenarios = await OldEvaluationScenarioDB.find( + In( + OldEvaluationScenarioDB.evaluation.evaluation_type, + ["human_a_b_testing", "single_model_test"], + ), + fetch_links=True, + ).to_list() + for old_scenario in old_human_scenarios: + matching_human_evaluation = await HumanEvaluationDB.find_one( + HumanEvaluationDB.app.id == old_scenario.evaluation.app.id, + fetch_links=True, + ) + if matching_human_evaluation: + scenario_inputs = [ + HumanEvaluationScenarioInput( + input_name=input.input_name, + input_value=input.input_value, + ) + for input in old_scenario.inputs + ] + scenario_outputs = [ + HumanEvaluationScenarioOutput( + variant_id=output.variant_id, + variant_output=output.variant_output, + ) + for output in old_scenario.outputs + ] + new_scenario = HumanEvaluationScenarioDB( + user=matching_human_evaluation.user, + organization=matching_human_evaluation.organization, + evaluation=matching_human_evaluation, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + vote=old_scenario.vote, + score=old_scenario.score, + ) + await new_scenario.insert(session=session) + + +class Backward: + pass From d7c481a41c36c55ade6441eea77620795ddc85c8 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 15:57:41 +0100 Subject: [PATCH 376/414] Update - remove evaluation scenario logic from evaluation revamp migration --- .../20240110165900_evaluations_revamp.py | 196 +----------------- .../models/api/evaluation_model.py | 2 +- 2 files changed, 5 insertions(+), 193 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 95e60586f2..e1bb46ec7c 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -1,8 +1,6 @@ -import os from datetime import datetime from typing import Any, Dict, List, Optional -from pymongo import MongoClient from pydantic import BaseModel, Field from beanie import free_fall_migration, Document, Link, PydanticObjectId @@ -75,37 +73,11 @@ class Result(BaseModel): value: Any -class EvaluationScenarioResult(BaseModel): - evaluator_config: PydanticObjectId - result: Result - - class AggregatedResult(BaseModel): evaluator_config: PydanticObjectId result: Result -class EvaluationScenarioInputDB(BaseModel): - name: str - type: str - value: str - - -class EvaluationScenarioOutputDB(BaseModel): - type: str - value: Any - - -class HumanEvaluationScenarioInput(BaseModel): - input_name: str - input_value: str - - -class HumanEvaluationScenarioOutput(BaseModel): - variant_id: str - variant_output: str - - class HumanEvaluationDB(Document): app: Link[AppDB] organization: Link[OrganizationDB] @@ -121,24 +93,6 @@ class Settings: name = "human_evaluations" -class HumanEvaluationScenarioDB(Document): - user: Link[UserDB] - organization: Link[OrganizationDB] - evaluation: Link[HumanEvaluationDB] - inputs: List[HumanEvaluationScenarioInput] - outputs: List[HumanEvaluationScenarioOutput] - vote: Optional[str] - score: Optional[Any] - correct_answer: Optional[str] - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - is_pinned: Optional[bool] - note: Optional[str] - - class Settings: - name = "human_evaluations_scenarios" - - class EvaluationDB(Document): app: Link[AppDB] organization: Link[OrganizationDB] @@ -155,25 +109,6 @@ class Settings: name = "new_evaluations" -class EvaluationScenarioDB(Document): - user: Link[UserDB] - organization: Link[OrganizationDB] - evaluation: Link[EvaluationDB] - variant_id: PydanticObjectId - inputs: List[EvaluationScenarioInputDB] - outputs: List[EvaluationScenarioOutputDB] - correct_answer: Optional[str] - is_pinned: Optional[bool] - note: Optional[str] - evaluators_configs: List[PydanticObjectId] - results: List[EvaluationScenarioResult] - created_at: datetime = Field(default=datetime.utcnow()) - updated_at: datetime = Field(default=datetime.utcnow()) - - class Settings: - name = "new_evaluation_scenarios" - - class OldEvaluationTypeSettings(BaseModel): similarity_threshold: Optional[float] regex_pattern: Optional[str] @@ -184,16 +119,6 @@ class OldEvaluationTypeSettings(BaseModel): evaluation_prompt_template: Optional[str] -class OldEvaluationScenarioInput(BaseModel): - input_name: str - input_value: str - - -class OldEvaluationScenarioOutput(BaseModel): - variant_id: str - variant_output: str - - class OldEvaluationDB(Document): app: Link[AppDB] organization: Link[OrganizationDB] @@ -211,25 +136,6 @@ class Settings: name = "evaluations" -class OldEvaluationScenarioDB(Document): - user: Link[UserDB] - organization: Link[OrganizationDB] - evaluation: Link[OldEvaluationDB] - inputs: List[OldEvaluationScenarioInput] - outputs: List[OldEvaluationScenarioOutput] # EvaluationScenarioOutput - vote: Optional[str] - version: str = Field("odmantic") - score: Optional[Any] - correct_answer: Optional[str] - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - is_pinned: Optional[bool] - note: Optional[str] - - class Settings: - name = "evaluation_scenarios" - - class OldCustomEvaluationDB(Document): evaluation_name: str python_code: str @@ -354,7 +260,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): "human_a_b_testing", "single_model_test", ]: - auto_evaluator_configs.append(PydanticObjectId(evaluator_config.id)) + auto_evaluator_configs.append(evaluator_config.id) # STEP 3 (b): # In the case where the evaluator key is a human evaluator, @@ -365,7 +271,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): "single_model_test", ]: new_eval = HumanEvaluationDB( - app=PydanticObjectId(app_id), + app=old_eval.app, organization=old_eval.organization, user=old_eval.user, status=old_eval.status, @@ -381,111 +287,17 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if auto_evaluator_configs is not None: for variant in app_id_store["variant_ids"]: new_eval = EvaluationDB( - app=PydanticObjectId(app_id), + app=old_eval.app, organization=old_eval.organization, user=old_eval.user, status=old_eval.status, testset=old_eval.testset, variant=PydanticObjectId(variant), evaluators_configs=auto_evaluator_configs, + aggregated_results=[], ) await new_eval.insert(session=session) - @free_fall_migration( - document_models=[ - AppDB, - OrganizationDB, - UserDB, - TestSetDB, - EvaluationDB, - OldEvaluationDB, - OldEvaluationScenarioDB, - EvaluationScenarioDB, - HumanEvaluationDB, - HumanEvaluationScenarioDB, - ] - ) - async def migrate_old_evaluation_scenario_to_new_evaluation_scenario(self, session): - old_scenarios = await OldEvaluationScenarioDB.find(fetch_links=True).to_list() - new_evaluations = await EvaluationDB.find(fetch_links=True).to_list() - new_human_evaluations = await HumanEvaluationDB.find(fetch_links=True).to_list() - - for old_scenario in old_scenarios: - matching_evaluations = [ - evaluation - for evaluation in new_evaluations - if old_scenario.evaluation.app == evaluation.app.id - ] - for evaluation in matching_evaluations: - results = [ - EvaluationScenarioResult( - evaluator_config=PydanticObjectId(evaluator_config), - result=old_scenario.score, - ) - for evaluator_config in evaluation.evaluators_configs - ] - new_scenario = EvaluationScenarioDB( - user=evaluation.user, - organization=evaluation.organization, - evaluation=evaluation, - variant_id=old_scenario.evaluation.variants[0], - inputs=[ - EvaluationScenarioInputDB( - name=input.input_name, - type=type(input.input_value).__name__, - value=input.input_value, - ) - for input in old_scenario.inputs - ], - outputs=[ - EvaluationScenarioOutputDB( - type=type(output.variant_output).__name__, - value=output.variant_output, - ) - for output in old_scenario.outputs - ], - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - evaluators_configs=evaluation.evaluators_configs, - results=results, - ) - await new_scenario.insert(session=session) - - matching_human_evaluations = [ - evaluation - for evaluation in new_human_evaluations - if old_scenario.evaluation.app == evaluation.app.id - ] - for human_evaluation in matching_human_evaluations: - scenario_inputs = [ - HumanEvaluationScenarioInput( - input_name=input.input_name, - input_value=input.input_value, - ) - for input in old_scenario.inputs - ] - scenario_outputs = [ - HumanEvaluationScenarioOutput( - variant_id=output.variant_id, - variant_output=output.variant_output, - ) - for output in old_scenario.outputs - ] - new_scenario = HumanEvaluationScenarioDB( - user=human_evaluation.user, - organization=human_evaluation.organization, - evaluation=human_evaluation, - inputs=scenario_inputs, - outputs=scenario_outputs, - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - vote=old_scenario.vote, - score=old_scenario.score, - ) - await new_scenario.insert(session=session) - class Backward: pass diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 15a48e2fd4..f64e2b4833 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -114,7 +114,7 @@ class HumanEvaluation(BaseModel): app_id: str user_id: str user_username: str - evaluation_type: EvaluationType + evaluation_type: str variant_ids: List[str] variant_names: List[str] testset_id: str From b38eca004093511f97b84d0990c084edbd7b1fa8 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 16:00:37 +0100 Subject: [PATCH 377/414] Update - switch timestamp between 3rd and 4th migration files --- ...os_revamp.py => 20240112120721_evaluation_scenarios_revamp.py} | 0 ...ink.py => 20240113131802_change_odmantic_reference_to_link.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename agenta-backend/agenta_backend/migrations/{20240113131802_evaluation_scenarios_revamp.py => 20240112120721_evaluation_scenarios_revamp.py} (100%) rename agenta-backend/agenta_backend/migrations/{20240112120721_change_odmantic_reference_to_link.py => 20240113131802_change_odmantic_reference_to_link.py} (100%) diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py similarity index 100% rename from agenta-backend/agenta_backend/migrations/20240113131802_evaluation_scenarios_revamp.py rename to agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py b/agenta-backend/agenta_backend/migrations/20240113131802_change_odmantic_reference_to_link.py similarity index 100% rename from agenta-backend/agenta_backend/migrations/20240112120721_change_odmantic_reference_to_link.py rename to agenta-backend/agenta_backend/migrations/20240113131802_change_odmantic_reference_to_link.py From 657ce5beed0cbe2e195ec927a72cace78b9a7f28 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 17:23:33 +0100 Subject: [PATCH 378/414] Update - modified steps to allow creation of human evaluations --- .../20240110165900_evaluations_revamp.py | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index e1bb46ec7c..9484542ab1 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -250,7 +250,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) - # STEP 3 (a): + # STEP 3: # Retrieve evaluator configs for app id auto_evaluator_configs: List[PydanticObjectId] = [] for evaluator_config in app_evaluator_configs: @@ -262,26 +262,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): ]: auto_evaluator_configs.append(evaluator_config.id) - # STEP 3 (b): - # In the case where the evaluator key is a human evaluator, - # Proceed to create the human evaluation with the evaluator config - for evaluator_config in app_evaluator_configs: - if evaluator_config.evaluator_key in [ - "human_a_b_testing", - "single_model_test", - ]: - new_eval = HumanEvaluationDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, - status=old_eval.status, - evaluation_type=evaluator_config.evaluator_key, - variants=app_id_store["variant_ids"], - testset=old_eval.testset, - ) - await new_eval.insert(session=session) - - # STEP 3 (c): + # STEP 4: # Proceed to create a single evaluation for every variant in the app_id_store # with the auto_evaluator_configs if auto_evaluator_configs is not None: @@ -298,6 +279,24 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): ) await new_eval.insert(session=session) + # STEP 5: + # Create the human evaluation + for old_evaluation in old_evaluations: + if old_evaluation.evaluation_type in [ + "human_a_b_testing", + "single_model_test", + ]: + new_eval = HumanEvaluationDB( + app=old_evaluation.app, + organization=old_evaluation.organization, + user=old_evaluation.user, + status=old_evaluation.status, + evaluation_type=old_evaluation.evaluation_type, + variants=old_evaluation.variants, + testset=old_evaluation.testset, + ) + await new_eval.insert(session=session) + class Backward: pass From dd8fe7b5919fb983b0cdeacb96cd121c6e762011 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 17:55:15 +0100 Subject: [PATCH 379/414] Update - broke down human evaluation scenarios migration logic --- ...40112120721_evaluation_scenarios_revamp.py | 85 +++++++++++++++---- 1 file changed, 70 insertions(+), 15 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py index d8ba377e79..da5cda1033 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py @@ -363,19 +363,17 @@ async def migrate_old_auto_evaluation_scenario_to_new_auto_evaluation_scenario( HumanEvaluationScenarioDB, ] ) - async def migrate_old_human_evaluation_scenario_to_new_human_evaluation_scenario( + async def migrate_old_human_a_b_evaluation_scenario_to_new_human_evaluation_scenario( self, session ): - old_human_scenarios = await OldEvaluationScenarioDB.find( - In( - OldEvaluationScenarioDB.evaluation.evaluation_type, - ["human_a_b_testing", "single_model_test"], - ), + old_human_ab_testing_scenarios = await OldEvaluationScenarioDB.find( + OldEvaluationScenarioDB.evaluation.evaluation_type == "human_a_b_testing", fetch_links=True, ).to_list() - for old_scenario in old_human_scenarios: + for ab_testing_scenario in old_human_ab_testing_scenarios: matching_human_evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.app.id == old_scenario.evaluation.app.id, + HumanEvaluationDB.app.id == ab_testing_scenario.evaluation.app.id, + HumanEvaluationDB.evaluation_type == "human_a_b_testing", fetch_links=True, ) if matching_human_evaluation: @@ -384,14 +382,14 @@ async def migrate_old_human_evaluation_scenario_to_new_human_evaluation_scenario input_name=input.input_name, input_value=input.input_value, ) - for input in old_scenario.inputs + for input in ab_testing_scenario.inputs ] scenario_outputs = [ HumanEvaluationScenarioOutput( variant_id=output.variant_id, variant_output=output.variant_output, ) - for output in old_scenario.outputs + for output in ab_testing_scenario.outputs ] new_scenario = HumanEvaluationScenarioDB( user=matching_human_evaluation.user, @@ -399,11 +397,68 @@ async def migrate_old_human_evaluation_scenario_to_new_human_evaluation_scenario evaluation=matching_human_evaluation, inputs=scenario_inputs, outputs=scenario_outputs, - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - vote=old_scenario.vote, - score=old_scenario.score, + correct_answer=ab_testing_scenario.correct_answer, + is_pinned=ab_testing_scenario.is_pinned, + note=ab_testing_scenario.note, + vote=ab_testing_scenario.vote, + score=ab_testing_scenario.score, + ) + await new_scenario.insert(session=session) + + + @free_fall_migration( + document_models=[ + AppDB, + OrganizationDB, + UserDB, + TestSetDB, + EvaluationDB, + OldEvaluationDB, + OldEvaluationScenarioDB, + EvaluationScenarioDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + ] + ) + async def migrate_old_human_single_model_evaluation_scenario_to_new_human_evaluation_scenario( + self, session + ): + old_human_single_model_scenarios = await OldEvaluationScenarioDB.find( + OldEvaluationScenarioDB.evaluation.evaluation_type == "single_model_test", + fetch_links=True, + ).to_list() + for single_model_scenario in old_human_single_model_scenarios: + matching_human_evaluation = await HumanEvaluationDB.find_one( + HumanEvaluationDB.app.id == single_model_scenario.evaluation.app.id, + HumanEvaluationDB.evaluation_type == "single_model_test", + fetch_links=True, + ) + if matching_human_evaluation: + scenario_inputs = [ + HumanEvaluationScenarioInput( + input_name=input.input_name, + input_value=input.input_value, + ) + for input in single_model_scenario.inputs + ] + scenario_outputs = [ + HumanEvaluationScenarioOutput( + variant_id=output.variant_id, + variant_output=output.variant_output, + ) + for output in single_model_scenario.outputs + ] + new_scenario = HumanEvaluationScenarioDB( + user=matching_human_evaluation.user, + organization=matching_human_evaluation.organization, + evaluation=matching_human_evaluation, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=single_model_scenario.correct_answer, + is_pinned=single_model_scenario.is_pinned, + note=single_model_scenario.note, + vote=single_model_scenario.vote, + score=single_model_scenario.score, ) await new_scenario.insert(session=session) From ea18aa920e67d79cd970fe0fab8ff99558cd9dac Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 18:40:38 +0100 Subject: [PATCH 380/414] Update - modified step 2 logic in evaluations revamp migration --- .../20240110165900_evaluations_revamp.py | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 9484542ab1..5c4a643f4e 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -238,7 +238,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) - if evaluation_type not in ["custom_code_run", "auto_similarity_match"]: + if evaluation_type == "auto_exact_match": eval_config = EvaluatorConfigDB( app=old_eval.app, organization=old_eval.organization, @@ -250,6 +250,49 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) + if evaluation_type == "auto_regex_test": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values={ + "regex_pattern": old_eval.evaluation_type_settings.regex_pattern, + "regex_should_match": old_eval.evaluation_type_settings.regex_should_match, + }, + ) + await eval_config.insert(session=session) + app_evaluator_configs.append(eval_config) + + if evaluation_type == "auto_webhook_test": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values={ + "webhook_url": old_eval.evaluation_type_settings.webhook_url, + }, + ) + await eval_config.insert(session=session) + app_evaluator_configs.append(eval_config) + + if evaluation_type == "auto_ai_critique": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values={ + "prompt_template": old_eval.evaluation_type_settings.evaluation_prompt_template + }, + ) + await eval_config.insert(session=session) + app_evaluator_configs.append(eval_config) + # STEP 3: # Retrieve evaluator configs for app id auto_evaluator_configs: List[PydanticObjectId] = [] From 508416c39cfde2a53f7fc32bd3173febe4945ed3 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 20:16:05 +0100 Subject: [PATCH 381/414] Update - cleanup and format evaluations revamp migration --- .../20240110165900_evaluations_revamp.py | 43 ++++++++++++------- ...40112120721_evaluation_scenarios_revamp.py | 1 - 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 5c4a643f4e..a4513d27d6 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -60,7 +60,7 @@ class EvaluatorConfigDB(Document): user: Link[UserDB] name: str evaluator_key: str - settings_values: Optional[Dict[str, Any]] = None + settings_values: Optional[Dict[str, Any]] created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) @@ -219,9 +219,9 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): user=old_eval.user, name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=f"auto_{evaluation_type}", - settings_values={} - if custom_code_evaluation is None - else {"code": custom_code_evaluation.python_code}, + settings_values=dict( + {"code": custom_code_evaluation.python_code} + ), ) await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) @@ -233,7 +233,11 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): user=old_eval.user, name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=evaluation_type, - settings_values={"similarity_threshold": 0.5}, + settings_values=dict( + { + "similarity_threshold": float(old_eval.evaluation_type_settings.similarity_threshold) + } + ), ) await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) @@ -245,7 +249,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): user=old_eval.user, name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=evaluation_type, - settings_values={}, + settings_values={} ) await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) @@ -257,10 +261,12 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): user=old_eval.user, name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=evaluation_type, - settings_values={ - "regex_pattern": old_eval.evaluation_type_settings.regex_pattern, - "regex_should_match": old_eval.evaluation_type_settings.regex_should_match, - }, + settings_values=dict( + { + "regex_pattern": old_eval.evaluation_type_settings.regex_pattern, + "regex_should_match": old_eval.evaluation_type_settings.regex_should_match, + } + ), ) await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) @@ -272,9 +278,12 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): user=old_eval.user, name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=evaluation_type, - settings_values={ - "webhook_url": old_eval.evaluation_type_settings.webhook_url, - }, + settings_values=dict( + { + "webhook_url": old_eval.evaluation_type_settings.webhook_url, + "webhook_body": {}, + } + ), ) await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) @@ -286,9 +295,11 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): user=old_eval.user, name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=evaluation_type, - settings_values={ - "prompt_template": old_eval.evaluation_type_settings.evaluation_prompt_template - }, + settings_values=dict( + { + "prompt_template": old_eval.evaluation_type_settings.evaluation_prompt_template + } + ), ) await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py index da5cda1033..8ad1f2a105 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py @@ -405,7 +405,6 @@ async def migrate_old_human_a_b_evaluation_scenario_to_new_human_evaluation_scen ) await new_scenario.insert(session=session) - @free_fall_migration( document_models=[ AppDB, From 2774ff923e074dd2e8b1d2fd6462c789dd8c9528 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 20:47:58 +0100 Subject: [PATCH 382/414] Update - modified logic to include evaluator result for evaluation results aggregation --- agenta-backend/agenta_backend/tasks/evaluations.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 5b0b861389..082b4bee96 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -86,7 +86,7 @@ def evaluate( # 2. Initialize vars evaluators_aggregated_data = { - evaluator_config_db.id: { + str(evaluator_config_db.id): { "evaluator_key": evaluator_config.evaluator_key, "results": [], } @@ -115,7 +115,7 @@ def evaluate( self.update_state(state=states.FAILURE) raise ValueError("Length of csv data and app_outputs are not the same") - return + for data_point, app_output in zip(testset_db.csvdata, app_outputs): # 2. We prepare the inputs logger.debug(f"Preparing inputs for data point: {data_point}") @@ -149,6 +149,10 @@ def evaluate( lm_providers_keys=lm_providers_keys, ) + # Update evaluators aggregated data + evaluator_results: List[Result] = evaluators_aggregated_data[str(evaluator_config_db.id)]["results"] + evaluator_results.append(result) + result_object = EvaluationScenarioResult( evaluator_config=evaluator_config_db.id, result=result, From 01ce41efbcd9274bcac3f1fb5e0cd95507c7546e Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 20:48:37 +0100 Subject: [PATCH 383/414] Update - modified evaluator config db model --- agenta-backend/agenta_backend/models/db_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 68d15450b8..11ad909077 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -200,7 +200,7 @@ class EvaluatorConfigDB(Document): user: Link[UserDB] name: str evaluator_key: str - settings_values: Optional[Dict[str, Any]] = None + settings_values: Dict[str, Any] = Field(default=dict) created_at: datetime = Field(default=datetime.utcnow()) updated_at: datetime = Field(default=datetime.utcnow()) From 85ae65f27e5e5297f26a70ba59d043ff2cd1e486 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 23:26:33 +0100 Subject: [PATCH 384/414] Update - tiny cleanup and format --- .../migrations/20240110165900_evaluations_revamp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index a4513d27d6..2a99a65a84 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -162,7 +162,7 @@ def modify_app_id_store( app_id_store = app_keyvalue_store[app_id] app_id_store_variant_ids = list(app_id_store["variant_ids"]) - if variant_ids not in list(app_id_store["variant_ids"]): + if variant_ids not in app_id_store_variant_ids: app_id_store_variant_ids.extend(variant_ids) app_id_store["variant_ids"] = list(set(app_id_store_variant_ids)) @@ -235,7 +235,9 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): evaluator_key=evaluation_type, settings_values=dict( { - "similarity_threshold": float(old_eval.evaluation_type_settings.similarity_threshold) + "similarity_threshold": float( + old_eval.evaluation_type_settings.similarity_threshold + ) } ), ) @@ -249,7 +251,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): user=old_eval.user, name=f"{old_eval.app.app_name}_{evaluation_type}", evaluator_key=evaluation_type, - settings_values={} + settings_values={}, ) await eval_config.insert(session=session) app_evaluator_configs.append(eval_config) From d989fbb8bf00c575348043eeec1d25d1c812ee25 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 23:27:17 +0100 Subject: [PATCH 385/414] Update - tiny cleanup and format --- agenta-backend/agenta_backend/tasks/evaluations.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 082b4bee96..6bb45e0924 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -150,7 +150,9 @@ def evaluate( ) # Update evaluators aggregated data - evaluator_results: List[Result] = evaluators_aggregated_data[str(evaluator_config_db.id)]["results"] + evaluator_results: List[Result] = evaluators_aggregated_data[ + str(evaluator_config_db.id) + ]["results"] evaluator_results.append(result) result_object = EvaluationScenarioResult( From 7943810ad675b3b2ab9f82cfdfdb9a73fe3e2b14 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 23:27:39 +0100 Subject: [PATCH 386/414] Feat - created migration logic to aggregated evaluation scenarios results --- ...4909_new_evaluation_results_aggregation.py | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 agenta-backend/agenta_backend/migrations/20240113204909_new_evaluation_results_aggregation.py diff --git a/agenta-backend/agenta_backend/migrations/20240113204909_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113204909_new_evaluation_results_aggregation.py new file mode 100644 index 0000000000..884b419881 --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/20240113204909_new_evaluation_results_aggregation.py @@ -0,0 +1,257 @@ +from datetime import datetime +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field +from beanie import free_fall_migration, Document, Link, PydanticObjectId + + +class OrganizationDB(Document): + name: str = Field(default="agenta") + description: str = Field(default="") + type: Optional[str] + owner: str # user id + members: Optional[List[PydanticObjectId]] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "organizations" + + +class UserDB(Document): + uid: str = Field(default="0", unique=True, index=True) + username: str = Field(default="agenta") + email: str = Field(default="demo@agenta.ai", unique=True) + organizations: Optional[List[PydanticObjectId]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "users" + + +class AppDB(Document): + app_name: str + organization: Link[OrganizationDB] + user: Link[UserDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_db" + + +class TestSetDB(Document): + name: str + app: Link[AppDB] + csvdata: List[Dict[str, str]] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "testsets" + + +class Result(BaseModel): + type: str + value: Any + + +class EvaluationScenarioResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class AggregatedResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class EvaluationScenarioInputDB(BaseModel): + name: str + type: str + value: str + + +class EvaluationScenarioOutputDB(BaseModel): + type: str + value: Any + + +class EvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str = Field(default="EVALUATION_INITIALIZED") + testset: Link[TestSetDB] + variant: PydanticObjectId + evaluators_configs: List[PydanticObjectId] + aggregated_results: List[AggregatedResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluations" + + +class EvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[EvaluationDB] + variant_id: PydanticObjectId + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] + evaluators_configs: List[PydanticObjectId] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluation_scenarios" + + +def prepare_evaluation_keyvalue_store( + evaluation_id: str, evaluator_id: str, evaluation_keyvalue_store: Dict +) -> Dict[str, Dict[str, Any]]: + """ + Construct a key-value store to saves results based on a evaluator config in an evaluation + + Args: + evaluation_id (str): ID of evaluation + evaluator_id (str): ID of evaluator config + evaluation_keyvalue_store (Dict): evaluation keyvalue store + + Returns: + Dict[str, Dict[str, Any]]: {"evaluation_id": {"evaluation_config_id": {"results": [Result("type": str, "value": Any)]}}} + """ + + if evaluation_id not in evaluation_keyvalue_store: + evaluation_keyvalue_store[evaluation_id] = {} + + if evaluator_id not in evaluation_keyvalue_store[evaluation_id]: + evaluation_keyvalue_store[evaluation_id][evaluator_id] = {"results": []} + + return evaluation_keyvalue_store + + +def get_numeric_value(value: Any): + """ + Converts the given value to a numeric representation, with specific + conversions for strings such as 'correct', 'wrong', 'true', and 'false'. + """ + + if isinstance(value, str): + if value.lower() == "correct": + return 1 + elif value.lower() == "wrong": + return 0 + elif value.lower() == "true": + return float(True) + elif value.lower() == "false": + return float(False) + else: + return float(value) + return 0 + + +def aggregate_evaluator_results( + evaluators_aggregated_data: dict, +) -> List[AggregatedResult]: + aggregated_results = [] + for config_id, evaluator_store in evaluators_aggregated_data.items(): + results: List[EvaluationScenarioResult] = evaluator_store.get("results", []) + if len(results) >= 1: + values = [get_numeric_value(result.result.value) for result in results] + average_value = sum(values) / len(values) + else: + average_value = 0 + + aggregated_result = AggregatedResult( + evaluator_config=PydanticObjectId(config_id), + result=Result(type="number", value=round(average_value, 4)), + ) + aggregated_results.append(aggregated_result) + return aggregated_results + + +def modify_evaluation_scenario_store( + evaluator_id: str, + result: Result, + evaluation_keyvalue_store: Dict[str, Dict[str, List[Any]]], +): + """ + Updates an evaluation scenario store by adding a result to the list of results for a + specific evaluation and evaluator. + + Args: + evaluator_id (str): ID of evaluator config + result: The evaluation result that needs to be added to the evaluation_results list + evaluation_keyvalue_store: The store that holds the evaluation data + """ + + evaluation_evaluator_config_store = evaluation_keyvalue_store[evaluator_id] + evaluation_results = list(evaluation_evaluator_config_store["results"]) + if result not in evaluation_results: + evaluation_results.append(result) + evaluation_evaluator_config_store["results"] = list(evaluation_results) + + +class Forward: + @free_fall_migration( + document_models=[ + AppDB, + OrganizationDB, + UserDB, + TestSetDB, + EvaluationDB, + EvaluationScenarioDB, + ] + ) + async def aggregate_new_evaluation_with_evaluation_scenario_results(self, session): + # STEP 1: + # Create a key-value store that saves all the evaluator configs & results for a particular evaluation id + # Example: {"evaluation_id": {"evaluation_config_id": {"results": [Result("type": str, "value": Any)]}}} + evaluation_keyvalue_store = {} + new_auto_evaluations = await EvaluationDB.find().to_list() + for auto_evaluation in new_auto_evaluations: + for evaluator_config in auto_evaluation.evaluators_configs: + evaluation_keyvalue_store = prepare_evaluation_keyvalue_store( + str(auto_evaluation.id), + str(evaluator_config), + evaluation_keyvalue_store, + ) + + # STEP 2: + # Update the evaluation key-value store + new_auto_evaluation_scenarios = await EvaluationScenarioDB.find( + fetch_links=True + ).to_list() + for auto_evaluation in new_auto_evaluation_scenarios: + evaluation_id = str(auto_evaluation.evaluation.id) + evaluation_store = evaluation_keyvalue_store[evaluation_id] + configs_with_results = zip( + auto_evaluation.evaluators_configs, auto_evaluation.results + ) + for evaluator, result in configs_with_results: + modify_evaluation_scenario_store( + str(evaluator), result, evaluation_store + ) + + # STEP 3: + # Modify the evaluations with the aggregated results from the keyvalue store + for auto_evaluation in new_auto_evaluations: + aggregated_results = aggregate_evaluator_results( + evaluation_keyvalue_store[str(auto_evaluation.id)] + ) + auto_evaluation.aggregated_results = aggregated_results + auto_evaluation.updated_at = datetime.utcnow().isoformat() + await auto_evaluation.save() + + +class Backward: + pass From 42eb6c9fda7fb8e16a0195226ad4b915802f9be3 Mon Sep 17 00:00:00 2001 From: Abram Date: Sat, 13 Jan 2024 23:29:06 +0100 Subject: [PATCH 387/414] Update - swtich timestamp between migration 4 and 5 --- ...on.py => 20240113131802_new_evaluation_results_aggregation.py} | 0 ...ink.py => 20240113204909_change_odmantic_reference_to_link.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename agenta-backend/agenta_backend/migrations/{20240113204909_new_evaluation_results_aggregation.py => 20240113131802_new_evaluation_results_aggregation.py} (100%) rename agenta-backend/agenta_backend/migrations/{20240113131802_change_odmantic_reference_to_link.py => 20240113204909_change_odmantic_reference_to_link.py} (100%) diff --git a/agenta-backend/agenta_backend/migrations/20240113204909_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py similarity index 100% rename from agenta-backend/agenta_backend/migrations/20240113204909_new_evaluation_results_aggregation.py rename to agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_change_odmantic_reference_to_link.py b/agenta-backend/agenta_backend/migrations/20240113204909_change_odmantic_reference_to_link.py similarity index 100% rename from agenta-backend/agenta_backend/migrations/20240113131802_change_odmantic_reference_to_link.py rename to agenta-backend/agenta_backend/migrations/20240113204909_change_odmantic_reference_to_link.py From f0570952132d7aeba24af0d9748d6c21ed751a5c Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 14 Jan 2024 00:00:09 +0100 Subject: [PATCH 388/414] Update - added status to evaluation --- .../20240113131802_new_evaluation_results_aggregation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py index 884b419881..935c6c66be 100644 --- a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py +++ b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py @@ -248,6 +248,7 @@ async def aggregate_new_evaluation_with_evaluation_scenario_results(self, sessio aggregated_results = aggregate_evaluator_results( evaluation_keyvalue_store[str(auto_evaluation.id)] ) + auto_evaluation.status = "EVALUATION_FINISHED" auto_evaluation.aggregated_results = aggregated_results auto_evaluation.updated_at = datetime.utcnow().isoformat() await auto_evaluation.save() From 7e3083ba5c7d7a336dd4fcdd46f9e42e80fb5cdd Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 15 Jan 2024 11:39:24 +0100 Subject: [PATCH 389/414] Update - modified logic to get_numeric_value for aggregated results --- .../20240113131802_new_evaluation_results_aggregation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py index 935c6c66be..09856eda80 100644 --- a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py +++ b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py @@ -155,7 +155,10 @@ def get_numeric_value(value: Any): elif value.lower() == "false": return float(False) else: - return float(value) + try: + return float(value) + except ValueError: + return 0 return 0 From a7590e6f5ea138e6dc0a2b845c031c77329dfc56 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 15 Jan 2024 13:22:06 +0100 Subject: [PATCH 390/414] Update - modify logic to assign evaluations to their respective users --- .../20240110165900_evaluations_revamp.py | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 2a99a65a84..689b277865 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -205,6 +205,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): # based on the evaluation types available for app_id, app_id_store in app_keyvalue_store.items(): app_evaluator_configs: List[EvaluatorConfigDB] = [] + app_db = await AppDB.find_one(AppDB.id == PydanticObjectId(app_id)) for evaluation_type in app_id_store[ "evaluation_types" ]: # the values in this case are the evaluation type @@ -214,10 +215,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type == "custom_code_run": for custom_code_evaluation in custom_code_evaluations: eval_config = EvaluatorConfigDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_{evaluation_type}", evaluator_key=f"auto_{evaluation_type}", settings_values=dict( {"code": custom_code_evaluation.python_code} @@ -228,10 +229,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type == "auto_similarity_match": eval_config = EvaluatorConfigDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_{evaluation_type}", evaluator_key=evaluation_type, settings_values=dict( { @@ -246,10 +247,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type == "auto_exact_match": eval_config = EvaluatorConfigDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_{evaluation_type}", evaluator_key=evaluation_type, settings_values={}, ) @@ -258,10 +259,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type == "auto_regex_test": eval_config = EvaluatorConfigDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_{evaluation_type}", evaluator_key=evaluation_type, settings_values=dict( { @@ -275,10 +276,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type == "auto_webhook_test": eval_config = EvaluatorConfigDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_{evaluation_type}", evaluator_key=evaluation_type, settings_values=dict( { @@ -292,10 +293,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if evaluation_type == "auto_ai_critique": eval_config = EvaluatorConfigDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_{evaluation_type}", evaluator_key=evaluation_type, settings_values=dict( { @@ -324,9 +325,9 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): if auto_evaluator_configs is not None: for variant in app_id_store["variant_ids"]: new_eval = EvaluationDB( - app=old_eval.app, - organization=old_eval.organization, - user=old_eval.user, + app=app_db, + organization=app_db.organization, + user=app_db.user, status=old_eval.status, testset=old_eval.testset, variant=PydanticObjectId(variant), From ec0f59d10c1d2727e41feba55fcde0923437a0d3 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 16 Jan 2024 15:23:25 +0100 Subject: [PATCH 391/414] Update - rename AppEnvironmentDB collection name and modified query to list environments --- agenta-backend/agenta_backend/models/db_models.py | 2 +- agenta-backend/agenta_backend/services/db_manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/models/db_models.py b/agenta-backend/agenta_backend/models/db_models.py index 11ad909077..14125caecc 100644 --- a/agenta-backend/agenta_backend/models/db_models.py +++ b/agenta-backend/agenta_backend/models/db_models.py @@ -162,7 +162,7 @@ class AppEnvironmentDB(Document): created_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - name = "app_environment_db" + name = "environments" class TemplateDB(Document): diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index adbac48879..c2486c8e4d 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1009,7 +1009,7 @@ async def list_environments_by_variant( """ environments_db = await AppEnvironmentDB.find( - AppEnvironmentDB.app == app_variant.app.id, fetch_links=True + AppEnvironmentDB.app.id == app_variant.app.id, fetch_links=True ).to_list() return environments_db From 944d7743a625c8e792e5f2ed23041fa1b5923d70 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 16 Jan 2024 15:24:59 +0100 Subject: [PATCH 392/414] Update - modified migration 2 and 5 files --- .../migrations/20240110165900_evaluations_revamp.py | 2 ++ .../20240113204909_change_odmantic_reference_to_link.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 689b277865..7e32e32714 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -333,6 +333,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): variant=PydanticObjectId(variant), evaluators_configs=auto_evaluator_configs, aggregated_results=[], + created_at=old_evaluation.created_at, ) await new_eval.insert(session=session) @@ -351,6 +352,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): evaluation_type=old_evaluation.evaluation_type, variants=old_evaluation.variants, testset=old_evaluation.testset, + created_at=old_evaluation.created_at, ) await new_eval.insert(session=session) diff --git a/agenta-backend/agenta_backend/migrations/20240113204909_change_odmantic_reference_to_link.py b/agenta-backend/agenta_backend/migrations/20240113204909_change_odmantic_reference_to_link.py index 12518af42c..4c5f164d29 100644 --- a/agenta-backend/agenta_backend/migrations/20240113204909_change_odmantic_reference_to_link.py +++ b/agenta-backend/agenta_backend/migrations/20240113204909_change_odmantic_reference_to_link.py @@ -164,7 +164,7 @@ class AppEnvironmentDB(Document): created_at: Optional[datetime] = Field(default=datetime.utcnow()) class Settings: - name = "app_environment_db" + name = "environments" class TemplateDB(Document): From 6146555080dbacc616c1541b3496cdec3f2215de Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Tue, 16 Jan 2024 17:23:05 +0100 Subject: [PATCH 393/414] add backup script to save time --- .../agenta_backend/migrations/backup.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 agenta-backend/agenta_backend/migrations/backup.py diff --git a/agenta-backend/agenta_backend/migrations/backup.py b/agenta-backend/agenta_backend/migrations/backup.py new file mode 100644 index 0000000000..ffbbb60212 --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/backup.py @@ -0,0 +1,30 @@ +import asyncio +from pymongo import MongoClient + +async def drop_and_restore_collections(session=None): + print("dropping and restoring collections") + client = MongoClient("mongodb://username:password@mongo") + backup_db_name = "agenta_v2_backup" + main_db = "agenta_v2" + agenta_v2_db = client[main_db] + + # Drop all collections in the agenta_v2 database + for collection in agenta_v2_db.list_collection_names(): + agenta_v2_db[collection].drop() + + # Restore collections from agenta_v2_cloud_backup database + backup_db = client[backup_db_name] + for collection in backup_db.list_collection_names(): + data = list(backup_db[collection].find()) + if data: + agenta_v2_db[collection].insert_many(data) + + client.close() + +# Main entry point for the script +async def main(): + await drop_and_restore_collections() + +# Run the main function +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From 6100cc94e3efb13f211960e4247d0527c6663d92 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 16 Jan 2024 18:18:27 +0100 Subject: [PATCH 394/414] reverted dev.dockerfile --- agenta-web/dev.Dockerfile | 60 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile index ad5bc9a53b..6af573852c 100644 --- a/agenta-web/dev.Dockerfile +++ b/agenta-web/dev.Dockerfile @@ -1,37 +1,37 @@ FROM node:18-alpine -# WORKDIR /app +WORKDIR /app -# # Install dependencies based on the preferred package manager -# COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ -# RUN \ -# if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ -# elif [ -f package-lock.json ]; then npm i; \ -# elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ -# # Allow install without lockfile, so example works even without Node.js installed locally -# else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ -# fi +# Install dependencies based on the preferred package manager +COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./ +RUN \ + if [ -f yarn.lock ]; then yarn --frozen-lockfile; \ + elif [ -f package-lock.json ]; then npm i; \ + elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \ + # Allow install without lockfile, so example works even without Node.js installed locally + else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \ + fi -# COPY src ./src -# COPY public ./public -# COPY next.config.js . -# COPY tsconfig.json . -# COPY postcss.config.js . -# COPY .env . -# RUN if [ -f .env.local ]; then cp .env.local .; fi -# # # used in cloud -# COPY sentry.* . -# # Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry -# # Uncomment the following line to disable telemetry at run time -# # ENV NEXT_TELEMETRY_DISABLED 1 +COPY src ./src +COPY public ./public +COPY next.config.js . +COPY tsconfig.json . +COPY postcss.config.js . +COPY .env . +RUN if [ -f .env.local ]; then cp .env.local .; fi +# # used in cloud +COPY sentry.* . +# Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry +# Uncomment the following line to disable telemetry at run time +# ENV NEXT_TELEMETRY_DISABLED 1 -# # Note: Don't expose ports here, Compose will handle that for us +# Note: Don't expose ports here, Compose will handle that for us -# # Start Next.js in development mode based on the preferred package manager -# CMD \ -# if [ -f yarn.lock ]; then yarn dev; \ -# elif [ -f package-lock.json ]; then npm run dev; \ -# elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ -# else yarn dev; \ -# fi +# Start Next.js in development mode based on the preferred package manager +CMD \ + if [ -f yarn.lock ]; then yarn dev; \ + elif [ -f package-lock.json ]; then npm run dev; \ + elif [ -f pnpm-lock.yaml ]; then pnpm dev; \ + else yarn dev; \ + fi From 556598169f15bf730b866752ae6690a31920bf92 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 16 Jan 2024 18:28:09 +0100 Subject: [PATCH 395/414] Update - modified logic to allow a/b evaluation to be separate after migration --- .../20240110165900_evaluations_revamp.py | 55 +++++++++++++------ ...40112120721_evaluation_scenarios_revamp.py | 4 +- .../services/results_service.py | 2 +- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 689b277865..c995f2e2e9 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -1,6 +1,7 @@ from datetime import datetime from typing import Any, Dict, List, Optional +from beanie.operators import In from pydantic import BaseModel, Field from beanie import free_fall_migration, Document, Link, PydanticObjectId @@ -191,7 +192,20 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): # Create a key-value store that saves all the variants & evaluation types for a particular app id # Example: {"app_id": {"evaluation_types": ["string", "string"], "variant_ids": ["string", "string"]}} app_keyvalue_store = {} - old_evaluations = await OldEvaluationDB.find(fetch_links=True).to_list() + old_evaluations = await OldEvaluationDB.find( + In( + OldEvaluationDB.evaluation_type, + [ + "auto_exact_match", + "auto_similarity_match", + "auto_regex_test", + "auto_ai_critique", + "auto_custom_code_run", + "auto_webhook_test", + ], + ), + fetch_links=True, + ).to_list() for old_eval in old_evaluations: app_id = old_eval.app.id variant_ids = [str(variant_id) for variant_id in old_eval.variants] @@ -338,21 +352,30 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): # STEP 5: # Create the human evaluation - for old_evaluation in old_evaluations: - if old_evaluation.evaluation_type in [ - "human_a_b_testing", - "single_model_test", - ]: - new_eval = HumanEvaluationDB( - app=old_evaluation.app, - organization=old_evaluation.organization, - user=old_evaluation.user, - status=old_evaluation.status, - evaluation_type=old_evaluation.evaluation_type, - variants=old_evaluation.variants, - testset=old_evaluation.testset, - ) - await new_eval.insert(session=session) + old_human_evaluations = await OldEvaluationDB.find( + In( + OldEvaluationDB.evaluation_type, + [ + "human_a_b_testing", + "single_model_test", + ], + ), + fetch_links=True, + ).to_list() + for old_evaluation in old_human_evaluations: + new_eval = HumanEvaluationDB( + id=old_evaluation.id, + app=old_evaluation.app, + organization=old_evaluation.organization, + user=old_evaluation.user, + status=old_evaluation.status, + evaluation_type=old_evaluation.evaluation_type, + variants=old_evaluation.variants, + testset=old_evaluation.testset, + created_at=old_evaluation.created_at, + updated_at=old_evaluation.updated_at, + ) + await new_eval.insert(session=session) class Backward: diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py index 8ad1f2a105..3af89301cb 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py @@ -372,7 +372,7 @@ async def migrate_old_human_a_b_evaluation_scenario_to_new_human_evaluation_scen ).to_list() for ab_testing_scenario in old_human_ab_testing_scenarios: matching_human_evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.app.id == ab_testing_scenario.evaluation.app.id, + HumanEvaluationDB.id == ab_testing_scenario.evaluation.id, HumanEvaluationDB.evaluation_type == "human_a_b_testing", fetch_links=True, ) @@ -428,7 +428,7 @@ async def migrate_old_human_single_model_evaluation_scenario_to_new_human_evalua ).to_list() for single_model_scenario in old_human_single_model_scenarios: matching_human_evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.app.id == single_model_scenario.evaluation.app.id, + HumanEvaluationDB.id == single_model_scenario.evaluation.id, HumanEvaluationDB.evaluation_type == "single_model_test", fetch_links=True, ) diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index d33a1c419f..d04ee2976e 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -11,7 +11,7 @@ async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB): evaluation_scenarios = await HumanEvaluationScenarioDB.find( - HumanEvaluationScenarioDB.evaluation.id == ObjectId(evaluation.id), + HumanEvaluationScenarioDB.evaluation.id == evaluation.id, ).to_list() results = {} From a61d70f37ad3ac3cb3d862324251c8e3d4b8e7df Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 16 Jan 2024 18:28:09 +0100 Subject: [PATCH 396/414] Update - modified logic to allow a/b evaluation to be separate after migration --- .../20240110165900_evaluations_revamp.py | 55 +++++++++++++------ ...40112120721_evaluation_scenarios_revamp.py | 4 +- .../services/results_service.py | 2 +- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 689b277865..c995f2e2e9 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -1,6 +1,7 @@ from datetime import datetime from typing import Any, Dict, List, Optional +from beanie.operators import In from pydantic import BaseModel, Field from beanie import free_fall_migration, Document, Link, PydanticObjectId @@ -191,7 +192,20 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): # Create a key-value store that saves all the variants & evaluation types for a particular app id # Example: {"app_id": {"evaluation_types": ["string", "string"], "variant_ids": ["string", "string"]}} app_keyvalue_store = {} - old_evaluations = await OldEvaluationDB.find(fetch_links=True).to_list() + old_evaluations = await OldEvaluationDB.find( + In( + OldEvaluationDB.evaluation_type, + [ + "auto_exact_match", + "auto_similarity_match", + "auto_regex_test", + "auto_ai_critique", + "auto_custom_code_run", + "auto_webhook_test", + ], + ), + fetch_links=True, + ).to_list() for old_eval in old_evaluations: app_id = old_eval.app.id variant_ids = [str(variant_id) for variant_id in old_eval.variants] @@ -338,21 +352,30 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): # STEP 5: # Create the human evaluation - for old_evaluation in old_evaluations: - if old_evaluation.evaluation_type in [ - "human_a_b_testing", - "single_model_test", - ]: - new_eval = HumanEvaluationDB( - app=old_evaluation.app, - organization=old_evaluation.organization, - user=old_evaluation.user, - status=old_evaluation.status, - evaluation_type=old_evaluation.evaluation_type, - variants=old_evaluation.variants, - testset=old_evaluation.testset, - ) - await new_eval.insert(session=session) + old_human_evaluations = await OldEvaluationDB.find( + In( + OldEvaluationDB.evaluation_type, + [ + "human_a_b_testing", + "single_model_test", + ], + ), + fetch_links=True, + ).to_list() + for old_evaluation in old_human_evaluations: + new_eval = HumanEvaluationDB( + id=old_evaluation.id, + app=old_evaluation.app, + organization=old_evaluation.organization, + user=old_evaluation.user, + status=old_evaluation.status, + evaluation_type=old_evaluation.evaluation_type, + variants=old_evaluation.variants, + testset=old_evaluation.testset, + created_at=old_evaluation.created_at, + updated_at=old_evaluation.updated_at, + ) + await new_eval.insert(session=session) class Backward: diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py index 8ad1f2a105..3af89301cb 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py @@ -372,7 +372,7 @@ async def migrate_old_human_a_b_evaluation_scenario_to_new_human_evaluation_scen ).to_list() for ab_testing_scenario in old_human_ab_testing_scenarios: matching_human_evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.app.id == ab_testing_scenario.evaluation.app.id, + HumanEvaluationDB.id == ab_testing_scenario.evaluation.id, HumanEvaluationDB.evaluation_type == "human_a_b_testing", fetch_links=True, ) @@ -428,7 +428,7 @@ async def migrate_old_human_single_model_evaluation_scenario_to_new_human_evalua ).to_list() for single_model_scenario in old_human_single_model_scenarios: matching_human_evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.app.id == single_model_scenario.evaluation.app.id, + HumanEvaluationDB.id == single_model_scenario.evaluation.id, HumanEvaluationDB.evaluation_type == "single_model_test", fetch_links=True, ) diff --git a/agenta-backend/agenta_backend/services/results_service.py b/agenta-backend/agenta_backend/services/results_service.py index d33a1c419f..d04ee2976e 100644 --- a/agenta-backend/agenta_backend/services/results_service.py +++ b/agenta-backend/agenta_backend/services/results_service.py @@ -11,7 +11,7 @@ async def fetch_results_for_evaluation(evaluation: HumanEvaluationDB): evaluation_scenarios = await HumanEvaluationScenarioDB.find( - HumanEvaluationScenarioDB.evaluation.id == ObjectId(evaluation.id), + HumanEvaluationScenarioDB.evaluation.id == evaluation.id, ).to_list() results = {} From 4e5e85f88f09d68210ab0836505a5a79421463e1 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 16 Jan 2024 22:20:13 +0100 Subject: [PATCH 397/414] bug fix in deletion of environments --- agenta-backend/agenta_backend/services/db_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index adbac48879..66b51b44fd 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -881,7 +881,7 @@ async def remove_app_variant_from_db(app_variant_db: AppVariantDB, **kwargs: dic ) for environment in environments: environment.deployed_app_variant = None - await environment.create() + await environment.save() # removing the config config = app_variant_db.config await config.delete() From a227847d9ce318e1574578ff7c3376ec2e7a5a77 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Tue, 16 Jan 2024 22:20:34 +0100 Subject: [PATCH 398/414] format --- agenta-backend/agenta_backend/migrations/backup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/migrations/backup.py b/agenta-backend/agenta_backend/migrations/backup.py index ffbbb60212..9e01bce284 100644 --- a/agenta-backend/agenta_backend/migrations/backup.py +++ b/agenta-backend/agenta_backend/migrations/backup.py @@ -1,6 +1,7 @@ import asyncio from pymongo import MongoClient + async def drop_and_restore_collections(session=None): print("dropping and restoring collections") client = MongoClient("mongodb://username:password@mongo") @@ -21,10 +22,12 @@ async def drop_and_restore_collections(session=None): client.close() + # Main entry point for the script async def main(): await drop_and_restore_collections() + # Run the main function if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From eceec6e96b598ab829149337516f16abeccf8b71 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 08:10:34 +0100 Subject: [PATCH 399/414] Refactor - simplified logic to evaluation migration --- .../20240110165900_evaluations_revamp.py | 289 ++++++++---------- 1 file changed, 124 insertions(+), 165 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 6b65cb79ae..f324499837 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -151,26 +151,7 @@ class Settings: name = "custom_evaluations" -def modify_app_id_store( - app_id: str, - variant_ids: str, - evaluation_type: str, - app_keyvalue_store: Dict[str, Dict[str, List[str]]], -): - app_id_store = app_keyvalue_store.get(app_id, None) - if not app_id_store: - app_keyvalue_store[app_id] = {"variant_ids": [], "evaluation_types": []} - app_id_store = app_keyvalue_store[app_id] - - app_id_store_variant_ids = list(app_id_store["variant_ids"]) - if variant_ids not in app_id_store_variant_ids: - app_id_store_variant_ids.extend(variant_ids) - app_id_store["variant_ids"] = list(set(app_id_store_variant_ids)) - - app_id_store_evaluation_types = list(app_id_store["evaluation_types"]) - if evaluation_type not in app_id_store_evaluation_types: - app_id_store_evaluation_types.append(evaluation_type) - app_id_store["evaluation_types"] = list(set(app_id_store_evaluation_types)) +PYTHON_CODE = "import random from typing import Dict def evaluate( app_params: Dict[str, str], inputs: Dict[str, str], output: str, correct_answer: str ) -> float: return random.uniform(0.1, 0.9)" class Forward: @@ -189,9 +170,31 @@ class Forward: ) async def migrate_old_evaluation_to_new_evaluation(self, session): # STEP 1: - # Create a key-value store that saves all the variants & evaluation types for a particular app id - # Example: {"app_id": {"evaluation_types": ["string", "string"], "variant_ids": ["string", "string"]}} - app_keyvalue_store = {} + # Retrieve all the apps. + # Generate an "exact_match" evaluator and a code evaluator for each app. + apps_db = await AppDB.find(fetch_links=True).to_list() + for app_db in apps_db: + eval_exact_match_config = EvaluatorConfigDB( + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_exact_match_default", + evaluator_key="auto_exact_match", + settings_values={}, + ) + await eval_exact_match_config.insert(session=session) + eval_custom_code_config = EvaluatorConfigDB( + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_custom_code_default", + evaluator_key="auto_custom_code_run", + settings_values=dict({"code": PYTHON_CODE}), + ) + await eval_custom_code_config.insert(session=session) + + # STEP 2: + # Review the evaluations and create a unique evaluation for each one. old_evaluations = await OldEvaluationDB.find( In( OldEvaluationDB.evaluation_type, @@ -207,151 +210,107 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): fetch_links=True, ).to_list() for old_eval in old_evaluations: - app_id = old_eval.app.id - variant_ids = [str(variant_id) for variant_id in old_eval.variants] + list_of_eval_configs = [] evaluation_type = old_eval.evaluation_type - modify_app_id_store( - str(app_id), variant_ids, evaluation_type, app_keyvalue_store + # Use the created evaluator if the evaluation uses "exact_match" or a code evaluator. + # Otherwise, create a new evaluator. + if evaluation_type == "custom_code_run": + eval_config = await EvaluatorConfigDB.find_one( + EvaluatorConfigDB.app.id == old_eval.app.id, + EvaluatorConfigDB.evaluator_key == "auto_custom_code_run", + ) + list_of_eval_configs.append(eval_config.id) + + if evaluation_type == "auto_exact_match": + eval_config = await EvaluatorConfigDB.find_one( + EvaluatorConfigDB.app.id == old_eval.app.id, + EvaluatorConfigDB.evaluator_key == "auto_exact_match", + ) + list_of_eval_configs.append(eval_config.id) + + if evaluation_type == "auto_similarity_match": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values=dict( + { + "similarity_threshold": float( + old_eval.evaluation_type_settings.similarity_threshold + ) + } + ), + ) + await eval_config.insert(session=session) + list_of_eval_configs.append(eval_config.id) + + if evaluation_type == "auto_regex_test": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values=dict( + { + "regex_pattern": old_eval.evaluation_type_settings.regex_pattern, + "regex_should_match": old_eval.evaluation_type_settings.regex_should_match, + } + ), + ) + await eval_config.insert(session=session) + list_of_eval_configs.append(eval_config.id) + + if evaluation_type == "auto_webhook_test": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values=dict( + { + "webhook_url": old_eval.evaluation_type_settings.webhook_url, + "webhook_body": {}, + } + ), + ) + await eval_config.insert(session=session) + list_of_eval_configs.append(eval_config) + + if evaluation_type == "auto_ai_critique": + eval_config = EvaluatorConfigDB( + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_{evaluation_type}", + evaluator_key=evaluation_type, + settings_values=dict( + { + "prompt_template": old_eval.evaluation_type_settings.evaluation_prompt_template + } + ), + ) + await eval_config.insert(session=session) + list_of_eval_configs.append(eval_config) + + new_eval = EvaluationDB( + id=old_eval.id, + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + status=old_eval.status, + testset=old_eval.testset, + variant=PydanticObjectId(old_eval.variants[0]), + evaluators_configs=list_of_eval_configs, + aggregated_results=[], + created_at=old_eval.created_at, ) + await new_eval.insert(session=session) - # STEP 2: - # Loop through the app_id key-store to create evaluator configs - # based on the evaluation types available - for app_id, app_id_store in app_keyvalue_store.items(): - app_evaluator_configs: List[EvaluatorConfigDB] = [] - app_db = await AppDB.find_one(AppDB.id == PydanticObjectId(app_id)) - for evaluation_type in app_id_store[ - "evaluation_types" - ]: # the values in this case are the evaluation type - custom_code_evaluations = await OldCustomEvaluationDB.find( - OldCustomEvaluationDB.app == PydanticObjectId(app_id) - ).to_list() - if evaluation_type == "custom_code_run": - for custom_code_evaluation in custom_code_evaluations: - eval_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_{evaluation_type}", - evaluator_key=f"auto_{evaluation_type}", - settings_values=dict( - {"code": custom_code_evaluation.python_code} - ), - ) - await eval_config.insert(session=session) - app_evaluator_configs.append(eval_config) - - if evaluation_type == "auto_similarity_match": - eval_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_{evaluation_type}", - evaluator_key=evaluation_type, - settings_values=dict( - { - "similarity_threshold": float( - old_eval.evaluation_type_settings.similarity_threshold - ) - } - ), - ) - await eval_config.insert(session=session) - app_evaluator_configs.append(eval_config) - - if evaluation_type == "auto_exact_match": - eval_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_{evaluation_type}", - evaluator_key=evaluation_type, - settings_values={}, - ) - await eval_config.insert(session=session) - app_evaluator_configs.append(eval_config) - - if evaluation_type == "auto_regex_test": - eval_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_{evaluation_type}", - evaluator_key=evaluation_type, - settings_values=dict( - { - "regex_pattern": old_eval.evaluation_type_settings.regex_pattern, - "regex_should_match": old_eval.evaluation_type_settings.regex_should_match, - } - ), - ) - await eval_config.insert(session=session) - app_evaluator_configs.append(eval_config) - - if evaluation_type == "auto_webhook_test": - eval_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_{evaluation_type}", - evaluator_key=evaluation_type, - settings_values=dict( - { - "webhook_url": old_eval.evaluation_type_settings.webhook_url, - "webhook_body": {}, - } - ), - ) - await eval_config.insert(session=session) - app_evaluator_configs.append(eval_config) - - if evaluation_type == "auto_ai_critique": - eval_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_{evaluation_type}", - evaluator_key=evaluation_type, - settings_values=dict( - { - "prompt_template": old_eval.evaluation_type_settings.evaluation_prompt_template - } - ), - ) - await eval_config.insert(session=session) - app_evaluator_configs.append(eval_config) - - # STEP 3: - # Retrieve evaluator configs for app id - auto_evaluator_configs: List[PydanticObjectId] = [] - for evaluator_config in app_evaluator_configs: - # In the case where the evaluator key is not a human evaluator, - # Append the evaluator config id in the list of auto evaluator configs - if evaluator_config.evaluator_key not in [ - "human_a_b_testing", - "single_model_test", - ]: - auto_evaluator_configs.append(evaluator_config.id) - - # STEP 4: - # Proceed to create a single evaluation for every variant in the app_id_store - # with the auto_evaluator_configs - if auto_evaluator_configs is not None: - for variant in app_id_store["variant_ids"]: - new_eval = EvaluationDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - status=old_eval.status, - testset=old_eval.testset, - variant=PydanticObjectId(variant), - evaluators_configs=auto_evaluator_configs, - aggregated_results=[], - created_at=old_evaluation.created_at, - ) - await new_eval.insert(session=session) - - # STEP 5: + # STEP 3: # Create the human evaluation old_human_evaluations = await OldEvaluationDB.find( In( From 0c850cce6cd24d0e26cbff3a155d69c01a47e890 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 08:34:39 +0100 Subject: [PATCH 400/414] Refactor - simplify logic for evaluation scenarios --- .../20240110165900_evaluations_revamp.py | 14 +- ...40112120721_evaluation_scenarios_revamp.py | 138 +++++++----------- .../agenta_backend/migrations/backup.py | 5 +- 3 files changed, 63 insertions(+), 94 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index f324499837..51dca70eaa 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -151,7 +151,7 @@ class Settings: name = "custom_evaluations" -PYTHON_CODE = "import random from typing import Dict def evaluate( app_params: Dict[str, str], inputs: Dict[str, str], output: str, correct_answer: str ) -> float: return random.uniform(0.1, 0.9)" +PYTHON_CODE = "import random \nfrom typing import Dict \n\n\ndef evaluate(\n app_params: Dict[str, str], \n inputs: Dict[str, str], \n output: str, correct_answer: str \n) -> float: \n return random.uniform(0.1, 0.9)" class Forward: @@ -203,7 +203,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): "auto_similarity_match", "auto_regex_test", "auto_ai_critique", - "auto_custom_code_run", + "custom_code_run", "auto_webhook_test", ], ), @@ -219,14 +219,16 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): EvaluatorConfigDB.app.id == old_eval.app.id, EvaluatorConfigDB.evaluator_key == "auto_custom_code_run", ) - list_of_eval_configs.append(eval_config.id) + if eval_config is not None: + list_of_eval_configs.append(eval_config.id) if evaluation_type == "auto_exact_match": eval_config = await EvaluatorConfigDB.find_one( EvaluatorConfigDB.app.id == old_eval.app.id, EvaluatorConfigDB.evaluator_key == "auto_exact_match", ) - list_of_eval_configs.append(eval_config.id) + if eval_config is not None: + list_of_eval_configs.append(eval_config.id) if evaluation_type == "auto_similarity_match": eval_config = EvaluatorConfigDB( @@ -278,7 +280,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): ), ) await eval_config.insert(session=session) - list_of_eval_configs.append(eval_config) + list_of_eval_configs.append(eval_config.id) if evaluation_type == "auto_ai_critique": eval_config = EvaluatorConfigDB( @@ -294,7 +296,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): ), ) await eval_config.insert(session=session) - list_of_eval_configs.append(eval_config) + list_of_eval_configs.append(eval_config.id) new_eval = EvaluationDB( id=old_eval.id, diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py index 3af89301cb..b34fd5c5c1 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py @@ -229,42 +229,6 @@ class Settings: name = "evaluation_scenarios" -class OldCustomEvaluationDB(Document): - evaluation_name: str - python_code: str - version: str = Field("odmantic") - app: Link[AppDB] - user: Link[UserDB] - organization: Link[OrganizationDB] - created_at: Optional[datetime] = Field(default=datetime.utcnow()) - updated_at: Optional[datetime] = Field(default=datetime.utcnow()) - - class Settings: - name = "custom_evaluations" - - -def modify_app_id_store( - app_id: str, - variant_ids: str, - evaluation_type: str, - app_keyvalue_store: Dict[str, Dict[str, List[str]]], -): - app_id_store = app_keyvalue_store.get(app_id, None) - if not app_id_store: - app_keyvalue_store[app_id] = {"variant_ids": [], "evaluation_types": []} - app_id_store = app_keyvalue_store[app_id] - - app_id_store_variant_ids = list(app_id_store["variant_ids"]) - if variant_ids not in list(app_id_store["variant_ids"]): - app_id_store_variant_ids.extend(variant_ids) - app_id_store["variant_ids"] = list(set(app_id_store_variant_ids)) - - app_id_store_evaluation_types = list(app_id_store["evaluation_types"]) - if evaluation_type not in app_id_store_evaluation_types: - app_id_store_evaluation_types.append(evaluation_type) - app_id_store["evaluation_types"] = list(set(app_id_store_evaluation_types)) - - class Forward: @free_fall_migration( document_models=[ @@ -283,6 +247,9 @@ class Forward: async def migrate_old_auto_evaluation_scenario_to_new_auto_evaluation_scenario( self, session ): + new_evaluations = await EvaluationDB.find( + fetch_links=True, + ).to_list() old_auto_scenarios = await OldEvaluationScenarioDB.find( In( OldEvaluationScenarioDB.evaluation.evaluation_type, @@ -291,63 +258,60 @@ async def migrate_old_auto_evaluation_scenario_to_new_auto_evaluation_scenario( "auto_similarity_match", "auto_regex_test", "auto_ai_critique", - "auto_custom_code_run", + "custom_code_run", "auto_webhook_test", ], ), fetch_links=True, ).to_list() - for old_scenario in old_auto_scenarios: - matching_evaluation = await EvaluationDB.find_one( - EvaluationDB.app.id == old_scenario.evaluation.app.id, - fetch_links=True, - ) - if matching_evaluation: - results = [ - EvaluationScenarioResult( - evaluator_config=PydanticObjectId(evaluator_config), - result=Result( - type="number" - if isinstance(old_scenario.score, int) - else "number" - if isinstance(old_scenario.score, float) - else "string" - if isinstance(old_scenario.score, str) - else "boolean" - if isinstance(old_scenario.score, bool) - else "any", - value=old_scenario.score, - ), - ) - for evaluator_config in matching_evaluation.evaluators_configs - ] - new_scenario = EvaluationScenarioDB( - user=matching_evaluation.user, - organization=matching_evaluation.organization, - evaluation=matching_evaluation, - variant_id=old_scenario.evaluation.variants[0], - inputs=[ - EvaluationScenarioInputDB( - name=input.input_name, - type=type(input.input_value).__name__, - value=input.input_value, - ) - for input in old_scenario.inputs - ], - outputs=[ - EvaluationScenarioOutputDB( - type=type(output.variant_output).__name__, - value=output.variant_output, + for new_evaluation in new_evaluations: + for old_scenario in old_auto_scenarios: + if new_evaluation.id == old_scenario.evaluation.id: + results = [ + EvaluationScenarioResult( + evaluator_config=PydanticObjectId(evaluator_config), + result=Result( + type="number" + if isinstance(old_scenario.score, int) + else "number" + if isinstance(old_scenario.score, float) + else "string" + if isinstance(old_scenario.score, str) + else "boolean" + if isinstance(old_scenario.score, bool) + else "any", + value=old_scenario.score, + ), ) - for output in old_scenario.outputs - ], - correct_answer=old_scenario.correct_answer, - is_pinned=old_scenario.is_pinned, - note=old_scenario.note, - evaluators_configs=matching_evaluation.evaluators_configs, - results=results, - ) - await new_scenario.insert(session=session) + for evaluator_config in new_evaluation.evaluators_configs + ] + new_scenario = EvaluationScenarioDB( + user=new_evaluation.user, + organization=new_evaluation.organization, + evaluation=new_evaluation, + variant_id=old_scenario.evaluation.variants[0], + inputs=[ + EvaluationScenarioInputDB( + name=input.input_name, + type=type(input.input_value).__name__, + value=input.input_value, + ) + for input in old_scenario.inputs + ], + outputs=[ + EvaluationScenarioOutputDB( + type=type(output.variant_output).__name__, + value=output.variant_output, + ) + for output in old_scenario.outputs + ], + correct_answer=old_scenario.correct_answer, + is_pinned=old_scenario.is_pinned, + note=old_scenario.note, + evaluators_configs=new_evaluation.evaluators_configs, + results=results, + ) + await new_scenario.insert(session=session) @free_fall_migration( document_models=[ diff --git a/agenta-backend/agenta_backend/migrations/backup.py b/agenta-backend/agenta_backend/migrations/backup.py index ffbbb60212..9e01bce284 100644 --- a/agenta-backend/agenta_backend/migrations/backup.py +++ b/agenta-backend/agenta_backend/migrations/backup.py @@ -1,6 +1,7 @@ import asyncio from pymongo import MongoClient + async def drop_and_restore_collections(session=None): print("dropping and restoring collections") client = MongoClient("mongodb://username:password@mongo") @@ -21,10 +22,12 @@ async def drop_and_restore_collections(session=None): client.close() + # Main entry point for the script async def main(): await drop_and_restore_collections() + # Run the main function if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From 14fe29024c962ae9a750c1f2f1a4f1013fc29811 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 08:55:27 +0100 Subject: [PATCH 401/414] Update - put a 2 seconds sleept --- .../20240113131802_new_evaluation_results_aggregation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py index 09856eda80..2f5a8c4c2a 100644 --- a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py +++ b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py @@ -1,3 +1,4 @@ +import asyncio from datetime import datetime from typing import Any, Dict, List, Optional @@ -229,6 +230,9 @@ async def aggregate_new_evaluation_with_evaluation_scenario_results(self, sessio evaluation_keyvalue_store, ) + print("EKVS: ", evaluation_keyvalue_store) + await asyncio.sleep(2) + # STEP 2: # Update the evaluation key-value store new_auto_evaluation_scenarios = await EvaluationScenarioDB.find( From cf41da4142dec692e93b42818f1d93f94a4a2585 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 09:30:28 +0100 Subject: [PATCH 402/414] Update - update results aggregation logic --- ...1802_new_evaluation_results_aggregation.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py index 2f5a8c4c2a..0a9880a5c4 100644 --- a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py +++ b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py @@ -219,7 +219,7 @@ class Forward: async def aggregate_new_evaluation_with_evaluation_scenario_results(self, session): # STEP 1: # Create a key-value store that saves all the evaluator configs & results for a particular evaluation id - # Example: {"evaluation_id": {"evaluation_config_id": {"results": [Result("type": str, "value": Any)]}}} + # Example: {"evaluation_id": {"evaluation_config_id": {"results": [}}} evaluation_keyvalue_store = {} new_auto_evaluations = await EvaluationDB.find().to_list() for auto_evaluation in new_auto_evaluations: @@ -240,13 +240,20 @@ async def aggregate_new_evaluation_with_evaluation_scenario_results(self, sessio ).to_list() for auto_evaluation in new_auto_evaluation_scenarios: evaluation_id = str(auto_evaluation.evaluation.id) - evaluation_store = evaluation_keyvalue_store[evaluation_id] - configs_with_results = zip( - auto_evaluation.evaluators_configs, auto_evaluation.results - ) - for evaluator, result in configs_with_results: - modify_evaluation_scenario_store( - str(evaluator), result, evaluation_store + + # Check if the evaluation_id exists in the key-value store + if evaluation_id in evaluation_keyvalue_store: + evaluation_store = evaluation_keyvalue_store[evaluation_id] + configs_with_results = zip( + auto_evaluation.evaluators_configs, auto_evaluation.results + ) + for evaluator, result in configs_with_results: + modify_evaluation_scenario_store( + str(evaluator), result, evaluation_store + ) + else: + print( + f"Warning: Evaluation ID {evaluation_id} not found in the key-value store." ) # STEP 3: From 94058607f5ff878f8a9ea5d442bab20f3e18ef6d Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 10:35:28 +0100 Subject: [PATCH 403/414] fix aggregations --- ...1802_new_evaluation_results_aggregation.py | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py index 0a9880a5c4..2eb249ab93 100644 --- a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py +++ b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py @@ -117,14 +117,13 @@ class Settings: def prepare_evaluation_keyvalue_store( - evaluation_id: str, evaluator_id: str, evaluation_keyvalue_store: Dict + evaluation_id: str, evaluation_keyvalue_store: Dict ) -> Dict[str, Dict[str, Any]]: """ Construct a key-value store to saves results based on a evaluator config in an evaluation Args: evaluation_id (str): ID of evaluation - evaluator_id (str): ID of evaluator config evaluation_keyvalue_store (Dict): evaluation keyvalue store Returns: @@ -134,6 +133,24 @@ def prepare_evaluation_keyvalue_store( if evaluation_id not in evaluation_keyvalue_store: evaluation_keyvalue_store[evaluation_id] = {} + return evaluation_keyvalue_store + + +def prepare_evaluator_keyvalue_store( + evaluation_id: str, evaluator_id: str, evaluation_keyvalue_store: Dict +) -> Dict[str, Dict[str, Any]]: + """ + Construct a key-value store to saves results based on a evaluator config in an evaluation + + Args: + evaluation_id (str): ID of evaluation + evaluator_id (str): ID of evaluator config + evaluation_keyvalue_store (Dict): evaluation keyvalue store + + Returns: + Dict[str, Dict[str, Any]]: {"evaluation_id": {"evaluation_config_id": {"results": [Result("type": str, "value": Any)]}}} + """ + if evaluator_id not in evaluation_keyvalue_store[evaluation_id]: evaluation_keyvalue_store[evaluation_id][evaluator_id] = {"results": []} @@ -222,14 +239,21 @@ async def aggregate_new_evaluation_with_evaluation_scenario_results(self, sessio # Example: {"evaluation_id": {"evaluation_config_id": {"results": [}}} evaluation_keyvalue_store = {} new_auto_evaluations = await EvaluationDB.find().to_list() + print("### len new_auto_evaluations", len(new_auto_evaluations)) + for auto_evaluation in new_auto_evaluations: + evaluation_keyvalue_store = prepare_evaluation_keyvalue_store( + str(auto_evaluation.id), + evaluation_keyvalue_store, + ) for evaluator_config in auto_evaluation.evaluators_configs: - evaluation_keyvalue_store = prepare_evaluation_keyvalue_store( + evaluation_keyvalue_store = prepare_evaluator_keyvalue_store( str(auto_evaluation.id), str(evaluator_config), evaluation_keyvalue_store, ) + print("### len evaluation_keyvalue_store", len(evaluation_keyvalue_store)) print("EKVS: ", evaluation_keyvalue_store) await asyncio.sleep(2) From e3318b44d73663fdf587a7537d1e004d6789b2da Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 10:41:46 +0100 Subject: [PATCH 404/414] Update - modified evaluations revamp migration --- .../20240110165900_evaluations_revamp.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 51dca70eaa..f370cf6ad8 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -151,9 +151,6 @@ class Settings: name = "custom_evaluations" -PYTHON_CODE = "import random \nfrom typing import Dict \n\n\ndef evaluate(\n app_params: Dict[str, str], \n inputs: Dict[str, str], \n output: str, correct_answer: str \n) -> float: \n return random.uniform(0.1, 0.9)" - - class Forward: @free_fall_migration( document_models=[ @@ -183,15 +180,6 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): settings_values={}, ) await eval_exact_match_config.insert(session=session) - eval_custom_code_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_custom_code_default", - evaluator_key="auto_custom_code_run", - settings_values=dict({"code": PYTHON_CODE}), - ) - await eval_custom_code_config.insert(session=session) # STEP 2: # Review the evaluations and create a unique evaluation for each one. @@ -215,12 +203,22 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): # Use the created evaluator if the evaluation uses "exact_match" or a code evaluator. # Otherwise, create a new evaluator. if evaluation_type == "custom_code_run": - eval_config = await EvaluatorConfigDB.find_one( - EvaluatorConfigDB.app.id == old_eval.app.id, - EvaluatorConfigDB.evaluator_key == "auto_custom_code_run", + custom_code = await OldCustomEvaluationDB.find_one( + OldCustomEvaluationDB.id + == PydanticObjectId( + old_eval.evaluation_type_settings.custom_code_evaluation_id + ) ) - if eval_config is not None: - list_of_eval_configs.append(eval_config.id) + eval_config = EvaluatorConfigDB( + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"{app_db.app_name}_custom_code_default", + evaluator_key="auto_custom_code_run", + settings_values=dict({"code": custom_code.python_code}), + ) + await eval_config.insert(session=session) + list_of_eval_configs.append(eval_config.id) if evaluation_type == "auto_exact_match": eval_config = await EvaluatorConfigDB.find_one( From ff281fb994578ae51eeb31897f360ed087328746 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 11:17:29 +0100 Subject: [PATCH 405/414] Update - added backward compatibility for old templates --- .../agenta_backend/services/llm_apps_service.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py index 67115ad5ef..59b826c528 100644 --- a/agenta-backend/agenta_backend/services/llm_apps_service.py +++ b/agenta-backend/agenta_backend/services/llm_apps_service.py @@ -86,8 +86,13 @@ async def invoke_app( ) response.raise_for_status() - lm_app_response = response.json() - return AppOutput(output=lm_app_response["message"], status="success") + llm_app_response = response.json() + app_output = ( + llm_app_response["message"] + if isinstance(llm_app_response, dict) + else llm_app_response + ) + return AppOutput(output=app_output, status="success") async def run_with_retry( From 1dc208260a893a5a59a002957d024c268afad8fd Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 11:52:32 +0100 Subject: [PATCH 406/414] Update - modify code to create custom code evaluator config --- .../migrations/20240110165900_evaluations_revamp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index f370cf6ad8..9784a84594 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -210,10 +210,10 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): ) ) eval_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_custom_code_default", + app=old_eval.app, + organization=old_eval.organization, + user=old_eval.user, + name=f"{old_eval.app.app_name}_custom_code_default", evaluator_key="auto_custom_code_run", settings_values=dict({"code": custom_code.python_code}), ) From 3a223b31fd733a17eee18f55f4960388e6a2eef5 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 17 Jan 2024 13:57:12 +0100 Subject: [PATCH 407/414] Update - fix update app variant db manager --- agenta-backend/agenta_backend/services/db_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 58f2ef984d..60bd665db8 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -1474,7 +1474,7 @@ async def update_app_variant( if hasattr(app_variant, key): setattr(app_variant, key, value) - await app_variant.update() + await app_variant.save() return app_variant From cd55f03b109fbd607457dc2748415e0d086e25c5 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 14:07:45 +0100 Subject: [PATCH 408/414] add migration for exact match evaluator --- ...547_create_exact_match_evaluator_config.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 agenta-backend/agenta_backend/migrations/20240110132547_create_exact_match_evaluator_config.py diff --git a/agenta-backend/agenta_backend/migrations/20240110132547_create_exact_match_evaluator_config.py b/agenta-backend/agenta_backend/migrations/20240110132547_create_exact_match_evaluator_config.py new file mode 100644 index 0000000000..82aed942cf --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/20240110132547_create_exact_match_evaluator_config.py @@ -0,0 +1,75 @@ +from datetime import datetime +from typing import Any, Dict, List, Optional + + +from beanie.operators import In +from pydantic import BaseModel, Field +from beanie import free_fall_migration, Document, Link, PydanticObjectId + + +class UserDB(Document): + uid: str = Field(default="0", unique=True, index=True) + username: str = Field(default="agenta") + email: str = Field(default="demo@agenta.ai", unique=True) + organizations: Optional[List[PydanticObjectId]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "users" + + +class OrganizationDB(Document): + name: str = Field(default="agenta") + description: str = Field(default="") + type: Optional[str] + owner: str # user id + members: Optional[List[PydanticObjectId]] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "organizations" + + +class AppDB(Document): + app_name: str + organization: Link[OrganizationDB] + user: Link[UserDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_db" + + +class EvaluatorConfigDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + name: str + evaluator_key: str + settings_values: Optional[Dict[str, Any]] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluators_configs" + + +class Forward: + @free_fall_migration( + document_models=[AppDB, UserDB, OrganizationDB, EvaluatorConfigDB] + ) + async def create_default_exact_match_evaluator(self, session): + apps_db = await AppDB.find(fetch_links=True).to_list() + for app_db in apps_db: + eval_exact_match_config = EvaluatorConfigDB( + app=app_db, + organization=app_db.organization, + user=app_db.user, + name=f"Exact Match", + evaluator_key="auto_exact_match", + settings_values={}, + ) + await eval_exact_match_config.insert(session=session) From 6e6ab609c4f03d09f79ecc9a1cf4251850a7e8ec Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 14:08:09 +0100 Subject: [PATCH 409/414] improve evaluator name --- .../20240110165900_evaluations_revamp.py | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 9784a84594..7e198167d7 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -167,21 +167,6 @@ class Forward: ) async def migrate_old_evaluation_to_new_evaluation(self, session): # STEP 1: - # Retrieve all the apps. - # Generate an "exact_match" evaluator and a code evaluator for each app. - apps_db = await AppDB.find(fetch_links=True).to_list() - for app_db in apps_db: - eval_exact_match_config = EvaluatorConfigDB( - app=app_db, - organization=app_db.organization, - user=app_db.user, - name=f"{app_db.app_name}_exact_match_default", - evaluator_key="auto_exact_match", - settings_values={}, - ) - await eval_exact_match_config.insert(session=session) - - # STEP 2: # Review the evaluations and create a unique evaluation for each one. old_evaluations = await OldEvaluationDB.find( In( @@ -213,7 +198,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): app=old_eval.app, organization=old_eval.organization, user=old_eval.user, - name=f"{old_eval.app.app_name}_custom_code_default", + name="Custom Code Run", evaluator_key="auto_custom_code_run", settings_values=dict({"code": custom_code.python_code}), ) @@ -233,7 +218,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): app=old_eval.app, organization=old_eval.organization, user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + name="Similarity Match", evaluator_key=evaluation_type, settings_values=dict( { @@ -251,7 +236,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): app=old_eval.app, organization=old_eval.organization, user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + name="Regex Test", evaluator_key=evaluation_type, settings_values=dict( { @@ -268,7 +253,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): app=old_eval.app, organization=old_eval.organization, user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + name="Webhook Test", evaluator_key=evaluation_type, settings_values=dict( { @@ -285,7 +270,7 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): app=old_eval.app, organization=old_eval.organization, user=old_eval.user, - name=f"{old_eval.app.app_name}_{evaluation_type}", + name="AI Critique", evaluator_key=evaluation_type, settings_values=dict( { From 52de18e06f95efde492e73fa893cbb4f0346f2c1 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 14:08:27 +0100 Subject: [PATCH 410/414] remove verbose print --- .../20240113131802_new_evaluation_results_aggregation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py index 2eb249ab93..722d185e40 100644 --- a/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py +++ b/agenta-backend/agenta_backend/migrations/20240113131802_new_evaluation_results_aggregation.py @@ -254,7 +254,6 @@ async def aggregate_new_evaluation_with_evaluation_scenario_results(self, sessio ) print("### len evaluation_keyvalue_store", len(evaluation_keyvalue_store)) - print("EKVS: ", evaluation_keyvalue_store) await asyncio.sleep(2) # STEP 2: From d206a4c0758b5064127da6fd6879cc217b7d85a0 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 20:43:20 +0100 Subject: [PATCH 411/414] ignore evals with deleted apps --- .../migrations/20240110165900_evaluations_revamp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index 7e198167d7..d1af3eb729 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -183,6 +183,8 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): fetch_links=True, ).to_list() for old_eval in old_evaluations: + if getattr(old_eval, 'id', None) and not getattr(getattr(old_eval, 'app', None), 'id', None): + continue list_of_eval_configs = [] evaluation_type = old_eval.evaluation_type # Use the created evaluator if the evaluation uses "exact_match" or a code evaluator. From 887d5e99396c65443b400cebc967fc76bb841929 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 20:44:00 +0100 Subject: [PATCH 412/414] move human evals to separated migrations --- ...40112120721_evaluation_scenarios_revamp.py | 116 +------ ...12120740_human_a_b_evaluation_scenarios.py | 292 ++++++++++++++++++ ...human_single_model_evaluation_scenarios.py | 292 ++++++++++++++++++ 3 files changed, 586 insertions(+), 114 deletions(-) create mode 100644 agenta-backend/agenta_backend/migrations/20240112120740_human_a_b_evaluation_scenarios.py create mode 100644 agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py index b34fd5c5c1..b771096c7d 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py @@ -265,7 +265,8 @@ async def migrate_old_auto_evaluation_scenario_to_new_auto_evaluation_scenario( fetch_links=True, ).to_list() for new_evaluation in new_evaluations: - for old_scenario in old_auto_scenarios: + for i, old_scenario in enumerate(old_auto_scenarios): + print(f"auto evaluation {i}") if new_evaluation.id == old_scenario.evaluation.id: results = [ EvaluationScenarioResult( @@ -313,118 +314,5 @@ async def migrate_old_auto_evaluation_scenario_to_new_auto_evaluation_scenario( ) await new_scenario.insert(session=session) - @free_fall_migration( - document_models=[ - AppDB, - OrganizationDB, - UserDB, - TestSetDB, - EvaluationDB, - OldEvaluationDB, - OldEvaluationScenarioDB, - EvaluationScenarioDB, - HumanEvaluationDB, - HumanEvaluationScenarioDB, - ] - ) - async def migrate_old_human_a_b_evaluation_scenario_to_new_human_evaluation_scenario( - self, session - ): - old_human_ab_testing_scenarios = await OldEvaluationScenarioDB.find( - OldEvaluationScenarioDB.evaluation.evaluation_type == "human_a_b_testing", - fetch_links=True, - ).to_list() - for ab_testing_scenario in old_human_ab_testing_scenarios: - matching_human_evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.id == ab_testing_scenario.evaluation.id, - HumanEvaluationDB.evaluation_type == "human_a_b_testing", - fetch_links=True, - ) - if matching_human_evaluation: - scenario_inputs = [ - HumanEvaluationScenarioInput( - input_name=input.input_name, - input_value=input.input_value, - ) - for input in ab_testing_scenario.inputs - ] - scenario_outputs = [ - HumanEvaluationScenarioOutput( - variant_id=output.variant_id, - variant_output=output.variant_output, - ) - for output in ab_testing_scenario.outputs - ] - new_scenario = HumanEvaluationScenarioDB( - user=matching_human_evaluation.user, - organization=matching_human_evaluation.organization, - evaluation=matching_human_evaluation, - inputs=scenario_inputs, - outputs=scenario_outputs, - correct_answer=ab_testing_scenario.correct_answer, - is_pinned=ab_testing_scenario.is_pinned, - note=ab_testing_scenario.note, - vote=ab_testing_scenario.vote, - score=ab_testing_scenario.score, - ) - await new_scenario.insert(session=session) - - @free_fall_migration( - document_models=[ - AppDB, - OrganizationDB, - UserDB, - TestSetDB, - EvaluationDB, - OldEvaluationDB, - OldEvaluationScenarioDB, - EvaluationScenarioDB, - HumanEvaluationDB, - HumanEvaluationScenarioDB, - ] - ) - async def migrate_old_human_single_model_evaluation_scenario_to_new_human_evaluation_scenario( - self, session - ): - old_human_single_model_scenarios = await OldEvaluationScenarioDB.find( - OldEvaluationScenarioDB.evaluation.evaluation_type == "single_model_test", - fetch_links=True, - ).to_list() - for single_model_scenario in old_human_single_model_scenarios: - matching_human_evaluation = await HumanEvaluationDB.find_one( - HumanEvaluationDB.id == single_model_scenario.evaluation.id, - HumanEvaluationDB.evaluation_type == "single_model_test", - fetch_links=True, - ) - if matching_human_evaluation: - scenario_inputs = [ - HumanEvaluationScenarioInput( - input_name=input.input_name, - input_value=input.input_value, - ) - for input in single_model_scenario.inputs - ] - scenario_outputs = [ - HumanEvaluationScenarioOutput( - variant_id=output.variant_id, - variant_output=output.variant_output, - ) - for output in single_model_scenario.outputs - ] - new_scenario = HumanEvaluationScenarioDB( - user=matching_human_evaluation.user, - organization=matching_human_evaluation.organization, - evaluation=matching_human_evaluation, - inputs=scenario_inputs, - outputs=scenario_outputs, - correct_answer=single_model_scenario.correct_answer, - is_pinned=single_model_scenario.is_pinned, - note=single_model_scenario.note, - vote=single_model_scenario.vote, - score=single_model_scenario.score, - ) - await new_scenario.insert(session=session) - - class Backward: pass diff --git a/agenta-backend/agenta_backend/migrations/20240112120740_human_a_b_evaluation_scenarios.py b/agenta-backend/agenta_backend/migrations/20240112120740_human_a_b_evaluation_scenarios.py new file mode 100644 index 0000000000..1802374cd2 --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/20240112120740_human_a_b_evaluation_scenarios.py @@ -0,0 +1,292 @@ +from datetime import datetime +from typing import Any, Dict, List, Optional + +from beanie.operators import In +from pydantic import BaseModel, Field +from beanie import free_fall_migration, Document, Link, PydanticObjectId + + +class OrganizationDB(Document): + name: str = Field(default="agenta") + description: str = Field(default="") + type: Optional[str] + owner: str # user id + members: Optional[List[PydanticObjectId]] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "organizations" + + +class UserDB(Document): + uid: str = Field(default="0", unique=True, index=True) + username: str = Field(default="agenta") + email: str = Field(default="demo@agenta.ai", unique=True) + organizations: Optional[List[PydanticObjectId]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "users" + + +class AppDB(Document): + app_name: str + organization: Link[OrganizationDB] + user: Link[UserDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_db" + + +class TestSetDB(Document): + name: str + app: Link[AppDB] + csvdata: List[Dict[str, str]] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "testsets" + + +class EvaluatorConfigDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + name: str + evaluator_key: str + settings_values: Optional[Dict[str, Any]] = None + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluators_configs" + + +class Result(BaseModel): + type: str + value: Any + + +class EvaluationScenarioResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class AggregatedResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class EvaluationScenarioInputDB(BaseModel): + name: str + type: str + value: str + + +class EvaluationScenarioOutputDB(BaseModel): + type: str + value: Any + + +class HumanEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class HumanEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + variants: List[PydanticObjectId] + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "human_evaluations" + + +class HumanEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[HumanEvaluationDB] + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] + vote: Optional[str] + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "human_evaluations_scenarios" + + +class EvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str = Field(default="EVALUATION_INITIALIZED") + testset: Link[TestSetDB] + variant: PydanticObjectId + evaluators_configs: List[PydanticObjectId] + aggregated_results: List[AggregatedResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluations" + + +class EvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[EvaluationDB] + variant_id: PydanticObjectId + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] + evaluators_configs: List[PydanticObjectId] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluation_scenarios" + + +class OldEvaluationTypeSettings(BaseModel): + similarity_threshold: Optional[float] + regex_pattern: Optional[str] + regex_should_match: Optional[bool] + webhook_url: Optional[str] + llm_app_prompt_template: Optional[str] + custom_code_evaluation_id: Optional[str] + evaluation_prompt_template: Optional[str] + + +class OldEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class OldEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class OldEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + evaluation_type_settings: OldEvaluationTypeSettings + variants: List[PydanticObjectId] + version: str = Field("odmantic") + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluations" + + +class OldEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[OldEvaluationDB] + inputs: List[OldEvaluationScenarioInput] + outputs: List[OldEvaluationScenarioOutput] # EvaluationScenarioOutput + vote: Optional[str] + version: str = Field("odmantic") + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "evaluation_scenarios" + + +class Forward: + @free_fall_migration( + document_models=[ + AppDB, + OrganizationDB, + UserDB, + TestSetDB, + EvaluationDB, + OldEvaluationDB, + OldEvaluationScenarioDB, + EvaluationScenarioDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + ] + ) + async def migrate_old_human_a_b_evaluation_scenario_to_new_human_evaluation_scenario( + self, session + ): + old_human_ab_testing_scenarios = await OldEvaluationScenarioDB.find( + OldEvaluationScenarioDB.evaluation.evaluation_type == "human_a_b_testing", + fetch_links=True, + ).to_list() + for counter, ab_testing_scenario in enumerate(old_human_ab_testing_scenarios): + print(f"ab evaluation scenario {counter}") + matching_human_evaluation = await HumanEvaluationDB.find_one( + HumanEvaluationDB.id == ab_testing_scenario.evaluation.id, + HumanEvaluationDB.evaluation_type == "human_a_b_testing", + fetch_links=True, + ) + if matching_human_evaluation: + scenario_inputs = [ + HumanEvaluationScenarioInput( + input_name=input.input_name, + input_value=input.input_value, + ) + for input in ab_testing_scenario.inputs + ] + scenario_outputs = [ + HumanEvaluationScenarioOutput( + variant_id=output.variant_id, + variant_output=output.variant_output, + ) + for output in ab_testing_scenario.outputs + ] + new_scenario = HumanEvaluationScenarioDB( + user=matching_human_evaluation.user, + organization=matching_human_evaluation.organization, + evaluation=matching_human_evaluation, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=ab_testing_scenario.correct_answer, + is_pinned=ab_testing_scenario.is_pinned, + note=ab_testing_scenario.note, + vote=ab_testing_scenario.vote, + score=ab_testing_scenario.score, + ) + await new_scenario.insert(session=session) + + +class Backward: + pass diff --git a/agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py b/agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py new file mode 100644 index 0000000000..6fa1178183 --- /dev/null +++ b/agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py @@ -0,0 +1,292 @@ +from datetime import datetime +from typing import Any, Dict, List, Optional + +from beanie.operators import In +from pydantic import BaseModel, Field +from beanie import free_fall_migration, Document, Link, PydanticObjectId + + +class OrganizationDB(Document): + name: str = Field(default="agenta") + description: str = Field(default="") + type: Optional[str] + owner: str # user id + members: Optional[List[PydanticObjectId]] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "organizations" + + +class UserDB(Document): + uid: str = Field(default="0", unique=True, index=True) + username: str = Field(default="agenta") + email: str = Field(default="demo@agenta.ai", unique=True) + organizations: Optional[List[PydanticObjectId]] = [] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "users" + + +class AppDB(Document): + app_name: str + organization: Link[OrganizationDB] + user: Link[UserDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "app_db" + + +class TestSetDB(Document): + name: str + app: Link[AppDB] + csvdata: List[Dict[str, str]] + user: Link[UserDB] + organization: Link[OrganizationDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "testsets" + + +class EvaluatorConfigDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + name: str + evaluator_key: str + settings_values: Optional[Dict[str, Any]] = None + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluators_configs" + + +class Result(BaseModel): + type: str + value: Any + + +class EvaluationScenarioResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class AggregatedResult(BaseModel): + evaluator_config: PydanticObjectId + result: Result + + +class EvaluationScenarioInputDB(BaseModel): + name: str + type: str + value: str + + +class EvaluationScenarioOutputDB(BaseModel): + type: str + value: Any + + +class HumanEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class HumanEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class HumanEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + variants: List[PydanticObjectId] + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "human_evaluations" + + +class HumanEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[HumanEvaluationDB] + inputs: List[HumanEvaluationScenarioInput] + outputs: List[HumanEvaluationScenarioOutput] + vote: Optional[str] + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "human_evaluations_scenarios" + + +class EvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str = Field(default="EVALUATION_INITIALIZED") + testset: Link[TestSetDB] + variant: PydanticObjectId + evaluators_configs: List[PydanticObjectId] + aggregated_results: List[AggregatedResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluations" + + +class EvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[EvaluationDB] + variant_id: PydanticObjectId + inputs: List[EvaluationScenarioInputDB] + outputs: List[EvaluationScenarioOutputDB] + correct_answer: Optional[str] + is_pinned: Optional[bool] + note: Optional[str] + evaluators_configs: List[PydanticObjectId] + results: List[EvaluationScenarioResult] + created_at: datetime = Field(default=datetime.utcnow()) + updated_at: datetime = Field(default=datetime.utcnow()) + + class Settings: + name = "new_evaluation_scenarios" + + +class OldEvaluationTypeSettings(BaseModel): + similarity_threshold: Optional[float] + regex_pattern: Optional[str] + regex_should_match: Optional[bool] + webhook_url: Optional[str] + llm_app_prompt_template: Optional[str] + custom_code_evaluation_id: Optional[str] + evaluation_prompt_template: Optional[str] + + +class OldEvaluationScenarioInput(BaseModel): + input_name: str + input_value: str + + +class OldEvaluationScenarioOutput(BaseModel): + variant_id: str + variant_output: str + + +class OldEvaluationDB(Document): + app: Link[AppDB] + organization: Link[OrganizationDB] + user: Link[UserDB] + status: str + evaluation_type: str + evaluation_type_settings: OldEvaluationTypeSettings + variants: List[PydanticObjectId] + version: str = Field("odmantic") + testset: Link[TestSetDB] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + + class Settings: + name = "evaluations" + + +class OldEvaluationScenarioDB(Document): + user: Link[UserDB] + organization: Link[OrganizationDB] + evaluation: Link[OldEvaluationDB] + inputs: List[OldEvaluationScenarioInput] + outputs: List[OldEvaluationScenarioOutput] # EvaluationScenarioOutput + vote: Optional[str] + version: str = Field("odmantic") + score: Optional[Any] + correct_answer: Optional[str] + created_at: Optional[datetime] = Field(default=datetime.utcnow()) + updated_at: Optional[datetime] = Field(default=datetime.utcnow()) + is_pinned: Optional[bool] + note: Optional[str] + + class Settings: + name = "evaluation_scenarios" + + +class Forward: + @free_fall_migration( + document_models=[ + AppDB, + OrganizationDB, + UserDB, + TestSetDB, + EvaluationDB, + OldEvaluationDB, + OldEvaluationScenarioDB, + EvaluationScenarioDB, + HumanEvaluationDB, + HumanEvaluationScenarioDB, + ] + ) + async def migrate_old_human_single_model_evaluation_scenario_to_new_human_evaluation_scenario( + self, session + ): + old_human_single_model_scenarios = await OldEvaluationScenarioDB.find( + OldEvaluationScenarioDB.evaluation.evaluation_type == "single_model_test", + fetch_links=True, + ).to_list() + for counter, single_model_scenario in enumerate(old_human_single_model_scenarios): + print(f"single model evaluation {counter}") + matching_human_evaluation = await HumanEvaluationDB.find_one( + HumanEvaluationDB.id == single_model_scenario.evaluation.id, + HumanEvaluationDB.evaluation_type == "single_model_test", + fetch_links=True, + ) + if matching_human_evaluation: + scenario_inputs = [ + HumanEvaluationScenarioInput( + input_name=input.input_name, + input_value=input.input_value, + ) + for input in single_model_scenario.inputs + ] + scenario_outputs = [ + HumanEvaluationScenarioOutput( + variant_id=output.variant_id, + variant_output=output.variant_output, + ) + for output in single_model_scenario.outputs + ] + new_scenario = HumanEvaluationScenarioDB( + user=matching_human_evaluation.user, + organization=matching_human_evaluation.organization, + evaluation=matching_human_evaluation, + inputs=scenario_inputs, + outputs=scenario_outputs, + correct_answer=single_model_scenario.correct_answer, + is_pinned=single_model_scenario.is_pinned, + note=single_model_scenario.note, + vote=single_model_scenario.vote, + score=single_model_scenario.score, + ) + await new_scenario.insert(session=session) + + +class Backward: + pass From 12d06e897921c2ddf0ab1479c7c250a8292f5e46 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 20:52:29 +0100 Subject: [PATCH 413/414] format --- agenta-web/src/lib/helpers/evaluate.ts | 2 +- .../src/pages/apps/[app_id]/testsets/new/upload/index.tsx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/agenta-web/src/lib/helpers/evaluate.ts b/agenta-web/src/lib/helpers/evaluate.ts index d4962ff42c..abfac16f40 100644 --- a/agenta-web/src/lib/helpers/evaluate.ts +++ b/agenta-web/src/lib/helpers/evaluate.ts @@ -219,4 +219,4 @@ export const calculateResultsDataAvg = ( export const getVotesPercentage = (record: HumanEvaluationListTableDataType, index: number) => { const variant = record.votesData.variants[index] return record.votesData.variants_votes_data[variant]?.percentage -} \ No newline at end of file +} diff --git a/agenta-web/src/pages/apps/[app_id]/testsets/new/upload/index.tsx b/agenta-web/src/pages/apps/[app_id]/testsets/new/upload/index.tsx index 139c45ffdc..29f7c47337 100644 --- a/agenta-web/src/pages/apps/[app_id]/testsets/new/upload/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/testsets/new/upload/index.tsx @@ -70,8 +70,8 @@ export default function AddANewTestset() { router.push(`/apps/${appId}/testsets`) } catch (e: any) { if ( - e?.response?.data?.detail?.find((item: GenericObject) => - item?.loc?.includes("csvdata"), + e?.response?.data?.detail?.find( + (item: GenericObject) => item?.loc?.includes("csvdata"), ) ) message.error(malformedFileError) From 027d477aeeb2f9b125a5c78448f7db7c8f7c2060 Mon Sep 17 00:00:00 2001 From: Akrem Abayed Date: Wed, 17 Jan 2024 20:53:43 +0100 Subject: [PATCH 414/414] format backend --- .../migrations/20240110165900_evaluations_revamp.py | 4 +++- .../migrations/20240112120721_evaluation_scenarios_revamp.py | 1 + .../20240112120800_human_single_model_evaluation_scenarios.py | 4 +++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py index d1af3eb729..6514a18677 100644 --- a/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240110165900_evaluations_revamp.py @@ -183,7 +183,9 @@ async def migrate_old_evaluation_to_new_evaluation(self, session): fetch_links=True, ).to_list() for old_eval in old_evaluations: - if getattr(old_eval, 'id', None) and not getattr(getattr(old_eval, 'app', None), 'id', None): + if getattr(old_eval, "id", None) and not getattr( + getattr(old_eval, "app", None), "id", None + ): continue list_of_eval_configs = [] evaluation_type = old_eval.evaluation_type diff --git a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py index b771096c7d..81911214a9 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py +++ b/agenta-backend/agenta_backend/migrations/20240112120721_evaluation_scenarios_revamp.py @@ -314,5 +314,6 @@ async def migrate_old_auto_evaluation_scenario_to_new_auto_evaluation_scenario( ) await new_scenario.insert(session=session) + class Backward: pass diff --git a/agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py b/agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py index 6fa1178183..82c58fcb10 100644 --- a/agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py +++ b/agenta-backend/agenta_backend/migrations/20240112120800_human_single_model_evaluation_scenarios.py @@ -251,7 +251,9 @@ async def migrate_old_human_single_model_evaluation_scenario_to_new_human_evalua OldEvaluationScenarioDB.evaluation.evaluation_type == "single_model_test", fetch_links=True, ).to_list() - for counter, single_model_scenario in enumerate(old_human_single_model_scenarios): + for counter, single_model_scenario in enumerate( + old_human_single_model_scenarios + ): print(f"single model evaluation {counter}") matching_human_evaluation = await HumanEvaluationDB.find_one( HumanEvaluationDB.id == single_model_scenario.evaluation.id,