Merge branch 'main' into Hot-Fix--Failing-cypress-test

Agenta-AI · Feb 16, 2024 · 8e209aa · 8e209aa
2 parents 5d1b478 + a16eca8
commit 8e209aa
Show file tree

Hide file tree

Showing 28 changed files with 203 additions and 126 deletions.
diff --git a/agenta-backend/agenta_backend/models/api/organization_models.py b/agenta-backend/agenta_backend/models/api/organization_models.py
diff --git a/agenta-backend/agenta_backend/routers/organization_router.py b/agenta-backend/agenta_backend/routers/organization_router.py
diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py
@@ -96,6 +96,9 @@ def evaluate(
         loop.run_until_complete(DBEngine().init_db())
         app = loop.run_until_complete(fetch_app_by_id(app_id))
         app_variant_db = loop.run_until_complete(fetch_app_variant_by_id(variant_id))
+        assert (
+            app_variant_db is not None
+        ), f"App variant with id {variant_id} not found!"
         app_variant_parameters = app_variant_db.config.parameters
         testset_db = loop.run_until_complete(fetch_testset_by_id(testset_id))
         new_evaluation_db = loop.run_until_complete(
@@ -180,10 +183,12 @@ def evaluate(
                     if correct_answer_column in data_point
                     else ""
                 )
+
                 loop.run_until_complete(
                     create_new_evaluation_scenario(
                         user=app.user,
-                        organization=app.organization,
+                        organization=app.organization if isCloudEE() else None,
+                        workspace=app.workspace if isCloudEE() else None,
                         evaluation=new_evaluation_db,
                         variant_id=variant_id,
                         evaluators_configs=new_evaluation_db.evaluators_configs,

diff --git a/agenta-web/src/components/AppSelector/AppSelector.tsx b/agenta-web/src/components/AppSelector/AppSelector.tsx
@@ -1,11 +1,9 @@
 import {useState, useEffect, useMemo} from "react"
-import {useRouter} from "next/router"
 import {PlusOutlined} from "@ant-design/icons"
-import {Input, Modal, ConfigProvider, theme, Spin, Card, Button, notification, Divider} from "antd"
+import {Input, Modal, ConfigProvider, theme, Card, Button, notification, Divider} from "antd"
 import AppCard from "./AppCard"
 import {Template, GenericObject} from "@/lib/Types"
 import {useAppTheme} from "../Layout/ThemeContextProvider"
-import {CloseCircleFilled} from "@ant-design/icons"
 import TipsAndFeatures from "./TipsAndFeatures"
 import Welcome from "./Welcome"
 import {
@@ -31,6 +29,7 @@ import {useProfileData} from "@/contexts/profile.context"
 import CreateAppStatusModal from "./modals/CreateAppStatusModal"
 import {usePostHogAg} from "@/hooks/usePostHogAg"
 import ResultComponent from "../ResultComponent/ResultComponent"
+import {dynamicContext} from "@/lib/helpers/dynamic"
 
 type StyleProps = {
     themeMode: "dark" | "light"
@@ -128,6 +127,14 @@ const AppSelector: React.FC = () => {
         details: undefined,
         appId: undefined,
     })
+    const [useOrgData, setUseOrgData] = useState<Function>(() => () => "")
+    const {selectedOrg} = useOrgData()
+
+    useEffect(() => {
+        dynamicContext("org.context", {useOrgData}).then((context) => {
+            setUseOrgData(() => context.useOrgData)
+        })
+    }, [])
 
     useEffect(() => {
         getAllProviderLlmKeys()
@@ -284,7 +291,11 @@ const AppSelector: React.FC = () => {
                                     <Card
                                         className={classes.createCard}
                                         onClick={() => {
-                                            if (isDemo() && apps.length > 2) {
+                                            if (
+                                                isDemo() &&
+                                                selectedOrg?.is_paying == false &&
+                                                apps.length > 2
+                                            ) {
                                                 showMaxAppError()
                                             } else {
                                                 showCreateAppModal()

diff --git a/docker-compose.gh.yml b/docker-compose.gh.yml
@@ -145,6 +145,7 @@ services:
             - ./agenta-backend/agenta_backend:/app/agenta_backend
             - /var/run/docker.sock:/var/run/docker.sock
         depends_on:
+            - mongo
             - rabbitmq
             - redis
         extra_hosts:

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
@@ -145,8 +145,9 @@ services:
       - ./agenta-backend/agenta_backend:/app/agenta_backend
       - /var/run/docker.sock:/var/run/docker.sock
     depends_on:
-      - rabbitmq
-      - redis
+        - mongo
+        - rabbitmq
+        - redis
     extra_hosts:
       - "host.docker.internal:host-gateway"
     networks:

diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -43,7 +43,7 @@ services:
                 "--log-level",
                 "info",
                 "--root-path",
-                "/api",
+                "/api"
             ]
         labels:
             - "traefik.http.routers.backend.rule=PathPrefix(`/api/`)"
@@ -126,12 +126,14 @@ services:
             - CELERY_BROKER_URL=amqp://guest@rabbitmq//
             - CELERY_RESULT_BACKEND=redis://redis:6379/0
             - FEATURE_FLAG=oss
+            - DATABASE_MODE=test
         volumes:
             - ./agenta-backend/agenta_backend:/app/agenta_backend
             - /var/run/docker.sock:/var/run/docker.sock
         depends_on:
             - rabbitmq
             - redis
+            - mongo
         extra_hosts:
             - host.docker.internal:host-gateway
         networks:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -149,6 +149,7 @@ services:
             - ./agenta-backend/agenta_backend:/app/agenta_backend
             - /var/run/docker.sock:/var/run/docker.sock
         depends_on:
+            - mongo
             - rabbitmq
             - redis
         extra_hosts:

diff --git a/docs/basic_guides/automatic_evaluation.mdx b/docs/basic_guides/automatic_evaluation.mdx
@@ -1,5 +1,70 @@
 ---
-title: 'Automatic Evaluation'
+title: 'Evaluating LLM Apps'
+description: Systematically evaluate your LLM applications and compare their performance.
 ---
 
-<Warning> This page is under construction. Please reach out to us on [Slack](https://join.slack.com/t/agenta-hq/shared_invite/zt-1zsafop5i-Y7~ZySbhRZvKVPV5DO_7IA) **#support** channel, [Book a call](https://cal.com/mahmoud-mabrouk-ogzgey/demo), through [email](mailto:[email protected]) if you need help with using automatic evaluation.</Warning>
+The key to building production-ready LLM applications is to have a tight feedback loop of prompt engineering and evaluation. In this document, we will explain how to use agenta to quickly evaluate and compare the performance of your LLM applications.
+
+## Configuring Evaluators
+
+Agenta comes with a set of built-in evaluators that can be configured. 
+
+By default, each project includes the following evaluators (which do not require configuration):
+- Exact match: This evaluator checks if the generated answer is an exact match to the expected answer. The aggregated result is the percentage of correct answers.
+
+Additionally, the following configurable evaluators are available but need to be explicitly configured  and added before use.
+
+To add an evaluator, go to the Evaluators tab and click on the "Add Evaluator" button. A modal will appear where you can select the evaluator you want to add and configure it.
+
+<img height="600" className="dark:hidden" src="/images/basic_guides/15_accessing_evaluator_page_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/15_accessing_evaluator_page_dark.png" />
+
+<img height="600" className="dark:hidden" src="/images/basic_guides/16_new_evaluator_modal_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/16_new_evaluator_modal_dark.png" />
+
+**Configurable evaluators**
+- Regex match: This evaluator checks if the generated answer matches a regular expression pattern. You need to provide the regex expression and specify whether an answer is correct if it matches or does not match the regex.
+- Webhook evaluator: This evaluator sends the generated answer and the correct_answer to a webhook and expects a response indicating the correctness of the answer. You need to provide the URL of the webhook.
+- Similarity Match evaluator: This evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.
+- AI Critic evaluator: This evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).
+- Custom code evaluator: This evaluator allows you to write your own evaluator in Python. You need to provide the Python code for the evaluator. More details can be found here.
+
+
+## Begin Evaluation
+To start an evaluation, go to the Evaluations page and click on the "Begin Evaluation Now" button. A modal will appear where you can fine-tune the evaluation based on your specific requirements.
+
+In the modal, you need to specify the following parameters:
+
+- <b>Testset:</b> Choose the testset you want to use for the evaluation.
+- <b>Variants:</b> Select one or more variants you wish to evaluate.
+- <b>Evaluators:</b> Choose one or more evaluators for the assessment.
+
+<img height="600" className="dark:hidden" src="/images/basic_guides/17_begin_evaluation_modal_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/17_begin_evaluation_modal_dark.png" />
+
+### Advanced Configuration
+Additional configurations for batching and retrying LLM calls are available in the advanced configuration section. You can specify the following parameters:
+
+- <b>Batch Size:</b> Set the number of testsets to include in each batch <b>(default is 10)</b>.
+- <b>Retry Delay:</b> Define the delay before retrying a failed language model call <b>(in seconds, default is 3)</b>.
+- <b>Max Retries:</b> Specify the maximum number of retries for a failed language model call <b>(default is 3)</b>.
+- <b>Delay Between Batches:</b> Set the delay between running batches <b>(in seconds, default is 5)</b>.
+
+In addition to the batching and retrying configurations, you can also specify the following parameters:
+- <b>Correct Answer Column:</b> Specify the column in the test set containing the correct/expected answer <b>(default is correct_answer)</b>.
+
+<img height="600" className="dark:hidden" src="/images/basic_guides/18_begin_evaluation_modal_advanced_config_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/18_begin_evaluation_modal_advanced_config_dark.png" />
+
+## View Evaluation Result
+To view the result of an evaluation, double-click on the evaluation row once you have clicked the "Create" button and the evaluation status is set to "completed". This will give you access to the detailed evaluation results.
+
+<img height="600" className="dark:hidden" src="/images/basic_guides/19_view_evaluation_result_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/19_view_evaluation_result_dark.png" />
+
+## Compare Evaluations
+When the evaluation status is set to "completed", you can select two or more evaluations <b>from the same testset</b> to compare. Click on the "Compare" button, and you will be taken to the Evaluation comparison view where you can compare the output of two or more evaluations.
+
+<img height="600" className="dark:hidden" src="/images/basic_guides/20_evaluation_comparison_view_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/20_evaluation_comparison_view_dark.png" />
+
diff --git a/docs/basic_guides/custom_evaluator.mdx b/docs/basic_guides/custom_evaluator.mdx
@@ -0,0 +1,57 @@
+---
+title: 'Writing Custom Evaluators'
+description: 'Write the code for a custom evaluator on Agenta'
+---
+
+Sometimes, the default evaluators on Agenta may not be sufficient for your specific use case. In such cases, you can create a custom evaluator to suit your specific needs. Custom evaluators are written in Python.
+
+For the moment, there are limitation on the code that can be written in the custom evaluator. Our backend uses RestrictedPython to execute the code which limits the libraries that can be used. 
+
+
+## Accessing the Evaluator Page
+To create a custom evaluator on Agenta, simply click on the Evaluations button in the sidebar menu, and then select the "Evaluators" tab within the Evaluations page.
+<img height="600" className="dark:hidden" src="/images/basic_guides/15_accessing_evaluator_page_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/15_accessing_evaluator_page_dark.png" />
+
+## Creating an Evaluator
+On the Evaluators tab, click on the "New Evaluator" button at the top right corner of your screen which would open a modal prompting you to provide the following information:
+1. <b>Evaluator name: </b> Enter a unique and descriptive name for your custom evaluator.
+2. <b>Evaluator Template: </b> Choose a template for your custom evaluator. This could be based on the specific criteria or type of evaluation you want.
+<img height="600" className="dark:hidden" src="/images/basic_guides/16_new_evaluator_modal_light.png" />
+<img height="600" className="hidden dark:block" src="/images/basic_guides/16_new_evaluator_modal_dark.png" />
+Click on the "Create" button within the modal to confirm and complete the creation of your custom evaluator.
+
+## Evaluation code
+
+Your code should include on function called evaluate with the following signature:
+```python
+from typing import Dict
+
+def evaluate(
+    app_params: Dict[str, str],
+    inputs: Dict[str, str],
+    output: str,
+    correct_answer: str
+) -> float:
+```
+
+The function should return a float value which is the score of the evaluation. The score should be between 0 and 1. 0 means the evaluation failed and 1 means the evaluation passed.
+
+The parameters are as follows:
+1. <b>app_params: </b> A dictionary containing the configuration of the app. This would include the prompt, model and all the other parameters specified in the playground with the same naming.
+2. <b>inputs: </b> A dictionary containing the inputs of the app. 
+3. <b>output: </b> The generated output of the app. 
+4. <b>correct_answer: </b> The correct answer of the app. 
+
+For instance, exact match would be implemented as follows:
+```python
+from typing import Dict
+
+def evaluate(
+    app_params: Dict[str, str],
+    inputs: Dict[str, str],
+    output: str,
+    correct_answer: str
+) -> float:
+    return 1 if output == correct_answer else 0
+```