diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py index cf00af533d..3faab67016 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py @@ -28,6 +28,9 @@ "type": "number", "default": 0.5, "description": "The threshold value for similarity comparison", + "min": 0, + "max": 1, + "required": True, } }, "description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.", @@ -43,6 +46,7 @@ "type": "regex", "default": "", "description": "Pattern for regex testing (ex: ^this_word\\d{3}$)", + "required": True, }, "regex_should_match": { "label": "Match/Mismatch", @@ -62,6 +66,7 @@ "type": "string", "default": "", "description": "The name of the field in the JSON output that you wish to evaluate", + "required": True, } }, "description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.", @@ -76,6 +81,7 @@ "type": "text", "default": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below: Evaluation strategy: 0 to 10 0 is very bad and 10 is very good. Prompt: {llm_app_prompt_template} Inputs: country: {country} Correct Answer:{correct_answer} Evaluate this: {variant_output} Answer ONLY with one of the given grading or evaluation options.", "description": "Template for AI critique prompts", + "required": True, } }, "description": "AI Critique evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).", @@ -90,6 +96,7 @@ "type": "code", "default": "from typing import Dict\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: str,\n correct_answer: str\n) -> float:\n # ...\n return 0.75 # Replace with your calculated score", "description": "Code for evaluating submissions", + "required": True, } }, "description": "Code Evaluation allows you to write your own evaluator in Python. You need to provide the Python code for the evaluator.", @@ -103,6 +110,7 @@ "label": "Webhook URL", "type": "string", "description": "https://your-webhook-url.com", + "required": True, }, }, "description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response indicating the correctness of the answer. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.", @@ -132,10 +140,7 @@ "settings_template": { "label": "Single Model Testing Settings", "description": "Checks if the output starts with the specified prefix.", - "prefix": { - "label": "prefix", - "type": "string", - }, + "prefix": {"label": "prefix", "type": "string", "required": True}, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -161,6 +166,7 @@ "label": "suffix", "type": "string", "description": "The string to match at the end of the output.", + "required": True, }, }, "description": "Ends With evaluator checks if the output ends with a specified suffix, considering case sensitivity based on the settings.", @@ -182,6 +188,7 @@ "label": "substring", "type": "string", "description": "The string to check if it is contained in the output.", + "required": True, }, }, "description": "Contains evaluator checks if the output contains a specified substring, considering case sensitivity based on the settings.", @@ -203,6 +210,7 @@ "label": "substrings", "type": "string", "description": "Provide a comma-separated list of strings to check if any is contained in the output.", + "required": True, }, }, "description": "Contains Any evaluator checks if the output contains any of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.", @@ -224,10 +232,22 @@ "label": "substrings", "type": "string", "description": "Provide a comma-separated list of strings to check if all are contained in the output.", + "required": True, }, }, "description": "Contains All evaluator checks if the output contains all of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.", }, + { + "name": "Levenshtein Distance", + "key": "auto_levenshtein_distance", + "direct_use": False, + "settings_template": { + "label": "Levenshtein Distance Settings", + "description": "Evaluates the Levenshtein distance between the output and the correct answer. If a threshold is specified, it checks if the distance is below this threshold and returns a boolean value. If no threshold is specified, it returns the numerical Levenshtein distance.", + "threshold": {"label": "Threshold", "type": "number", "required": False}, + }, + "description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.", + }, ] diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index bab421c9c5..6a8219e056 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -421,6 +421,55 @@ def auto_contains_json( ) +def levenshtein_distance(s1, s2): + if len(s1) < len(s2): + return levenshtein_distance(s2, s1) + + if len(s2) == 0: + return len(s1) + + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + + return previous_row[-1] + + +def auto_levenshtein_distance( + inputs: Dict[str, Any], + output: str, + correct_answer: str, + app_params: Dict[str, Any], + settings_values: Dict[str, Any], + lm_providers_keys: Dict[str, Any], +) -> Result: + try: + distance = levenshtein_distance(output, correct_answer) + + if "threshold" in settings_values: + threshold = settings_values["threshold"] + is_within_threshold = distance <= threshold + return Result(type="bool", value=is_within_threshold) + + return Result(type="number", value=distance) + + except Exception as e: + return Result( + type="error", + value=None, + error=Error( + message="Error during Levenshtein threshold evaluation", + stacktrace=str(e), + ), + ) + + def evaluate( evaluator_key: str, inputs: Dict[str, Any], diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index 1882f35564..50ce322437 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -355,6 +355,7 @@ async def aggregate_evaluator_results( "auto_contains_any", "auto_contains_all", "auto_contains_json", + "auto_levenshtein_distance", ]: result = aggregation_service.aggregate_float(results) diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py index 894233d7d4..f76dd9e6fb 100644 --- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py +++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py @@ -1,6 +1,7 @@ import pytest from agenta_backend.services.evaluators_service import ( + auto_levenshtein_distance, auto_starts_with, auto_ends_with, auto_contains, @@ -129,3 +130,22 @@ def test_auto_contains_all(output, substrings, case_sensitive, expected): def test_auto_contains_json(output, expected): result = auto_contains_json({}, output, "", {}, {}, {}) assert result.value == expected + + +@pytest.mark.parametrize( + "output, correct_answer, threshold, expected", + [ + ("hello world", "hello world", 5, True), + ("hello world", "hola mundo", 5, False), + ("hello world", "hello world!", 2, True), + ("hello world", "hello wor", 10, True), + ("hello world", "hello worl", None, 1), + ("hello world", "helo world", None, 1), + ], +) +def test_auto_levenshtein_distance(output, correct_answer, threshold, expected): + settings_values = {"threshold": threshold} if threshold is not None else {} + result = auto_levenshtein_distance( + {}, output, correct_answer, {}, settings_values, {} + ) + assert result.value == expected diff --git a/agenta-backend/pyproject.toml b/agenta-backend/pyproject.toml index 9471926a51..fbd07a5474 100644 --- a/agenta-backend/pyproject.toml +++ b/agenta-backend/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "agenta_backend" -version = "0.12.3" +version = "0.12.4" description = "" authors = ["Mahmoud Mabrouk "] readme = "README.md" diff --git a/agenta-cli/pyproject.toml b/agenta-cli/pyproject.toml index e9c50bafd8..5a7c126a1a 100644 --- a/agenta-cli/pyproject.toml +++ b/agenta-cli/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "agenta" -version = "0.12.3" +version = "0.12.4" description = "The SDK for agenta is an open-source LLMOps platform." readme = "README.md" authors = ["Mahmoud Mabrouk "] diff --git a/agenta-web/package-lock.json b/agenta-web/package-lock.json index 86a835dbdc..aa2a4e4556 100644 --- a/agenta-web/package-lock.json +++ b/agenta-web/package-lock.json @@ -1,12 +1,12 @@ { "name": "dashboard", - "version": "0.12.3", + "version": "0.12.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "dashboard", - "version": "0.12.3", + "version": "0.12.4", "dependencies": { "@ant-design/colors": "^7.0.0", "@ant-design/icons": "^5.0.1", diff --git a/agenta-web/package.json b/agenta-web/package.json index c11f4e7625..6f25692c17 100644 --- a/agenta-web/package.json +++ b/agenta-web/package.json @@ -1,6 +1,6 @@ { "name": "agenta", - "version": "0.12.3", + "version": "0.12.4", "private": true, "engines": { "node": ">=18" diff --git a/agenta-web/src/components/Evaluations/EvaluationCardView/index.tsx b/agenta-web/src/components/Evaluations/EvaluationCardView/index.tsx index 45e114047c..f0097bc08e 100644 --- a/agenta-web/src/components/Evaluations/EvaluationCardView/index.tsx +++ b/agenta-web/src/components/Evaluations/EvaluationCardView/index.tsx @@ -361,33 +361,38 @@ const EvaluationCardView: React.FC = ({ - - isChat - ? onChatChange(value) - : onInputChange( - {target: {value}} as any, - scenarioId, - scenario.inputs.findIndex((ip) => ip.input_name === name), - ) - } - inputParams={ - isChat - ? [{name: "chat", value: chat} as any] - : variantData[0].inputParams?.map((item) => ({ - ...item, - value: scenario.inputs.find( - (ip) => ip.input_name === item.name, - )?.input_value, - })) || [] - } - key={scenarioId} - useChatDefaultValue - form={form} - onFinish={() => onRun(scenarioId)} - imageSize="large" - /> +
+ Inputs + + isChat + ? onChatChange(value) + : onInputChange( + {target: {value}} as any, + scenarioId, + scenario.inputs.findIndex( + (ip) => ip.input_name === name, + ), + ) + } + inputParams={ + isChat + ? [{name: "chat", value: chat} as any] + : variantData[0].inputParams?.map((item) => ({ + ...item, + value: scenario.inputs.find( + (ip) => ip.input_name === item.name, + )?.input_value, + })) || [] + } + key={scenarioId} + useChatDefaultValue + form={form} + onFinish={() => onRun(scenarioId)} + imageSize="large" + /> +
@@ -405,11 +410,17 @@ const EvaluationCardView: React.FC = ({
- {!isAbTesting && ( - - Model Response - - )} +
+ {!isAbTesting ? ( + + Model Response + + ) : ( + + Outputs + + )} +
({ }, })) -export function LongTextCellRenderer(params: ICellRendererParams) { +export function LongTextCellRenderer(params: ICellRendererParams, output?: any) { const {value, api, node} = params const [expanded, setExpanded] = useState( node.rowHeight !== api.getSizesForCurrentTheme().rowHeight, @@ -95,11 +95,11 @@ export function LongTextCellRenderer(params: ICellRendererParams) { cellsArr.forEach((cell) => { cell.setAttribute( "style", - "overflow: visible; white-space: pre-wrap; text-overflow: unset;", + "overflow: visible; white-space: pre-wrap; text-overflow: unset; line-height: 2.5em;", ) }) const height = Math.max(...cellsArr.map((cell) => cell.scrollHeight)) - node.setRowHeight(height <= defaultHeight ? defaultHeight * 2 : height) + node.setRowHeight(height <= defaultHeight ? defaultHeight * 2 : height + 10) } else { cellsArr.forEach((cell) => { cell.setAttribute( @@ -121,9 +121,9 @@ export function LongTextCellRenderer(params: ICellRendererParams) { return (
- {value} + {output ? output : value} {expanded ? ( diff --git a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx index 52a9b498eb..547e7644b7 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationCompare/EvaluationCompare.tsx @@ -99,7 +99,7 @@ const EvaluationCompareMode: React.FC = () => { field: `inputs.${ix}.value` as any, ...getFilterParams("text"), pinned: "left", - cellRenderer: LongTextCellRenderer, + cellRenderer: (params: any) => LongTextCellRenderer(params), }) }) @@ -110,7 +110,7 @@ const EvaluationCompareMode: React.FC = () => { field: "correctAnswer", ...getFilterParams("text"), pinned: "left", - cellRenderer: LongTextCellRenderer, + cellRenderer: (params: any) => LongTextCellRenderer(params), }) variants.forEach((variant, vi) => { @@ -133,25 +133,28 @@ const EvaluationCompareMode: React.FC = () => { cellRenderer: (params: any) => { return ( <> - {showDiff === "show" ? ( - - - item.evaluationId === variant.evaluationId, - )?.output?.result, - )} - expectedOutput={params.data?.correctAnswer} - /> - - ) : ( - getTypedValue( - params.data?.variants.find( - (item: any) => item.evaluationId === variant.evaluationId, - )?.output?.result, - ) - )} + {showDiff === "show" + ? LongTextCellRenderer( + params, + + item.evaluationId === variant.evaluationId, + )?.output?.result, + )} + expectedOutput={params.data?.correctAnswer} + />, + ) + : LongTextCellRenderer( + params, + getTypedValue( + params.data?.variants.find( + (item: any) => + item.evaluationId === variant.evaluationId, + )?.output?.result, + ), + )} ) }, diff --git a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx index 04993ce5ba..687ae40f10 100644 --- a/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluationScenarios/EvaluationScenarios.tsx @@ -71,7 +71,7 @@ const EvaluationScenarios: React.FC = () => { valueGetter: (params) => { return getTypedValue(params.data?.inputs[index]) }, - cellRenderer: LongTextCellRenderer, + cellRenderer: (params: any) => LongTextCellRenderer(params), }) }) colDefs.push({ @@ -83,7 +83,7 @@ const EvaluationScenarios: React.FC = () => { valueGetter: (params) => { return params.data?.correct_answer?.toString() || "" }, - cellRenderer: LongTextCellRenderer, + cellRenderer: (params: any) => LongTextCellRenderer(params), }) evalaution?.variants.forEach((_, index) => { colDefs.push({ @@ -97,14 +97,15 @@ const EvaluationScenarios: React.FC = () => { if (result && result.type == "error") { return `${result?.error?.message}\n${result?.error?.stacktrace}` } - return showDiff === "show" ? ( - - ) : ( - LongTextCellRenderer(params) - ) + return showDiff === "show" + ? LongTextCellRenderer( + params, + , + ) + : LongTextCellRenderer(params) }, valueGetter: (params) => { const result = params.data?.outputs[index].result diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx index 860fd03ca2..bb67065b5b 100644 --- a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx +++ b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx @@ -117,12 +117,15 @@ const DynamicFormField: React.FC = ({ type, default: defaultVal, description, + min, + max, + required, }) => { const {appTheme} = useAppTheme() const classes = useStyles() const {token} = theme.useToken() - const rules: Rule[] = [{required: true, message: "This field is required"}] + const rules: Rule[] = [{required: required ?? true, message: "This field is required"}] if (type === "regex") rules.push({ validator: (_, value) => @@ -167,7 +170,7 @@ const DynamicFormField: React.FC = ({ {type === "string" || type === "regex" ? ( ) : type === "number" ? ( - + ) : type === "boolean" || type === "bool" ? ( ) : type === "text" ? ( @@ -295,25 +298,6 @@ const NewEvaluatorModal: React.FC = ({ ) }, }, - { - title: "Type", - dataIndex: "type", - key: "type", - render(_, record) { - const template = Object.keys(record?.settings_template || {}) - .filter((key) => !!record?.settings_template[key]?.type) - .map((key) => ({ - key, - ...record?.settings_template[key]!, - })) - - return ( - <> - {template[0].type} - - ) - }, - }, { title: "Description", dataIndex: "description", diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index d0df2873ea..b3d0632242 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -382,6 +382,9 @@ export interface EvaluationSettingsTemplate { label: string default?: ValueType description: string + min?: number + max?: number + required?: boolean } export interface Evaluator {