diff --git a/.all-contributorsrc b/.all-contributorsrc
index d055b089f1..ecc7093dc3 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -417,6 +417,16 @@
       "contributions": [
         "doc"
       ]
+    },
+    {
+      "login": "LucasTrg",
+      "name": "LucasTrg",
+      "avatar_url": "https://avatars.githubusercontent.com/u/47852577?v=4",
+      "profile": "https://github.com/LucasTrg",
+      "contributions": [
+        "code",
+        "bug"
+      ]
     }
   ],
   "contributorsPerLine": 7,
diff --git a/README.md b/README.md
index 13e745b36a..62735b3dd7 100644
--- a/README.md
+++ b/README.md
@@ -169,7 +169,7 @@ Check out our [Contributing Guide](https://docs.agenta.ai/contributing/getting-s
 ## Contributors ✨
 
 <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
-[![All Contributors](https://img.shields.io/badge/all_contributors-44-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-45-orange.svg?style=flat-square)](#contributors-)
 <!-- ALL-CONTRIBUTORS-BADGE:END -->
 
 Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
@@ -236,6 +236,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
     <tr>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/vishalvanpariya"><img src="https://avatars.githubusercontent.com/u/27823328?v=4?s=100" width="100px;" alt="Vishal Vanpariya"/><br /><sub><b>Vishal Vanpariya</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=vishalvanpariya" title="Code">💻</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/youcefs21"><img src="https://avatars.githubusercontent.com/u/34604972?v=4?s=100" width="100px;" alt="Youcef Boumar"/><br /><sub><b>Youcef Boumar</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=youcefs21" title="Documentation">📖</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/LucasTrg"><img src="https://avatars.githubusercontent.com/u/47852577?v=4?s=100" width="100px;" alt="LucasTrg"/><br /><sub><b>LucasTrg</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=LucasTrg" title="Code">💻</a> <a href="https://github.com/Agenta-AI/agenta/issues?q=author%3ALucasTrg" title="Bug reports">🐛</a></td>
     </tr>
   </tbody>
 </table>
diff --git a/agenta-backend/agenta_backend/services/llm_apps_service.py b/agenta-backend/agenta_backend/services/llm_apps_service.py
index 4e7ab1a9c5..2accbfe509 100644
--- a/agenta-backend/agenta_backend/services/llm_apps_service.py
+++ b/agenta-backend/agenta_backend/services/llm_apps_service.py
@@ -172,6 +172,7 @@ async def run_with_retry(
             last_exception = e
             logger.info(f"Error processing datapoint: {input_data}. {str(e)}")
             logger.info("".join(traceback.format_exception_only(type(e), e)))
+            retries += 1
             common.capture_exception_in_sentry(e)
 
     # If max retries is reached or an exception that isn't in the second block,
@@ -186,7 +187,7 @@ async def run_with_retry(
         result=Result(
             type="error",
             value=None,
-            error=Error(message=exception_message, stacktrace=last_exception),
+            error=Error(message=exception_message, stacktrace=str(last_exception)),
         )
     )
 
diff --git a/agenta-backend/agenta_backend/services/security/sandbox.py b/agenta-backend/agenta_backend/services/security/sandbox.py
index 58a58f0d35..6a6988daa7 100644
--- a/agenta-backend/agenta_backend/services/security/sandbox.py
+++ b/agenta-backend/agenta_backend/services/security/sandbox.py
@@ -91,7 +91,7 @@ def execute_code_safely(
 
     # Call the evaluation function, extract the result if it exists
     # and is a float between 0 and 1
-    result = environment["evaluate"](app_params, inputs, correct_answer, output)
+    result = environment["evaluate"](app_params, inputs, output, correct_answer)
     if isinstance(result, float) and 0 <= result <= 1:
         return result
     return None
diff --git a/agenta-backend/agenta_backend/tests/unit/test_llm_apps_service.py b/agenta-backend/agenta_backend/tests/unit/test_llm_apps_service.py
new file mode 100644
index 0000000000..3462f7c94c
--- /dev/null
+++ b/agenta-backend/agenta_backend/tests/unit/test_llm_apps_service.py
@@ -0,0 +1,162 @@
+import pytest
+from unittest.mock import patch, AsyncMock
+import asyncio
+import aiohttp
+
+from agenta_backend.services.llm_apps_service import (
+    batch_invoke,
+    InvokationResult,
+    Result,
+    Error,
+)
+
+
+@pytest.mark.asyncio
+async def test_batch_invoke_success():
+    """
+    Test the successful invocation of batch_invoke function.
+
+    This test mocks the get_parameters_from_openapi and invoke_app functions
+    to simulate successful invocations. It verifies that the batch_invoke
+    function correctly returns the expected results for the given test data.
+    """
+    with patch(
+        "agenta_backend.services.llm_apps_service.get_parameters_from_openapi",
+        new_callable=AsyncMock,
+    ) as mock_get_parameters_from_openapi, patch(
+        "agenta_backend.services.llm_apps_service.invoke_app", new_callable=AsyncMock
+    ) as mock_invoke_app, patch(
+        "asyncio.sleep", new_callable=AsyncMock
+    ) as mock_sleep:
+        mock_get_parameters_from_openapi.return_value = [
+            {"name": "param1", "type": "input"},
+            {"name": "param2", "type": "input"},
+        ]
+
+        # Mock the response of invoke_app to always succeed
+        def invoke_app_side_effect(uri, datapoint, parameters, openapi_parameters):
+            return InvokationResult(
+                result=Result(type="text", value="Success", error=None),
+                latency=0.1,
+                cost=0.01,
+            )
+
+        mock_invoke_app.side_effect = invoke_app_side_effect
+
+        uri = "http://example.com"
+        testset_data = [
+            {"id": 1, "param1": "value1", "param2": "value2"},
+            {"id": 2, "param1": "value1", "param2": "value2"},
+        ]
+        parameters = {}
+        rate_limit_config = {
+            "batch_size": 10,
+            "max_retries": 3,
+            "retry_delay": 3,
+            "delay_between_batches": 5,
+        }
+
+        results = await batch_invoke(uri, testset_data, parameters, rate_limit_config)
+
+        assert len(results) == 2
+        assert results[0].result.type == "text"
+        assert results[0].result.value == "Success"
+        assert results[1].result.type == "text"
+        assert results[1].result.value == "Success"
+
+
+@pytest.mark.asyncio
+async def test_batch_invoke_retries_and_failure():
+    """
+    Test the batch_invoke function with retries and eventual failure.
+
+    This test mocks the get_parameters_from_openapi and invoke_app functions
+    to simulate failures that trigger retries. It verifies that the batch_invoke
+    function correctly retries the specified number of times and returns an error
+    result after reaching the maximum retries.
+    """
+    with patch(
+        "agenta_backend.services.llm_apps_service.get_parameters_from_openapi",
+        new_callable=AsyncMock,
+    ) as mock_get_parameters_from_openapi, patch(
+        "agenta_backend.services.llm_apps_service.invoke_app", new_callable=AsyncMock
+    ) as mock_invoke_app, patch(
+        "asyncio.sleep", new_callable=AsyncMock
+    ) as mock_sleep:
+        mock_get_parameters_from_openapi.return_value = [
+            {"name": "param1", "type": "input"},
+            {"name": "param2", "type": "input"},
+        ]
+
+        # Mock the response of invoke_app to always fail
+        def invoke_app_side_effect(uri, datapoint, parameters, openapi_parameters):
+            raise aiohttp.ClientError("Test Error")
+
+        mock_invoke_app.side_effect = invoke_app_side_effect
+
+        uri = "http://example.com"
+        testset_data = [
+            {"id": 1, "param1": "value1", "param2": "value2"},
+            {"id": 2, "param1": "value1", "param2": "value2"},
+        ]
+        parameters = {}
+        rate_limit_config = {
+            "batch_size": 10,
+            "max_retries": 3,
+            "retry_delay": 3,
+            "delay_between_batches": 5,
+        }
+
+        results = await batch_invoke(uri, testset_data, parameters, rate_limit_config)
+
+        assert len(results) == 2
+        assert results[0].result.type == "error"
+        assert results[0].result.error.message == "Max retries reached"
+        assert results[1].result.type == "error"
+        assert results[1].result.error.message == "Max retries reached"
+
+
+@pytest.mark.asyncio
+async def test_batch_invoke_generic_exception():
+    """
+    Test the batch_invoke function with a generic exception.
+
+    This test mocks the get_parameters_from_openapi and invoke_app functions
+    to simulate a generic exception during invocation. It verifies that the
+    batch_invoke function correctly handles the exception and returns an error
+    result with the appropriate error message.
+    """
+    with patch(
+        "agenta_backend.services.llm_apps_service.get_parameters_from_openapi",
+        new_callable=AsyncMock,
+    ) as mock_get_parameters_from_openapi, patch(
+        "agenta_backend.services.llm_apps_service.invoke_app", new_callable=AsyncMock
+    ) as mock_invoke_app, patch(
+        "asyncio.sleep", new_callable=AsyncMock
+    ) as mock_sleep:
+        mock_get_parameters_from_openapi.return_value = [
+            {"name": "param1", "type": "input"},
+            {"name": "param2", "type": "input"},
+        ]
+
+        # Mock the response of invoke_app to raise a generic exception
+        def invoke_app_side_effect(uri, datapoint, parameters, openapi_parameters):
+            raise Exception("Generic Error")
+
+        mock_invoke_app.side_effect = invoke_app_side_effect
+
+        uri = "http://example.com"
+        testset_data = [{"id": 1, "param1": "value1", "param2": "value2"}]
+        parameters = {}
+        rate_limit_config = {
+            "batch_size": 1,
+            "max_retries": 3,
+            "retry_delay": 1,
+            "delay_between_batches": 1,
+        }
+
+        results = await batch_invoke(uri, testset_data, parameters, rate_limit_config)
+
+        assert len(results) == 1
+        assert results[0].result.type == "error"
+        assert results[0].result.error.message == "Max retries reached"
diff --git a/agenta-cli/poetry.lock b/agenta-cli/poetry.lock
index 8319af3a35..ef3818ea17 100644
--- a/agenta-cli/poetry.lock
+++ b/agenta-cli/poetry.lock
@@ -1019,13 +1019,13 @@ zstd = ["zstandard"]
 
 [[package]]
 name = "pytest"
-version = "8.2.0"
+version = "8.2.1"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pytest-8.2.0-py3-none-any.whl", hash = "sha256:1733f0620f6cda4095bbf0d9ff8022486e91892245bb9e7d5542c018f612f233"},
-    {file = "pytest-8.2.0.tar.gz", hash = "sha256:d507d4482197eac0ba2bae2e9babf0672eb333017bcedaa5fb1a3d42c1174b3f"},
+    {file = "pytest-8.2.1-py3-none-any.whl", hash = "sha256:faccc5d332b8c3719f40283d0d44aa5cf101cec36f88cde9ed8f2bc0538612b1"},
+    {file = "pytest-8.2.1.tar.gz", hash = "sha256:5046e5b46d8e4cac199c373041f26be56fdb81eb4e67dc11d4e10811fc3408fd"},
 ]
 
 [package.dependencies]
diff --git a/agenta-web/dev.Dockerfile b/agenta-web/dev.Dockerfile
index c155bc07db..1e5e0c16f5 100644
--- a/agenta-web/dev.Dockerfile
+++ b/agenta-web/dev.Dockerfile
@@ -1,21 +1,17 @@
-FROM node:18-alpine
+FROM node:22-alpine3.18 AS base
 
 WORKDIR /app
 
 # Install dependencies based on the preferred package manager
 COPY package.json yarn.lock* package-lock.json* pnpm-lock.yaml* ./
 RUN \
-    # echo "Standalone: $NEXT_PUBLIC_STANDALONE"; \
-    # if [[ ! $NEXT_PUBLIC_STANDALONE == "true" ]]; then  \
-    if [ -f yarn.lock ]; then yarn --frozen-lockfile; \
-    elif [ -f package-lock.json ]; then npm i; \
-    elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i; \
-    # Allow install without lockfile, so example works even without Node.js installed locally
-    else echo "Warning: Lockfile not found. It is recommended to commit lockfiles to version control." && yarn install; \
+    if [ -f yarn.lock ]; then yarn install --frozen-lockfile; \
+    elif [ -f package-lock.json ]; then npm install; \
+    elif [ -f pnpm-lock.yaml ]; then npm install -g pnpm && pnpm install; \
+    else yarn install; \
     fi
-# else echo "NEXT_PUBLIC_STANDALONE is set, skipping install"; \
-# fi
 
+# Copy only the necessary files for development
 COPY src ./src
 COPY public ./public
 COPY next.config.js .
@@ -23,25 +19,31 @@ COPY tsconfig.json .
 COPY postcss.config.js .
 COPY tailwind.config.ts .
 COPY .env .
-RUN if [ -f .env.local ]; then cp .env.local .; fi
-# RUN if [ -f tailwind.config.ts ]; then cp tailwind.config.ts .; fi
-# # used in cloud
 COPY sentry.* .
-# Next.js collects completely anonymous telemetry data about general usage. Learn more here: https://nextjs.org/telemetry
-# Uncomment the following line to disable telemetry at run time
-# ENV NEXT_TELEMETRY_DISABLED 1
 
-# Note: Don't expose ports here, Compose will handle that for us
+# Stage 2: Development Stage
+FROM node:22-alpine3.18 AS dev
+
+WORKDIR /app
+
+# Copy dependencies and application files from the base stage
+COPY --from=base /app /app
+
+# Install development dependencies
+RUN \
+    if [ -f yarn.lock ]; then yarn install; \
+    elif [ -f package-lock.json ]; then npm install; \
+    elif [ -f pnpm-lock.yaml ]; then pnpm install; \
+    else yarn install; \
+    fi
+
+# Expose the necessary ports
+EXPOSE 3000
 
 # Start Next.js in development mode based on the preferred package manager
 CMD \
-    # echo "Standalone: $NEXT_PUBLIC_STANDALONE"; \
-    # if [[ ! $NEXT_PUBLIC_STANDALONE == "true" ]]; then  \
     if [ -f yarn.lock ]; then yarn dev; \
     elif [ -f package-lock.json ]; then npm run dev; \
     elif [ -f pnpm-lock.yaml ]; then pnpm dev; \
     else yarn dev; \
     fi
-# else echo "NEXT_PUBLIC_STANDALONE is set, skipping run"; \
-# fi
-
diff --git a/agenta-web/prod.Dockerfile b/agenta-web/prod.Dockerfile
index dbe9b9da3c..a2b8e55c60 100644
--- a/agenta-web/prod.Dockerfile
+++ b/agenta-web/prod.Dockerfile
@@ -1,11 +1,11 @@
-FROM node:18-alpine
+# Stage 1: Build Stage
+FROM node:22-alpine3.18 AS builder
 
 WORKDIR /app
 
 # Install only production dependencies
 COPY package.json package-lock.json* ./
-RUN npm ci --omit=dev
-
+RUN npm ci
 # Copy only necessary files
 COPY src ./src
 COPY public ./public
@@ -14,10 +14,27 @@ COPY tsconfig.json .
 COPY postcss.config.js .
 COPY tailwind.config.ts .
 COPY .env.production .
-# used in cloud
 COPY sentry.* .
+
 # Build the Next.js app for production
 RUN npm run build
 
+# Stage 2: Production Stage
+FROM node:22-alpine3.18 AS prod
+
+WORKDIR /app
+
+# Copy only the necessary files from the build stage
+COPY --from=builder /app/package.json /app/package-lock.json* /app
+COPY --from=builder /app/.next /app/.next
+COPY --from=builder /app/public /app/public
+COPY --from=builder /app/next.config.js /app/tsconfig.json /app/postcss.config.js /app/tailwind.config.ts /app/.env.production /app/sentry.* /app/
+
+# Install only production dependencies
+RUN npm ci --omit=dev
+
+# Expose the necessary port
+EXPOSE 3000
+
 # Start the production server
 CMD ["npm", "start"]
diff --git a/docker-compose.yml b/docker-compose.yml
index 476dede0cd..2832585b27 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -167,3 +167,4 @@ networks:
 volumes:
     mongodb_data:
     redis_data:
+    nextjs_cache:
\ No newline at end of file