feat(benchmark): Add -N, --attempts option for multiple attempts …

…per challenge LLMs are probabilistic systems. Reproducibility of completions is not guaranteed. It only makes sense to account for this, by running challenges multiple times to obtain a success ratio rather than a boolean success/failure result. Changes: - Add `-N`, `--attempts` option to CLI and `attempts_per_challenge` parameter to `main.py:run_benchmark`. - Add dynamic `i_attempt` fixture through `pytest_generate_tests` hook in conftest.py to achieve multiple runs per challenge. - Modify `pytest_runtest_makereport` hook in conftest.py to handle multiple reporting calls per challenge. - Refactor report_types.py, reports.py, process_report.ty to allow multiple results per challenge. - Calculate `success_percentage` from results of the current run, rather than all known results ever. - Add docstrings to a number of models in report_types.py. - Allow `None` as a success value, e.g. for runs that did not render any results before being cut off. - Make SingletonReportManager thread-safe.
MauroDruwel · Jan 22, 2024 · a0cae78 · a0cae78
1 parent 488f40a
commit a0cae78
Show file tree

Hide file tree

Showing 12 changed files with 177 additions and 137 deletions.
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
@@ -59,6 +59,9 @@ def start():
 
 
 @cli.command(default=True)
+@click.option(
+    "-N", "--attempts", default=1, help="Number of times to run each challenge."
+)
 @click.option(
     "-c",
     "--category",
@@ -107,6 +110,7 @@ def run(
     test: tuple[str],
     category: tuple[str],
     skip_category: tuple[str],
+    attempts: int,
     cutoff: Optional[int] = None,
     backend: Optional[bool] = False,
     # agent_path: Optional[Path] = None,
@@ -153,6 +157,7 @@ def run(
                 tests=test,
                 categories=category,
                 skip_categories=skip_category,
+                attempts_per_challenge=attempts,
                 cutoff=cutoff,
             )
 
@@ -171,6 +176,7 @@ def run(
             tests=test,
             categories=category,
             skip_categories=skip_category,
+            attempts_per_challenge=attempts,
             cutoff=cutoff,
         )
 

diff --git a/benchmark/agbenchmark/challenges/base.py b/benchmark/agbenchmark/challenges/base.py
@@ -47,7 +47,10 @@ def from_source_uri(cls, source_uri: str) -> type["BaseChallenge"]:
 
     @abstractmethod
     def test_method(
-        self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
+        self,
+        config: AgentBenchmarkConfig,
+        request: pytest.FixtureRequest,
+        i_attempt: int,
     ) -> None:
         """
         Test method for use by Pytest-based benchmark sessions. Should return normally

diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py
@@ -155,7 +155,10 @@ def from_source_uri(cls, source_uri: str) -> type["BuiltinChallenge"]:
 
     @pytest.mark.asyncio
     async def test_method(
-        self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
+        self,
+        config: AgentBenchmarkConfig,
+        request: pytest.FixtureRequest,
+        i_attempt: int,
     ) -> None:
         if os.environ.get("HELICONE_API_KEY"):
             from helicone.lock import HeliconeLockManager

diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
@@ -353,7 +353,10 @@ async def evaluate_task_state(
 
     @pytest.mark.asyncio
     async def test_method(
-        self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
+        self,
+        config: AgentBenchmarkConfig,
+        request: pytest.FixtureRequest,
+        i_attempt: int,
     ) -> None:
         if os.environ.get("HELICONE_API_KEY"):
             from helicone.lock import HeliconeLockManager

diff --git a/benchmark/agbenchmark/conftest.py b/benchmark/agbenchmark/conftest.py
@@ -12,10 +12,11 @@
 
 from agbenchmark.challenges import OPTIONAL_CATEGORIES, BaseChallenge
 from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.reports.processing.report_types import Test
 from agbenchmark.reports.ReportManager import RegressionTestsTracker
 from agbenchmark.reports.reports import (
-    finalize_test_report,
-    initialize_test_report,
+    add_test_result_to_report,
+    make_empty_test_report,
     session_finish,
 )
 from agbenchmark.utils.data_types import Category
@@ -80,6 +81,7 @@ def pytest_addoption(parser: pytest.Parser) -> None:
     Args:
         parser: The Pytest CLI parser to which the command-line options are added.
     """
+    parser.addoption("-N", "--attempts", action="store")
     parser.addoption("--no-dep", action="store_true")
     parser.addoption("--mock", action="store_true")
     parser.addoption("--host", default=None)
@@ -149,6 +151,9 @@ def mock(request: pytest.FixtureRequest) -> bool:
     return request.config.getoption("--mock")
 
 
+test_reports: dict[str, Test] = {}
+
+
 def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
     """
     Pytest hook that is called when a test report is being generated.
@@ -159,14 +164,19 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
         call: The call object from which the test result is retrieved.
     """
     challenge: type[BaseChallenge] = item.cls  # type: ignore
+    challenge_id = challenge.info.eval_id
+
+    if challenge_id not in test_reports:
+        test_reports[challenge_id] = make_empty_test_report(challenge.info)
 
     if call.when == "setup":
         test_name = item.nodeid.split("::")[1]
         item.user_properties.append(("test_name", test_name))
-        initialize_test_report(item, challenge.info)
 
     if call.when == "call":
-        finalize_test_report(item, call, agbenchmark_config)
+        add_test_result_to_report(
+            test_reports[challenge_id], item, call, agbenchmark_config
+        )
 
 
 def timeout_monitor(start_time: int) -> None:
@@ -205,6 +215,11 @@ def pytest_sessionfinish(session: pytest.Session) -> None:
     session_finish(agbenchmark_config)
 
 
+def pytest_generate_tests(metafunc: pytest.Metafunc):
+    if type(n := metafunc.config.getoption("-N")) is str:
+        metafunc.parametrize("i_attempt", range(int(n)))
+
+
 def pytest_collection_modifyitems(
     items: list[pytest.Item], config: pytest.Config
 ) -> None:

diff --git a/benchmark/agbenchmark/main.py b/benchmark/agbenchmark/main.py
@@ -21,6 +21,7 @@ def run_benchmark(
     tests: tuple[str] = tuple(),
     categories: tuple[str] = tuple(),
     skip_categories: tuple[str] = tuple(),
+    attempts_per_challenge: int = 1,
     mock: bool = False,
     no_dep: bool = False,
     no_cutoff: bool = False,
@@ -96,6 +97,9 @@ def run_benchmark(
         if active:
             pytest_args.append(flag)
 
+    if attempts_per_challenge > 1:
+        pytest_args.append(f"--attempts={attempts_per_challenge}")
+
     if cutoff:
         pytest_args.append(f"--cutoff={cutoff}")
         logger.debug(f"Setting cuttoff override to {cutoff} seconds.")
@@ -104,6 +108,7 @@ def run_benchmark(
     pytest_args.append(str(current_dir / "generate_test.py"))
 
     pytest_args.append("--cache-clear")
+    logger.debug(f"Running Pytest with args: {pytest_args}")
     exit_code = pytest.main(pytest_args)
 
     SingletonReportManager.clear_instance()

diff --git a/benchmark/agbenchmark/reports/ReportManager.py b/benchmark/agbenchmark/reports/ReportManager.py
@@ -10,7 +10,9 @@
 
 from agbenchmark.config import AgentBenchmarkConfig
 from agbenchmark.reports.processing.graphs import save_single_radar_chart
-from agbenchmark.reports.processing.process_report import get_agent_category
+from agbenchmark.reports.processing.process_report import (
+    get_highest_achieved_difficulty_per_category,
+)
 from agbenchmark.reports.processing.report_types import MetricsOverall, Report, Test
 from agbenchmark.utils.utils import get_highest_success_difficulty
 
@@ -79,7 +81,6 @@ def load(self) -> None:
         except json.decoder.JSONDecodeError as e:
             logger.warning(f"Could not parse {self.report_file}: {e}")
             self.tests = {}
-        self.save()
 
     def save(self) -> None:
         with self.report_file.open("w") as f:
@@ -113,6 +114,13 @@ def save(self) -> None:
             else:
                 json.dump({k: v.dict() for k, v in self.tests.items()}, f, indent=4)
 
+    def load(self) -> None:
+        super().load()
+        if "tests" in self.tests:  # type: ignore
+            self.tests = Report.parse_obj(self.tests)
+        else:
+            self.tests = {n: Test.parse_obj(d) for n, d in self.tests.items()}
+
     def add_test_report(self, test_name: str, test_report: Test) -> None:
         if isinstance(self.tests, Report):
             raise RuntimeError("Session report already finalized")
@@ -148,7 +156,7 @@ def finalize_session_report(self, config: AgentBenchmarkConfig) -> None:
             config=config.dict(exclude_none=True),
         )
 
-        agent_categories = get_agent_category(self.tests)
+        agent_categories = get_highest_achieved_difficulty_per_category(self.tests)
         if len(agent_categories) > 1:
             save_single_radar_chart(
                 agent_categories,
@@ -166,7 +174,7 @@ def get_total_costs(self):
         total_cost = 0
         all_costs_none = True
         for test_data in tests.values():
-            cost = test_data.metrics.cost or 0.0
+            cost = sum(r.cost or 0 for r in test_data.results)
 
             if cost is not None:  # check if cost is not None
                 all_costs_none = False
@@ -184,8 +192,8 @@ class RegressionTestsTracker(BaseReportManager):
     def add_test(self, test_name: str, test_details: dict) -> None:
         if test_name.startswith("Test"):
             test_name = test_name[4:]
-        self.tests[test_name] = test_details
 
+        self.tests[test_name] = test_details
         self.save()
 
     def has_regression_test(self, test_name: str) -> bool:
@@ -195,11 +203,11 @@ def has_regression_test(self, test_name: str) -> bool:
 class SuccessRatesTracker(BaseReportManager):
     """Abstracts interaction with the regression tests file"""
 
-    tests: dict[str, list[bool]]
+    tests: dict[str, list[bool | None]]
 
-    def update(self, test_name: str, success_history: list[bool]) -> None:
+    def update(self, test_name: str, success_history: list[bool | None]) -> None:
         if test_name.startswith("Test"):
             test_name = test_name[4:]
-        self.tests[test_name] = success_history
 
+        self.tests[test_name] = success_history
         self.save()
diff --git a/benchmark/agbenchmark/reports/processing/process_report.py b/benchmark/agbenchmark/reports/processing/process_report.py
@@ -34,34 +34,31 @@ def get_reports_data(report_path: str) -> dict[str, Any]:
     return reports_data
 
 
-def get_agent_category(report: Report) -> dict[str, Any]:
+def get_highest_achieved_difficulty_per_category(report: Report) -> dict[str, Any]:
     categories: dict[str, Any] = {}
 
-    def get_highest_category_difficulty(data: Test) -> None:
-        for category in data.category:
-            if (
-                category == "interface"
-                or category == "iterate"
-                or category == "product_advisor"
-            ):
+    for _, test_data in report.tests.items():
+        for category in test_data.category:
+            if category in ("interface", "iterate", "product_advisor"):
                 continue
             categories.setdefault(category, 0)
-            if data.metrics.success and data.metrics.difficulty:
-                num_dif = STRING_DIFFICULTY_MAP[data.metrics.difficulty]
+            if (
+                test_data.results
+                and all(r.success for r in test_data.results)
+                and test_data.difficulty
+            ):
+                num_dif = STRING_DIFFICULTY_MAP[test_data.difficulty]
                 if num_dif > categories[category]:
                     categories[category] = num_dif
 
-    for _, test_data in report.tests.items():
-        get_highest_category_difficulty(test_data)
-
     return categories
 
 
 def all_agent_categories(reports_data: dict[str, Any]) -> dict[str, Any]:
     all_categories: dict[str, Any] = {}
 
     for name, report in reports_data.items():
-        categories = get_agent_category(report)
+        categories = get_highest_achieved_difficulty_per_category(report)
         if categories:  # only add to all_categories if categories is not empty
             logger.debug(f"Adding {name}: {categories}")
             all_categories[name] = categories

diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py
@@ -1,46 +1,74 @@
+"""
+Model definitions used internally and for reports generated during command-line runs.
+"""
+
 from typing import Any, Dict, List
 
 from pydantic import BaseModel, Field, constr, validator
 
 datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"
 
 
-class Metrics(BaseModel):
-    difficulty: str | None
+class TestResult(BaseModel):
+    """Result details for a single run of a test/challenge."""
+
     success: bool | None = None
+    """Whether the run was successful"""
     run_time: str | None = None
+    """The (formatted) duration of the run"""
     fail_reason: str | None = None
-    success_percentage: float | None = Field(default=None, alias="success_%")
-    attempted: bool
+    """If applicable, the reason why the run was not successful"""
+    reached_cutoff: bool | None = None  # None if in progress
+    """Whether the run had to be stopped due to reaching the timeout"""
     cost: float | None = None
+    """The (known) cost incurred by the run, e.g. from using paid LLM APIs"""
 
-    @validator("attempted")
-    def require_metrics_if_attempted(cls, v: bool, values: dict[str, Any]):
-        required_fields_if_attempted = ["success", "run_time"]
+    @validator("fail_reason")
+    def success_xor_fail_reason(cls, v: str | None, values: dict[str, Any]):
         if v:
-            for f in required_fields_if_attempted:
-                assert (
-                    values.get(f) is not None
-                ), f"'{f}' must be defined if attempted is True"
+            success = values["success"]
+            assert not success, "fail_reason must only be specified if success=False"
+        else:
+            assert values["success"], "fail_reason is required if success=False"
         return v
 
 
+class TestMetrics(BaseModel):
+    """
+    Result metrics for a set of runs for a test/challenge. Should be an aggregate of all
+    results for the same test/challenge within a benchmarking session.
+    """
+
+    attempted: bool
+    """Whether the challenge was attempted during this session"""
+    is_regression: bool
+    """Whether the challenge was considered a regression test at the time of running"""
+    success_percentage: float | None = Field(default=None, alias="success_%")
+    """Success rate (0-100) for this challenge within the session"""
+
+
 class MetricsOverall(BaseModel):
+    """Global metrics concerning a benchmarking session"""
+
     run_time: str
+    """Duration from beginning to end of the session"""
     highest_difficulty: str
-    percentage: float | None = None
+    """
+    Difficulty of the most difficult challenge that succeeded at least once this session
+    """
     total_cost: float | None = None
+    """Total known cost of the session"""
 
 
 class Test(BaseModel):
+    category: List[str]
+    difficulty: str | None
     data_path: str
-    is_regression: bool
-    answer: str
     description: str
-    metrics: Metrics
-    category: List[str]
     task: str
-    reached_cutoff: bool | None = None  # None if in progress
+    answer: str
+    metrics: TestMetrics
+    results: list[TestResult]
     metadata: dict[str, Any] | None = Field(default_factory=dict)
 
 
@@ -57,9 +85,3 @@ class ReportBase(BaseModel):
 
 class Report(ReportBase):
     tests: Dict[str, Test]
-
-
-class ReportV2(Test, ReportBase):
-    test_name: str
-    run_id: str | None
-    team_name: str | None