Skip to content

Commit

Permalink
feat(benchmark): Add -N, --attempts option for multiple attempts …
Browse files Browse the repository at this point in the history
…per challenge

LLMs are probabilistic systems. Reproducibility of completions is not guaranteed. It only makes sense to account for this, by running challenges multiple times to obtain a success ratio rather than a boolean success/failure result.

Changes:
- Add `-N`, `--attempts` option to CLI and `attempts_per_challenge` parameter to `main.py:run_benchmark`.
- Add dynamic `i_attempt` fixture through `pytest_generate_tests` hook in conftest.py to achieve multiple runs per challenge.
- Modify `pytest_runtest_makereport` hook in conftest.py to handle multiple reporting calls per challenge.
- Refactor report_types.py, reports.py, process_report.ty to allow multiple results per challenge.
   - Calculate `success_percentage` from results of the current run, rather than all known results ever.
   - Add docstrings to a number of models in report_types.py.
   - Allow `None` as a success value, e.g. for runs that did not render any results before being cut off.
- Make SingletonReportManager thread-safe.
  • Loading branch information
Pwuts committed Jan 22, 2024
1 parent 488f40a commit a0cae78
Show file tree
Hide file tree
Showing 12 changed files with 177 additions and 137 deletions.
6 changes: 6 additions & 0 deletions benchmark/agbenchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def start():


@cli.command(default=True)
@click.option(
"-N", "--attempts", default=1, help="Number of times to run each challenge."
)
@click.option(
"-c",
"--category",
Expand Down Expand Up @@ -107,6 +110,7 @@ def run(
test: tuple[str],
category: tuple[str],
skip_category: tuple[str],
attempts: int,
cutoff: Optional[int] = None,
backend: Optional[bool] = False,
# agent_path: Optional[Path] = None,
Expand Down Expand Up @@ -153,6 +157,7 @@ def run(
tests=test,
categories=category,
skip_categories=skip_category,
attempts_per_challenge=attempts,
cutoff=cutoff,
)

Expand All @@ -171,6 +176,7 @@ def run(
tests=test,
categories=category,
skip_categories=skip_category,
attempts_per_challenge=attempts,
cutoff=cutoff,
)

Expand Down
5 changes: 4 additions & 1 deletion benchmark/agbenchmark/challenges/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def from_source_uri(cls, source_uri: str) -> type["BaseChallenge"]:

@abstractmethod
def test_method(
self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
self,
config: AgentBenchmarkConfig,
request: pytest.FixtureRequest,
i_attempt: int,
) -> None:
"""
Test method for use by Pytest-based benchmark sessions. Should return normally
Expand Down
5 changes: 4 additions & 1 deletion benchmark/agbenchmark/challenges/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,10 @@ def from_source_uri(cls, source_uri: str) -> type["BuiltinChallenge"]:

@pytest.mark.asyncio
async def test_method(
self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
self,
config: AgentBenchmarkConfig,
request: pytest.FixtureRequest,
i_attempt: int,
) -> None:
if os.environ.get("HELICONE_API_KEY"):
from helicone.lock import HeliconeLockManager
Expand Down
5 changes: 4 additions & 1 deletion benchmark/agbenchmark/challenges/webarena.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,10 @@ async def evaluate_task_state(

@pytest.mark.asyncio
async def test_method(
self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
self,
config: AgentBenchmarkConfig,
request: pytest.FixtureRequest,
i_attempt: int,
) -> None:
if os.environ.get("HELICONE_API_KEY"):
from helicone.lock import HeliconeLockManager
Expand Down
23 changes: 19 additions & 4 deletions benchmark/agbenchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@

from agbenchmark.challenges import OPTIONAL_CATEGORIES, BaseChallenge
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.processing.report_types import Test
from agbenchmark.reports.ReportManager import RegressionTestsTracker
from agbenchmark.reports.reports import (
finalize_test_report,
initialize_test_report,
add_test_result_to_report,
make_empty_test_report,
session_finish,
)
from agbenchmark.utils.data_types import Category
Expand Down Expand Up @@ -80,6 +81,7 @@ def pytest_addoption(parser: pytest.Parser) -> None:
Args:
parser: The Pytest CLI parser to which the command-line options are added.
"""
parser.addoption("-N", "--attempts", action="store")
parser.addoption("--no-dep", action="store_true")
parser.addoption("--mock", action="store_true")
parser.addoption("--host", default=None)
Expand Down Expand Up @@ -149,6 +151,9 @@ def mock(request: pytest.FixtureRequest) -> bool:
return request.config.getoption("--mock")


test_reports: dict[str, Test] = {}


def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
"""
Pytest hook that is called when a test report is being generated.
Expand All @@ -159,14 +164,19 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
call: The call object from which the test result is retrieved.
"""
challenge: type[BaseChallenge] = item.cls # type: ignore
challenge_id = challenge.info.eval_id

if challenge_id not in test_reports:
test_reports[challenge_id] = make_empty_test_report(challenge.info)

if call.when == "setup":
test_name = item.nodeid.split("::")[1]
item.user_properties.append(("test_name", test_name))
initialize_test_report(item, challenge.info)

if call.when == "call":
finalize_test_report(item, call, agbenchmark_config)
add_test_result_to_report(
test_reports[challenge_id], item, call, agbenchmark_config
)


def timeout_monitor(start_time: int) -> None:
Expand Down Expand Up @@ -205,6 +215,11 @@ def pytest_sessionfinish(session: pytest.Session) -> None:
session_finish(agbenchmark_config)


def pytest_generate_tests(metafunc: pytest.Metafunc):
if type(n := metafunc.config.getoption("-N")) is str:
metafunc.parametrize("i_attempt", range(int(n)))


def pytest_collection_modifyitems(
items: list[pytest.Item], config: pytest.Config
) -> None:
Expand Down
5 changes: 5 additions & 0 deletions benchmark/agbenchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def run_benchmark(
tests: tuple[str] = tuple(),
categories: tuple[str] = tuple(),
skip_categories: tuple[str] = tuple(),
attempts_per_challenge: int = 1,
mock: bool = False,
no_dep: bool = False,
no_cutoff: bool = False,
Expand Down Expand Up @@ -96,6 +97,9 @@ def run_benchmark(
if active:
pytest_args.append(flag)

if attempts_per_challenge > 1:
pytest_args.append(f"--attempts={attempts_per_challenge}")

if cutoff:
pytest_args.append(f"--cutoff={cutoff}")
logger.debug(f"Setting cuttoff override to {cutoff} seconds.")
Expand All @@ -104,6 +108,7 @@ def run_benchmark(
pytest_args.append(str(current_dir / "generate_test.py"))

pytest_args.append("--cache-clear")
logger.debug(f"Running Pytest with args: {pytest_args}")
exit_code = pytest.main(pytest_args)

SingletonReportManager.clear_instance()
Expand Down
24 changes: 16 additions & 8 deletions benchmark/agbenchmark/reports/ReportManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.processing.graphs import save_single_radar_chart
from agbenchmark.reports.processing.process_report import get_agent_category
from agbenchmark.reports.processing.process_report import (
get_highest_achieved_difficulty_per_category,
)
from agbenchmark.reports.processing.report_types import MetricsOverall, Report, Test
from agbenchmark.utils.utils import get_highest_success_difficulty

Expand Down Expand Up @@ -79,7 +81,6 @@ def load(self) -> None:
except json.decoder.JSONDecodeError as e:
logger.warning(f"Could not parse {self.report_file}: {e}")
self.tests = {}
self.save()

def save(self) -> None:
with self.report_file.open("w") as f:
Expand Down Expand Up @@ -113,6 +114,13 @@ def save(self) -> None:
else:
json.dump({k: v.dict() for k, v in self.tests.items()}, f, indent=4)

def load(self) -> None:
super().load()
if "tests" in self.tests: # type: ignore
self.tests = Report.parse_obj(self.tests)
else:
self.tests = {n: Test.parse_obj(d) for n, d in self.tests.items()}

def add_test_report(self, test_name: str, test_report: Test) -> None:
if isinstance(self.tests, Report):
raise RuntimeError("Session report already finalized")
Expand Down Expand Up @@ -148,7 +156,7 @@ def finalize_session_report(self, config: AgentBenchmarkConfig) -> None:
config=config.dict(exclude_none=True),
)

agent_categories = get_agent_category(self.tests)
agent_categories = get_highest_achieved_difficulty_per_category(self.tests)
if len(agent_categories) > 1:
save_single_radar_chart(
agent_categories,
Expand All @@ -166,7 +174,7 @@ def get_total_costs(self):
total_cost = 0
all_costs_none = True
for test_data in tests.values():
cost = test_data.metrics.cost or 0.0
cost = sum(r.cost or 0 for r in test_data.results)

if cost is not None: # check if cost is not None
all_costs_none = False
Expand All @@ -184,8 +192,8 @@ class RegressionTestsTracker(BaseReportManager):
def add_test(self, test_name: str, test_details: dict) -> None:
if test_name.startswith("Test"):
test_name = test_name[4:]
self.tests[test_name] = test_details

self.tests[test_name] = test_details
self.save()

def has_regression_test(self, test_name: str) -> bool:
Expand All @@ -195,11 +203,11 @@ def has_regression_test(self, test_name: str) -> bool:
class SuccessRatesTracker(BaseReportManager):
"""Abstracts interaction with the regression tests file"""

tests: dict[str, list[bool]]
tests: dict[str, list[bool | None]]

def update(self, test_name: str, success_history: list[bool]) -> None:
def update(self, test_name: str, success_history: list[bool | None]) -> None:
if test_name.startswith("Test"):
test_name = test_name[4:]
self.tests[test_name] = success_history

self.tests[test_name] = success_history
self.save()
25 changes: 11 additions & 14 deletions benchmark/agbenchmark/reports/processing/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,34 +34,31 @@ def get_reports_data(report_path: str) -> dict[str, Any]:
return reports_data


def get_agent_category(report: Report) -> dict[str, Any]:
def get_highest_achieved_difficulty_per_category(report: Report) -> dict[str, Any]:
categories: dict[str, Any] = {}

def get_highest_category_difficulty(data: Test) -> None:
for category in data.category:
if (
category == "interface"
or category == "iterate"
or category == "product_advisor"
):
for _, test_data in report.tests.items():
for category in test_data.category:
if category in ("interface", "iterate", "product_advisor"):
continue
categories.setdefault(category, 0)
if data.metrics.success and data.metrics.difficulty:
num_dif = STRING_DIFFICULTY_MAP[data.metrics.difficulty]
if (
test_data.results
and all(r.success for r in test_data.results)
and test_data.difficulty
):
num_dif = STRING_DIFFICULTY_MAP[test_data.difficulty]
if num_dif > categories[category]:
categories[category] = num_dif

for _, test_data in report.tests.items():
get_highest_category_difficulty(test_data)

return categories


def all_agent_categories(reports_data: dict[str, Any]) -> dict[str, Any]:
all_categories: dict[str, Any] = {}

for name, report in reports_data.items():
categories = get_agent_category(report)
categories = get_highest_achieved_difficulty_per_category(report)
if categories: # only add to all_categories if categories is not empty
logger.debug(f"Adding {name}: {categories}")
all_categories[name] = categories
Expand Down
68 changes: 45 additions & 23 deletions benchmark/agbenchmark/reports/processing/report_types.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,74 @@
"""
Model definitions used internally and for reports generated during command-line runs.
"""

from typing import Any, Dict, List

from pydantic import BaseModel, Field, constr, validator

datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"


class Metrics(BaseModel):
difficulty: str | None
class TestResult(BaseModel):
"""Result details for a single run of a test/challenge."""

success: bool | None = None
"""Whether the run was successful"""
run_time: str | None = None
"""The (formatted) duration of the run"""
fail_reason: str | None = None
success_percentage: float | None = Field(default=None, alias="success_%")
attempted: bool
"""If applicable, the reason why the run was not successful"""
reached_cutoff: bool | None = None # None if in progress
"""Whether the run had to be stopped due to reaching the timeout"""
cost: float | None = None
"""The (known) cost incurred by the run, e.g. from using paid LLM APIs"""

@validator("attempted")
def require_metrics_if_attempted(cls, v: bool, values: dict[str, Any]):
required_fields_if_attempted = ["success", "run_time"]
@validator("fail_reason")
def success_xor_fail_reason(cls, v: str | None, values: dict[str, Any]):
if v:
for f in required_fields_if_attempted:
assert (
values.get(f) is not None
), f"'{f}' must be defined if attempted is True"
success = values["success"]
assert not success, "fail_reason must only be specified if success=False"
else:
assert values["success"], "fail_reason is required if success=False"
return v


class TestMetrics(BaseModel):
"""
Result metrics for a set of runs for a test/challenge. Should be an aggregate of all
results for the same test/challenge within a benchmarking session.
"""

attempted: bool
"""Whether the challenge was attempted during this session"""
is_regression: bool
"""Whether the challenge was considered a regression test at the time of running"""
success_percentage: float | None = Field(default=None, alias="success_%")
"""Success rate (0-100) for this challenge within the session"""


class MetricsOverall(BaseModel):
"""Global metrics concerning a benchmarking session"""

run_time: str
"""Duration from beginning to end of the session"""
highest_difficulty: str
percentage: float | None = None
"""
Difficulty of the most difficult challenge that succeeded at least once this session
"""
total_cost: float | None = None
"""Total known cost of the session"""


class Test(BaseModel):
category: List[str]
difficulty: str | None
data_path: str
is_regression: bool
answer: str
description: str
metrics: Metrics
category: List[str]
task: str
reached_cutoff: bool | None = None # None if in progress
answer: str
metrics: TestMetrics
results: list[TestResult]
metadata: dict[str, Any] | None = Field(default_factory=dict)


Expand All @@ -57,9 +85,3 @@ class ReportBase(BaseModel):

class Report(ReportBase):
tests: Dict[str, Test]


class ReportV2(Test, ReportBase):
test_name: str
run_id: str | None
team_name: str | None
Loading

0 comments on commit a0cae78

Please sign in to comment.