Skip to content

Commit

Permalink
Refactor state (#31)
Browse files Browse the repository at this point in the history
* Refactor and add more tests

* Adjust report_v2 to new trajectory format
  • Loading branch information
aorwall authored Aug 6, 2024
1 parent 6b16fd0 commit a50e3ef
Show file tree
Hide file tree
Showing 38 changed files with 2,552 additions and 1,788 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,6 @@ cython_debug/
notebooks/.ipynb_checkpoints/
notebooks/local_experiments.ipynb

playground
playground
logs
Pipfile
6 changes: 3 additions & 3 deletions moatless/benchmark/claude_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import instructor

from moatless import Transitions
from moatless.transition_rules import TransitionRules
from moatless.benchmark.evaluation import create_evaluation_name, Evaluation
from moatless.edit.edit import EditCode
from moatless.edit.plan import PlanToCode
Expand Down Expand Up @@ -170,7 +170,7 @@ def run_evaluation():


def evaluate_search():
transitions = Transitions(
transitions = TransitionRules(
global_params=global_params,
state_params={
SearchCode: {"max_search_results": 50, "provide_initial_context": True},
Expand Down Expand Up @@ -280,7 +280,7 @@ def evaluate_coding():


def evaluate_plan(previous_trajectory_dir: Optional[str] = None):
transitions = Transitions(
transitions = TransitionRules(
global_params=global_params,
state_params={
SearchCode: {
Expand Down
60 changes: 30 additions & 30 deletions moatless/benchmark/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
import traceback
from collections import defaultdict
from datetime import datetime, timezone
from typing import Optional
from typing import Optional, Tuple

import instructor
import litellm
import pandas as pd
from tqdm.auto import tqdm

from moatless.benchmark.report_v2 import to_result, generate_md_report
from moatless.trajectory import Trajectory
from moatless.transition_rules import TransitionRules
from moatless.benchmark.swebench import (
found_in_alternative_spans,
Expand Down Expand Up @@ -82,6 +83,7 @@ def __init__(
max_transitions: int = 25,
max_expansions: int = 2,
max_file_context_tokens: int = 16000,
markdown_report: bool = False,
litellm_callback: Optional[str] = None,
previous_trajectory_dir: Optional[str] = None,
retry_state: Optional[str] = None,
Expand All @@ -93,6 +95,7 @@ def __init__(
self.evaluations_dir = evaluations_dir
self.num_workers = num_workers
self.detailed_report = detailed_report
self.markdown_report = markdown_report

self.evaluation_name = evaluation_name
self.max_file_context_tokens = max_file_context_tokens
Expand Down Expand Up @@ -193,22 +196,21 @@ def run_single_instance(
instance_id: str,
dataset: str = "princeton-nlp/SWE-bench_Lite",
split="test",
):
) -> dict:
instance = load_instance(instance_id, dataset, split)
return self._evaluate_instance(instance)
trajectory = self._evaluate_instance(instance)
return to_result(instance, trajectory, self.report)

def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict:
def _evaluate_instance(self, instance: dict, retry: bool = False) -> Trajectory:
instance_id = instance["instance_id"]
trajectory_path = os.path.join(self.trajectory_dir, f"{instance_id}.json")
prompt_log_dir = os.path.join(self.logs_dir, f"{instance_id}")
if not os.path.exists(prompt_log_dir):
os.makedirs(prompt_log_dir)

if os.path.exists(trajectory_path) and not retry:
with open(trajectory_path) as file:
trajectory = json.load(file)
if trajectory["info"].get("status") or trajectory["info"].get("error"):
return trajectory
# TODO: Retry when failed or not finished?
return Trajectory.load(trajectory_path)

repo_dir = setup_swebench_repo(instance)
persist_dir = os.path.join(self.index_store_dir, get_repo_dir_name(instance_id))
Expand Down Expand Up @@ -284,31 +286,30 @@ def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict:
info["submission"] = diff

loop.trajectory.save_info(info)
return loop.trajectory.to_dict()
return loop.trajectory

def _process_instance(self, instance):
def _process_instance(self, instance) -> Tuple[dict, str]:
trajectory = self._evaluate_instance(instance)
if not trajectory:
return None, None, None

result, transition_result = to_result(instance, trajectory, self.report)
submission = trajectory["info"].get("submission", "")
result = to_result(instance, trajectory, self.report)
submission = trajectory.info.get("submission", "")

try:
md_report = generate_md_report(trajectory, instance)
if not os.path.exists(f"{self.evaluation_dir}/reports"):
os.makedirs(f"{self.evaluation_dir}/reports")
with open(
f"{self.evaluation_dir}/reports/{instance['instance_id']}.md",
"w",
) as file:
file.write(md_report)
except Exception:
logging.exception(
f"Error in generating report for {instance['instance_id']} "
)
if self.markdown_report:
try:
md_report = generate_md_report(trajectory, instance)
if not os.path.exists(f"{self.evaluation_dir}/reports"):
os.makedirs(f"{self.evaluation_dir}/reports")
with open(
f"{self.evaluation_dir}/reports/{instance['instance_id']}.md",
"w",
) as file:
file.write(md_report)
except Exception:
logging.exception(
f"Error in generating report for {instance['instance_id']} "
)

return result, transition_result, submission
return result, submission

def _process_repo_group(self, repo, instances):
results = []
Expand All @@ -322,9 +323,8 @@ def _process_repo_group(self, repo, instances):
if not trajectory:
return None, None

result, transition_result = to_result(instance, trajectory, report=self.report)
result = to_result(instance, trajectory, report=self.report)
results.append(result)
transition_results.extend(transition_result)

try:
md_report = generate_md_report(trajectory, instance)
Expand Down
Loading

0 comments on commit a50e3ef

Please sign in to comment.