From c452a297b508e9286ee7eaf5775ccd9f5a7c6cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Mon, 5 Aug 2024 07:18:53 +0200 Subject: [PATCH] Split out report functions --- moatless/benchmark/evaluation.py | 415 +----------------------------- moatless/benchmark/report_v1.py | 418 +++++++++++++++++++++++++++++++ moatless/benchmark/report_v2.py | 415 ++++++++++++++++++++++++++++++ 3 files changed, 837 insertions(+), 411 deletions(-) create mode 100644 moatless/benchmark/report_v1.py create mode 100644 moatless/benchmark/report_v2.py diff --git a/moatless/benchmark/evaluation.py b/moatless/benchmark/evaluation.py index a4f12bc9..78a73a18 100644 --- a/moatless/benchmark/evaluation.py +++ b/moatless/benchmark/evaluation.py @@ -14,6 +14,7 @@ import pandas as pd from tqdm.auto import tqdm +from moatless.benchmark.report_v2 import to_result, generate_md_report from moatless.transition_rules import TransitionRules from moatless.benchmark.swebench import ( found_in_alternative_spans, @@ -290,7 +291,7 @@ def _process_instance(self, instance): if not trajectory: return None, None, None - result, transition_result = self.to_result(instance, trajectory) + result, transition_result = to_result(instance, trajectory, self.report) submission = trajectory["info"].get("submission", "") try: @@ -321,7 +322,7 @@ def _process_repo_group(self, repo, instances): if not trajectory: return None, None - result, transition_result = self.to_result(instance, trajectory) + result, transition_result = to_result(instance, trajectory, report=self.report) results.append(result) transition_results.extend(transition_result) @@ -445,7 +446,7 @@ def _run_evaluation_simple(self, instances: list[dict]): if not trajectory: continue - result, transition_result = self.to_result(instance, trajectory) + result, transition_result = to_result(instance, trajectory, report=self.report) sum_duration += result["duration"] sum_total_cost += result["total_cost"] @@ -490,317 +491,6 @@ def _run_evaluation_simple(self, instances: list[dict]): json_string = json.dumps(prediction) file.write(json_string + "\n") - def to_result(self, instance: dict, trajectory: dict) -> tuple[dict, list]: - info = trajectory["info"] - - if "resolved_ids" in self.report and instance["instance_id"] in self.report["resolved_ids"]: - result_status = "resolved" - else: - result_status = info.get("status") - - resolved = result_status == "resolved" - - try: - transitions = [] - result = { - "instance_id": instance["instance_id"], - "duration": info.get("duration", 0), - "total_cost": info.get("total_cost", 0), - "resolved_by": (len(instance.get("resolved_by", []))), - "status": None, - "result_status": result_status, - "transitions": len(trajectory["transitions"]), - "edited": False, - "planned": False, - "identified": None, - "expected_identified": None, - "alt_identified": None, - "found_in_search": None, - "file_identified": None, - "file_in_search": None, - "edit_retries": 0, - "has_diff": False, - "lint_codes": None, - "review": False, - "p_query": 0, - "p_file": 0, - "p_code": 0, - "p_class": 0, - "p_function": 0, - "lints": "", - } - - lint_codes = set() - search_results_spans = {} - identified_spans = {} - planned_spans = {} - edited_spans = {} - - id_iterations = 0 - search_iterations = 0 - - selected_transition_ids = [] - if "current_transition_id" in trajectory: - transitions_map = {t["id"]: t for t in trajectory["transitions"]} - - transition = transitions_map.get(trajectory["current_transition_id"]) - while transition: - selected_transition_ids.append(transition["id"]) - if "parent_id" in transition: - transition = transitions_map.get(transition["parent_id"]) - else: - break - - logger.info(f"Selected transitions: {selected_transition_ids}") - - if instance.get("expected_spans"): - for transition in trajectory["transitions"]: - if selected_transition_ids and transition["id"] not in selected_transition_ids: - continue - - state_name = transition["state"]["name"] - - if state_name not in result: - result[state_name] = 0 - result[f"{state_name}_cost"] = 0 - - result[state_name] += 1 - - expected_span_str = "" - for file_path, span_ids in instance["expected_spans"].items(): - expected_span_str += f"{file_path}: {span_ids} " - - transition_result = { - "instance_id": instance["instance_id"], - "resolved": resolved, - "name": state_name, - "cost": 0, - "expected_spans": expected_span_str, - "actual_spans": "", - } - - if not transition["actions"]: - continue - - for traj_action in transition["actions"]: - result[f"{state_name}_cost"] += traj_action.get( - "completion_cost", 0 - ) - transition_result["cost"] += traj_action.get( - "completion_cost", 0 - ) - - if state_name == "SearchCode": - search_iterations += 1 - - action = transition["actions"][-1] - - if "search_requests" in action["action"]: - for search_request in action["action"]["search_requests"]: - if search_request.get("query"): - result["p_query"] += 1 - - if search_request.get("file_pattern"): - result["p_file"] += 1 - - if search_request.get("code_snippet"): - result["p_code"] += 1 - - if search_request.get( - "class_name" - ) or search_request.get("class_names"): - result["p_class"] += 1 - - if search_request.get( - "function_name" - ) or search_request.get("function_names"): - result["p_function"] += 1 - - if state_name == "IdentifyCode": - id_iterations += 1 - - state = transition["state"] - if state.get("ranked_spans"): - for ranked_span in state["ranked_spans"]: - if ( - ranked_span["file_path"] - not in search_results_spans - ): - search_results_spans[ - ranked_span["file_path"] - ] = [] - search_results_spans[ - ranked_span["file_path"] - ].append(ranked_span["span_id"]) - - if not result["found_in_search"] and ( - found_in_expected_spans( - instance, search_results_spans - ) - or found_in_alternative_spans( - instance, search_results_spans - ) - ): - result["found_in_search"] = search_iterations - - if not result["file_in_search"]: - missing_files = get_missing_files( - instance["expected_spans"], - search_results_spans, - ) - if not missing_files: - result["file_in_search"] = search_iterations - - action = transition["actions"][-1] - if action.get("action"): - identified_str = "" - if action["action"].get("identified_spans"): - for span in action["action"]["identified_spans"]: - identified_str += ( - f"{span['file_path']}: {span['span_ids']} " - ) - if span["file_path"] not in identified_spans: - identified_spans[span["file_path"]] = [] - - transition_result["actual_spans"] += ( - f"{span['file_path']}: {','.join(span['span_ids'])} " - ) - for span_id in span["span_ids"]: - identified_spans[span["file_path"]].append( - span_id - ) - result["identified_spans"] = identified_str - - if not result["file_identified"]: - missing_files = get_missing_files( - instance["expected_spans"], - identified_spans, - ) - if not missing_files: - result["file_identified"] = id_iterations - - if result[ - "expected_identified" - ] is None and found_in_expected_spans( - instance, identified_spans - ): - result["expected_identified"] = id_iterations - - if result[ - "alt_identified" - ] is None and found_in_alternative_spans( - instance, identified_spans - ): - result["alt_identified"] = id_iterations - - if result.get("alt_identified") or result.get( - "expected_identified" - ): - result["identified"] = min( - result.get("alt_identified") or 1000, - result.get("expected_identified") or 1000, - ) - - if state_name == "PlanToCode": - action = transition["actions"][-1]["action"] - if action.get("action") == "review": - result["review"] = True - - if "file_path" in action: - if "span_id" not in action: - logger.warning( - f"Span id missing in planning action in {instance['instance_id']}" - ) - else: - file_path = action["file_path"] - if file_path not in planned_spans: - planned_spans[file_path] = [] - planned_spans[file_path].append(action["span_id"]) - transition_result["actual_spans"] = ( - f"{file_path}: {action['span_id']} " - ) - - if not result.get("planned") and ( - found_in_expected_spans( - instance, - planned_spans, - ) - or found_in_alternative_spans(instance, planned_spans) - ): - result["planned"] = True - - if state_name == "EditCode": - result["edit_retries"] = len(transition["actions"]) - 1 - - action = transition["actions"][-1] - edited = action.get("trigger") == "finish" - - if edited and "file_path" in transition["state"]: - file_path = transition["state"]["file_path"] - if file_path not in edited_spans: - edited_spans[file_path] = [] - edited_spans[file_path].append( - transition["state"]["span_id"] - ) - transition_result["actual_spans"] = ( - f"{file_path}: {transition['state']['span_id']} " - ) - - if not result.get("edited") and ( - found_in_expected_spans( - instance, - edited_spans, - ) - or found_in_alternative_spans(instance, edited_spans) - ): - result["edited"] = True - - - output = action.get("output", {}) - if output: - if edited: - result["has_diff"] = True - - for lint in output.get("verification_errors", []): - lint_codes.add(lint["code"]) - - transitions.append(transition_result) - - if result.get("alt_identified") or result.get("expected_identified"): - result["identified"] = min( - result.get("alt_identified") or 1000, - result.get("expected_identified") or 1000, - ) - - result["expected_files"] = list(instance["expected_spans"].keys()) - result["edited_files"] = list(edited_spans.keys()) - result["identified_spans"] = sum( - [len(v) for v in identified_spans.values()] - ) - - result["lints"] = ",".join(lint_codes) - - - if result["edited"]: - result["status"] = "edited" - elif result["identified"]: - result["status"] = "identified" - elif result["found_in_search"]: - result["status"] = "found_in_search" - elif result["file_identified"]: - result["status"] = "file_identified" - else: - result["status"] = "" - - if "error" in info: - result["error"] = info["error"].split("\n")[0] - else: - result["error"] = "" - - except Exception as e: - raise e - - return result, transitions def read_trajectory(self, path) -> Optional[dict]: if os.path.exists(path): @@ -825,100 +515,3 @@ def create_evaluation_name( model_name = model.split("/")[-1] return f"{date_str}_{name}_{model_name}" - -def generate_md_report(trajectory: dict, instance: dict): - info = trajectory["info"] - markdown = f"# {instance['instance_id']}\n" - - markdown += "\n## Problem statement\n" - markdown += f"```\n{instance['problem_statement']}\n```\n" - - if "error" in trajectory["info"]: - markdown += "\n## Error\n" - markdown += f"```\n{trajectory['info']['error']}\n```\n" - else: - markdown += "\n## Prediction\n" - markdown += f"```diff\n{info['submission']}\n```\n" - - markdown += "\n## Golden patch\n" - markdown += f"```diff\n{instance['golden_patch']}\n```\n" - - markdown += "\n## Trajectory\n" - - repo_dir = setup_swebench_repo(instance) - file_repo = FileRepository(repo_dir) - - for j, step in enumerate(trajectory["transitions"]): - for i, traj_action in enumerate(step["actions"]): - state_name = step['state'] - markdown += f"### {j+1} {state_name} ({i+1})\n\n" - - if not traj_action.get("action"): - continue - action = traj_action["action"] - - if state_name == "PlanToCode": - if action.get("scratch_pad"): - markdown += "*" + action["scratch_pad"] + "*" - - if action.get("instructions"): - markdown += f"\n\n * {action['instructions']}" - - if action.get("file_path"): - markdown += f"\n * {action['file_path']}" - - if action.get("span_id"): - markdown += f"\n * {action['span_id']}" - - if action.get("file_path") and action.get("span_id"): - markdown += "\n\n#### File context \n\n" - try: - file_context = FileContext(file_repo) - file_context.add_span_to_context( - action.get("file_path"), - action.get("span_id"), - ) - markdown += file_context.create_prompt( - show_outcommented_code=True - ) - except Exception as e: - logger.error(e) - - if state_name == "EditCode": - markdown += "#### LLM Response\n\n" - markdown += f"```\n{action.get('content', '')}\n```\n" - - output = traj_action.get("output") - if output: - if output.get("diff"): - markdown += "#### Diff\n\n" - markdown += f"```diff\n{output['diff']}\n```\n" - - if output.get("errors"): - markdown += "#### Errors\n\n" - markdown += f"{output['errors']}\n\n" - - if output.get("message"): - markdown += "#### Message\n\n" - markdown += f"{output['message']}\n\n" - - if state_name == "ClarifyCodeChange": - if action.get("thoughts"): - markdown += "*" + action["thoughts"] + "*" - - if action.get("output") and action.get("output").get("start_line"): - markdown += f"\n* Start Line: {action['output']['start_line']}\n" - markdown += f"\n* End Line: {action['output']['end_line']}\n" - - if state_name == "Finished": - markdown += f"*{action['properties']['message']}*\n" - - if state_name == "Rejected": - markdown += f"*{action['properties']['message']}*\n" - - markdown += "## Alternative patches\n" - for alternative in instance["resolved_by"]: - markdown += f"### {alternative['name']}\n" - markdown += f"```diff\n{alternative['patch']}\n```\n" - - return markdown diff --git a/moatless/benchmark/report_v1.py b/moatless/benchmark/report_v1.py new file mode 100644 index 00000000..65a5a403 --- /dev/null +++ b/moatless/benchmark/report_v1.py @@ -0,0 +1,418 @@ +import json +import logging +import os + +from moatless import FileRepository +from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo +from moatless.benchmark.utils import get_missing_files +from moatless.file_context import FileContext + +logger = logging.getLogger(__name__) + + +def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[dict, list]: + """ + Generate reports from saved trajectories with version 1 format. + """ + + info = trajectory["info"] + + resolved = report and info.get("instance_id", "") in report["resolved"] + + try: + transitions = [] + result = { + "instance_id": instance["instance_id"], + "duration": info.get("duration", 0), + "total_cost": info.get("total_cost", 0), + "resolved_by": (len(instance.get("resolved_by", []))), + "status": None, + "transitions": len(trajectory["transitions"]), + "edited": False, + "planned": False, + "identified": None, + "expected_identified": None, + "alt_identified": None, + "found_in_search": None, + "file_identified": None, + "file_in_search": None, + "edit_retries": 0, + "has_diff": False, + "lint_codes": None, + "review": False, + "p_query": 0, + "p_file": 0, + "p_code": 0, + "p_class": 0, + "p_function": 0, + "lints": "", + } + + lint_codes = set() + search_results_spans = {} + identified_spans = {} + planned_spans = {} + edited_spans = {} + + id_iterations = 0 + search_iterations = 0 + + if instance.get("expected_spans"): + for transition in trajectory["transitions"]: + if transition["name"] not in result: + result[transition["name"]] = 0 + result[f"{transition['name']}_cost"] = 0 + + result[transition["name"]] += 1 + + expected_span_str = "" + for file_path, span_ids in instance["expected_spans"].items(): + expected_span_str += f"{file_path}: {span_ids} " + + transition_result = { + "instance_id": instance["instance_id"], + "resolved": resolved, + "name": transition["name"], + "cost": 0, + "expected_spans": expected_span_str, + "actual_spans": "", + } + + if not transition["actions"]: + continue + + for traj_action in transition["actions"]: + result[f"{transition['name']}_cost"] += traj_action.get( + "completion_cost", 0 + ) + transition_result["cost"] += traj_action.get( + "completion_cost", 0 + ) + + if transition["name"] == "SearchCode": + search_iterations += 1 + + action = transition["actions"][-1] + + if "search_requests" in action["action"]: + for search_request in action["action"]["search_requests"]: + if search_request.get("query"): + result["p_query"] += 1 + + if search_request.get("file_pattern"): + result["p_file"] += 1 + + if search_request.get("code_snippet"): + result["p_code"] += 1 + + if search_request.get( + "class_name" + ) or search_request.get("class_names"): + result["p_class"] += 1 + + if search_request.get( + "function_name" + ) or search_request.get("function_names"): + result["p_function"] += 1 + + if "output" in action and action.get("output"): + output = action["output"] + + if "query" in output: + result["p_query"] += 1 + + if "file_pattern" in output: + result["p_file"] += 1 + + if "code_snippet" in output: + result["p_code"] += 1 + + if "class_name" in output or "class_names" in output: + result["p_class"] += 1 + + if "function_name" in output or "function_names" in output: + result["p_function"] += 1 + + if output.get("ranked_spans"): + for ranked_span in output["ranked_spans"]: + if ( + ranked_span["file_path"] + not in search_results_spans + ): + search_results_spans[ + ranked_span["file_path"] + ] = [] + search_results_spans[ + ranked_span["file_path"] + ].append(ranked_span["span_id"]) + + if not result["found_in_search"] and ( + found_in_expected_spans( + instance, search_results_spans + ) + or found_in_alternative_spans( + instance, search_results_spans + ) + ): + result["found_in_search"] = search_iterations + + if not result["file_in_search"]: + missing_files = get_missing_files( + instance["expected_spans"], + search_results_spans, + ) + if not missing_files: + result["file_in_search"] = search_iterations + + if transition["name"] == "IdentifyCode": + id_iterations += 1 + + action = transition["actions"][-1] + if action.get("action"): + identified_str = "" + if action["action"].get("identified_spans"): + for span in action["action"]["identified_spans"]: + identified_str += ( + f"{span['file_path']}: {span['span_ids']} " + ) + if span["file_path"] not in identified_spans: + identified_spans[span["file_path"]] = [] + + transition_result["actual_spans"] += ( + f"{span['file_path']}: {','.join(span['span_ids'])} " + ) + for span_id in span["span_ids"]: + identified_spans[span["file_path"]].append( + span_id + ) + result["identified_spans"] = identified_str + + if not result["file_identified"]: + missing_files = get_missing_files( + instance["expected_spans"], + identified_spans, + ) + if not missing_files: + result["file_identified"] = id_iterations + + if result[ + "expected_identified" + ] is None and found_in_expected_spans( + instance, identified_spans + ): + result["expected_identified"] = id_iterations + + if result[ + "alt_identified" + ] is None and found_in_alternative_spans( + instance, identified_spans + ): + result["alt_identified"] = id_iterations + + if result.get("alt_identified") or result.get( + "expected_identified" + ): + result["identified"] = min( + result.get("alt_identified") or 1000, + result.get("expected_identified") or 1000, + ) + + if transition["name"] == "PlanToCode": + action = transition["actions"][-1]["action"] + if action.get("action") == "review": + result["review"] = True + + if "file_path" in action: + if "span_id" not in action: + logger.warning( + f"Span id missing in planning action in {instance['instance_id']}" + ) + else: + file_path = action["file_path"] + if file_path not in planned_spans: + planned_spans[file_path] = [] + planned_spans[file_path].append(action["span_id"]) + transition_result["actual_spans"] = ( + f"{file_path}: {action['span_id']} " + ) + + if not result.get("planned") and ( + found_in_expected_spans( + instance, + planned_spans, + ) + or found_in_alternative_spans(instance, planned_spans) + ): + result["planned"] = True + + if transition["name"] == "EditCode": + result["edit_retries"] = len(transition["actions"]) - 1 + + action = transition["actions"][-1] + output = action.get("output", {}) + + if output: + edited = output.get("diff") + + if edited: + result["has_diff"] = True + + for lint in output.get("verification_errors", []): + lint_codes.add(lint["code"]) + + if edited and "file_path" in transition["state"]: + file_path = transition["state"]["file_path"] + if file_path not in edited_spans: + edited_spans[file_path] = [] + edited_spans[file_path].append( + transition["state"]["span_id"] + ) + transition_result["actual_spans"] = ( + f"{file_path}: {transition['state']['span_id']} " + ) + + if not result.get("edited") and ( + found_in_expected_spans( + instance, + edited_spans, + ) + or found_in_alternative_spans(instance, edited_spans) + ): + result["edited"] = True + + transitions.append(transition_result) + + if result.get("alt_identified") or result.get("expected_identified"): + result["identified"] = min( + result.get("alt_identified") or 1000, + result.get("expected_identified") or 1000, + ) + + result["expected_files"] = list(instance["expected_spans"].keys()) + result["edited_files"] = list(edited_spans.keys()) + result["identified_spans"] = sum( + [len(v) for v in identified_spans.values()] + ) + + result["lints"] = ",".join(lint_codes) + + if report and info.get("instance_id", "") in report["resolved"]: + result["status"] = "resolved" + elif result["edited"]: + result["status"] = "edited" + elif result["identified"]: + result["status"] = "identified" + elif result["found_in_search"]: + result["status"] = "found_in_search" + elif result["file_identified"]: + result["status"] = "file_identified" + else: + result["status"] = "" + + if "error" in info: + result["error"] = info["error"].split("\n")[0] + else: + result["error"] = "" + + except Exception as e: + raise e + + return result, transitions + + +def generate_md_report(trajectory: dict, instance: dict): + info = trajectory["info"] + markdown = f"# {instance['instance_id']}\n" + + markdown += "\n## Problem statement\n" + markdown += f"```\n{instance['problem_statement']}\n```\n" + + if "error" in trajectory["info"]: + markdown += "\n## Error\n" + markdown += f"```\n{trajectory['info']['error']}\n```\n" + else: + markdown += "\n## Prediction\n" + markdown += f"```diff\n{info['submission']}\n```\n" + + markdown += "\n## Golden patch\n" + markdown += f"```diff\n{instance['golden_patch']}\n```\n" + + markdown += "\n## Trajectory\n" + + repo_dir = setup_swebench_repo(instance) + file_repo = FileRepository(repo_dir) + + for j, step in enumerate(trajectory["transitions"]): + for i, traj_action in enumerate(step["actions"]): + state_name = step['state'] + markdown += f"### {j+1} {state_name} ({i+1})\n\n" + + if not traj_action.get("action"): + continue + action = traj_action["action"] + + if state_name == "PlanToCode": + if action.get("scratch_pad"): + markdown += "*" + action["scratch_pad"] + "*" + + if action.get("instructions"): + markdown += f"\n\n * {action['instructions']}" + + if action.get("file_path"): + markdown += f"\n * {action['file_path']}" + + if action.get("span_id"): + markdown += f"\n * {action['span_id']}" + + if action.get("file_path") and action.get("span_id"): + markdown += "\n\n#### File context \n\n" + try: + file_context = FileContext(file_repo) + file_context.add_span_to_context( + action.get("file_path"), + action.get("span_id"), + ) + markdown += file_context.create_prompt( + show_outcommented_code=True + ) + except Exception as e: + logger.error(e) + + if state_name == "EditCode": + markdown += "#### LLM Response\n\n" + markdown += f"```\n{action.get('content', '')}\n```\n" + + output = traj_action.get("output") + if output: + if output.get("diff"): + markdown += "#### Diff\n\n" + markdown += f"```diff\n{output['diff']}\n```\n" + + if output.get("errors"): + markdown += "#### Errors\n\n" + markdown += f"{output['errors']}\n\n" + + if output.get("message"): + markdown += "#### Message\n\n" + markdown += f"{output['message']}\n\n" + + if state_name == "ClarifyCodeChange": + if action.get("thoughts"): + markdown += "*" + action["thoughts"] + "*" + + if action.get("output") and action.get("output").get("start_line"): + markdown += f"\n* Start Line: {action['output']['start_line']}\n" + markdown += f"\n* End Line: {action['output']['end_line']}\n" + + if state_name == "Finished": + markdown += f"*{action['properties']['message']}*\n" + + if state_name == "Rejected": + markdown += f"*{action['properties']['message']}*\n" + + markdown += "## Alternative patches\n" + for alternative in instance["resolved_by"]: + markdown += f"### {alternative['name']}\n" + markdown += f"```diff\n{alternative['patch']}\n```\n" + + return markdown diff --git a/moatless/benchmark/report_v2.py b/moatless/benchmark/report_v2.py new file mode 100644 index 00000000..a98ce537 --- /dev/null +++ b/moatless/benchmark/report_v2.py @@ -0,0 +1,415 @@ +import logging + +from moatless import FileRepository +from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo +from moatless.benchmark.utils import get_missing_files +from moatless.file_context import FileContext + +logger = logging.getLogger(__name__) + + +def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[dict, list]: + info = trajectory["info"] + + if report and "resolved_ids" in report and instance["instance_id"] in report["resolved_ids"]: + result_status = "resolved" + else: + result_status = info.get("status") + + resolved = result_status == "resolved" + + try: + transitions = [] + result = { + "instance_id": instance["instance_id"], + "duration": info.get("duration", 0), + "total_cost": info.get("total_cost", 0), + "resolved_by": (len(instance.get("resolved_by", []))), + "status": None, + "result_status": result_status, + "transitions": len(trajectory["transitions"]), + "edited": False, + "planned": False, + "identified": None, + "expected_identified": None, + "alt_identified": None, + "found_in_search": None, + "file_identified": None, + "file_in_search": None, + "edit_retries": 0, + "has_diff": False, + "lint_codes": None, + "review": False, + "p_query": 0, + "p_file": 0, + "p_code": 0, + "p_class": 0, + "p_function": 0, + "lints": "", + } + + lint_codes = set() + search_results_spans = {} + identified_spans = {} + planned_spans = {} + edited_spans = {} + + id_iterations = 0 + search_iterations = 0 + + selected_transition_ids = [] + if "current_transition_id" in trajectory: + transitions_map = {t["id"]: t for t in trajectory["transitions"]} + + transition = transitions_map.get(trajectory["current_transition_id"]) + while transition: + selected_transition_ids.append(transition["id"]) + if "parent_id" in transition: + transition = transitions_map.get(transition["parent_id"]) + else: + break + + logger.info(f"Selected transitions: {selected_transition_ids}") + + if instance.get("expected_spans"): + for transition in trajectory["transitions"]: + if selected_transition_ids and transition["id"] not in selected_transition_ids: + continue + + state_name = transition["state"]["name"] + + if state_name not in result: + result[state_name] = 0 + result[f"{state_name}_cost"] = 0 + + result[state_name] += 1 + + expected_span_str = "" + for file_path, span_ids in instance["expected_spans"].items(): + expected_span_str += f"{file_path}: {span_ids} " + + transition_result = { + "instance_id": instance["instance_id"], + "resolved": resolved, + "name": state_name, + "cost": 0, + "expected_spans": expected_span_str, + "actual_spans": "", + } + + if not transition["actions"]: + continue + + for traj_action in transition["actions"]: + result[f"{state_name}_cost"] += traj_action.get( + "completion_cost", 0 + ) + transition_result["cost"] += traj_action.get( + "completion_cost", 0 + ) + + if state_name == "SearchCode": + search_iterations += 1 + + action = transition["actions"][-1] + + if "search_requests" in action["action"]: + for search_request in action["action"]["search_requests"]: + if search_request.get("query"): + result["p_query"] += 1 + + if search_request.get("file_pattern"): + result["p_file"] += 1 + + if search_request.get("code_snippet"): + result["p_code"] += 1 + + if search_request.get( + "class_name" + ) or search_request.get("class_names"): + result["p_class"] += 1 + + if search_request.get( + "function_name" + ) or search_request.get("function_names"): + result["p_function"] += 1 + + if state_name == "IdentifyCode": + id_iterations += 1 + + state = transition["state"] + if state.get("ranked_spans"): + for ranked_span in state["ranked_spans"]: + if ( + ranked_span["file_path"] + not in search_results_spans + ): + search_results_spans[ + ranked_span["file_path"] + ] = [] + search_results_spans[ + ranked_span["file_path"] + ].append(ranked_span["span_id"]) + + if not result["found_in_search"] and ( + found_in_expected_spans( + instance, search_results_spans + ) + or found_in_alternative_spans( + instance, search_results_spans + ) + ): + result["found_in_search"] = search_iterations + + if not result["file_in_search"]: + missing_files = get_missing_files( + instance["expected_spans"], + search_results_spans, + ) + if not missing_files: + result["file_in_search"] = search_iterations + + action = transition["actions"][-1] + if action.get("action"): + identified_str = "" + if action["action"].get("identified_spans"): + for span in action["action"]["identified_spans"]: + identified_str += ( + f"{span['file_path']}: {span['span_ids']} " + ) + if span["file_path"] not in identified_spans: + identified_spans[span["file_path"]] = [] + + transition_result["actual_spans"] += ( + f"{span['file_path']}: {','.join(span['span_ids'])} " + ) + for span_id in span["span_ids"]: + identified_spans[span["file_path"]].append( + span_id + ) + result["identified_spans"] = identified_str + + if not result["file_identified"]: + missing_files = get_missing_files( + instance["expected_spans"], + identified_spans, + ) + if not missing_files: + result["file_identified"] = id_iterations + + if result[ + "expected_identified" + ] is None and found_in_expected_spans( + instance, identified_spans + ): + result["expected_identified"] = id_iterations + + if result[ + "alt_identified" + ] is None and found_in_alternative_spans( + instance, identified_spans + ): + result["alt_identified"] = id_iterations + + if result.get("alt_identified") or result.get( + "expected_identified" + ): + result["identified"] = min( + result.get("alt_identified") or 1000, + result.get("expected_identified") or 1000, + ) + + if state_name == "PlanToCode": + action = transition["actions"][-1]["action"] + if action.get("action") == "review": + result["review"] = True + + if "file_path" in action: + if "span_id" not in action: + logger.warning( + f"Span id missing in planning action in {instance['instance_id']}" + ) + else: + file_path = action["file_path"] + if file_path not in planned_spans: + planned_spans[file_path] = [] + planned_spans[file_path].append(action["span_id"]) + transition_result["actual_spans"] = ( + f"{file_path}: {action['span_id']} " + ) + + if not result.get("planned") and ( + found_in_expected_spans( + instance, + planned_spans, + ) + or found_in_alternative_spans(instance, planned_spans) + ): + result["planned"] = True + + if state_name == "EditCode": + result["edit_retries"] = len(transition["actions"]) - 1 + + action = transition["actions"][-1] + edited = action.get("trigger") == "finish" + + if edited and "file_path" in transition["state"]: + file_path = transition["state"]["file_path"] + if file_path not in edited_spans: + edited_spans[file_path] = [] + edited_spans[file_path].append( + transition["state"]["span_id"] + ) + transition_result["actual_spans"] = ( + f"{file_path}: {transition['state']['span_id']} " + ) + + if not result.get("edited") and ( + found_in_expected_spans( + instance, + edited_spans, + ) + or found_in_alternative_spans(instance, edited_spans) + ): + result["edited"] = True + + output = action.get("output", {}) + if output: + if edited: + result["has_diff"] = True + + for lint in output.get("verification_errors", []): + lint_codes.add(lint["code"]) + + transitions.append(transition_result) + + if result.get("alt_identified") or result.get("expected_identified"): + result["identified"] = min( + result.get("alt_identified") or 1000, + result.get("expected_identified") or 1000, + ) + + result["expected_files"] = list(instance["expected_spans"].keys()) + result["edited_files"] = list(edited_spans.keys()) + result["identified_spans"] = sum( + [len(v) for v in identified_spans.values()] + ) + + result["lints"] = ",".join(lint_codes) + + if result["edited"]: + result["status"] = "edited" + elif result["identified"]: + result["status"] = "identified" + elif result["found_in_search"]: + result["status"] = "found_in_search" + elif result["file_identified"]: + result["status"] = "file_identified" + else: + result["status"] = "" + + if "error" in info: + result["error"] = info["error"].split("\n")[0] + else: + result["error"] = "" + + except Exception as e: + raise e + + return result, transitions +def generate_md_report(trajectory: dict, instance: dict): + info = trajectory["info"] + markdown = f"# {instance['instance_id']}\n" + + markdown += "\n## Problem statement\n" + markdown += f"```\n{instance['problem_statement']}\n```\n" + + if "error" in trajectory["info"]: + markdown += "\n## Error\n" + markdown += f"```\n{trajectory['info']['error']}\n```\n" + else: + markdown += "\n## Prediction\n" + markdown += f"```diff\n{info['submission']}\n```\n" + + markdown += "\n## Golden patch\n" + markdown += f"```diff\n{instance['golden_patch']}\n```\n" + + markdown += "\n## Trajectory\n" + + repo_dir = setup_swebench_repo(instance) + file_repo = FileRepository(repo_dir) + + for j, step in enumerate(trajectory["transitions"]): + for i, traj_action in enumerate(step["actions"]): + state_name = step['state'] + markdown += f"### {j+1} {state_name} ({i+1})\n\n" + + if not traj_action.get("action"): + continue + action = traj_action["action"] + + if state_name == "PlanToCode": + if action.get("scratch_pad"): + markdown += "*" + action["scratch_pad"] + "*" + + if action.get("instructions"): + markdown += f"\n\n * {action['instructions']}" + + if action.get("file_path"): + markdown += f"\n * {action['file_path']}" + + if action.get("span_id"): + markdown += f"\n * {action['span_id']}" + + if action.get("file_path") and action.get("span_id"): + markdown += "\n\n#### File context \n\n" + try: + file_context = FileContext(file_repo) + file_context.add_span_to_context( + action.get("file_path"), + action.get("span_id"), + ) + markdown += file_context.create_prompt( + show_outcommented_code=True + ) + except Exception as e: + logger.error(e) + + if state_name == "EditCode": + markdown += "#### LLM Response\n\n" + markdown += f"```\n{action.get('content', '')}\n```\n" + + output = traj_action.get("output") + if output: + if output.get("diff"): + markdown += "#### Diff\n\n" + markdown += f"```diff\n{output['diff']}\n```\n" + + if output.get("errors"): + markdown += "#### Errors\n\n" + markdown += f"{output['errors']}\n\n" + + if output.get("message"): + markdown += "#### Message\n\n" + markdown += f"{output['message']}\n\n" + + if state_name == "ClarifyCodeChange": + if action.get("thoughts"): + markdown += "*" + action["thoughts"] + "*" + + if action.get("output") and action.get("output").get("start_line"): + markdown += f"\n* Start Line: {action['output']['start_line']}\n" + markdown += f"\n* End Line: {action['output']['end_line']}\n" + + if state_name == "Finished": + markdown += f"*{action['properties']['message']}*\n" + + if state_name == "Rejected": + markdown += f"*{action['properties']['message']}*\n" + + markdown += "## Alternative patches\n" + for alternative in instance["resolved_by"]: + markdown += f"### {alternative['name']}\n" + markdown += f"```diff\n{alternative['patch']}\n```\n" + + return markdown