-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Notebook for ingesting all swe bench repos
- Loading branch information
Showing
1 changed file
with
270 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,270 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"metadata": {}, | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Run ingestion\n", | ||
"Ingest all SWE-Bench instances and evaluate the results." | ||
], | ||
"id": "d2e538dc56ed690c" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "markdown", | ||
"source": "", | ||
"id": "4d7e85ae729b9c1a" | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-05-21T08:52:16.128716Z", | ||
"start_time": "2024-05-21T08:51:59.638654Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"from moatless.benchmark.swebench import load_instances\n", | ||
"import os\n", | ||
"import json\n", | ||
"\n", | ||
"instance_by_id = load_instances(\"princeton-nlp/SWE-bench_Lite\", split=\"test\")\n", | ||
"\n", | ||
"evaluation_report = \"report.jsonl\"\n", | ||
"\n", | ||
"previous_instances = {\n", | ||
"}\n", | ||
"\n", | ||
"if os.path.exists(evaluation_report):\n", | ||
" with open(evaluation_report, \"r\") as f:\n", | ||
" for line in f:\n", | ||
" report = json.loads(line)\n", | ||
" previous_instance = instance_by_id[report[\"instance_id\"]]\n", | ||
" previous_instances[previous_instance[\"repo\"]] = previous_instance\n", | ||
" del instance_by_id[report[\"instance_id\"]]\n", | ||
"\n", | ||
"instances = list(instance_by_id.values())\n", | ||
"instances = sorted(instances, key=lambda x: x[\"created_at\"])\n", | ||
"\n", | ||
"print(f\"Number of instances: {len(instances)}\")" | ||
], | ||
"id": "2a935c4beaaa4635", | ||
"execution_count": 1, | ||
"outputs": [] | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-05-21T08:52:16.133527Z", | ||
"start_time": "2024-05-21T08:52:16.130077Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"from moatless.benchmark.swebench import setup_swebench_repo\n", | ||
"\n", | ||
"def next_instance(instances):\n", | ||
" if not instances:\n", | ||
" return None\n", | ||
" instance = instances.pop(0)\n", | ||
" print(f\"Instance: {instance['instance_id']}, {len(instances)} instances left\")\n", | ||
" return instance\n", | ||
"\n", | ||
"instance = next_instance(instances)" | ||
], | ||
"id": "c2b3e2c270d2e5c3", | ||
"execution_count": 2, | ||
"outputs": [] | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-05-21T08:52:16.924594Z", | ||
"start_time": "2024-05-21T08:52:16.134279Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"from moatless.index.settings import IndexSettings\n", | ||
"from moatless.index.code_index import CodeIndex\n", | ||
"from dotenv import load_dotenv\n", | ||
"from moatless.benchmark.swebench import get_repo_dir_name\n", | ||
"import os\n", | ||
"\n", | ||
"index_settings = IndexSettings(\n", | ||
" embed_model=\"voyage-code-2\"\n", | ||
")\n", | ||
"\n", | ||
"load_dotenv('../.env')\n", | ||
"\n", | ||
"def get_persist_dir(instance):\n", | ||
" return os.path.join(\"/tmp/index_store\", get_repo_dir_name(instance[\"instance_id\"]))\n", | ||
"\n", | ||
"def create_index(instance):\n", | ||
" previous_instance = previous_instances.get(instance[\"repo\"])\n", | ||
" if previous_instance:\n", | ||
" return CodeIndex.from_persist_dir(get_persist_dir(previous_instance))\n", | ||
" else:\n", | ||
" return CodeIndex(settings=index_settings)\n", | ||
"\n", | ||
"code_index = create_index(instance)" | ||
], | ||
"id": "af25431875f4a923", | ||
"execution_count": 3, | ||
"outputs": [] | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-05-21T09:00:40.704370Z", | ||
"start_time": "2024-05-21T08:52:16.926814Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"def ingest(code_index, instance):\n", | ||
" repo_path = setup_swebench_repo(instance)\n", | ||
" print(f\"Repo path: {repo_path}\")\n", | ||
"\n", | ||
" vectors, indexed_tokens = code_index.run_ingestion(repo_path=repo_path, num_workers=4)\n", | ||
" print(f\"Indexed {vectors} vectors and {indexed_tokens} tokens.\")\n", | ||
" \n", | ||
" persist_dir = get_persist_dir(instance)\n", | ||
" code_index.persist(persist_dir=persist_dir)\n", | ||
" print(f\"Index persisted to {persist_dir}\")\n", | ||
" \n", | ||
" previous_instances[instance[\"repo\"]] = instance\n", | ||
" return vectors, indexed_tokens\n", | ||
"\n", | ||
"vectors, indexed_tokens = ingest(code_index, instance)" | ||
], | ||
"id": "3f749928390182fb", | ||
"execution_count": 4, | ||
"outputs": [] | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-05-21T09:00:41.269230Z", | ||
"start_time": "2024-05-21T09:00:40.705860Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"from moatless.benchmark.utils import calculate_estimated_context_window\n", | ||
"\n", | ||
"def evaluate(code_index, instance):\n", | ||
" results = code_index.find_code(instance[\"problem_statement\"], top_k=1000)\n", | ||
" \n", | ||
" expected_changes, sum_tokens = calculate_estimated_context_window(instance, results)\n", | ||
" all_matching_context_window = None\n", | ||
" any_matching_context_window = None\n", | ||
" \n", | ||
" expected_matches = [context for context in expected_changes if context[\"context_window\"] is not None]\n", | ||
" if expected_matches:\n", | ||
" all_matching_context_window = max(context[\"context_window\"] for context in expected_matches)\n", | ||
" any_matching_context_window = min(context[\"context_window\"] for context in expected_matches)\n", | ||
" \n", | ||
" if len(expected_matches) == len(expected_changes):\n", | ||
" print(f\"Found all expected changes within a context window of {all_matching_context_window} tokens, first match at context window {any_matching_context_window}\")\n", | ||
" else:\n", | ||
" any_matching_context_window = min(context[\"context_window\"] for context in expected_changes if context[\"context_window\"] is not None)\n", | ||
" print(f\"Found {len(expected_matches)} expected changes within a context window {all_matching_context_window} tokens, first match at context window {any_matching_context_window} max context window {sum_tokens} tokens\")\n", | ||
" \n", | ||
" \n", | ||
" else:\n", | ||
" print(f\"No expected changes found in context window of {sum_tokens} tokens\")\n", | ||
" \n", | ||
" for change in expected_changes:\n", | ||
" if change[\"context_window\"] is None:\n", | ||
" print(f\"Expected change: {change['file_path']} ({change['start_line']}-{change['end_line']}) not fund, closest match: {change.get('closest_match_lines')}\")\n", | ||
" else:\n", | ||
" print(f\"Expected change: {change['file_path']} ({change['start_line']}-{change['end_line']}) found at context window {change['context_window']} tokens. Distance: {change['distance']}. Position: {change['position']}\")\n", | ||
" \n", | ||
" \n", | ||
" return expected_changes, all_matching_context_window, any_matching_context_window\n", | ||
"\n", | ||
"expected_changes, all_matching_context_window, any_matching_context_window = evaluate(code_index, instance)" | ||
], | ||
"id": "ac7a612e3e7fb834", | ||
"execution_count": 5, | ||
"outputs": [] | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-05-21T09:00:41.273680Z", | ||
"start_time": "2024-05-21T09:00:41.270252Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"import json\n", | ||
"\n", | ||
"def write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window):\n", | ||
" with open(\"report.jsonl\", \"a\") as f:\n", | ||
" f.write(json.dumps({\n", | ||
" \"instance_id\": instance[\"instance_id\"],\n", | ||
" \"vectors\": vectors,\n", | ||
" \"indexed_tokens\": indexed_tokens,\n", | ||
" \"all_matching_context_window\": all_matching_context_window,\n", | ||
" \"any_matching_context_window\": any_matching_context_window,\n", | ||
" \"expected_changes\": expected_changes,\n", | ||
" }) + \"\\n\")\n", | ||
" \n", | ||
"write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window)" | ||
], | ||
"id": "7f9a0ee8beabe3d6", | ||
"execution_count": 6, | ||
"outputs": [] | ||
}, | ||
{ | ||
"metadata": { | ||
"jupyter": { | ||
"is_executing": true | ||
}, | ||
"ExecuteTime": { | ||
"start_time": "2024-05-21T09:00:41.274579Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"\n", | ||
"def index_next_instance():\n", | ||
" instance = next_instance(instances)\n", | ||
" while instance:\n", | ||
" code_index = create_index(instance)\n", | ||
" vectors, indexed_tokens = ingest(code_index, instance)\n", | ||
" expected_changes, all_matching_context_window, any_matching_context_window = evaluate(code_index, instance)\n", | ||
" write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window)\n", | ||
" instance = next_instance(instances)\n", | ||
"\n", | ||
"index_next_instance()" | ||
], | ||
"id": "69a610864e1d85f5", | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |