From bda099dfc2dbab34a36bf587f27f4ae75d620d3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Thu, 13 Jun 2024 20:57:16 +0200 Subject: [PATCH] Notebook for ingesting all swe bench repos --- notebooks/ingest.ipynb | 270 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 notebooks/ingest.ipynb diff --git a/notebooks/ingest.ipynb b/notebooks/ingest.ipynb new file mode 100644 index 00000000..25610593 --- /dev/null +++ b/notebooks/ingest.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Run ingestion\n", + "Ingest all SWE-Bench instances and evaluate the results." + ], + "id": "d2e538dc56ed690c" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "", + "id": "4d7e85ae729b9c1a" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T08:52:16.128716Z", + "start_time": "2024-05-21T08:51:59.638654Z" + } + }, + "cell_type": "code", + "source": [ + "from moatless.benchmark.swebench import load_instances\n", + "import os\n", + "import json\n", + "\n", + "instance_by_id = load_instances(\"princeton-nlp/SWE-bench_Lite\", split=\"test\")\n", + "\n", + "evaluation_report = \"report.jsonl\"\n", + "\n", + "previous_instances = {\n", + "}\n", + "\n", + "if os.path.exists(evaluation_report):\n", + " with open(evaluation_report, \"r\") as f:\n", + " for line in f:\n", + " report = json.loads(line)\n", + " previous_instance = instance_by_id[report[\"instance_id\"]]\n", + " previous_instances[previous_instance[\"repo\"]] = previous_instance\n", + " del instance_by_id[report[\"instance_id\"]]\n", + "\n", + "instances = list(instance_by_id.values())\n", + "instances = sorted(instances, key=lambda x: x[\"created_at\"])\n", + "\n", + "print(f\"Number of instances: {len(instances)}\")" + ], + "id": "2a935c4beaaa4635", + "execution_count": 1, + "outputs": [] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T08:52:16.133527Z", + "start_time": "2024-05-21T08:52:16.130077Z" + } + }, + "cell_type": "code", + "source": [ + "from moatless.benchmark.swebench import setup_swebench_repo\n", + "\n", + "def next_instance(instances):\n", + " if not instances:\n", + " return None\n", + " instance = instances.pop(0)\n", + " print(f\"Instance: {instance['instance_id']}, {len(instances)} instances left\")\n", + " return instance\n", + "\n", + "instance = next_instance(instances)" + ], + "id": "c2b3e2c270d2e5c3", + "execution_count": 2, + "outputs": [] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T08:52:16.924594Z", + "start_time": "2024-05-21T08:52:16.134279Z" + } + }, + "cell_type": "code", + "source": [ + "from moatless.index.settings import IndexSettings\n", + "from moatless.index.code_index import CodeIndex\n", + "from dotenv import load_dotenv\n", + "from moatless.benchmark.swebench import get_repo_dir_name\n", + "import os\n", + "\n", + "index_settings = IndexSettings(\n", + " embed_model=\"voyage-code-2\"\n", + ")\n", + "\n", + "load_dotenv('../.env')\n", + "\n", + "def get_persist_dir(instance):\n", + " return os.path.join(\"/tmp/index_store\", get_repo_dir_name(instance[\"instance_id\"]))\n", + "\n", + "def create_index(instance):\n", + " previous_instance = previous_instances.get(instance[\"repo\"])\n", + " if previous_instance:\n", + " return CodeIndex.from_persist_dir(get_persist_dir(previous_instance))\n", + " else:\n", + " return CodeIndex(settings=index_settings)\n", + "\n", + "code_index = create_index(instance)" + ], + "id": "af25431875f4a923", + "execution_count": 3, + "outputs": [] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T09:00:40.704370Z", + "start_time": "2024-05-21T08:52:16.926814Z" + } + }, + "cell_type": "code", + "source": [ + "def ingest(code_index, instance):\n", + " repo_path = setup_swebench_repo(instance)\n", + " print(f\"Repo path: {repo_path}\")\n", + "\n", + " vectors, indexed_tokens = code_index.run_ingestion(repo_path=repo_path, num_workers=4)\n", + " print(f\"Indexed {vectors} vectors and {indexed_tokens} tokens.\")\n", + " \n", + " persist_dir = get_persist_dir(instance)\n", + " code_index.persist(persist_dir=persist_dir)\n", + " print(f\"Index persisted to {persist_dir}\")\n", + " \n", + " previous_instances[instance[\"repo\"]] = instance\n", + " return vectors, indexed_tokens\n", + "\n", + "vectors, indexed_tokens = ingest(code_index, instance)" + ], + "id": "3f749928390182fb", + "execution_count": 4, + "outputs": [] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T09:00:41.269230Z", + "start_time": "2024-05-21T09:00:40.705860Z" + } + }, + "cell_type": "code", + "source": [ + "from moatless.benchmark.utils import calculate_estimated_context_window\n", + "\n", + "def evaluate(code_index, instance):\n", + " results = code_index.find_code(instance[\"problem_statement\"], top_k=1000)\n", + " \n", + " expected_changes, sum_tokens = calculate_estimated_context_window(instance, results)\n", + " all_matching_context_window = None\n", + " any_matching_context_window = None\n", + " \n", + " expected_matches = [context for context in expected_changes if context[\"context_window\"] is not None]\n", + " if expected_matches:\n", + " all_matching_context_window = max(context[\"context_window\"] for context in expected_matches)\n", + " any_matching_context_window = min(context[\"context_window\"] for context in expected_matches)\n", + " \n", + " if len(expected_matches) == len(expected_changes):\n", + " print(f\"Found all expected changes within a context window of {all_matching_context_window} tokens, first match at context window {any_matching_context_window}\")\n", + " else:\n", + " any_matching_context_window = min(context[\"context_window\"] for context in expected_changes if context[\"context_window\"] is not None)\n", + " print(f\"Found {len(expected_matches)} expected changes within a context window {all_matching_context_window} tokens, first match at context window {any_matching_context_window} max context window {sum_tokens} tokens\")\n", + " \n", + " \n", + " else:\n", + " print(f\"No expected changes found in context window of {sum_tokens} tokens\")\n", + " \n", + " for change in expected_changes:\n", + " if change[\"context_window\"] is None:\n", + " print(f\"Expected change: {change['file_path']} ({change['start_line']}-{change['end_line']}) not fund, closest match: {change.get('closest_match_lines')}\")\n", + " else:\n", + " print(f\"Expected change: {change['file_path']} ({change['start_line']}-{change['end_line']}) found at context window {change['context_window']} tokens. Distance: {change['distance']}. Position: {change['position']}\")\n", + " \n", + " \n", + " return expected_changes, all_matching_context_window, any_matching_context_window\n", + "\n", + "expected_changes, all_matching_context_window, any_matching_context_window = evaluate(code_index, instance)" + ], + "id": "ac7a612e3e7fb834", + "execution_count": 5, + "outputs": [] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-21T09:00:41.273680Z", + "start_time": "2024-05-21T09:00:41.270252Z" + } + }, + "cell_type": "code", + "source": [ + "import json\n", + "\n", + "def write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window):\n", + " with open(\"report.jsonl\", \"a\") as f:\n", + " f.write(json.dumps({\n", + " \"instance_id\": instance[\"instance_id\"],\n", + " \"vectors\": vectors,\n", + " \"indexed_tokens\": indexed_tokens,\n", + " \"all_matching_context_window\": all_matching_context_window,\n", + " \"any_matching_context_window\": any_matching_context_window,\n", + " \"expected_changes\": expected_changes,\n", + " }) + \"\\n\")\n", + " \n", + "write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window)" + ], + "id": "7f9a0ee8beabe3d6", + "execution_count": 6, + "outputs": [] + }, + { + "metadata": { + "jupyter": { + "is_executing": true + }, + "ExecuteTime": { + "start_time": "2024-05-21T09:00:41.274579Z" + } + }, + "cell_type": "code", + "source": [ + "\n", + "def index_next_instance():\n", + " instance = next_instance(instances)\n", + " while instance:\n", + " code_index = create_index(instance)\n", + " vectors, indexed_tokens = ingest(code_index, instance)\n", + " expected_changes, all_matching_context_window, any_matching_context_window = evaluate(code_index, instance)\n", + " write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window)\n", + " instance = next_instance(instances)\n", + "\n", + "index_next_instance()" + ], + "id": "69a610864e1d85f5", + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}