Notebook for ingesting all swe bench repos

aorwall · Jun 13, 2024 · bda099d · bda099d
1 parent d282c0f
commit bda099d
Showing 1 changed file with 270 additions and 0 deletions.
diff --git a/notebooks/ingest.ipynb b/notebooks/ingest.ipynb
@@ -0,0 +1,270 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Run ingestion\n",
+    "Ingest all SWE-Bench instances and evaluate the results."
+   ],
+   "id": "d2e538dc56ed690c"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "",
+   "id": "4d7e85ae729b9c1a"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-21T08:52:16.128716Z",
+     "start_time": "2024-05-21T08:51:59.638654Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from moatless.benchmark.swebench import load_instances\n",
+    "import os\n",
+    "import json\n",
+    "\n",
+    "instance_by_id = load_instances(\"princeton-nlp/SWE-bench_Lite\", split=\"test\")\n",
+    "\n",
+    "evaluation_report = \"report.jsonl\"\n",
+    "\n",
+    "previous_instances = {\n",
+    "}\n",
+    "\n",
+    "if os.path.exists(evaluation_report):\n",
+    "    with open(evaluation_report, \"r\") as f:\n",
+    "        for line in f:\n",
+    "            report = json.loads(line)\n",
+    "            previous_instance = instance_by_id[report[\"instance_id\"]]\n",
+    "            previous_instances[previous_instance[\"repo\"]] = previous_instance\n",
+    "            del instance_by_id[report[\"instance_id\"]]\n",
+    "\n",
+    "instances = list(instance_by_id.values())\n",
+    "instances = sorted(instances, key=lambda x: x[\"created_at\"])\n",
+    "\n",
+    "print(f\"Number of instances: {len(instances)}\")"
+   ],
+   "id": "2a935c4beaaa4635",
+   "execution_count": 1,
+   "outputs": []
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-21T08:52:16.133527Z",
+     "start_time": "2024-05-21T08:52:16.130077Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from moatless.benchmark.swebench import setup_swebench_repo\n",
+    "\n",
+    "def next_instance(instances):\n",
+    "    if not instances:\n",
+    "        return None\n",
+    "    instance = instances.pop(0)\n",
+    "    print(f\"Instance: {instance['instance_id']}, {len(instances)} instances left\")\n",
+    "    return instance\n",
+    "\n",
+    "instance = next_instance(instances)"
+   ],
+   "id": "c2b3e2c270d2e5c3",
+   "execution_count": 2,
+   "outputs": []
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-21T08:52:16.924594Z",
+     "start_time": "2024-05-21T08:52:16.134279Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from moatless.index.settings import IndexSettings\n",
+    "from moatless.index.code_index import CodeIndex\n",
+    "from dotenv import load_dotenv\n",
+    "from moatless.benchmark.swebench import get_repo_dir_name\n",
+    "import os\n",
+    "\n",
+    "index_settings = IndexSettings(\n",
+    "    embed_model=\"voyage-code-2\"\n",
+    ")\n",
+    "\n",
+    "load_dotenv('../.env')\n",
+    "\n",
+    "def get_persist_dir(instance):\n",
+    "    return os.path.join(\"/tmp/index_store\", get_repo_dir_name(instance[\"instance_id\"]))\n",
+    "\n",
+    "def create_index(instance):\n",
+    "    previous_instance = previous_instances.get(instance[\"repo\"])\n",
+    "    if previous_instance:\n",
+    "        return CodeIndex.from_persist_dir(get_persist_dir(previous_instance))\n",
+    "    else:\n",
+    "        return CodeIndex(settings=index_settings)\n",
+    "\n",
+    "code_index = create_index(instance)"
+   ],
+   "id": "af25431875f4a923",
+   "execution_count": 3,
+   "outputs": []
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-21T09:00:40.704370Z",
+     "start_time": "2024-05-21T08:52:16.926814Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def ingest(code_index, instance):\n",
+    "    repo_path = setup_swebench_repo(instance)\n",
+    "    print(f\"Repo path: {repo_path}\")\n",
+    "\n",
+    "    vectors, indexed_tokens = code_index.run_ingestion(repo_path=repo_path, num_workers=4)\n",
+    "    print(f\"Indexed {vectors} vectors and {indexed_tokens} tokens.\")\n",
+    "    \n",
+    "    persist_dir = get_persist_dir(instance)\n",
+    "    code_index.persist(persist_dir=persist_dir)\n",
+    "    print(f\"Index persisted to {persist_dir}\")\n",
+    "    \n",
+    "    previous_instances[instance[\"repo\"]] = instance\n",
+    "    return vectors, indexed_tokens\n",
+    "\n",
+    "vectors, indexed_tokens = ingest(code_index, instance)"
+   ],
+   "id": "3f749928390182fb",
+   "execution_count": 4,
+   "outputs": []
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-21T09:00:41.269230Z",
+     "start_time": "2024-05-21T09:00:40.705860Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from moatless.benchmark.utils import calculate_estimated_context_window\n",
+    "\n",
+    "def evaluate(code_index, instance):\n",
+    "    results = code_index.find_code(instance[\"problem_statement\"], top_k=1000)\n",
+    "        \n",
+    "    expected_changes, sum_tokens = calculate_estimated_context_window(instance, results)\n",
+    "    all_matching_context_window = None\n",
+    "    any_matching_context_window = None\n",
+    "    \n",
+    "    expected_matches = [context for context in expected_changes if context[\"context_window\"] is not None]\n",
+    "    if expected_matches:\n",
+    "        all_matching_context_window = max(context[\"context_window\"] for context in expected_matches)\n",
+    "        any_matching_context_window = min(context[\"context_window\"] for context in expected_matches)\n",
+    "        \n",
+    "        if len(expected_matches) == len(expected_changes):\n",
+    "            print(f\"Found all expected changes within a context window of {all_matching_context_window} tokens, first match at context window {any_matching_context_window}\")\n",
+    "        else:\n",
+    "            any_matching_context_window = min(context[\"context_window\"] for context in expected_changes if context[\"context_window\"] is not None)\n",
+    "            print(f\"Found {len(expected_matches)} expected changes within a context window {all_matching_context_window} tokens, first match at context window {any_matching_context_window} max context window {sum_tokens} tokens\")\n",
+    "        \n",
+    "        \n",
+    "    else:\n",
+    "        print(f\"No expected changes found in context window of {sum_tokens} tokens\")\n",
+    "        \n",
+    "    for change in expected_changes:\n",
+    "        if change[\"context_window\"] is None:\n",
+    "            print(f\"Expected change: {change['file_path']} ({change['start_line']}-{change['end_line']}) not fund, closest match: {change.get('closest_match_lines')}\")\n",
+    "        else:\n",
+    "            print(f\"Expected change: {change['file_path']} ({change['start_line']}-{change['end_line']}) found at context window {change['context_window']} tokens. Distance: {change['distance']}. Position: {change['position']}\")\n",
+    "        \n",
+    "        \n",
+    "    return expected_changes, all_matching_context_window, any_matching_context_window\n",
+    "\n",
+    "expected_changes, all_matching_context_window, any_matching_context_window = evaluate(code_index, instance)"
+   ],
+   "id": "ac7a612e3e7fb834",
+   "execution_count": 5,
+   "outputs": []
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-05-21T09:00:41.273680Z",
+     "start_time": "2024-05-21T09:00:41.270252Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import json\n",
+    "\n",
+    "def write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window):\n",
+    "    with open(\"report.jsonl\", \"a\") as f:\n",
+    "        f.write(json.dumps({\n",
+    "            \"instance_id\": instance[\"instance_id\"],\n",
+    "            \"vectors\": vectors,\n",
+    "            \"indexed_tokens\": indexed_tokens,\n",
+    "            \"all_matching_context_window\": all_matching_context_window,\n",
+    "            \"any_matching_context_window\": any_matching_context_window,\n",
+    "            \"expected_changes\": expected_changes,\n",
+    "    }) + \"\\n\")\n",
+    "        \n",
+    "write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window)"
+   ],
+   "id": "7f9a0ee8beabe3d6",
+   "execution_count": 6,
+   "outputs": []
+  },
+  {
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    },
+    "ExecuteTime": {
+     "start_time": "2024-05-21T09:00:41.274579Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "\n",
+    "def index_next_instance():\n",
+    "    instance = next_instance(instances)\n",
+    "    while instance:\n",
+    "        code_index = create_index(instance)\n",
+    "        vectors, indexed_tokens = ingest(code_index, instance)\n",
+    "        expected_changes, all_matching_context_window, any_matching_context_window = evaluate(code_index, instance)\n",
+    "        write_report(instance, expected_changes, vectors, indexed_tokens, all_matching_context_window, any_matching_context_window)\n",
+    "        instance = next_instance(instances)\n",
+    "\n",
+    "index_next_instance()"
+   ],
+   "id": "69a610864e1d85f5",
+   "execution_count": null,
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}