diff --git a/models/llama-index-hack/llama2_wikis.ipynb b/models/llama-index-hack/llama2_wikis.ipynb index 781dffa3..e38e3056 100644 --- a/models/llama-index-hack/llama2_wikis.ipynb +++ b/models/llama-index-hack/llama2_wikis.ipynb @@ -16,10 +16,7 @@ "from llama_index import VectorStoreIndex\n", "from llama_index import LLMPredictor, PromptHelper, ServiceContext\n", "from llama_index import StorageContext, load_index_from_storage\n", - "from llama_index.readers import SimpleDirectoryReader\n", - "\n", - "import pandas as pd\n", - "import re" + "from llama_index.readers import SimpleDirectoryReader" ] }, { @@ -399,27 +396,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'research-engineering-group.wiki.git'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "url = \"https://github.com/alan-turing-institute/research-engineering-group.wiki.git\"\n", - "url.split(\"/\")[-1]" - ] - }, { "cell_type": "code", "execution_count": 4, @@ -453,15 +429,6 @@ " return documents" ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "documents = load_documents_from_url(url)" - ] - }, { "cell_type": "code", "execution_count": 6, diff --git a/models/llama-index-hack/llama2_wikis_chat.ipynb b/models/llama-index-hack/llama2_wikis_chat.ipynb index 00982c59..478b68b5 100644 --- a/models/llama-index-hack/llama2_wikis_chat.ipynb +++ b/models/llama-index-hack/llama2_wikis_chat.ipynb @@ -6,15 +6,17 @@ "metadata": {}, "outputs": [], "source": [ + "import os \n", + "\n", + "from git import Repo\n", + "\n", "from llama_index.llms import LlamaCPP\n", "from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt\n", "from llama_index import Document\n", "from llama_index import VectorStoreIndex\n", "from llama_index import LLMPredictor, PromptHelper, ServiceContext\n", "from llama_index import StorageContext, load_index_from_storage\n", - "from llama_index.readers import SimpleDirectoryReader\n", - "\n", - "import pandas as pd" + "from llama_index.readers import SimpleDirectoryReader" ] }, { @@ -400,19 +402,30 @@ "metadata": {}, "outputs": [], "source": [ - "def load_documents():\n", - " documents = SimpleDirectoryReader(\n", - " input_dir=\"../../data/wikis/\",\n", + "def load_documents_from_url(url):\n", + " reader = SimpleDirectoryReader(\n", + " input_dir=url.split(\"/\")[-1],\n", " required_exts=[\".md\"],\n", " recursive=True, \n", " filename_as_id=True,\n", - " exclude=[\"*_course/*\"] # my laptop struggles to create a VectorStoreIndex when I have too many documents\n", - " ).load_data()\n", + " )\n", " \n", - " turingacuk=pd.read_csv(\"../../data/public/turingacuk-no-boilerplate.csv\")\n", - " turingacuk.dropna(subset=\"body\", inplace=True)\n", - " turingacuk_text=[str(i) for i in turingacuk[\"body\"].values]\n", - " documents.extend([Document(text=i) for i in turingacuk_text])\n", + " # get base url and names of files\n", + " url_stripped = url.removesuffix(\".wiki.git\")\n", + " fnames = [str(file) for file in reader.input_files]\n", + " # create fname: file_url_dict\n", + " file_urls = [os.path.join(url_stripped, \"wiki\", fname.split(\"/\")[-1].removesuffix(\".md\")) for fname in fnames]\n", + " file_urls_dict = {fname: file_url for fname, file_url in zip(fnames, file_urls)}\n", + " # define function to get url and add to reader\n", + " get_urls = lambda fname: {\"url\": file_urls_dict.get(fname)}\n", + " reader.file_metadata = get_urls\n", + "\n", + " documents = reader.load_data()\n", + "\n", + " # turingacuk=pd.read_csv(\"../../data/public/turingacuk-no-boilerplate.csv\")\n", + " # turingacuk.dropna(subset=\"body\", inplace=True)\n", + " # turingacuk_text=[str(i) for i in turingacuk[\"body\"].values]\n", + " # documents.extend([Document(text=i) for i in turingacuk_text])\n", " return documents" ] }, @@ -465,6 +478,7 @@ " mode,\n", " service_context,\n", " persist_dir=None,\n", + " wiki_urls=None,\n", "):\n", " if mode == \"reload\":\n", " if persist_dir is None:\n", @@ -474,7 +488,15 @@ " index = load_index_from_storage(storage_context, service_context=service_context)\n", " \n", " elif mode == \"create\":\n", - " documents = load_documents()\n", + " documents = []\n", + " for url in wiki_urls:\n", + " if os.path.exists(url.split(\"/\")[-1]):\n", + " repo = Repo(url.split(\"/\")[-1])\n", + " repo.remotes.origin.pull()\n", + " else:\n", + " _ = Repo.clone_from(url, url.split(\"/\")[-1])\n", + " wiki_docs = load_documents_from_url(url)\n", + " documents.extend(wiki_docs)\n", " index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True)\n", " \n", " else:\n", @@ -488,12 +510,44 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "83e0f380d02c4de093969f4f2a59435a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Parsing documents into nodes: 0%| | 0/392 [00:00