From 3563936bbae5238768b9f549196cad7be4ea7fca Mon Sep 17 00:00:00 2001 From: Sanjiv Das Date: Tue, 23 Apr 2024 14:08:08 -0700 Subject: [PATCH] learn arxiv tex files * Created a new option remote or -r. Example: /learn -r arxiv * Approach: downloads the tar file for the entire paper into downloads_temp. Then, unzips and collects all .tex files in the tar file and concatenates them. Different authors use various approaches. Some have the entire paper in one tex file, whereas others may have separate tex files for each section, so we need to collect all the tex file into a single file and then hand off to the splitter, embedder. After completion, remove the temp directory. Return a properly formatted error if package arxiv needs to be installed. * Handle two types of errors: (i) package arxiv not installed. (ii) User enters a wrong paper id. --- .../jupyter_ai/chat_handlers/learn.py | 16 ++++++++- .../jupyter_ai/document_loaders/directory.py | 35 ++++++++++++++++++- packages/jupyter-ai/pyproject.toml | 2 +- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index 38390a44c..21c5ae6bb 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -4,7 +4,7 @@ from typing import Any, Coroutine, List, Optional, Tuple from dask.distributed import Client as DaskClient -from jupyter_ai.document_loaders.directory import get_embeddings, split +from jupyter_ai.document_loaders.directory import get_embeddings, split, arxiv_to_text from jupyter_ai.document_loaders.splitter import ExtensionSplitter, NotebookSplitter from jupyter_ai.models import ( DEFAULT_CHUNK_OVERLAP, @@ -44,6 +44,9 @@ def __init__(self, *args, **kwargs): self.parser.add_argument("-v", "--verbose", action="store_true") self.parser.add_argument("-d", "--delete", action="store_true") self.parser.add_argument("-l", "--list", action="store_true") + self.parser.add_argument( + "-r", "--remote", action="store" , default=None, type=str + ) self.parser.add_argument( "-c", "--chunk-size", action="store", default=DEFAULT_CHUNK_SIZE, type=int ) @@ -107,6 +110,17 @@ async def process_message(self, message: HumanChatMessage): self.reply(self._build_list_response()) return + if args.remote: + remote_type = args.remote.lower() + if remote_type=="arxiv": + try: + id = args.path[0] + args.path = [arxiv_to_text(id)] # call the function in `directory.py`` + self.reply(f"Processing arxiv file id {id}, saved in {args.path[0]}.", message) + except Exception as e: + self.reply(f"""The arXiv file could not be processed. Check the paper ID ({id}). Or, verify that the `arxiv` package is installed.""") + return + # Make sure the path exists. if not len(args.path) == 1: self.reply(f"{self.parser.format_usage()}", message) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 561f00a1c..bf70b89e2 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -5,10 +5,42 @@ from typing import List import dask -from langchain.document_loaders import PyPDFLoader +from langchain.document_loaders import PyPDFLoader, ArxivLoader from langchain.schema import Document from langchain.text_splitter import TextSplitter +import tarfile +import shutil + +# Download a single tar file from arXiv and store in a temp folder for RAG, then run learn on it. +try: + import arxiv +except Exception as e: + print("Missing package: arxiv") + +def arxiv_to_text(id): # id is numbers after "arXiv" in arXiv:xxxx.xxxxx + # Get the paper from arxiv + outfile = id + ".tex" + temp_dir = "downloads_temp" + if not os.path.isdir(temp_dir): + os.mkdir(temp_dir) + client = arxiv.Client() + paper = next(arxiv.Client().results(arxiv.Search(id_list=[id]))) + paper.download_source(dirpath=temp_dir, filename="downloaded-paper.tar.gz") + # Extract downloaded tar file + tar = tarfile.open(temp_dir+"/downloaded-paper.tar.gz") + tar.extractall(temp_dir) + tar.close() + tex_list = os.listdir(temp_dir) + tex_list = [j for j in tex_list if j.lower().endswith('.tex')] + with open(outfile,'wb') as wfd: + for f in tex_list: + with open(temp_dir+"/"+f,'rb') as fd: + shutil.copyfileobj(fd, wfd) + + outfile_path = os.path.realpath(outfile) + shutil.rmtree(temp_dir) # Delete the temp folder but not the downloaded latex files + return outfile_path # Uses pypdf which is used by PyPDFLoader from langchain def pdf_to_text(path): @@ -50,6 +82,7 @@ def path_to_doc(path): ".txt", ".html", ".pdf", + ".tex", } diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml index f5eb5e98f..68a770a1b 100644 --- a/packages/jupyter-ai/pyproject.toml +++ b/packages/jupyter-ai/pyproject.toml @@ -54,7 +54,7 @@ test = [ dev = ["jupyter_ai_magics[dev]"] -all = ["jupyter_ai_magics[all]", "pypdf"] +all = ["jupyter_ai_magics[all]", "pypdf", "arxiv"] [tool.hatch.version] source = "nodejs"