Skip to content

Commit

Permalink
learn arxiv tex files
Browse files Browse the repository at this point in the history
* Created a new option remote or -r. Example: /learn -r arxiv <arxiv-id>
* Approach: downloads the tar file for the entire paper into downloads_temp. Then, unzips and collects all .tex files in the tar file and concatenates them. Different authors use various approaches. Some have the entire paper in one tex file, whereas others may have separate tex files for each section, so we need to collect all the tex file into a single file and then hand off to the splitter, embedder. After completion, remove the temp directory. Return a properly formatted error if package arxiv needs to be installed.
* Handle two types of errors: (i) package arxiv not installed. (ii) User enters a wrong paper id.
  • Loading branch information
srdas committed Apr 23, 2024
1 parent 9c8046c commit 3563936
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 3 deletions.
16 changes: 15 additions & 1 deletion packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any, Coroutine, List, Optional, Tuple

from dask.distributed import Client as DaskClient
from jupyter_ai.document_loaders.directory import get_embeddings, split
from jupyter_ai.document_loaders.directory import get_embeddings, split, arxiv_to_text
from jupyter_ai.document_loaders.splitter import ExtensionSplitter, NotebookSplitter
from jupyter_ai.models import (
DEFAULT_CHUNK_OVERLAP,
Expand Down Expand Up @@ -44,6 +44,9 @@ def __init__(self, *args, **kwargs):
self.parser.add_argument("-v", "--verbose", action="store_true")
self.parser.add_argument("-d", "--delete", action="store_true")
self.parser.add_argument("-l", "--list", action="store_true")
self.parser.add_argument(
"-r", "--remote", action="store" , default=None, type=str
)
self.parser.add_argument(
"-c", "--chunk-size", action="store", default=DEFAULT_CHUNK_SIZE, type=int
)
Expand Down Expand Up @@ -107,6 +110,17 @@ async def process_message(self, message: HumanChatMessage):
self.reply(self._build_list_response())
return

if args.remote:
remote_type = args.remote.lower()
if remote_type=="arxiv":
try:
id = args.path[0]
args.path = [arxiv_to_text(id)] # call the function in `directory.py``
self.reply(f"Processing arxiv file id {id}, saved in {args.path[0]}.", message)
except Exception as e:
self.reply(f"""The arXiv file could not be processed. Check the paper ID ({id}). Or, verify that the `arxiv` package is installed.""")
return

# Make sure the path exists.
if not len(args.path) == 1:
self.reply(f"{self.parser.format_usage()}", message)
Expand Down
35 changes: 34 additions & 1 deletion packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,42 @@
from typing import List

import dask
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFLoader, ArxivLoader
from langchain.schema import Document
from langchain.text_splitter import TextSplitter

import tarfile
import shutil

# Download a single tar file from arXiv and store in a temp folder for RAG, then run learn on it.
try:
import arxiv
except Exception as e:
print("Missing package: arxiv")

def arxiv_to_text(id): # id is numbers after "arXiv" in arXiv:xxxx.xxxxx
# Get the paper from arxiv
outfile = id + ".tex"
temp_dir = "downloads_temp"
if not os.path.isdir(temp_dir):
os.mkdir(temp_dir)
client = arxiv.Client()
paper = next(arxiv.Client().results(arxiv.Search(id_list=[id])))
paper.download_source(dirpath=temp_dir, filename="downloaded-paper.tar.gz")
# Extract downloaded tar file
tar = tarfile.open(temp_dir+"/downloaded-paper.tar.gz")
tar.extractall(temp_dir)
tar.close()
tex_list = os.listdir(temp_dir)
tex_list = [j for j in tex_list if j.lower().endswith('.tex')]
with open(outfile,'wb') as wfd:
for f in tex_list:
with open(temp_dir+"/"+f,'rb') as fd:
shutil.copyfileobj(fd, wfd)

outfile_path = os.path.realpath(outfile)
shutil.rmtree(temp_dir) # Delete the temp folder but not the downloaded latex files
return outfile_path

# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
Expand Down Expand Up @@ -50,6 +82,7 @@ def path_to_doc(path):
".txt",
".html",
".pdf",
".tex",
}


Expand Down
2 changes: 1 addition & 1 deletion packages/jupyter-ai/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ test = [

dev = ["jupyter_ai_magics[dev]"]

all = ["jupyter_ai_magics[all]", "pypdf"]
all = ["jupyter_ai_magics[all]", "pypdf", "arxiv"]

[tool.hatch.version]
source = "nodejs"
Expand Down

0 comments on commit 3563936

Please sign in to comment.