Skip to content

Commit

Permalink
Saves arxiv to root, better exception handling.
Browse files Browse the repository at this point in the history
  • Loading branch information
3coins committed May 3, 2024
1 parent f4f09b9 commit 671ef03
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 20 deletions.
16 changes: 11 additions & 5 deletions packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,16 +118,22 @@ async def process_message(self, message: HumanChatMessage):
if remote_type == "arxiv":
try:
id = args.path[0]
args.path = [
arxiv_to_text(id)
] # call the function in `directory.py``
args.path = [arxiv_to_text(id, self.root_dir)]
self.reply(
f"Processing arxiv file id {id}, saved in {args.path[0]}.",
f"Learning arxiv file with id **{id}**, saved in **{args.path[0]}**.",
message,
)
except ModuleNotFoundError as e:
self.log.error(e)
self.reply(
"No `arxiv` package found. " "Install with `pip install arxiv`."
)
return
except Exception as e:
self.log.error(e)
self.reply(
f"""The arXiv file could not be processed. Check the paper ID ({id}). Or, verify that the `arxiv` package is installed."""
"An error occurred while processing the arXiv file. "
f"Please verify that the arxiv id {id} is correct."
)
return

Expand Down
48 changes: 33 additions & 15 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import datetime
import hashlib
import itertools
import os
import tarfile
from datetime import datetime
from pathlib import Path
from typing import List

Expand All @@ -12,31 +12,49 @@
from langchain_community.document_loaders import PyPDFLoader


# Download a single tar file from arXiv and store in a temp folder for RAG, then run learn on it.
def arxiv_to_text(id): # id is numbers after "arXiv" in arXiv:xxxx.xxxxx
def arxiv_to_text(id: str, output_dir: str) -> str:
"""Downloads and extracts single tar file from arXiv.
Combines the TeX components into a single file.
Parameters
----------
id : str
id for the paper, numbers after "arXiv" in arXiv:xxxx.xxxxx
output_dir : str
directory to save the output file
Returns
-------
output: str
output path to the downloaded TeX file
"""

import arxiv

# Get the paper from arxiv
outfile = id + datetime.datetime.now().strftime("_%Y-%m-%d-%H-%M") + ".tex"
outfile = f"{id}-{datetime.now():%Y-%m-%d-%H-%M}.tex"
download_filename = "downloaded-paper.tar.gz"
output_path = os.path.join(output_dir, outfile)

paper = next(arxiv.Client().results(arxiv.Search(id_list=[id])))
paper.download_source(dirpath="", filename="downloaded-paper.tar.gz")
# Extract tex files from downloaded tar file
with tarfile.open("downloaded-paper.tar.gz") as tar:
paper.download_source(filename=download_filename)

with tarfile.open(download_filename) as tar:
tex_list = []
for member in tar:
if member.isfile() and member.name.lower().endswith(".tex"):
tex_list.append(member.name)
tar.extract(member, path="")
# Concatenate all tex files
with open(outfile, "w") as wfd:

with open(output_path, "w") as w:
for f in tex_list:
with open(f) as infile:
wfd.write(infile.read())
with open(f) as tex:
w.write(tex.read())
os.remove(f)

outfile_path = os.path.realpath(outfile)
os.remove("downloaded-paper.tar.gz")
return outfile_path
os.remove(download_filename)

return output_path


# Uses pypdf which is used by PyPDFLoader from langchain
Expand Down

0 comments on commit 671ef03

Please sign in to comment.