From 21549bd92212e16bb4629f5c928097b62e699942 Mon Sep 17 00:00:00 2001 From: Sanjiv Das Date: Fri, 3 May 2024 06:59:46 -0700 Subject: [PATCH] learn arxiv tex files (#742) * learn arxiv tex files * Created a new option remote or -r. Example: /learn -r arxiv * Approach: downloads the tar file for the entire paper into downloads_temp. Then, unzips and collects all .tex files in the tar file and concatenates them. Different authors use various approaches. Some have the entire paper in one tex file, whereas others may have separate tex files for each section, so we need to collect all the tex file into a single file and then hand off to the splitter, embedder. After completion, remove the temp directory. Return a properly formatted error if package arxiv needs to be installed. * Handle two types of errors: (i) package arxiv not installed. (ii) User enters a wrong paper id. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * learn_arxiv * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * learn arxiv tex files * Created a new option remote or -r. Example: /learn -r arxiv * Approach: downloads the tar file for the entire paper into downloads_temp. Then, unzips and collects all .tex files in the tar file and concatenates them. Different authors use various approaches. Some have the entire paper in one tex file, whereas others may have separate tex files for each section, so we need to collect all the tex file into a single file and then hand off to the splitter, embedder. After completion, remove the temp directory. Return a properly formatted error if package arxiv needs to be installed. * Handle two types of errors: (i) package arxiv not installed. (ii) User enters a wrong paper id. * learn_arxiv * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Streamlined code for learning arxiv files (1) removed temp dir handling {2) extracted only tex files (3) Moved imports into the `arxiv_to_text` function {4) improved tar file processing * update learn for arxiv * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed extra imports * Fix /learn in 2.14.0 (#747) * accumulate filepaths * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * learn arxiv tex files * Created a new option remote or -r. Example: /learn -r arxiv * Approach: downloads the tar file for the entire paper into downloads_temp. Then, unzips and collects all .tex files in the tar file and concatenates them. Different authors use various approaches. Some have the entire paper in one tex file, whereas others may have separate tex files for each section, so we need to collect all the tex file into a single file and then hand off to the splitter, embedder. After completion, remove the temp directory. Return a properly formatted error if package arxiv needs to be installed. * Handle two types of errors: (i) package arxiv not installed. (ii) User enters a wrong paper id. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update learn for arxiv files Redoing code after the PR 747 made changes to the same file. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Improved code for arxiv files Improvements to PR 742: (i) removed extra `arxiv.Client` call (ii) removed unnecessary `try/catch` (iii) moved `datetime` import outside `arxiv_to_text` function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Saves arxiv to root, better exception handling. * Added arxiv feature to docs. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: michaelchia Co-authored-by: Piyush Jain --- docs/source/users/index.md | 7 +++ .../jupyter_ai/chat_handlers/learn.py | 29 ++++++++++- .../jupyter_ai/document_loaders/directory.py | 48 +++++++++++++++++++ packages/jupyter-ai/pyproject.toml | 2 +- 4 files changed, 84 insertions(+), 2 deletions(-) diff --git a/docs/source/users/index.md b/docs/source/users/index.md index 1556cc91c..2906fc074 100644 --- a/docs/source/users/index.md +++ b/docs/source/users/index.md @@ -464,6 +464,13 @@ use the `-a` or `--all-files` option. /learn -a ``` +### Learning arXiv files +`/learn` command also provides downloading and processing papers from the [arXiv](https://arxiv.org/) repository. You will need to install the `arxiv` python package for this feature to work. Run `pip install arxiv` to install the `arxiv` package. + +``` +/learn -r arxiv 2404.18558 +``` + ### Additional chat commands To clear the chat panel, use the `/clear` command. This does not reset the AI model; the model may still remember previous messages that you sent it, and it may use them to inform its responses. diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index f9348a21f..0f10b0147 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -4,7 +4,7 @@ from typing import Any, Coroutine, List, Optional, Tuple from dask.distributed import Client as DaskClient -from jupyter_ai.document_loaders.directory import get_embeddings, split +from jupyter_ai.document_loaders.directory import arxiv_to_text, get_embeddings, split from jupyter_ai.document_loaders.splitter import ExtensionSplitter, NotebookSplitter from jupyter_ai.models import ( DEFAULT_CHUNK_OVERLAP, @@ -44,6 +44,9 @@ def __init__(self, *args, **kwargs): self.parser.add_argument("-v", "--verbose", action="store_true") self.parser.add_argument("-d", "--delete", action="store_true") self.parser.add_argument("-l", "--list", action="store_true") + self.parser.add_argument( + "-r", "--remote", action="store", default=None, type=str + ) self.parser.add_argument( "-c", "--chunk-size", action="store", default=DEFAULT_CHUNK_SIZE, type=int ) @@ -110,6 +113,30 @@ async def process_message(self, message: HumanChatMessage): self.reply(self._build_list_response()) return + if args.remote: + remote_type = args.remote.lower() + if remote_type == "arxiv": + try: + id = args.path[0] + args.path = [arxiv_to_text(id, self.root_dir)] + self.reply( + f"Learning arxiv file with id **{id}**, saved in **{args.path[0]}**.", + message, + ) + except ModuleNotFoundError as e: + self.log.error(e) + self.reply( + "No `arxiv` package found. " "Install with `pip install arxiv`." + ) + return + except Exception as e: + self.log.error(e) + self.reply( + "An error occurred while processing the arXiv file. " + f"Please verify that the arxiv id {id} is correct." + ) + return + # Make sure the path exists. if not len(args.path) == 1: self.reply(f"{self.parser.format_usage()}", message) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index b8f9de3bb..e493fb385 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -1,6 +1,8 @@ import hashlib import itertools import os +import tarfile +from datetime import datetime from pathlib import Path from typing import List @@ -10,6 +12,51 @@ from langchain_community.document_loaders import PyPDFLoader +def arxiv_to_text(id: str, output_dir: str) -> str: + """Downloads and extracts single tar file from arXiv. + Combines the TeX components into a single file. + + Parameters + ---------- + id : str + id for the paper, numbers after "arXiv" in arXiv:xxxx.xxxxx + + output_dir : str + directory to save the output file + + Returns + ------- + output: str + output path to the downloaded TeX file + """ + + import arxiv + + outfile = f"{id}-{datetime.now():%Y-%m-%d-%H-%M}.tex" + download_filename = "downloaded-paper.tar.gz" + output_path = os.path.join(output_dir, outfile) + + paper = next(arxiv.Client().results(arxiv.Search(id_list=[id]))) + paper.download_source(filename=download_filename) + + with tarfile.open(download_filename) as tar: + tex_list = [] + for member in tar: + if member.isfile() and member.name.lower().endswith(".tex"): + tex_list.append(member.name) + tar.extract(member, path="") + + with open(output_path, "w") as w: + for f in tex_list: + with open(f) as tex: + w.write(tex.read()) + os.remove(f) + + os.remove(download_filename) + + return output_path + + # Uses pypdf which is used by PyPDFLoader from langchain def pdf_to_text(path): pages = PyPDFLoader(path) @@ -50,6 +97,7 @@ def path_to_doc(path): ".txt", ".html", ".pdf", + ".tex", # added for raw latex files from arxiv } diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml index ec7959975..4328c9aca 100644 --- a/packages/jupyter-ai/pyproject.toml +++ b/packages/jupyter-ai/pyproject.toml @@ -53,7 +53,7 @@ test = [ dev = ["jupyter_ai_magics[dev]"] -all = ["jupyter_ai_magics[all]", "pypdf"] +all = ["jupyter_ai_magics[all]", "pypdf", "arxiv"] [tool.hatch.version] source = "nodejs"