From 9079d729d1ea985efee605d19c18881be94e804b Mon Sep 17 00:00:00 2001 From: Sanjiv Das Date: Tue, 16 Apr 2024 02:07:15 -0700 Subject: [PATCH 1/5] Handle single files, pdfs, errors (1) Enables handling single files, not just directories. (2) Learns PDFs with langchain's PyPDFLoader. (3) Gives a clean error w/o traceback when the file type that is being handled needs addtional packages. --- .../jupyter_ai/chat_handlers/learn.py | 38 +++++++++++-------- .../jupyter_ai/document_loaders/directory.py | 5 +-- packages/jupyter-ai/pyproject.toml | 2 +- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index e1a22c9cc..c29669a50 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -59,6 +59,7 @@ def __init__(self, *args, **kwargs): self.index = None self.metadata = IndexMetadata(dirs=[]) self.prev_em_id = None + self.missing_dependency_error = None if not os.path.exists(INDEX_SAVE_DIR): os.makedirs(INDEX_SAVE_DIR) @@ -123,8 +124,12 @@ async def process_message(self, message: HumanChatMessage): ) self.save() - response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. - You can ask questions about these docs by prefixing your message with **/ask**.""" + if self.missing_dependency_error != None: + response = f"""Learn documents in **{load_path}** failed. Additional + packages needed: {self.missing_dependency_error}.""" + else: + response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. + You can ask questions about these docs by prefixing your message with **/ask**.""" self.reply(response, message) def _build_list_response(self): @@ -153,19 +158,22 @@ async def learn_dir( default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs), ) - delayed = split(path, all_files, splitter=splitter) - doc_chunks = await dask_client.compute(delayed) - - em_provider_cls, em_provider_args = self.get_embedding_provider() - delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args) - embedding_records = await dask_client.compute(delayed) - if self.index: - self.index.add_embeddings(*embedding_records) - else: - self.create(*embedding_records) - - self._add_dir_to_metadata(path, chunk_size, chunk_overlap) - self.prev_em_id = em_provider_cls.id + ":" + em_provider_args["model_id"] + try: + delayed = split(path, all_files, splitter=splitter) + doc_chunks = await dask_client.compute(delayed) + em_provider_cls, em_provider_args = self.get_embedding_provider() + delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args) + embedding_records = await dask_client.compute(delayed) + if self.index: + self.index.add_embeddings(*embedding_records) + else: + self.create(*embedding_records) + + self._add_dir_to_metadata(path, chunk_size, chunk_overlap) + self.prev_em_id = em_provider_cls.id + ":" + em_provider_args["model_id"] + except Exception as e: + self.missing_dependency_error = str(e) + return def _add_dir_to_metadata(self, path: str, chunk_size: int, chunk_overlap: int): dirs = self.metadata.dirs diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 6607d97d6..561f00a1c 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -8,13 +8,12 @@ from langchain.document_loaders import PyPDFLoader from langchain.schema import Document from langchain.text_splitter import TextSplitter -from pypdf import PdfReader # Uses pypdf which is used by PyPDFLoader from langchain def pdf_to_text(path): - reader = PdfReader(path) - text = "\n \n".join([page.extract_text() for page in reader.pages]) + pages = PyPDFLoader(path) + text = "\n \n".join([page.page_content for page in pages.load_and_split()]) return text diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml index 739175a26..9f8879767 100644 --- a/packages/jupyter-ai/pyproject.toml +++ b/packages/jupyter-ai/pyproject.toml @@ -54,7 +54,7 @@ test = [ dev = ["jupyter_ai_magics[dev]"] -all = ["jupyter_ai_magics[all]"] +all = ["jupyter_ai_magics[all]", pypdf] [tool.hatch.version] source = "nodejs" From 1d0f32bec1c39d0f0f96909ee38a1ea3f3d32956 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Apr 2024 09:26:33 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index c29669a50..476a33660 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -125,7 +125,7 @@ async def process_message(self, message: HumanChatMessage): self.save() if self.missing_dependency_error != None: - response = f"""Learn documents in **{load_path}** failed. Additional + response = f"""Learn documents in **{load_path}** failed. Additional packages needed: {self.missing_dependency_error}.""" else: response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. From 7b2c04d65b02b3a472066b2c422f81aa996cbf97 Mon Sep 17 00:00:00 2001 From: Sanjiv Das Date: Tue, 16 Apr 2024 12:59:35 -0700 Subject: [PATCH 3/5] error handling for missing packages in learn.py Removed the extra attribute and additional response comments based on feedback from Piyush Jain and Andrii Ieroshenko --- .../jupyter_ai/chat_handlers/learn.py | 44 ++++++++----------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index c29669a50..1610f4f73 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -59,7 +59,6 @@ def __init__(self, *args, **kwargs): self.index = None self.metadata = IndexMetadata(dirs=[]) self.prev_em_id = None - self.missing_dependency_error = None if not os.path.exists(INDEX_SAVE_DIR): os.makedirs(INDEX_SAVE_DIR) @@ -119,15 +118,14 @@ async def process_message(self, message: HumanChatMessage): if args.verbose: self.reply(f"Loading and splitting files for {load_path}", message) - await self.learn_dir( - load_path, args.chunk_size, args.chunk_overlap, args.all_files - ) - self.save() - - if self.missing_dependency_error != None: - response = f"""Learn documents in **{load_path}** failed. Additional - packages needed: {self.missing_dependency_error}.""" + try: + await self.learn_dir( + load_path, args.chunk_size, args.chunk_overlap, args.all_files + ) + except Exception as e: + response = f"""Learn documents in **{load_path}** failed. Additional packages needed: {str(e)}.""" else: + self.save() response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. You can ask questions about these docs by prefixing your message with **/ask**.""" self.reply(response, message) @@ -158,22 +156,18 @@ async def learn_dir( default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs), ) - try: - delayed = split(path, all_files, splitter=splitter) - doc_chunks = await dask_client.compute(delayed) - em_provider_cls, em_provider_args = self.get_embedding_provider() - delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args) - embedding_records = await dask_client.compute(delayed) - if self.index: - self.index.add_embeddings(*embedding_records) - else: - self.create(*embedding_records) - - self._add_dir_to_metadata(path, chunk_size, chunk_overlap) - self.prev_em_id = em_provider_cls.id + ":" + em_provider_args["model_id"] - except Exception as e: - self.missing_dependency_error = str(e) - return + delayed = split(path, all_files, splitter=splitter) + doc_chunks = await dask_client.compute(delayed) + em_provider_cls, em_provider_args = self.get_embedding_provider() + delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args) + embedding_records = await dask_client.compute(delayed) + if self.index: + self.index.add_embeddings(*embedding_records) + else: + self.create(*embedding_records) + + self._add_dir_to_metadata(path, chunk_size, chunk_overlap) + self.prev_em_id = em_provider_cls.id + ":" + em_provider_args["model_id"] def _add_dir_to_metadata(self, path: str, chunk_size: int, chunk_overlap: int): dirs = self.metadata.dirs From 95145592f61ba71764ff04843e231e420af4c2fd Mon Sep 17 00:00:00 2001 From: Sanjiv Das Date: Tue, 16 Apr 2024 13:58:37 -0700 Subject: [PATCH 4/5] Amend error message for failure in learn.py Made the error message more generic as there are many different failure types. --- packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index 1610f4f73..7ac7506df 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -123,7 +123,7 @@ async def process_message(self, message: HumanChatMessage): load_path, args.chunk_size, args.chunk_overlap, args.all_files ) except Exception as e: - response = f"""Learn documents in **{load_path}** failed. Additional packages needed: {str(e)}.""" + response = f"""Learn documents in **{load_path}** failed. {str(e)}.""" else: self.save() response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. From f2e7c26deb8da61cc0ce68fa0ed56f33f368944e Mon Sep 17 00:00:00 2001 From: Piyush Jain Date: Tue, 16 Apr 2024 14:45:38 -0700 Subject: [PATCH 5/5] Fixed build error. --- packages/jupyter-ai/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml index 9f8879767..f5eb5e98f 100644 --- a/packages/jupyter-ai/pyproject.toml +++ b/packages/jupyter-ai/pyproject.toml @@ -54,7 +54,7 @@ test = [ dev = ["jupyter_ai_magics[dev]"] -all = ["jupyter_ai_magics[all]", pypdf] +all = ["jupyter_ai_magics[all]", "pypdf"] [tool.hatch.version] source = "nodejs"