Backport PR #733: Handle single files, pdfs, errors from missing load…

…er dependencies in `/learn` (#744) Co-authored-by: Sanjiv Das <[email protected]>
jupyterlab · Apr 25, 2024 · d4dffa1 · d4dffa1
1 parent 4d77b00
commit d4dffa1
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 13 deletions.
diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
@@ -125,14 +125,16 @@ async def process_message(self, message: HumanChatMessage):
         if args.verbose:
             self.reply(f"Loading and splitting files for {load_path}", message)
 
-        await self.learn_dir(
-            load_path, args.chunk_size, args.chunk_overlap, args.all_files
-        )
-        self.save()
-
-        response = f"""🎉 I have learned from documents at **{load_path}** and
-        I am ready to answer questions about them. You can ask questions about these
-        documents by starting your message with **/ask**."""
+        try:
+            await self.learn_dir(
+                load_path, args.chunk_size, args.chunk_overlap, args.all_files
+            )
+        except Exception as e:
+            response = f"""Learn documents in **{load_path}** failed. {str(e)}."""
+        else:
+            self.save()
+            response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
+                You can ask questions about these docs by prefixing your message with **/ask**."""
         self.reply(response, message)
 
     def _build_list_response(self):
@@ -163,7 +165,6 @@ async def learn_dir(
 
         delayed = split(path, all_files, splitter=splitter)
         doc_chunks = await dask_client.compute(delayed)
-
         em_provider_cls, em_provider_args = self.get_embedding_provider()
         delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args)
         embedding_records = await dask_client.compute(delayed)

diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
@@ -8,13 +8,12 @@
 from langchain.document_loaders import PyPDFLoader
 from langchain.schema import Document
 from langchain.text_splitter import TextSplitter
-from pypdf import PdfReader
 
 
 # Uses pypdf which is used by PyPDFLoader from langchain
 def pdf_to_text(path):
-    reader = PdfReader(path)
-    text = "\n \n".join([page.extract_text() for page in reader.pages])
+    pages = PyPDFLoader(path)
+    text = "\n \n".join([page.page_content for page in pages.load_and_split()])
     return text
 
 

diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml
@@ -54,7 +54,7 @@ test = [
 
 dev = ["jupyter_ai_magics[dev]"]
 
-all = ["jupyter_ai_magics[all]"]
+all = ["jupyter_ai_magics[all]", "pypdf"]
 
 [tool.hatch.version]
 source = "nodejs"