diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index 4ee28d3a4..d05917374 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -125,14 +125,16 @@ async def process_message(self, message: HumanChatMessage): if args.verbose: self.reply(f"Loading and splitting files for {load_path}", message) - await self.learn_dir( - load_path, args.chunk_size, args.chunk_overlap, args.all_files - ) - self.save() - - response = f"""🎉 I have learned from documents at **{load_path}** and - I am ready to answer questions about them. You can ask questions about these - documents by starting your message with **/ask**.""" + try: + await self.learn_dir( + load_path, args.chunk_size, args.chunk_overlap, args.all_files + ) + except Exception as e: + response = f"""Learn documents in **{load_path}** failed. {str(e)}.""" + else: + self.save() + response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. + You can ask questions about these docs by prefixing your message with **/ask**.""" self.reply(response, message) def _build_list_response(self): @@ -163,7 +165,6 @@ async def learn_dir( delayed = split(path, all_files, splitter=splitter) doc_chunks = await dask_client.compute(delayed) - em_provider_cls, em_provider_args = self.get_embedding_provider() delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args) embedding_records = await dask_client.compute(delayed) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 6607d97d6..561f00a1c 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -8,13 +8,12 @@ from langchain.document_loaders import PyPDFLoader from langchain.schema import Document from langchain.text_splitter import TextSplitter -from pypdf import PdfReader # Uses pypdf which is used by PyPDFLoader from langchain def pdf_to_text(path): - reader = PdfReader(path) - text = "\n \n".join([page.extract_text() for page in reader.pages]) + pages = PyPDFLoader(path) + text = "\n \n".join([page.page_content for page in pages.load_and_split()]) return text diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml index 04d4109b0..9a75c2cae 100644 --- a/packages/jupyter-ai/pyproject.toml +++ b/packages/jupyter-ai/pyproject.toml @@ -54,7 +54,7 @@ test = [ dev = ["jupyter_ai_magics[dev]"] -all = ["jupyter_ai_magics[all]"] +all = ["jupyter_ai_magics[all]", "pypdf"] [tool.hatch.version] source = "nodejs"