Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle single files, pdfs, errors from missing loader dependencies in /learn #733

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,16 @@ async def process_message(self, message: HumanChatMessage):
if args.verbose:
self.reply(f"Loading and splitting files for {load_path}", message)

await self.learn_dir(
load_path, args.chunk_size, args.chunk_overlap, args.all_files
)
self.save()

response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
You can ask questions about these docs by prefixing your message with **/ask**."""
try:
await self.learn_dir(
load_path, args.chunk_size, args.chunk_overlap, args.all_files
)
except Exception as e:
response = f"""Learn documents in **{load_path}** failed. Additional packages needed: {str(e)}."""
3coins marked this conversation as resolved.
Show resolved Hide resolved
else:
self.save()
response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
You can ask questions about these docs by prefixing your message with **/ask**."""
self.reply(response, message)

def _build_list_response(self):
Expand Down Expand Up @@ -155,7 +158,6 @@ async def learn_dir(

delayed = split(path, all_files, splitter=splitter)
doc_chunks = await dask_client.compute(delayed)

em_provider_cls, em_provider_args = self.get_embedding_provider()
delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args)
embedding_records = await dask_client.compute(delayed)
Expand Down
5 changes: 2 additions & 3 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import TextSplitter
from pypdf import PdfReader


# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
reader = PdfReader(path)
text = "\n \n".join([page.extract_text() for page in reader.pages])
pages = PyPDFLoader(path)
text = "\n \n".join([page.page_content for page in pages.load_and_split()])
return text


Expand Down
2 changes: 1 addition & 1 deletion packages/jupyter-ai/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ test = [

dev = ["jupyter_ai_magics[dev]"]

all = ["jupyter_ai_magics[all]"]
all = ["jupyter_ai_magics[all]", pypdf]

[tool.hatch.version]
source = "nodejs"
Expand Down
Loading