Skip to content

Commit

Permalink
Backport PR #733: Handle single files, pdfs, errors from missing load…
Browse files Browse the repository at this point in the history
…er dependencies in `/learn`
  • Loading branch information
srdas authored and dlqqq committed Apr 25, 2024
1 parent 4d77b00 commit b934208
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 13 deletions.
19 changes: 10 additions & 9 deletions packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,16 @@ async def process_message(self, message: HumanChatMessage):
if args.verbose:
self.reply(f"Loading and splitting files for {load_path}", message)

await self.learn_dir(
load_path, args.chunk_size, args.chunk_overlap, args.all_files
)
self.save()

response = f"""🎉 I have learned from documents at **{load_path}** and
I am ready to answer questions about them. You can ask questions about these
documents by starting your message with **/ask**."""
try:
await self.learn_dir(
load_path, args.chunk_size, args.chunk_overlap, args.all_files
)
except Exception as e:
response = f"""Learn documents in **{load_path}** failed. {str(e)}."""
else:
self.save()
response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
You can ask questions about these docs by prefixing your message with **/ask**."""
self.reply(response, message)

def _build_list_response(self):
Expand Down Expand Up @@ -163,7 +165,6 @@ async def learn_dir(

delayed = split(path, all_files, splitter=splitter)
doc_chunks = await dask_client.compute(delayed)

em_provider_cls, em_provider_args = self.get_embedding_provider()
delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args)
embedding_records = await dask_client.compute(delayed)
Expand Down
5 changes: 2 additions & 3 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import TextSplitter
from pypdf import PdfReader


# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
reader = PdfReader(path)
text = "\n \n".join([page.extract_text() for page in reader.pages])
pages = PyPDFLoader(path)
text = "\n \n".join([page.page_content for page in pages.load_and_split()])
return text


Expand Down
2 changes: 1 addition & 1 deletion packages/jupyter-ai/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ test = [

dev = ["jupyter_ai_magics[dev]"]

all = ["jupyter_ai_magics[all]"]
all = ["jupyter_ai_magics[all]", "pypdf"]

[tool.hatch.version]
source = "nodejs"
Expand Down

0 comments on commit b934208

Please sign in to comment.