Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle Single Files and also enable html, pdf file formats for /learn #712

Merged
merged 12 commits into from
Apr 9, 2024
54 changes: 38 additions & 16 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,25 @@
from typing import List

import dask
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import TextSplitter
from pypdf import PdfReader


# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
reader = PdfReader(path)
text = "\n \n".join([page.extract_text() for page in reader.pages])
return text


def path_to_doc(path):
with open(str(path)) as f:
text = f.read()
if os.path.splitext(path)[1].lower() == ".pdf":
text = pdf_to_text(path)
else:
text = f.read()
m = hashlib.sha256()
m.update(text.encode("utf-8"))
metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix}
Expand All @@ -37,6 +49,8 @@ def path_to_doc(path):
".jsx",
".tsx",
".txt",
".html",
".pdf",
srdas marked this conversation as resolved.
Show resolved Hide resolved
}


Expand All @@ -51,21 +65,29 @@ def flatten(*chunk_lists):
def split(path, all_files: bool, splitter):
chunks = []

for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
if filepath.suffix not in SUPPORTED_EXTS:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)
# Check if the path points to a single file
if os.path.isfile(path):
dir = os.path.dirname(path)
filenames = [os.path.basename(path)]
else:
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [
d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)
]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
# Lower case everything to make sure file extension comparisons are not case sensitive
if filepath.suffix.lower() not in {j.lower() for j in SUPPORTED_EXTS}:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)

flattened_chunks = dask.delayed(flatten)(*chunks)
return flattened_chunks
Expand Down
1 change: 1 addition & 0 deletions packages/jupyter-ai/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies = [
"typing_extensions>=4.5.0",
"traitlets>=5.0",
"deepmerge>=1.0",
"pypdf==4.1.0",
]

dynamic = ["version", "description", "authors", "urls", "keywords"]
Expand Down
Loading