Skip to content

Commit

Permalink
Update directory.py to add new file formats
Browse files Browse the repository at this point in the history
1.  Added single file functionality
2.  Added HTML files
3.  Added PDF files
  • Loading branch information
srdas committed Apr 2, 2024
1 parent 144cc9b commit 3dda95b
Showing 1 changed file with 37 additions and 17 deletions.
54 changes: 37 additions & 17 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,24 @@
import dask
from langchain.schema import Document
from langchain.text_splitter import TextSplitter

from langchain.document_loaders import PyPDFLoader
from pypdf import PdfReader

# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
reader = PdfReader(path)
pages = reader.pages
text = ""
for page in pages:
text = text + "\n \n" + page.extract_text()
return text

def path_to_doc(path):
with open(str(path)) as f:
text = f.read()
if os.path.splitext(path)[1] == ".pdf":
text = pdf_to_text(path)
else:
text = f.read()
m = hashlib.sha256()
m.update(text.encode("utf-8"))
metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix}
Expand All @@ -37,6 +50,8 @@ def path_to_doc(path):
".jsx",
".tsx",
".txt",
".html",
".pdf",
}


Expand All @@ -51,21 +66,26 @@ def flatten(*chunk_lists):
def split(path, all_files: bool, splitter):
chunks = []

for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
if filepath.suffix not in SUPPORTED_EXTS:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)
# Check if the path points to a single file
if os.path.isfile(path):
dir = os.path.dirname(path)
filenames = [os.path.basename(path)]
else:
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
if filepath.suffix not in SUPPORTED_EXTS:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)

flattened_chunks = dask.delayed(flatten)(*chunks)
return flattened_chunks
Expand Down

0 comments on commit 3dda95b

Please sign in to comment.