Skip to content

Commit

Permalink
Merge branch 'learn_more_file_formats' of https://github.com/srdas/ju…
Browse files Browse the repository at this point in the history
…pyter-ai into learn_more_file_formats
  • Loading branch information
srdas committed Apr 2, 2024
2 parents 5950c93 + ed4a052 commit f045924
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from typing import List

import dask
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import TextSplitter
from langchain.document_loaders import PyPDFLoader
from pypdf import PdfReader


# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
reader = PdfReader(path)
Expand All @@ -19,12 +20,13 @@ def pdf_to_text(path):
text = text + "\n \n" + page.extract_text()
return text


def path_to_doc(path):
with open(str(path)) as f:
if os.path.splitext(path)[1] == ".pdf":
text = pdf_to_text(path)
else:
text = f.read()
else:
text = f.read()
m = hashlib.sha256()
m.update(text.encode("utf-8"))
metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix}
Expand Down Expand Up @@ -75,7 +77,9 @@ def split(path, all_files: bool, splitter):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
subdirs[:] = [
d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)
]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
Expand Down

0 comments on commit f045924

Please sign in to comment.