diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index efcc99e95..eb40a0d59 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -7,11 +7,24 @@ import dask from langchain.schema import Document from langchain.text_splitter import TextSplitter - +from langchain.document_loaders import PyPDFLoader +from pypdf import PdfReader + +# Uses pypdf which is used by PyPDFLoader from langchain +def pdf_to_text(path): + reader = PdfReader(path) + pages = reader.pages + text = "" + for page in pages: + text = text + "\n \n" + page.extract_text() + return text def path_to_doc(path): with open(str(path)) as f: - text = f.read() + if os.path.splitext(path)[1] == ".pdf": + text = pdf_to_text(path) + else: + text = f.read() m = hashlib.sha256() m.update(text.encode("utf-8")) metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix} @@ -37,6 +50,8 @@ def path_to_doc(path): ".jsx", ".tsx", ".txt", + ".html", + ".pdf", } @@ -51,21 +66,26 @@ def flatten(*chunk_lists): def split(path, all_files: bool, splitter): chunks = [] - for dir, subdirs, filenames in os.walk(path): - # Filter out hidden filenames, hidden directories, and excluded directories, - # unless "all files" are requested - if not all_files: - subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)] - filenames = [f for f in filenames if not f[0] == "."] - - for filename in filenames: - filepath = Path(os.path.join(dir, filename)) - if filepath.suffix not in SUPPORTED_EXTS: - continue - - document = dask.delayed(path_to_doc)(filepath) - chunk = dask.delayed(split_document)(document, splitter) - chunks.append(chunk) + # Check if the path points to a single file + if os.path.isfile(path): + dir = os.path.dirname(path) + filenames = [os.path.basename(path)] + else: + for dir, subdirs, filenames in os.walk(path): + # Filter out hidden filenames, hidden directories, and excluded directories, + # unless "all files" are requested + if not all_files: + subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)] + filenames = [f for f in filenames if not f[0] == "."] + + for filename in filenames: + filepath = Path(os.path.join(dir, filename)) + if filepath.suffix not in SUPPORTED_EXTS: + continue + + document = dask.delayed(path_to_doc)(filepath) + chunk = dask.delayed(split_document)(document, splitter) + chunks.append(chunk) flattened_chunks = dask.delayed(flatten)(*chunks) return flattened_chunks