Backport PR jupyterlab#712: Handle Single Files and also enable html,…

… pdf file formats for /learn
meeseeksmachine · Apr 10, 2024 · 26efa8f · 26efa8f
1 parent f18be1d
commit 26efa8f
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 16 deletions.
diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
@@ -5,13 +5,25 @@
 from typing import List
 
 import dask
+from langchain.document_loaders import PyPDFLoader
 from langchain.schema import Document
 from langchain.text_splitter import TextSplitter
+from pypdf import PdfReader
+
+
+# Uses pypdf which is used by PyPDFLoader from langchain
+def pdf_to_text(path):
+    reader = PdfReader(path)
+    text = "\n \n".join([page.extract_text() for page in reader.pages])
+    return text
 
 
 def path_to_doc(path):
     with open(str(path)) as f:
-        text = f.read()
+        if os.path.splitext(path)[1].lower() == ".pdf":
+            text = pdf_to_text(path)
+        else:
+            text = f.read()
         m = hashlib.sha256()
         m.update(text.encode("utf-8"))
         metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix}
@@ -37,6 +49,8 @@ def path_to_doc(path):
     ".jsx",
     ".tsx",
     ".txt",
+    ".html",
+    ".pdf",
 }
 
 
@@ -51,21 +65,29 @@ def flatten(*chunk_lists):
 def split(path, all_files: bool, splitter):
     chunks = []
 
-    for dir, subdirs, filenames in os.walk(path):
-        # Filter out hidden filenames, hidden directories, and excluded directories,
-        # unless "all files" are requested
-        if not all_files:
-            subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
-            filenames = [f for f in filenames if not f[0] == "."]
-
-        for filename in filenames:
-            filepath = Path(os.path.join(dir, filename))
-            if filepath.suffix not in SUPPORTED_EXTS:
-                continue
-
-            document = dask.delayed(path_to_doc)(filepath)
-            chunk = dask.delayed(split_document)(document, splitter)
-            chunks.append(chunk)
+    # Check if the path points to a single file
+    if os.path.isfile(path):
+        dir = os.path.dirname(path)
+        filenames = [os.path.basename(path)]
+    else:
+        for dir, subdirs, filenames in os.walk(path):
+            # Filter out hidden filenames, hidden directories, and excluded directories,
+            # unless "all files" are requested
+            if not all_files:
+                subdirs[:] = [
+                    d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)
+                ]
+                filenames = [f for f in filenames if not f[0] == "."]
+
+    for filename in filenames:
+        filepath = Path(os.path.join(dir, filename))
+        # Lower case everything to make sure file extension comparisons are not case sensitive
+        if filepath.suffix.lower() not in {j.lower() for j in SUPPORTED_EXTS}:
+            continue
+
+        document = dask.delayed(path_to_doc)(filepath)
+        chunk = dask.delayed(split_document)(document, splitter)
+        chunks.append(chunk)
 
     flattened_chunks = dask.delayed(flatten)(*chunks)
     return flattened_chunks

diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
     "typing_extensions>=4.5.0",
     "traitlets>=5.0",
     "deepmerge>=1.0",
+    "pypdf==4.1.0",
 ]
 
 dynamic = ["version", "description", "authors", "urls", "keywords"]