Skip to content

Commit

Permalink
Modified split to support input file and directory
Browse files Browse the repository at this point in the history
  • Loading branch information
apurvakhatri committed Mar 1, 2024
1 parent 642ac53 commit fee0ff9
Showing 1 changed file with 21 additions and 15 deletions.
36 changes: 21 additions & 15 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,27 @@ def flatten(*chunk_lists):
def split(path, all_files: bool, splitter):
chunks = []

for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
if filepath.suffix not in SUPPORTED_EXTS:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)
if os.path.isfile(path):
filenames = []
filenames.append(os.path.basename(path))
dir = os.path.dirname(path)

if os.path.isdir(path):
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
if filepath.suffix not in SUPPORTED_EXTS:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)

flattened_chunks = dask.delayed(flatten)(*chunks)
return flattened_chunks
Expand Down

0 comments on commit fee0ff9

Please sign in to comment.