Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewfulton9 committed Sep 12, 2024
1 parent 40f3ea8 commit 891c8ae
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 13 deletions.
28 changes: 15 additions & 13 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,20 @@ def collect_filepaths(path, all_files: bool):
if os.path.isfile(path):
filepaths = [Path(path)]
else:
filepaths = []
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [
d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)
]
filenames = [f for f in filenames if not f[0] == "."]
filepaths.extend([Path(dir) / filename for filename in filenames])
filepaths = set()
for glob_path in iglob(str(path), include_hidden=all_files, recursive=True):
if os.path.isfile(glob_path):
filepaths.add(Path(glob_path))
continue
for dir, subdirs, filenames in os.walk(glob_path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [
d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)
]
filenames = [f for f in filenames if not f[0] == "."]
filepaths.update([Path(dir) / filename for filename in filenames])
valid_exts = {j.lower() for j in SUPPORTED_EXTS}
filepaths = [fp for fp in filepaths if fp.suffix.lower() in valid_exts]
return filepaths
Expand All @@ -139,9 +143,7 @@ def collect_filepaths(path, all_files: bool):
def split(path, all_files: bool, splitter):
"""Splits files into chunks for vector db in RAG"""
chunks = []
filepaths = set()
for glob_path in iglob(path, include_hidden=all_files, recursive=True):
filepaths.update(collect_filepaths(glob_path, all_files))
filepaths = collect_filepaths(path, all_files)
for filepath in filepaths:
document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
Expand Down
7 changes: 7 additions & 0 deletions packages/jupyter-ai/jupyter_ai/tests/test_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,10 @@ def test_collect_filepaths(staging_dir):
filenames = [fp.name for fp in result]
assert "file0.html" in filenames # Check that valid file is included
assert "file3.xyz" not in filenames # Check that invalid file is excluded

# test unix wildcard pattern
pattern_path = os.path.join(staging_dir_filepath, "**/*.py")
print(pattern_path)
results = collect_filepaths(pattern_path, all_files)
assert len(results) == 1
assert results[0].suffix == ".py"

0 comments on commit 891c8ae

Please sign in to comment.