Skip to content

Commit

Permalink
Backport PR jupyterlab#427: /learn skips hidden files/dirs by default…
Browse files Browse the repository at this point in the history
…, unless "-a" is specified
  • Loading branch information
JasonWeill authored and meeseeksmachine committed Nov 8, 2023
1 parent 33831e9 commit 05daad8
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 10 deletions.
11 changes: 8 additions & 3 deletions packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
self.root_dir = root_dir
self.dask_client_future = dask_client_future
self.parser.prog = "/learn"
self.parser.add_argument("-a", "--all-files", action="store_true")
self.parser.add_argument("-v", "--verbose", action="store_true")
self.parser.add_argument("-d", "--delete", action="store_true")
self.parser.add_argument("-l", "--list", action="store_true")
Expand Down Expand Up @@ -115,7 +116,9 @@ async def process_message(self, message: HumanChatMessage):
if args.verbose:
self.reply(f"Loading and splitting files for {load_path}", message)

await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap)
await self.learn_dir(
load_path, args.chunk_size, args.chunk_overlap, args.all_files
)
self.save()

response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
Expand All @@ -132,7 +135,9 @@ def _build_list_response(self):
{dir_list}"""
return message

async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int):
async def learn_dir(
self, path: str, chunk_size: int, chunk_overlap: int, all_files: bool
):
dask_client = await self.dask_client_future
splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap}
splitters = {
Expand All @@ -146,7 +151,7 @@ async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int):
default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs),
)

delayed = split(path, splitter=splitter)
delayed = split(path, all_files, splitter=splitter)
doc_chunks = await dask_client.compute(delayed)

em_provider_cls, em_provider_args = self.get_embedding_provider()
Expand Down
15 changes: 8 additions & 7 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@ def path_to_doc(path):
return Document(page_content=text, metadata=metadata)


# Unless /learn has the "all files" option passed in, files and directories beginning with '.' are excluded
EXCLUDE_DIRS = {
".ipynb_checkpoints",
"node_modules",
"lib",
"build",
".git",
".DS_Store",
}
SUPPORTED_EXTS = {
".py",
Expand All @@ -50,12 +48,15 @@ def flatten(*chunk_lists):
return list(itertools.chain(*chunk_lists))


def split(path, splitter):
def split(path, all_files: bool, splitter):
chunks = []

for dir, _, filenames in os.walk(path):
if dir in EXCLUDE_DIRS:
continue
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
Expand Down

0 comments on commit 05daad8

Please sign in to comment.