From 05daad8b65951b99d2049dc816b677449ea092e8 Mon Sep 17 00:00:00 2001 From: Jason Weill <93281816+JasonWeill@users.noreply.github.com> Date: Fri, 3 Nov 2023 17:08:33 -0700 Subject: [PATCH] Backport PR #427: /learn skips hidden files/dirs by default, unless "-a" is specified --- .../jupyter-ai/jupyter_ai/chat_handlers/learn.py | 11 ++++++++--- .../jupyter_ai/document_loaders/directory.py | 15 ++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index a5635f235..b93685c0e 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -38,6 +38,7 @@ def __init__( self.root_dir = root_dir self.dask_client_future = dask_client_future self.parser.prog = "/learn" + self.parser.add_argument("-a", "--all-files", action="store_true") self.parser.add_argument("-v", "--verbose", action="store_true") self.parser.add_argument("-d", "--delete", action="store_true") self.parser.add_argument("-l", "--list", action="store_true") @@ -115,7 +116,9 @@ async def process_message(self, message: HumanChatMessage): if args.verbose: self.reply(f"Loading and splitting files for {load_path}", message) - await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap) + await self.learn_dir( + load_path, args.chunk_size, args.chunk_overlap, args.all_files + ) self.save() response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. @@ -132,7 +135,9 @@ def _build_list_response(self): {dir_list}""" return message - async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int): + async def learn_dir( + self, path: str, chunk_size: int, chunk_overlap: int, all_files: bool + ): dask_client = await self.dask_client_future splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap} splitters = { @@ -146,7 +151,7 @@ async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int): default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs), ) - delayed = split(path, splitter=splitter) + delayed = split(path, all_files, splitter=splitter) doc_chunks = await dask_client.compute(delayed) em_provider_cls, em_provider_args = self.get_embedding_provider() diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 0ce4bb739..efcc99e95 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -18,13 +18,11 @@ def path_to_doc(path): return Document(page_content=text, metadata=metadata) +# Unless /learn has the "all files" option passed in, files and directories beginning with '.' are excluded EXCLUDE_DIRS = { - ".ipynb_checkpoints", "node_modules", "lib", "build", - ".git", - ".DS_Store", } SUPPORTED_EXTS = { ".py", @@ -50,12 +48,15 @@ def flatten(*chunk_lists): return list(itertools.chain(*chunk_lists)) -def split(path, splitter): +def split(path, all_files: bool, splitter): chunks = [] - for dir, _, filenames in os.walk(path): - if dir in EXCLUDE_DIRS: - continue + for dir, subdirs, filenames in os.walk(path): + # Filter out hidden filenames, hidden directories, and excluded directories, + # unless "all files" are requested + if not all_files: + subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)] + filenames = [f for f in filenames if not f[0] == "."] for filename in filenames: filepath = Path(os.path.join(dir, filename))