From cf5a75b368f442ce24f60fd687c9cfaef827355b Mon Sep 17 00:00:00 2001 From: Jason Weill Date: Wed, 1 Nov 2023 15:55:38 -0700 Subject: [PATCH 1/9] Omits hidden files/dirs by default, unless "-a" is specified --- docs/source/users/index.md | 13 +++++++++++++ .../jupyter_ai/chat_handlers/learn.py | 7 ++++--- .../jupyter_ai/document_loaders/directory.py | 17 +++++++++++------ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/docs/source/users/index.md b/docs/source/users/index.md index cdbf0ce5c..1fbf072db 100644 --- a/docs/source/users/index.md +++ b/docs/source/users/index.md @@ -492,6 +492,19 @@ use the `-c` or `--chunk-size` option and the `-o` or `--chunk-overlap` option. /learn --chunk-size 1000 --chunk-overlap 200 ``` +By default, `/learn` will not read directories named `node_modules`, `lib`, or `build`, +and will not read hidden files or hidden directories, where the file or directory name +starts with a `.`. To force `/learn` to read all supported file types in all directories, +use the `-a` or `--all` option. + +``` +# do not learn from hidden files, hidden directories, or node_modules, lib, or build directories +/learn + +# learn from all supported files +/learn -a +``` + ### Additional chat commands To clear the chat panel, use the `/clear` command. This does not reset the AI model; the model may still remember previous messages that you sent it, and it may use them to inform its responses. diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index 2d011e522..8fdffba66 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -38,6 +38,7 @@ def __init__( self.root_dir = root_dir self.dask_client_future = dask_client_future self.parser.prog = "/learn" + self.parser.add_argument("-a", "--all", action="store_true") self.parser.add_argument("-v", "--verbose", action="store_true") self.parser.add_argument("-d", "--delete", action="store_true") self.parser.add_argument("-l", "--list", action="store_true") @@ -115,7 +116,7 @@ async def _process_message(self, message: HumanChatMessage): if args.verbose: self.reply(f"Loading and splitting files for {load_path}", message) - await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap) + await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap, args.all) self.save() response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. @@ -132,7 +133,7 @@ def _build_list_response(self): {dir_list}""" return message - async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int): + async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int, all: bool): dask_client = await self.dask_client_future splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap} splitters = { @@ -146,7 +147,7 @@ async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int): default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs), ) - delayed = split(path, splitter=splitter) + delayed = split(path, all, splitter=splitter) doc_chunks = await dask_client.compute(delayed) em_provider_cls, em_provider_args = self.get_embedding_provider() diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 0ce4bb739..ea4db8b42 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -17,14 +17,11 @@ def path_to_doc(path): metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix} return Document(page_content=text, metadata=metadata) - +# Unless /learn has the "all files" option passed in, files and directories beginning with '.' are excluded EXCLUDE_DIRS = { - ".ipynb_checkpoints", "node_modules", "lib", "build", - ".git", - ".DS_Store", } SUPPORTED_EXTS = { ".py", @@ -50,11 +47,15 @@ def flatten(*chunk_lists): return list(itertools.chain(*chunk_lists)) -def split(path, splitter): +def split(path, all: bool, splitter): chunks = [] for dir, _, filenames in os.walk(path): - if dir in EXCLUDE_DIRS: + if all is False and dir in EXCLUDE_DIRS: + continue + + # Exclude hidden directories + if all is False and dir[0] == '.': continue for filename in filenames: @@ -62,6 +63,10 @@ def split(path, splitter): if filepath.suffix not in SUPPORTED_EXTS: continue + # Unless we're learning "all" files, exclude hidden files + if all is False and filepath.name[0] == '.': + continue + document = dask.delayed(path_to_doc)(filepath) chunk = dask.delayed(split_document)(document, splitter) chunks.append(chunk) From 6818985276a022fff718fb59da41ea69f9b133a8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 1 Nov 2023 22:58:23 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py | 4 +++- packages/jupyter-ai/jupyter_ai/document_loaders/directory.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index 8fdffba66..ca4858deb 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -133,7 +133,9 @@ def _build_list_response(self): {dir_list}""" return message - async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int, all: bool): + async def learn_dir( + self, path: str, chunk_size: int, chunk_overlap: int, all: bool + ): dask_client = await self.dask_client_future splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap} splitters = { diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index ea4db8b42..d3e55fb1a 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -17,6 +17,7 @@ def path_to_doc(path): metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix} return Document(page_content=text, metadata=metadata) + # Unless /learn has the "all files" option passed in, files and directories beginning with '.' are excluded EXCLUDE_DIRS = { "node_modules", @@ -55,7 +56,7 @@ def split(path, all: bool, splitter): continue # Exclude hidden directories - if all is False and dir[0] == '.': + if all is False and dir[0] == ".": continue for filename in filenames: @@ -64,7 +65,7 @@ def split(path, all: bool, splitter): continue # Unless we're learning "all" files, exclude hidden files - if all is False and filepath.name[0] == '.': + if all is False and filepath.name[0] == ".": continue document = dask.delayed(path_to_doc)(filepath) From 03156903391f5bd8eff656c547fde13ceb9ee311 Mon Sep 17 00:00:00 2001 From: Jason Weill <93281816+JasonWeill@users.noreply.github.com> Date: Thu, 2 Nov 2023 09:50:38 -0700 Subject: [PATCH 3/9] Update packages/jupyter-ai/jupyter_ai/document_loaders/directory.py Co-authored-by: Piyush Jain --- .../jupyter-ai/jupyter_ai/document_loaders/directory.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index d3e55fb1a..25705f464 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -52,11 +52,7 @@ def split(path, all: bool, splitter): chunks = [] for dir, _, filenames in os.walk(path): - if all is False and dir in EXCLUDE_DIRS: - continue - - # Exclude hidden directories - if all is False and dir[0] == ".": + if not all and (dir.startswith(".") or dir in EXCLUDE_DIRS): continue for filename in filenames: From 8a1f8c96e51ce3728df78aece57c7410dd776821 Mon Sep 17 00:00:00 2001 From: Jason Weill <93281816+JasonWeill@users.noreply.github.com> Date: Thu, 2 Nov 2023 09:51:37 -0700 Subject: [PATCH 4/9] Update packages/jupyter-ai/jupyter_ai/document_loaders/directory.py Co-authored-by: Piyush Jain --- .../jupyter-ai/jupyter_ai/document_loaders/directory.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 25705f464..e62e9a18b 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -56,12 +56,11 @@ def split(path, all: bool, splitter): continue for filename in filenames: - filepath = Path(os.path.join(dir, filename)) - if filepath.suffix not in SUPPORTED_EXTS: + if not all and filename.startswith("."): continue - # Unless we're learning "all" files, exclude hidden files - if all is False and filepath.name[0] == ".": + filepath = Path(os.path.join(dir, filename)) + if filepath.suffix not in SUPPORTED_EXTS: continue document = dask.delayed(path_to_doc)(filepath) From 95c5a5329e1f23682a12d2e374b980aff9d822f4 Mon Sep 17 00:00:00 2001 From: Jason Weill Date: Fri, 3 Nov 2023 10:42:04 -0700 Subject: [PATCH 5/9] Renames long arg from "all" to "all-files" --- docs/source/users/index.md | 2 +- packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/users/index.md b/docs/source/users/index.md index 1fbf072db..3563cec83 100644 --- a/docs/source/users/index.md +++ b/docs/source/users/index.md @@ -495,7 +495,7 @@ use the `-c` or `--chunk-size` option and the `-o` or `--chunk-overlap` option. By default, `/learn` will not read directories named `node_modules`, `lib`, or `build`, and will not read hidden files or hidden directories, where the file or directory name starts with a `.`. To force `/learn` to read all supported file types in all directories, -use the `-a` or `--all` option. +use the `-a` or `--all-files` option. ``` # do not learn from hidden files, hidden directories, or node_modules, lib, or build directories diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index ca4858deb..c3f6a01a9 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -38,7 +38,7 @@ def __init__( self.root_dir = root_dir self.dask_client_future = dask_client_future self.parser.prog = "/learn" - self.parser.add_argument("-a", "--all", action="store_true") + self.parser.add_argument("-a", "--all-files", action="store_true") self.parser.add_argument("-v", "--verbose", action="store_true") self.parser.add_argument("-d", "--delete", action="store_true") self.parser.add_argument("-l", "--list", action="store_true") @@ -116,7 +116,7 @@ async def _process_message(self, message: HumanChatMessage): if args.verbose: self.reply(f"Loading and splitting files for {load_path}", message) - await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap, args.all) + await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap, args.all_files) self.save() response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. @@ -134,7 +134,7 @@ def _build_list_response(self): return message async def learn_dir( - self, path: str, chunk_size: int, chunk_overlap: int, all: bool + self, path: str, chunk_size: int, chunk_overlap: int, all_files: bool ): dask_client = await self.dask_client_future splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap} @@ -149,7 +149,7 @@ async def learn_dir( default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs), ) - delayed = split(path, all, splitter=splitter) + delayed = split(path, all_files, splitter=splitter) doc_chunks = await dask_client.compute(delayed) em_provider_cls, em_provider_args = self.get_embedding_provider() From b0c93773fd27f03d3c2ad2055ba12779e7878855 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Nov 2023 17:44:42 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index c3f6a01a9..dac1d82d1 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -116,7 +116,9 @@ async def _process_message(self, message: HumanChatMessage): if args.verbose: self.reply(f"Loading and splitting files for {load_path}", message) - await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap, args.all_files) + await self.learn_dir( + load_path, args.chunk_size, args.chunk_overlap, args.all_files + ) self.save() response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. From bf8ff5dc66c5c52513cf78134836d76a0a5e276b Mon Sep 17 00:00:00 2001 From: Jason Weill Date: Fri, 3 Nov 2023 10:47:08 -0700 Subject: [PATCH 7/9] Renames `all` arg in `split` --- .../jupyter-ai/jupyter_ai/document_loaders/directory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index e62e9a18b..766819b22 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -48,15 +48,15 @@ def flatten(*chunk_lists): return list(itertools.chain(*chunk_lists)) -def split(path, all: bool, splitter): +def split(path, all_files: bool, splitter): chunks = [] for dir, _, filenames in os.walk(path): - if not all and (dir.startswith(".") or dir in EXCLUDE_DIRS): + if not all_files and (dir.startswith(".") or dir in EXCLUDE_DIRS): continue for filename in filenames: - if not all and filename.startswith("."): + if not all_files and filename.startswith("."): continue filepath = Path(os.path.join(dir, filename)) From 4e7fa2eca2dd45f46daa431beb31b840e393d825 Mon Sep 17 00:00:00 2001 From: Jason Weill Date: Fri, 3 Nov 2023 16:51:38 -0700 Subject: [PATCH 8/9] Updates subdirectory and filename exclusion logic --- .../jupyter_ai/document_loaders/directory.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 766819b22..dd1e52b84 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -51,14 +51,14 @@ def flatten(*chunk_lists): def split(path, all_files: bool, splitter): chunks = [] - for dir, _, filenames in os.walk(path): - if not all_files and (dir.startswith(".") or dir in EXCLUDE_DIRS): - continue + for dir, subdirs, filenames in os.walk(path): + # Filter out hidden filenames, hidden directories, and excluded directories, + # unless "all files" are requested + if not all_files: + subdirs[:] = [d for d in subdirs if not (d[0] == '.' or d in EXCLUDE_DIRS)] + filenames = [f for f in filenames if not f[0] == '.'] for filename in filenames: - if not all_files and filename.startswith("."): - continue - filepath = Path(os.path.join(dir, filename)) if filepath.suffix not in SUPPORTED_EXTS: continue From a1c59ae0b3cacf4dcd05d81ededdc3b45679adeb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Nov 2023 23:52:26 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- packages/jupyter-ai/jupyter_ai/document_loaders/directory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index dd1e52b84..efcc99e95 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -55,8 +55,8 @@ def split(path, all_files: bool, splitter): # Filter out hidden filenames, hidden directories, and excluded directories, # unless "all files" are requested if not all_files: - subdirs[:] = [d for d in subdirs if not (d[0] == '.' or d in EXCLUDE_DIRS)] - filenames = [f for f in filenames if not f[0] == '.'] + subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)] + filenames = [f for f in filenames if not f[0] == "."] for filename in filenames: filepath = Path(os.path.join(dir, filename))