Skip to content

Commit

Permalink
/learn skips hidden files/dirs by default, unless "-a" is specified (j…
Browse files Browse the repository at this point in the history
…upyterlab#427)

* Omits hidden files/dirs by default, unless "-a" is specified

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update packages/jupyter-ai/jupyter_ai/document_loaders/directory.py

Co-authored-by: Piyush Jain <[email protected]>

* Update packages/jupyter-ai/jupyter_ai/document_loaders/directory.py

Co-authored-by: Piyush Jain <[email protected]>

* Renames long arg from "all" to "all-files"

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Renames `all` arg in `split`

* Updates subdirectory and filename exclusion logic

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piyush Jain <[email protected]>
  • Loading branch information
3 people authored and Marchlak committed Oct 28, 2024
1 parent 42e5d2e commit 4e54659
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 10 deletions.
13 changes: 13 additions & 0 deletions docs/source/users/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,19 @@ use the `-c` or `--chunk-size` option and the `-o` or `--chunk-overlap` option.
/learn --chunk-size 1000 --chunk-overlap 200 <directory>
```

By default, `/learn` will not read directories named `node_modules`, `lib`, or `build`,
and will not read hidden files or hidden directories, where the file or directory name
starts with a `.`. To force `/learn` to read all supported file types in all directories,
use the `-a` or `--all-files` option.

```
# do not learn from hidden files, hidden directories, or node_modules, lib, or build directories
/learn <directory>
# learn from all supported files
/learn -a <directory>
```

### Additional chat commands

To clear the chat panel, use the `/clear` command. This does not reset the AI model; the model may still remember previous messages that you sent it, and it may use them to inform its responses.
Expand Down
11 changes: 8 additions & 3 deletions packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
self.root_dir = root_dir
self.dask_client_future = dask_client_future
self.parser.prog = "/learn"
self.parser.add_argument("-a", "--all-files", action="store_true")
self.parser.add_argument("-v", "--verbose", action="store_true")
self.parser.add_argument("-d", "--delete", action="store_true")
self.parser.add_argument("-l", "--list", action="store_true")
Expand Down Expand Up @@ -115,7 +116,9 @@ async def _process_message(self, message: HumanChatMessage):
if args.verbose:
self.reply(f"Loading and splitting files for {load_path}", message)

await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap)
await self.learn_dir(
load_path, args.chunk_size, args.chunk_overlap, args.all_files
)
self.save()

response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
Expand All @@ -132,7 +135,9 @@ def _build_list_response(self):
{dir_list}"""
return message

async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int):
async def learn_dir(
self, path: str, chunk_size: int, chunk_overlap: int, all_files: bool
):
dask_client = await self.dask_client_future
splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap}
splitters = {
Expand All @@ -146,7 +151,7 @@ async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int):
default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs),
)

delayed = split(path, splitter=splitter)
delayed = split(path, all_files, splitter=splitter)
doc_chunks = await dask_client.compute(delayed)

em_provider_cls, em_provider_args = self.get_embedding_provider()
Expand Down
15 changes: 8 additions & 7 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@ def path_to_doc(path):
return Document(page_content=text, metadata=metadata)


# Unless /learn has the "all files" option passed in, files and directories beginning with '.' are excluded
EXCLUDE_DIRS = {
".ipynb_checkpoints",
"node_modules",
"lib",
"build",
".git",
".DS_Store",
}
SUPPORTED_EXTS = {
".py",
Expand All @@ -50,12 +48,15 @@ def flatten(*chunk_lists):
return list(itertools.chain(*chunk_lists))


def split(path, splitter):
def split(path, all_files: bool, splitter):
chunks = []

for dir, _, filenames in os.walk(path):
if dir in EXCLUDE_DIRS:
continue
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
Expand Down

0 comments on commit 4e54659

Please sign in to comment.