Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

/learn skips hidden files/dirs by default, unless "-a" is specified #427

Merged
merged 9 commits into from
Nov 4, 2023
13 changes: 13 additions & 0 deletions docs/source/users/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,19 @@ use the `-c` or `--chunk-size` option and the `-o` or `--chunk-overlap` option.
/learn --chunk-size 1000 --chunk-overlap 200 <directory>
```

By default, `/learn` will not read directories named `node_modules`, `lib`, or `build`,
and will not read hidden files or hidden directories, where the file or directory name
starts with a `.`. To force `/learn` to read all supported file types in all directories,
use the `-a` or `--all-files` option.

```
# do not learn from hidden files, hidden directories, or node_modules, lib, or build directories
/learn <directory>

# learn from all supported files
/learn -a <directory>
```

### Additional chat commands

To clear the chat panel, use the `/clear` command. This does not reset the AI model; the model may still remember previous messages that you sent it, and it may use them to inform its responses.
Expand Down
11 changes: 8 additions & 3 deletions packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
self.root_dir = root_dir
self.dask_client_future = dask_client_future
self.parser.prog = "/learn"
self.parser.add_argument("-a", "--all-files", action="store_true")
self.parser.add_argument("-v", "--verbose", action="store_true")
self.parser.add_argument("-d", "--delete", action="store_true")
self.parser.add_argument("-l", "--list", action="store_true")
Expand Down Expand Up @@ -115,7 +116,9 @@ async def _process_message(self, message: HumanChatMessage):
if args.verbose:
self.reply(f"Loading and splitting files for {load_path}", message)

await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap)
await self.learn_dir(
load_path, args.chunk_size, args.chunk_overlap, args.all_files
)
self.save()

response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
Expand All @@ -132,7 +135,9 @@ def _build_list_response(self):
{dir_list}"""
return message

async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int):
async def learn_dir(
self, path: str, chunk_size: int, chunk_overlap: int, all_files: bool
):
dask_client = await self.dask_client_future
splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap}
splitters = {
Expand All @@ -146,7 +151,7 @@ async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int):
default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs),
)

delayed = split(path, splitter=splitter)
delayed = split(path, all_files, splitter=splitter)
doc_chunks = await dask_client.compute(delayed)

em_provider_cls, em_provider_args = self.get_embedding_provider()
Expand Down
15 changes: 8 additions & 7 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@ def path_to_doc(path):
return Document(page_content=text, metadata=metadata)


# Unless /learn has the "all files" option passed in, files and directories beginning with '.' are excluded
EXCLUDE_DIRS = {
".ipynb_checkpoints",
"node_modules",
"lib",
"build",
".git",
".DS_Store",
}
SUPPORTED_EXTS = {
".py",
Expand All @@ -50,12 +48,15 @@ def flatten(*chunk_lists):
return list(itertools.chain(*chunk_lists))


def split(path, splitter):
def split(path, all_files: bool, splitter):
chunks = []

for dir, _, filenames in os.walk(path):
if dir in EXCLUDE_DIRS:
continue
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
Expand Down
Loading