From ea43c8379133e1fea3966cbbd0e386e1a98573d5 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Mon, 16 Sep 2024 14:41:43 -0600 Subject: [PATCH] Adds unix shell-style wildcard matching to `/learn` (#989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * adds wildcard matching to /learn * Add documentation * improve docs * cleanup * adds wildcard matching to /learn * Add documentation * improve docs * Update docs/source/users/index.md Co-authored-by: Michał Krassowski <5832902+krassowski@users.noreply.github.com> * update for test * improve test * improve directory handling * remove dir only logic --------- Co-authored-by: Michał Krassowski <5832902+krassowski@users.noreply.github.com> --- docs/source/users/index.md | 7 +++ .../jupyter_ai/chat_handlers/learn.py | 22 +++++--- .../jupyter_ai/document_loaders/directory.py | 27 ++++++---- .../jupyter_ai/tests/static/file9.ipynb | 51 +++++++++++++++++++ .../jupyter_ai/tests/test_directory.py | 20 +++++++- 5 files changed, 111 insertions(+), 16 deletions(-) create mode 100644 packages/jupyter-ai/jupyter_ai/tests/static/file9.ipynb diff --git a/docs/source/users/index.md b/docs/source/users/index.md index db42265c9..d6eb4ac4d 100644 --- a/docs/source/users/index.md +++ b/docs/source/users/index.md @@ -499,6 +499,13 @@ To teach Jupyter AI about a folder full of documentation, for example, run `/lea alt='Screen shot of "/learn docs/" command and a response.' class="screenshot" /> +The `/learn` command also supports unix shell-style wildcard matching. This allows fine-grained file selection for learning. For example, to learn on only notebooks in all directories you can use `/learn **/*.ipynb` and all notebooks within your base (or preferred directory if set) will be indexed, while all other file extensions will be ignored. + +:::{warning} +:name: unix shell-style wildcard matching +Certain patterns may cause `/learn` to run more slowly. For instance `/learn **` may cause directories to be walked multiple times in search of files. +::: + You can then use `/ask` to ask a question specifically about the data that you taught Jupyter AI with `/learn`. Path: file6_path = static_test_files_dir / "file3.csv" file7_path = static_test_files_dir / "file3.xyz" file8_path = static_test_files_dir / "file4.pdf" + file9_path = static_test_files_dir / "file9.ipynb" job_staging_dir = jp_ai_staging_dir / "TestDir" job_staging_dir.mkdir() @@ -33,6 +34,7 @@ def staging_dir(static_test_files_dir, jp_ai_staging_dir) -> Path: shutil.copy2(file6_path, job_staging_hiddendir) shutil.copy2(file7_path, job_staging_subdir) shutil.copy2(file8_path, job_staging_hiddendir) + shutil.copy2(file9_path, job_staging_subdir) return job_staging_dir @@ -49,8 +51,24 @@ def test_collect_filepaths(staging_dir): # Call the function we want to test result = collect_filepaths(staging_dir_filepath, all_files) - assert len(result) == 3 # Test number of valid files + assert len(result) == 4 # Test number of valid files filenames = [fp.name for fp in result] assert "file0.html" in filenames # Check that valid file is included assert "file3.xyz" not in filenames # Check that invalid file is excluded + + # test unix wildcard pattern + pattern_path = os.path.join(staging_dir_filepath, "**/*.*py*") + results = collect_filepaths(pattern_path, all_files) + assert len(results) == 2 + condition = lambda p: p.suffix in [".py", ".ipynb"] + assert all(map(condition, results)) + + # test unix wildcard pattern returning only directories + pattern_path = f"{str(staging_dir_filepath)}*/" + results = collect_filepaths(pattern_path, all_files) + assert len(result) == 4 + filenames = [fp.name for fp in result] + + assert "file0.html" in filenames # Check that valid file is included + assert "file3.xyz" not in filenames # Check that invalid file is excluded