From f15784dde8a720da6f54238d25faca8354e47d2a Mon Sep 17 00:00:00 2001 From: Sanjiv Das Date: Fri, 31 May 2024 12:11:23 -0700 Subject: [PATCH 1/4] Refactor split function with test The split function was (1) selecting files in included directories in the top half of the function, and (2) selecting files with valid extensions and sharding them in the second half. This PR divides the split function in a new `collect_files` function that selects files with valid extensions from non-excluded directories, and then passes the valid filepaths into the `split` function, which calls `collect_files`. --- .../jupyter_ai/document_loaders/directory.py | 21 ++++++---- .../jupyter_ai/tests/doc_loaders/__init__.py | 1 + .../tests/doc_loaders/test_eligible_files.py | 42 +++++++++++++++++++ 3 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py create mode 100644 packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index e493fb385..bbe00777e 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -108,10 +108,11 @@ def split_document(document, splitter: TextSplitter) -> List[Document]: def flatten(*chunk_lists): return list(itertools.chain(*chunk_lists)) - -def split(path, all_files: bool, splitter): - chunks = [] - +# Selects eligible files, i.e., +# 1. Files not in excluded directories, and +# 2. Files that are in the valid file extensions list +# Called from the `split` function. +def collect_files(path, all_files: bool): # Check if the path points to a single file if os.path.isfile(path): filepaths = [Path(path)] @@ -126,16 +127,18 @@ def split(path, all_files: bool, splitter): ] filenames = [f for f in filenames if not f[0] == "."] filepaths += [Path(os.path.join(dir, filename)) for filename in filenames] + filepaths = [fp for fp in filepaths if fp.suffix.lower() in {j.lower() for j in SUPPORTED_EXTS}] + return filepaths - for filepath in filepaths: - # Lower case everything to make sure file extension comparisons are not case sensitive - if filepath.suffix.lower() not in {j.lower() for j in SUPPORTED_EXTS}: - continue +# Split files into chunks for vector db in RAG +def split(path, all_files: bool, splitter): + chunks = [] + filepaths = collect_files(path, all_files) + for filepath in filepaths: document = dask.delayed(path_to_doc)(filepath) chunk = dask.delayed(split_document)(document, splitter) chunks.append(chunk) - flattened_chunks = dask.delayed(flatten)(*chunks) return flattened_chunks diff --git a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py new file mode 100644 index 000000000..7df157d78 --- /dev/null +++ b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py @@ -0,0 +1 @@ +"""Tests for the collect_files and split functions in directory.py.""" \ No newline at end of file diff --git a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py new file mode 100644 index 000000000..075528c10 --- /dev/null +++ b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py @@ -0,0 +1,42 @@ +""" +Test that the collect_files function only selects files that are +1. Not in the the excluded directories and +2. Are in the valid file extensions list. +""" + +import os +import unittest +from jupyter_ai.document_loaders.directory import collect_files + +class TestCollectFiles(unittest.TestCase): + # Prepare temp directory for test + os.mkdir("TestDir") + path = os.path.join(os.getcwd(), "TestDir") + test_dir_contents = {'0' : ['file0.html', '.hidden_file.pdf'], # top level folder, 1 valid file + 'subdir' : ['file1.txt','.hidden_file.txt','file2.py','file3.xyz'], # subfolder, 2 valid files + '.hidden_dir' : ['file3.csv', 'file4.pdf']} # hidden subfolder, no valid files + for folder in test_dir_contents: + os.chdir(path) + if folder != '0': + os.mkdir(folder) + d = os.path.join(path,folder) + else: + d = path + for file in test_dir_contents[folder]: + filepath = os.path.join(d, file) + open(filepath, 'a') + + # Test that the number of valid files for `/learn` is correct + def test_collect_files(self): + all_files = False + # Call the function we want to test + result = collect_files(self.path, all_files) + self.assertEqual(len(result), 3) # Number of valid files + + # Clean up temp directory + from shutil import rmtree + rmtree(self.path) + os.chdir(os.path.split(self.path)[0]) + +if __name__ == '__main__': + unittest.main() From 59493dc8b3cf81fc3bfbed7c3918cdfbb18c00ae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 May 2024 19:15:06 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../jupyter_ai/document_loaders/directory.py | 11 +++++-- .../jupyter_ai/tests/doc_loaders/__init__.py | 2 +- .../tests/doc_loaders/test_eligible_files.py | 31 +++++++++++++------ 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index bbe00777e..6137609b6 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -108,8 +108,9 @@ def split_document(document, splitter: TextSplitter) -> List[Document]: def flatten(*chunk_lists): return list(itertools.chain(*chunk_lists)) -# Selects eligible files, i.e., -# 1. Files not in excluded directories, and + +# Selects eligible files, i.e., +# 1. Files not in excluded directories, and # 2. Files that are in the valid file extensions list # Called from the `split` function. def collect_files(path, all_files: bool): @@ -127,7 +128,11 @@ def collect_files(path, all_files: bool): ] filenames = [f for f in filenames if not f[0] == "."] filepaths += [Path(os.path.join(dir, filename)) for filename in filenames] - filepaths = [fp for fp in filepaths if fp.suffix.lower() in {j.lower() for j in SUPPORTED_EXTS}] + filepaths = [ + fp + for fp in filepaths + if fp.suffix.lower() in {j.lower() for j in SUPPORTED_EXTS} + ] return filepaths diff --git a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py index 7df157d78..5fbde7b8d 100644 --- a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py +++ b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/__init__.py @@ -1 +1 @@ -"""Tests for the collect_files and split functions in directory.py.""" \ No newline at end of file +"""Tests for the collect_files and split functions in directory.py.""" diff --git a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py index 075528c10..16627e48a 100644 --- a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py +++ b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py @@ -1,30 +1,39 @@ """ -Test that the collect_files function only selects files that are -1. Not in the the excluded directories and +Test that the collect_files function only selects files that are +1. Not in the the excluded directories and 2. Are in the valid file extensions list. """ import os import unittest + from jupyter_ai.document_loaders.directory import collect_files + class TestCollectFiles(unittest.TestCase): # Prepare temp directory for test os.mkdir("TestDir") path = os.path.join(os.getcwd(), "TestDir") - test_dir_contents = {'0' : ['file0.html', '.hidden_file.pdf'], # top level folder, 1 valid file - 'subdir' : ['file1.txt','.hidden_file.txt','file2.py','file3.xyz'], # subfolder, 2 valid files - '.hidden_dir' : ['file3.csv', 'file4.pdf']} # hidden subfolder, no valid files + test_dir_contents = { + "0": ["file0.html", ".hidden_file.pdf"], # top level folder, 1 valid file + "subdir": [ + "file1.txt", + ".hidden_file.txt", + "file2.py", + "file3.xyz", + ], # subfolder, 2 valid files + ".hidden_dir": ["file3.csv", "file4.pdf"], + } # hidden subfolder, no valid files for folder in test_dir_contents: os.chdir(path) - if folder != '0': + if folder != "0": os.mkdir(folder) - d = os.path.join(path,folder) - else: + d = os.path.join(path, folder) + else: d = path for file in test_dir_contents[folder]: filepath = os.path.join(d, file) - open(filepath, 'a') + open(filepath, "a") # Test that the number of valid files for `/learn` is correct def test_collect_files(self): @@ -35,8 +44,10 @@ def test_collect_files(self): # Clean up temp directory from shutil import rmtree + rmtree(self.path) os.chdir(os.path.split(self.path)[0]) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() From 9e5e649705448186781f82902c135469eb3db9ef Mon Sep 17 00:00:00 2001 From: Sanjiv Das Date: Fri, 31 May 2024 13:12:31 -0700 Subject: [PATCH 3/4] Test changed to use pytest --- .../tests/doc_loaders/test_eligible_files.py | 79 ++++++++----------- 1 file changed, 35 insertions(+), 44 deletions(-) diff --git a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py index 16627e48a..b07278982 100644 --- a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py +++ b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py @@ -5,49 +5,40 @@ """ import os -import unittest - from jupyter_ai.document_loaders.directory import collect_files - -class TestCollectFiles(unittest.TestCase): - # Prepare temp directory for test - os.mkdir("TestDir") - path = os.path.join(os.getcwd(), "TestDir") - test_dir_contents = { - "0": ["file0.html", ".hidden_file.pdf"], # top level folder, 1 valid file - "subdir": [ - "file1.txt", - ".hidden_file.txt", - "file2.py", - "file3.xyz", - ], # subfolder, 2 valid files - ".hidden_dir": ["file3.csv", "file4.pdf"], - } # hidden subfolder, no valid files - for folder in test_dir_contents: - os.chdir(path) - if folder != "0": - os.mkdir(folder) - d = os.path.join(path, folder) - else: - d = path - for file in test_dir_contents[folder]: - filepath = os.path.join(d, file) - open(filepath, "a") - - # Test that the number of valid files for `/learn` is correct - def test_collect_files(self): - all_files = False - # Call the function we want to test - result = collect_files(self.path, all_files) - self.assertEqual(len(result), 3) # Number of valid files - - # Clean up temp directory - from shutil import rmtree - - rmtree(self.path) - os.chdir(os.path.split(self.path)[0]) - - -if __name__ == "__main__": - unittest.main() +# Prepare temp directory for test with pytest.py +os.mkdir("TestDir") +path = os.path.join(os.getcwd(), "TestDir") +test_dir_contents = { + "0": ["file0.html", ".hidden_file.pdf"], # top level folder, 1 valid file + "subdir": [ + "file1.txt", + ".hidden_file.txt", + "file2.py", + "file3.xyz", + ], # subfolder, 2 valid files + ".hidden_dir": ["file3.csv", "file4.pdf"], +} # hidden subfolder, no valid files +for folder in test_dir_contents: + os.chdir(path) + if folder != "0": + os.mkdir(folder) + d = os.path.join(path, folder) + else: + d = path + for file in test_dir_contents[folder]: + filepath = os.path.join(d, file) + open(filepath, "a") + +# Test that the number of valid files for `/learn` is correct +def test_collect_files(): + all_files = False + # Call the function we want to test + result = collect_files(path, all_files) + assert len(result) == 3 # Number of valid files + + # Clean up temp directory + from shutil import rmtree + rmtree(path) + os.chdir(os.path.split(path)[0]) \ No newline at end of file From f788fc5e7ea1e4b352a0e0de2d923ab568f46024 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 31 May 2024 20:15:04 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../jupyter_ai/tests/doc_loaders/test_eligible_files.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py index b07278982..445aa579e 100644 --- a/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py +++ b/packages/jupyter-ai/jupyter_ai/tests/doc_loaders/test_eligible_files.py @@ -5,6 +5,7 @@ """ import os + from jupyter_ai.document_loaders.directory import collect_files # Prepare temp directory for test with pytest.py @@ -31,6 +32,7 @@ filepath = os.path.join(d, file) open(filepath, "a") + # Test that the number of valid files for `/learn` is correct def test_collect_files(): all_files = False @@ -40,5 +42,6 @@ def test_collect_files(): # Clean up temp directory from shutil import rmtree + rmtree(path) - os.chdir(os.path.split(path)[0]) \ No newline at end of file + os.chdir(os.path.split(path)[0])