From 063d27c522f69d1acce3c4a070df6989f2820ecd Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Fri, 3 Nov 2023 11:33:20 +0100 Subject: [PATCH] refactor!: rename `TextDocumentSplitter` to `DocumentSplitter` (#6223) * rename TextDocumentSplitter to DocumentSplitter * reno * fix init --- .../pipelines/test_preprocessing_pipeline.py | 4 +-- .../components/preprocessors/__init__.py | 6 ++-- ...ocument_cleaner.py => document_cleaner.py} | 0 ...ument_splitter.py => document_splitter.py} | 8 ++--- pyproject.toml | 2 +- ...xt-document-splitter-1e9fcd292c4591dd.yaml | 6 ++++ ...nt_cleaner.py => test_document_cleaner.py} | 6 ++-- ..._splitter.py => test_document_splitter.py} | 34 +++++++++---------- 8 files changed, 36 insertions(+), 30 deletions(-) rename haystack/preview/components/preprocessors/{text_document_cleaner.py => document_cleaner.py} (100%) rename haystack/preview/components/preprocessors/{text_document_splitter.py => document_splitter.py} (91%) create mode 100644 releasenotes/notes/rename-text-document-splitter-1e9fcd292c4591dd.yaml rename test/preview/components/preprocessors/{test_text_document_cleaner.py => test_document_cleaner.py} (96%) rename test/preview/components/preprocessors/{test_text_document_splitter.py => test_document_splitter.py} (81%) diff --git a/e2e/preview/pipelines/test_preprocessing_pipeline.py b/e2e/preview/pipelines/test_preprocessing_pipeline.py index 5b32c642a4..0984a2d10d 100644 --- a/e2e/preview/pipelines/test_preprocessing_pipeline.py +++ b/e2e/preview/pipelines/test_preprocessing_pipeline.py @@ -3,7 +3,7 @@ from haystack.preview import Pipeline from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder from haystack.preview.components.file_converters import TextFileToDocument -from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier +from haystack.preview.components.preprocessors import DocumentSplitter, DocumentCleaner, DocumentLanguageClassifier from haystack.preview.components.routers import FileTypeRouter from haystack.preview.components.writers import DocumentWriter from haystack.preview.document_stores import InMemoryDocumentStore @@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path): preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") preprocessing_pipeline.add_component( - instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter" + instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter" ) preprocessing_pipeline.add_component( instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), diff --git a/haystack/preview/components/preprocessors/__init__.py b/haystack/preview/components/preprocessors/__init__.py index 35ab6a5f07..3844c495b1 100644 --- a/haystack/preview/components/preprocessors/__init__.py +++ b/haystack/preview/components/preprocessors/__init__.py @@ -1,6 +1,6 @@ -from haystack.preview.components.preprocessors.text_document_cleaner import DocumentCleaner -from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter +from haystack.preview.components.preprocessors.document_cleaner import DocumentCleaner +from haystack.preview.components.preprocessors.document_splitter import DocumentSplitter from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier from haystack.preview.components.preprocessors.text_language_classifier import TextLanguageClassifier -__all__ = ["TextDocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"] +__all__ = ["DocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"] diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/document_cleaner.py similarity index 100% rename from haystack/preview/components/preprocessors/text_document_cleaner.py rename to haystack/preview/components/preprocessors/document_cleaner.py diff --git a/haystack/preview/components/preprocessors/text_document_splitter.py b/haystack/preview/components/preprocessors/document_splitter.py similarity index 91% rename from haystack/preview/components/preprocessors/text_document_splitter.py rename to haystack/preview/components/preprocessors/document_splitter.py index d24fddf330..ecb8a3f11f 100644 --- a/haystack/preview/components/preprocessors/text_document_splitter.py +++ b/haystack/preview/components/preprocessors/document_splitter.py @@ -7,7 +7,7 @@ @component -class TextDocumentSplitter: +class DocumentSplitter: """ Splits a list of text documents into a list of text documents with shorter texts. This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models. @@ -45,13 +45,13 @@ def run(self, documents: List[Document]): """ if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): - raise TypeError("TextDocumentSplitter expects a List of Documents as input.") + raise TypeError("DocumentSplitter expects a List of Documents as input.") split_docs = [] for doc in documents: if doc.content is None: raise ValueError( - f"TextDocumentSplitter only works with text documents but document.content for document ID {doc.id} is None." + f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None." ) units = self._split_into_units(doc.content, self.split_by) text_splits = self._concatenate_units(units, self.split_length, self.split_overlap) @@ -69,7 +69,7 @@ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "pa split_at = " " else: raise NotImplementedError( - "TextDocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options." + "DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options." ) units = text.split(split_at) # Add the delimiter back to all units except the last one diff --git a/pyproject.toml b/pyproject.toml index 6ab456c790..5b1b3c46d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ preview = [ "Jinja2", "openai", "pyyaml", - "more-itertools", # TextDocumentSplitter + "more-itertools", # DocumentSplitter ] inference = [ "transformers[torch,sentencepiece]==4.34.1", diff --git a/releasenotes/notes/rename-text-document-splitter-1e9fcd292c4591dd.yaml b/releasenotes/notes/rename-text-document-splitter-1e9fcd292c4591dd.yaml new file mode 100644 index 0000000000..363da072d4 --- /dev/null +++ b/releasenotes/notes/rename-text-document-splitter-1e9fcd292c4591dd.yaml @@ -0,0 +1,6 @@ +--- +preview: + - | + rename `TextDocumentSplitter` to `DocumentSplitter`, to allow a better + distinction between Components that operate on text and those that operate + on Documents. diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_document_cleaner.py similarity index 96% rename from test/preview/components/preprocessors/test_text_document_cleaner.py rename to test/preview/components/preprocessors/test_document_cleaner.py index 765771b44f..71f412f3f1 100644 --- a/test/preview/components/preprocessors/test_text_document_cleaner.py +++ b/test/preview/components/preprocessors/test_document_cleaner.py @@ -10,9 +10,9 @@ class TestDocumentCleaner: @pytest.mark.unit def test_init(self): cleaner = DocumentCleaner() - assert cleaner.remove_empty_lines == True - assert cleaner.remove_extra_whitespaces == True - assert cleaner.remove_repeated_substrings == False + assert cleaner.remove_empty_lines is True + assert cleaner.remove_extra_whitespaces is True + assert cleaner.remove_repeated_substrings is False assert cleaner.remove_substrings is None assert cleaner.remove_regex is None diff --git a/test/preview/components/preprocessors/test_text_document_splitter.py b/test/preview/components/preprocessors/test_document_splitter.py similarity index 81% rename from test/preview/components/preprocessors/test_text_document_splitter.py rename to test/preview/components/preprocessors/test_document_splitter.py index b53737b40c..4e28d1b135 100644 --- a/test/preview/components/preprocessors/test_text_document_splitter.py +++ b/test/preview/components/preprocessors/test_document_splitter.py @@ -1,48 +1,48 @@ import pytest from haystack.preview import Document -from haystack.preview.components.preprocessors import TextDocumentSplitter +from haystack.preview.components.preprocessors import DocumentSplitter -class TestTextDocumentSplitter: +class TestDocumentSplitter: @pytest.mark.unit def test_non_text_document(self): with pytest.raises( - ValueError, match="TextDocumentSplitter only works with text documents but document.content for document ID" + ValueError, match="DocumentSplitter only works with text documents but document.content for document ID" ): - splitter = TextDocumentSplitter() + splitter = DocumentSplitter() splitter.run(documents=[Document()]) @pytest.mark.unit def test_single_doc(self): - with pytest.raises(TypeError, match="TextDocumentSplitter expects a List of Documents as input."): - splitter = TextDocumentSplitter() + with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."): + splitter = DocumentSplitter() splitter.run(documents=Document()) @pytest.mark.unit def test_empty_list(self): - splitter = TextDocumentSplitter() + splitter = DocumentSplitter() res = splitter.run(documents=[]) assert res == {"documents": []} @pytest.mark.unit def test_unsupported_split_by(self): with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."): - TextDocumentSplitter(split_by="unsupported") + DocumentSplitter(split_by="unsupported") @pytest.mark.unit def test_unsupported_split_length(self): with pytest.raises(ValueError, match="split_length must be greater than 0."): - TextDocumentSplitter(split_length=0) + DocumentSplitter(split_length=0) @pytest.mark.unit def test_unsupported_split_overlap(self): with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."): - TextDocumentSplitter(split_overlap=-1) + DocumentSplitter(split_overlap=-1) @pytest.mark.unit def test_split_by_word(self): - splitter = TextDocumentSplitter(split_by="word", split_length=10) + splitter = DocumentSplitter(split_by="word", split_length=10) result = splitter.run( documents=[ Document( @@ -56,7 +56,7 @@ def test_split_by_word(self): @pytest.mark.unit def test_split_by_word_multiple_input_docs(self): - splitter = TextDocumentSplitter(split_by="word", split_length=10) + splitter = DocumentSplitter(split_by="word", split_length=10) result = splitter.run( documents=[ Document( @@ -76,7 +76,7 @@ def test_split_by_word_multiple_input_docs(self): @pytest.mark.unit def test_split_by_sentence(self): - splitter = TextDocumentSplitter(split_by="sentence", split_length=1) + splitter = DocumentSplitter(split_by="sentence", split_length=1) result = splitter.run( documents=[ Document( @@ -91,7 +91,7 @@ def test_split_by_sentence(self): @pytest.mark.unit def test_split_by_passage(self): - splitter = TextDocumentSplitter(split_by="passage", split_length=1) + splitter = DocumentSplitter(split_by="passage", split_length=1) result = splitter.run( documents=[ Document( @@ -106,7 +106,7 @@ def test_split_by_passage(self): @pytest.mark.unit def test_split_by_word_with_overlap(self): - splitter = TextDocumentSplitter(split_by="word", split_length=10, split_overlap=2) + splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2) result = splitter.run( documents=[ Document( @@ -120,7 +120,7 @@ def test_split_by_word_with_overlap(self): @pytest.mark.unit def test_source_id_stored_in_metadata(self): - splitter = TextDocumentSplitter(split_by="word", split_length=10) + splitter = DocumentSplitter(split_by="word", split_length=10) doc1 = Document(content="This is a text with some words.") doc2 = Document(content="This is a different text with some words.") result = splitter.run(documents=[doc1, doc2]) @@ -129,7 +129,7 @@ def test_source_id_stored_in_metadata(self): @pytest.mark.unit def test_copy_metadata(self): - splitter = TextDocumentSplitter(split_by="word", split_length=10) + splitter = DocumentSplitter(split_by="word", split_length=10) documents = [ Document(content="Text.", meta={"name": "doc 0"}), Document(content="Text.", meta={"name": "doc 1"}),