Skip to content

Commit

Permalink
refactor!: rename TextDocumentSplitter to DocumentSplitter (deeps…
Browse files Browse the repository at this point in the history
…et-ai#6223)

* rename TextDocumentSplitter to DocumentSplitter

* reno

* fix init
  • Loading branch information
anakin87 authored Nov 3, 2023
1 parent 6e2dbdc commit 063d27c
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 30 deletions.
4 changes: 2 additions & 2 deletions e2e/preview/pipelines/test_preprocessing_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from haystack.preview import Pipeline
from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.preview.components.file_converters import TextFileToDocument
from haystack.preview.components.preprocessors import TextDocumentSplitter, DocumentCleaner, DocumentLanguageClassifier
from haystack.preview.components.preprocessors import DocumentSplitter, DocumentCleaner, DocumentLanguageClassifier
from haystack.preview.components.routers import FileTypeRouter
from haystack.preview.components.writers import DocumentWriter
from haystack.preview.document_stores import InMemoryDocumentStore
Expand All @@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path):
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
preprocessing_pipeline.add_component(
instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter"
instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter"
)
preprocessing_pipeline.add_component(
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
Expand Down
6 changes: 3 additions & 3 deletions haystack/preview/components/preprocessors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from haystack.preview.components.preprocessors.text_document_cleaner import DocumentCleaner
from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter
from haystack.preview.components.preprocessors.document_cleaner import DocumentCleaner
from haystack.preview.components.preprocessors.document_splitter import DocumentSplitter
from haystack.preview.components.preprocessors.document_language_classifier import DocumentLanguageClassifier
from haystack.preview.components.preprocessors.text_language_classifier import TextLanguageClassifier

__all__ = ["TextDocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"]
__all__ = ["DocumentSplitter", "DocumentCleaner", "TextLanguageClassifier", "DocumentLanguageClassifier"]
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


@component
class TextDocumentSplitter:
class DocumentSplitter:
"""
Splits a list of text documents into a list of text documents with shorter texts.
This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models.
Expand Down Expand Up @@ -45,13 +45,13 @@ def run(self, documents: List[Document]):
"""

if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("TextDocumentSplitter expects a List of Documents as input.")
raise TypeError("DocumentSplitter expects a List of Documents as input.")

split_docs = []
for doc in documents:
if doc.content is None:
raise ValueError(
f"TextDocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
)
units = self._split_into_units(doc.content, self.split_by)
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
Expand All @@ -69,7 +69,7 @@ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "pa
split_at = " "
else:
raise NotImplementedError(
"TextDocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
"DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
)
units = text.split(split_at)
# Add the delimiter back to all units except the last one
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ preview = [
"Jinja2",
"openai",
"pyyaml",
"more-itertools", # TextDocumentSplitter
"more-itertools", # DocumentSplitter
]
inference = [
"transformers[torch,sentencepiece]==4.34.1",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
preview:
- |
rename `TextDocumentSplitter` to `DocumentSplitter`, to allow a better
distinction between Components that operate on text and those that operate
on Documents.
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ class TestDocumentCleaner:
@pytest.mark.unit
def test_init(self):
cleaner = DocumentCleaner()
assert cleaner.remove_empty_lines == True
assert cleaner.remove_extra_whitespaces == True
assert cleaner.remove_repeated_substrings == False
assert cleaner.remove_empty_lines is True
assert cleaner.remove_extra_whitespaces is True
assert cleaner.remove_repeated_substrings is False
assert cleaner.remove_substrings is None
assert cleaner.remove_regex is None

Expand Down
Original file line number Diff line number Diff line change
@@ -1,48 +1,48 @@
import pytest

from haystack.preview import Document
from haystack.preview.components.preprocessors import TextDocumentSplitter
from haystack.preview.components.preprocessors import DocumentSplitter


class TestTextDocumentSplitter:
class TestDocumentSplitter:
@pytest.mark.unit
def test_non_text_document(self):
with pytest.raises(
ValueError, match="TextDocumentSplitter only works with text documents but document.content for document ID"
ValueError, match="DocumentSplitter only works with text documents but document.content for document ID"
):
splitter = TextDocumentSplitter()
splitter = DocumentSplitter()
splitter.run(documents=[Document()])

@pytest.mark.unit
def test_single_doc(self):
with pytest.raises(TypeError, match="TextDocumentSplitter expects a List of Documents as input."):
splitter = TextDocumentSplitter()
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
splitter = DocumentSplitter()
splitter.run(documents=Document())

@pytest.mark.unit
def test_empty_list(self):
splitter = TextDocumentSplitter()
splitter = DocumentSplitter()
res = splitter.run(documents=[])
assert res == {"documents": []}

@pytest.mark.unit
def test_unsupported_split_by(self):
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
TextDocumentSplitter(split_by="unsupported")
DocumentSplitter(split_by="unsupported")

@pytest.mark.unit
def test_unsupported_split_length(self):
with pytest.raises(ValueError, match="split_length must be greater than 0."):
TextDocumentSplitter(split_length=0)
DocumentSplitter(split_length=0)

@pytest.mark.unit
def test_unsupported_split_overlap(self):
with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."):
TextDocumentSplitter(split_overlap=-1)
DocumentSplitter(split_overlap=-1)

@pytest.mark.unit
def test_split_by_word(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
splitter = DocumentSplitter(split_by="word", split_length=10)
result = splitter.run(
documents=[
Document(
Expand All @@ -56,7 +56,7 @@ def test_split_by_word(self):

@pytest.mark.unit
def test_split_by_word_multiple_input_docs(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
splitter = DocumentSplitter(split_by="word", split_length=10)
result = splitter.run(
documents=[
Document(
Expand All @@ -76,7 +76,7 @@ def test_split_by_word_multiple_input_docs(self):

@pytest.mark.unit
def test_split_by_sentence(self):
splitter = TextDocumentSplitter(split_by="sentence", split_length=1)
splitter = DocumentSplitter(split_by="sentence", split_length=1)
result = splitter.run(
documents=[
Document(
Expand All @@ -91,7 +91,7 @@ def test_split_by_sentence(self):

@pytest.mark.unit
def test_split_by_passage(self):
splitter = TextDocumentSplitter(split_by="passage", split_length=1)
splitter = DocumentSplitter(split_by="passage", split_length=1)
result = splitter.run(
documents=[
Document(
Expand All @@ -106,7 +106,7 @@ def test_split_by_passage(self):

@pytest.mark.unit
def test_split_by_word_with_overlap(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10, split_overlap=2)
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
result = splitter.run(
documents=[
Document(
Expand All @@ -120,7 +120,7 @@ def test_split_by_word_with_overlap(self):

@pytest.mark.unit
def test_source_id_stored_in_metadata(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
splitter = DocumentSplitter(split_by="word", split_length=10)
doc1 = Document(content="This is a text with some words.")
doc2 = Document(content="This is a different text with some words.")
result = splitter.run(documents=[doc1, doc2])
Expand All @@ -129,7 +129,7 @@ def test_source_id_stored_in_metadata(self):

@pytest.mark.unit
def test_copy_metadata(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
splitter = DocumentSplitter(split_by="word", split_length=10)
documents = [
Document(content="Text.", meta={"name": "doc 0"}),
Document(content="Text.", meta={"name": "doc 1"}),
Expand Down

0 comments on commit 063d27c

Please sign in to comment.