From 3f77d3ab6c665312c20907860091ad4aa33c6edb Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Dec 2024 15:22:27 +0100 Subject: [PATCH] !feat: unify NLTKDocumentSplitter and DocumentSplitter (#8617) * wip: initial import * wip: refactoring * wip: refactoring tests * wip: refactoring tests * making all NLTKSplitter related tests work * refactoring * docstrings * refactoring and removing NLTKDocumentSplitter * fixing tests for custom sentence tokenizer * fixing tests for custom sentence tokenizer * cleaning up * adding release notes * reverting some changes * cleaning up tests * fixing serialisation and adding tests * cleaning up * wip * renaming and cleaning * adding NLTK files * updating docstring * adding import to init * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci * updating tests * wip * adding sentence/period change warning * fixing LICENSE header * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci --------- Co-authored-by: Stefano Fiorucci --- .../preprocessors/document_splitter.py | 274 +++++++++++++-- .../preprocessors/sentence_tokenizer.py | 19 +- ...-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml | 4 + .../preprocessors/test_document_splitter.py | 322 +++++++++++++++++- .../preprocessors/test_sentence_tokenizer.py | 67 ++++ .../test_sentence_window_retriever.py | 2 +- 6 files changed, 635 insertions(+), 53 deletions(-) create mode 100644 releasenotes/notes/unifying-docsplitter-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml create mode 100644 test/components/preprocessors/test_sentence_tokenizer.py diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 86d95f412a..b3e99924a7 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -2,20 +2,21 @@ # # SPDX-License-Identifier: Apache-2.0 +import warnings from copy import deepcopy from typing import Any, Callable, Dict, List, Literal, Optional, Tuple from more_itertools import windowed from haystack import Document, component, logging +from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports from haystack.core.serialization import default_from_dict, default_to_dict from haystack.utils import deserialize_callable, serialize_callable logger = logging.getLogger(__name__) -# Maps the 'split_by' argument to the actual char used to split the Documents. -# 'function' is not in the mapping cause it doesn't split on chars. -_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"} +# mapping of split by character, 'function' and 'sentence' don't split by character +_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"} @component @@ -23,8 +24,7 @@ class DocumentSplitter: """ Splits long documents into smaller chunks. - This is a common preprocessing step during indexing. - It helps Embedders create meaningful semantic representations + This is a common preprocessing step during indexing. It helps Embedders create meaningful semantic representations and prevents exceeding language model context limits. The DocumentSplitter is compatible with the following DocumentStores: @@ -54,18 +54,27 @@ class DocumentSplitter: def __init__( # pylint: disable=too-many-positional-arguments self, - split_by: Literal["function", "page", "passage", "sentence", "word", "line"] = "word", + split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word", split_length: int = 200, split_overlap: int = 0, split_threshold: int = 0, splitting_function: Optional[Callable[[str], List[str]]] = None, + respect_sentence_boundary: bool = False, + language: Language = "en", + use_split_rules: bool = True, + extend_abbreviations: bool = True, ): """ Initialize DocumentSplitter. - :param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "), - `sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"), - `passage` for splitting by double line breaks ("\\n\\n") or `line` for splitting each line ("\\n"). + :param split_by: The unit for splitting your documents. Choose from: + - `word` for splitting by spaces (" ") + - `period` for splitting by periods (".") + - `page` for splitting by form feed ("\\f") + - `passage` for splitting by double line breaks ("\\n\\n") + - `line` for splitting each line ("\\n") + - `sentence` for splitting by NLTK sentence tokenizer + :param split_length: The maximum number of units in each split. :param split_overlap: The number of overlapping units for each split. :param split_threshold: The minimum number of units per split. If a split has fewer units @@ -73,21 +82,87 @@ def __init__( # pylint: disable=too-many-positional-arguments :param splitting_function: Necessary when `split_by` is set to "function". This is a function which must accept a single `str` as input and return a `list` of `str` as output, representing the chunks after splitting. + :param respect_sentence_boundary: Choose whether to respect sentence boundaries when splitting by "word". + If True, uses NLTK to detect sentence boundaries, ensuring splits occur only between sentences. + :param language: Choose the language for the NLTK tokenizer. The default is English ("en"). + :param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`. + :param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list + of curated abbreviations, if available. This is currently supported for English ("en") and German ("de"). """ self.split_by = split_by - if split_by not in ["function", "page", "passage", "sentence", "word", "line"]: - raise ValueError("split_by must be one of 'function', 'word', 'sentence', 'page', 'passage' or 'line'.") + self.split_length = split_length + self.split_overlap = split_overlap + self.split_threshold = split_threshold + self.splitting_function = splitting_function + self.respect_sentence_boundary = respect_sentence_boundary + self.language = language + self.use_split_rules = use_split_rules + self.extend_abbreviations = extend_abbreviations + + self._init_checks( + split_by=split_by, + split_length=split_length, + split_overlap=split_overlap, + splitting_function=splitting_function, + respect_sentence_boundary=respect_sentence_boundary, + ) + + if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"): + nltk_imports.check() + self.sentence_splitter = SentenceSplitter( + language=language, + use_split_rules=use_split_rules, + extend_abbreviations=extend_abbreviations, + keep_white_spaces=True, + ) + + if split_by == "sentence": + # ToDo: remove this warning in the next major release + msg = ( + "The `split_by='sentence'` no longer splits by '.' and now relies on custom sentence tokenizer " + "based on NLTK. To achieve the previous behaviour use `split_by='period'." + ) + warnings.warn(msg) + + def _init_checks( + self, + *, + split_by: str, + split_length: int, + split_overlap: int, + splitting_function: Optional[Callable], + respect_sentence_boundary: bool, + ) -> None: + """ + Validates initialization parameters for DocumentSplitter. + + :param split_by: The unit for splitting documents + :param split_length: The maximum number of units in each split + :param split_overlap: The number of overlapping units for each split + :param splitting_function: Custom function for splitting when split_by="function" + :param respect_sentence_boundary: Whether to respect sentence boundaries when splitting + :raises ValueError: If any parameter is invalid + """ + valid_split_by = ["function", "page", "passage", "period", "word", "line", "sentence"] + if split_by not in valid_split_by: + raise ValueError(f"split_by must be one of {', '.join(valid_split_by)}.") + if split_by == "function" and splitting_function is None: raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.") + if split_length <= 0: raise ValueError("split_length must be greater than 0.") - self.split_length = split_length + if split_overlap < 0: raise ValueError("split_overlap must be greater than or equal to 0.") - self.split_overlap = split_overlap - self.split_threshold = split_threshold - self.splitting_function = splitting_function + + if respect_sentence_boundary and split_by != "word": + logger.warning( + "The 'respect_sentence_boundary' option is only supported for `split_by='word'`. " + "The option `respect_sentence_boundary` will be set to `False`." + ) + self.respect_sentence_boundary = False @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): @@ -98,7 +173,6 @@ def run(self, documents: List[Document]): and an overlap of `split_overlap`. :param documents: The documents to split. - :returns: A dictionary with the following key: - `documents`: List of documents with the split texts. Each document includes: - A metadata field `source_id` to track the original document. @@ -121,39 +195,69 @@ def run(self, documents: List[Document]): if doc.content == "": logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) continue - split_docs += self._split(doc) + + split_docs += self._split_document(doc) return {"documents": split_docs} - def _split(self, to_split: Document) -> List[Document]: - # We already check this before calling _split but - # we need to make linters happy - if to_split.content is None: - return [] + def _split_document(self, doc: Document) -> List[Document]: + if self.split_by == "sentence" or self.respect_sentence_boundary: + return self._split_by_nltk_sentence(doc) if self.split_by == "function" and self.splitting_function is not None: - splits = self.splitting_function(to_split.content) - docs: List[Document] = [] - for s in splits: - meta = deepcopy(to_split.meta) - meta["source_id"] = to_split.id - docs.append(Document(content=s, meta=meta)) - return docs - - split_at = _SPLIT_BY_MAPPING[self.split_by] - units = to_split.content.split(split_at) + return self._split_by_function(doc) + + return self._split_by_character(doc) + + def _split_by_nltk_sentence(self, doc: Document) -> List[Document]: + split_docs = [] + + result = self.sentence_splitter.split_sentences(doc.content) # type: ignore # None check is done in run() + units = [sentence["sentence"] for sentence in result] + + if self.respect_sentence_boundary: + text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount( + sentences=units, split_length=self.split_length, split_overlap=self.split_overlap + ) + else: + text_splits, splits_pages, splits_start_idxs = self._concatenate_units( + elements=units, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + ) + metadata = deepcopy(doc.meta) + metadata["source_id"] = doc.id + split_docs += self._create_docs_from_splits( + text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata + ) + + return split_docs + + def _split_by_character(self, doc) -> List[Document]: + split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by] + units = doc.content.split(split_at) # Add the delimiter back to all units except the last one for i in range(len(units) - 1): units[i] += split_at - text_splits, splits_pages, splits_start_idxs = self._concatenate_units( units, self.split_length, self.split_overlap, self.split_threshold ) - metadata = deepcopy(to_split.meta) - metadata["source_id"] = to_split.id + metadata = deepcopy(doc.meta) + metadata["source_id"] = doc.id return self._create_docs_from_splits( text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata ) + def _split_by_function(self, doc) -> List[Document]: + # the check for None is done already in the run method + splits = self.splitting_function(doc.content) # type: ignore + docs: List[Document] = [] + for s in splits: + meta = deepcopy(doc.meta) + meta["source_id"] = doc.id + docs.append(Document(content=s, meta=meta)) + return docs + def _concatenate_units( self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int ) -> Tuple[List[str], List[int], List[int]]: @@ -265,6 +369,10 @@ def to_dict(self) -> Dict[str, Any]: split_length=self.split_length, split_overlap=self.split_overlap, split_threshold=self.split_threshold, + respect_sentence_boundary=self.respect_sentence_boundary, + language=self.language, + use_split_rules=self.use_split_rules, + extend_abbreviations=self.extend_abbreviations, ) if self.splitting_function: serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function) @@ -282,3 +390,99 @@ def from_dict(cls, data: Dict[str, Any]) -> "DocumentSplitter": init_params["splitting_function"] = deserialize_callable(splitting_function) return default_from_dict(cls, data) + + @staticmethod + def _concatenate_sentences_based_on_word_amount( + sentences: List[str], split_length: int, split_overlap: int + ) -> Tuple[List[str], List[int], List[int]]: + """ + Groups the sentences into chunks of `split_length` words while respecting sentence boundaries. + + This function is only used when splitting by `word` and `respect_sentence_boundary` is set to `True`, i.e.: + with NLTK sentence tokenizer. + + :param sentences: The list of sentences to split. + :param split_length: The maximum number of words in each split. + :param split_overlap: The number of overlapping words in each split. + :returns: A tuple containing the concatenated sentences, the start page numbers, and the start indices. + """ + # chunk information + chunk_word_count = 0 + chunk_starting_page_number = 1 + chunk_start_idx = 0 + current_chunk: List[str] = [] + # output lists + split_start_page_numbers = [] + list_of_splits: List[List[str]] = [] + split_start_indices = [] + + for sentence_idx, sentence in enumerate(sentences): + current_chunk.append(sentence) + chunk_word_count += len(sentence.split()) + next_sentence_word_count = ( + len(sentences[sentence_idx + 1].split()) if sentence_idx < len(sentences) - 1 else 0 + ) + + # Number of words in the current chunk plus the next sentence is larger than the split_length, + # or we reached the last sentence + if (chunk_word_count + next_sentence_word_count) > split_length or sentence_idx == len(sentences) - 1: + # Save current chunk and start a new one + list_of_splits.append(current_chunk) + split_start_page_numbers.append(chunk_starting_page_number) + split_start_indices.append(chunk_start_idx) + + # Get the number of sentences that overlap with the next chunk + num_sentences_to_keep = DocumentSplitter._number_of_sentences_to_keep( + sentences=current_chunk, split_length=split_length, split_overlap=split_overlap + ) + # Set up information for the new chunk + if num_sentences_to_keep > 0: + # Processed sentences are the ones that are not overlapping with the next chunk + processed_sentences = current_chunk[:-num_sentences_to_keep] + chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences) + chunk_start_idx += len("".join(processed_sentences)) + # Next chunk starts with the sentences that were overlapping with the previous chunk + current_chunk = current_chunk[-num_sentences_to_keep:] + chunk_word_count = sum(len(s.split()) for s in current_chunk) + else: + # Here processed_sentences is the same as current_chunk since there is no overlap + chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk) + chunk_start_idx += len("".join(current_chunk)) + current_chunk = [] + chunk_word_count = 0 + + # Concatenate the sentences together within each split + text_splits = [] + for split in list_of_splits: + text = "".join(split) + if len(text) > 0: + text_splits.append(text) + + return text_splits, split_start_page_numbers, split_start_indices + + @staticmethod + def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_overlap: int) -> int: + """ + Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`. + + :param sentences: The list of sentences to split. + :param split_length: The maximum number of words in each split. + :param split_overlap: The number of overlapping words in each split. + :returns: The number of sentences to keep in the next chunk. + """ + # If the split_overlap is 0, we don't need to keep any sentences + if split_overlap == 0: + return 0 + + num_sentences_to_keep = 0 + num_words = 0 + # Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence + for sent in reversed(sentences[1:]): + num_words += len(sent.split()) + # If the number of words is larger than the split_length then don't add any more sentences + if num_words > split_length: + break + num_sentences_to_keep += 1 + if num_words > split_overlap: + break + return num_sentences_to_keep diff --git a/haystack/components/preprocessors/sentence_tokenizer.py b/haystack/components/preprocessors/sentence_tokenizer.py index 4932513452..505126e901 100644 --- a/haystack/components/preprocessors/sentence_tokenizer.py +++ b/haystack/components/preprocessors/sentence_tokenizer.py @@ -186,11 +186,16 @@ def _needs_join( """ Checks if the spans need to be joined as parts of one sentence. + This method determines whether two adjacent sentence spans should be joined back together as a single sentence. + It's used to prevent incorrect sentence splitting in specific cases like quotations, numbered lists, + and parenthetical expressions. + :param text: The text containing the spans. - :param span: The current sentence span within text. - :param next_span: The next sentence span within text. + :param span: Tuple of (start, end) positions for the current sentence span. + :param next_span: Tuple of (start, end) positions for the next sentence span. :param quote_spans: All quoted spans within text. - :returns: True if the spans needs to be joined. + :returns: + True if the spans needs to be joined. """ start, end = span next_start, next_end = next_span @@ -216,16 +221,16 @@ def _needs_join( return re.search(r"^\s*[\(\[]", text[next_start:next_end]) is not None @staticmethod - def _read_abbreviations(language: Language) -> List[str]: + def _read_abbreviations(lang: Language) -> List[str]: """ Reads the abbreviations for a given language from the abbreviations file. - :param language: The language to read the abbreviations for. + :param lang: The language to read the abbreviations for. :returns: List of abbreviations. """ - abbreviations_file = Path(__file__).parent.parent / f"data/abbreviations/{language}.txt" + abbreviations_file = Path(__file__).parent.parent / f"data/abbreviations/{lang}.txt" if not abbreviations_file.exists(): - logger.warning("No abbreviations file found for {language}.Using default abbreviations.", language=language) + logger.warning("No abbreviations file found for {language}. Using default abbreviations.", language=lang) return [] abbreviations = abbreviations_file.read_text().split("\n") diff --git a/releasenotes/notes/unifying-docsplitter-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml b/releasenotes/notes/unifying-docsplitter-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml new file mode 100644 index 0000000000..2b2a9d5f1b --- /dev/null +++ b/releasenotes/notes/unifying-docsplitter-and-nltkdocsplitter-f01a983c7e7f3ed3.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + The `NLTKDocumentSplitter` was merged into the `DocumentSplitter` which now provides the same functionality as the `NLTKDocumentSplitter`. The `split_by="sentence"` now uses a custom sentence boundary detection based on the `nltk` library. The previous `sentence` behaviour can still be achieved by `split_by="period"` diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 25872626c1..78767dbccd 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 +from typing import List + import re import pytest @@ -36,7 +38,7 @@ def merge_documents(documents): return merged_text -class TestDocumentSplitter: +class TestSplittingByFunctionOrCharacterRegex: def test_non_text_document(self): with pytest.raises( ValueError, match="DocumentSplitter only works with text documents but content for document ID" @@ -56,11 +58,13 @@ def test_empty_list(self): assert res == {"documents": []} def test_unsupported_split_by(self): - with pytest.raises( - ValueError, match="split_by must be one of 'function', 'word', 'sentence', 'page', 'passage' or 'line'." - ): + with pytest.raises(ValueError, match="split_by must be one of "): DocumentSplitter(split_by="unsupported") + def test_undefined_function(self): + with pytest.raises(ValueError, match="When 'split_by' is set to 'function', a valid 'splitting_function'"): + DocumentSplitter(split_by="function", splitting_function=None) + def test_unsupported_split_length(self): with pytest.raises(ValueError, match="split_length must be greater than 0."): DocumentSplitter(split_length=0) @@ -125,8 +129,8 @@ def test_split_by_word_multiple_input_docs(self): assert docs[4].meta["split_id"] == 2 assert docs[4].meta["split_idx_start"] == text2.index(docs[4].content) - def test_split_by_sentence(self): - splitter = DocumentSplitter(split_by="sentence", split_length=1) + def test_split_by_period(self): + splitter = DocumentSplitter(split_by="period", split_length=1) text = "This is a text with some words. There is a second sentence. And there is a third sentence." result = splitter.run(documents=[Document(content=text)]) docs = result["documents"] @@ -275,8 +279,8 @@ def test_add_page_number_to_metadata_with_no_overlap_word_split(self): for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p - def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self): - splitter = DocumentSplitter(split_by="sentence", split_length=1) + def test_add_page_number_to_metadata_with_no_overlap_period_split(self): + splitter = DocumentSplitter(split_by="period", split_length=1) doc1 = Document(content="This is some text.\f This text is on another page.") doc2 = Document(content="This content has two.\f\f page brakes.") result = splitter.run(documents=[doc1, doc2]) @@ -326,8 +330,8 @@ def test_add_page_number_to_metadata_with_overlap_word_split(self): for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p - def test_add_page_number_to_metadata_with_overlap_sentence_split(self): - splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1) + def test_add_page_number_to_metadata_with_overlap_period_split(self): + splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1) doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.") doc2 = Document(content="This content has two.\f\f page brakes. More text.") result = splitter.run(documents=[doc1, doc2]) @@ -494,3 +498,301 @@ def test_run_document_only_whitespaces(self): doc = Document(content=" ") results = splitter.run([doc]) assert results["documents"][0].content == " " + + +class TestSplittingNLTKSentenceSplitter: + @pytest.mark.parametrize( + "sentences, expected_num_sentences", + [ + (["The sun set.", "Moonlight shimmered softly, wolves howled nearby, night enveloped everything."], 0), + (["The sun set.", "It was a dark night ..."], 0), + (["The sun set.", " The moon was full."], 1), + (["The sun.", " The moon."], 1), # Ignores the first sentence + (["Sun", "Moon"], 1), # Ignores the first sentence even if its inclusion would be < split_overlap + ], + ) + def test_number_of_sentences_to_keep(self, sentences: List[str], expected_num_sentences: int) -> None: + num_sentences = DocumentSplitter._number_of_sentences_to_keep( + sentences=sentences, split_length=5, split_overlap=2 + ) + assert num_sentences == expected_num_sentences + + def test_number_of_sentences_to_keep_split_overlap_zero(self) -> None: + sentences = [ + "Moonlight shimmered softly, wolves howled nearby, night enveloped everything.", + " It was a dark night ...", + " The moon was full.", + ] + num_sentences = DocumentSplitter._number_of_sentences_to_keep( + sentences=sentences, split_length=5, split_overlap=0 + ) + assert num_sentences == 0 + + def test_run_split_by_sentence_1(self) -> None: + document_splitter = DocumentSplitter( + split_by="sentence", + split_length=2, + split_overlap=0, + split_threshold=0, + language="en", + use_split_rules=True, + extend_abbreviations=True, + ) + + text = ( + "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... " + "The moon was full." + ) + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 2 + assert ( + documents[0].content == "Moonlight shimmered softly, wolves howled nearby, night enveloped " + "everything. It was a dark night ... " + ) + assert documents[1].content == "The moon was full." + + def test_run_split_by_sentence_2(self) -> None: + document_splitter = DocumentSplitter( + split_by="sentence", + split_length=1, + split_overlap=0, + split_threshold=0, + language="en", + use_split_rules=False, + extend_abbreviations=True, + ) + + text = ( + "This is a test sentence with many many words that exceeds the split length and should not be repeated. " + "This is another test sentence. (This is a third test sentence.) " + "This is the last test sentence." + ) + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 4 + assert ( + documents[0].content + == "This is a test sentence with many many words that exceeds the split length and should not be repeated. " + ) + assert documents[0].meta["page_number"] == 1 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) + assert documents[1].content == "This is another test sentence. " + assert documents[1].meta["page_number"] == 1 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) + assert documents[2].content == "(This is a third test sentence.) " + assert documents[2].meta["page_number"] == 1 + assert documents[2].meta["split_id"] == 2 + assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) + assert documents[3].content == "This is the last test sentence." + assert documents[3].meta["page_number"] == 1 + assert documents[3].meta["split_id"] == 3 + assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) + + def test_run_split_by_sentence_3(self) -> None: + document_splitter = DocumentSplitter( + split_by="sentence", + split_length=1, + split_overlap=0, + split_threshold=0, + language="en", + use_split_rules=True, + extend_abbreviations=True, + ) + + text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 4 + assert documents[0].content == "Sentence on page 1.\f" + assert documents[0].meta["page_number"] == 1 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) + assert documents[1].content == "Sentence on page 2. \f" + assert documents[1].meta["page_number"] == 2 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) + assert documents[2].content == "Sentence on page 3. \f\f " + assert documents[2].meta["page_number"] == 3 + assert documents[2].meta["split_id"] == 2 + assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) + assert documents[3].content == "Sentence on page 5." + assert documents[3].meta["page_number"] == 5 + assert documents[3].meta["split_id"] == 3 + assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) + + def test_run_split_by_sentence_4(self) -> None: + document_splitter = DocumentSplitter( + split_by="sentence", + split_length=2, + split_overlap=1, + split_threshold=0, + language="en", + use_split_rules=True, + extend_abbreviations=True, + ) + + text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 3 + assert documents[0].content == "Sentence on page 1.\fSentence on page 2. \f" + assert documents[0].meta["page_number"] == 1 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) + assert documents[1].content == "Sentence on page 2. \fSentence on page 3. \f\f " + assert documents[1].meta["page_number"] == 2 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) + assert documents[2].content == "Sentence on page 3. \f\f Sentence on page 5." + assert documents[2].meta["page_number"] == 3 + assert documents[2].meta["split_id"] == 2 + assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) + + def test_run_split_by_word_respect_sentence_boundary(self) -> None: + document_splitter = DocumentSplitter( + split_by="word", + split_length=3, + split_overlap=0, + split_threshold=0, + language="en", + respect_sentence_boundary=True, + ) + + text = ( + "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f" + "The moon was full." + ) + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 3 + assert documents[0].content == "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. " + assert documents[0].meta["page_number"] == 1 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) + assert documents[1].content == "It was a dark night.\f" + assert documents[1].meta["page_number"] == 1 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) + assert documents[2].content == "The moon was full." + assert documents[2].meta["page_number"] == 2 + assert documents[2].meta["split_id"] == 2 + assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) + + def test_run_split_by_word_respect_sentence_boundary_no_repeats(self) -> None: + document_splitter = DocumentSplitter( + split_by="word", + split_length=13, + split_overlap=3, + split_threshold=0, + language="en", + respect_sentence_boundary=True, + use_split_rules=False, + extend_abbreviations=False, + ) + text = ( + "This is a test sentence with many many words that exceeds the split length and should not be repeated. " + "This is another test sentence. (This is a third test sentence.) " + "This is the last test sentence." + ) + documents = document_splitter.run([Document(content=text)])["documents"] + assert len(documents) == 3 + assert ( + documents[0].content + == "This is a test sentence with many many words that exceeds the split length and should not be repeated. " + ) + assert "This is a test sentence with many many words" not in documents[1].content + assert "This is a test sentence with many many words" not in documents[2].content + + def test_run_split_by_word_respect_sentence_boundary_with_split_overlap_and_page_breaks(self) -> None: + document_splitter = DocumentSplitter( + split_by="word", + split_length=8, + split_overlap=1, + split_threshold=0, + language="en", + use_split_rules=True, + extend_abbreviations=True, + respect_sentence_boundary=True, + ) + + text = ( + "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" + "Sentence on page 3. Another on page 3.\f\f Sentence on page 5." + ) + documents = document_splitter.run(documents=[Document(content=text)])["documents"] + + assert len(documents) == 6 + assert documents[0].content == "Sentence on page 1. Another on page 1.\f" + assert documents[0].meta["page_number"] == 1 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) + assert documents[1].content == "Another on page 1.\fSentence on page 2. " + assert documents[1].meta["page_number"] == 1 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) + assert documents[2].content == "Sentence on page 2. Another on page 2.\f" + assert documents[2].meta["page_number"] == 2 + assert documents[2].meta["split_id"] == 2 + assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) + assert documents[3].content == "Another on page 2.\fSentence on page 3. " + assert documents[3].meta["page_number"] == 2 + assert documents[3].meta["split_id"] == 3 + assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) + assert documents[4].content == "Sentence on page 3. Another on page 3.\f\f " + assert documents[4].meta["page_number"] == 3 + assert documents[4].meta["split_id"] == 4 + assert documents[4].meta["split_idx_start"] == text.index(documents[4].content) + assert documents[5].content == "Another on page 3.\f\f Sentence on page 5." + assert documents[5].meta["page_number"] == 3 + assert documents[5].meta["split_id"] == 5 + assert documents[5].meta["split_idx_start"] == text.index(documents[5].content) + + def test_respect_sentence_boundary_checks(self): + # this combination triggers the warning + splitter = DocumentSplitter(split_by="sentence", split_length=10, respect_sentence_boundary=True) + assert splitter.respect_sentence_boundary == False + + def test_sentence_serialization(self): + """Test serialization with NLTK sentence splitting configuration and using non-default values""" + splitter = DocumentSplitter( + split_by="sentence", + language="de", + use_split_rules=False, + extend_abbreviations=False, + respect_sentence_boundary=False, + ) + serialized = splitter.to_dict() + deserialized = DocumentSplitter.from_dict(serialized) + + assert deserialized.split_by == "sentence" + assert hasattr(deserialized, "sentence_splitter") + assert deserialized.language == "de" + assert deserialized.use_split_rules == False + assert deserialized.extend_abbreviations == False + assert deserialized.respect_sentence_boundary == False + + def test_nltk_serialization_roundtrip(self): + """Test complete serialization roundtrip with actual document splitting""" + splitter = DocumentSplitter( + split_by="sentence", + language="de", + use_split_rules=False, + extend_abbreviations=False, + respect_sentence_boundary=False, + ) + serialized = splitter.to_dict() + deserialized_splitter = DocumentSplitter.from_dict(serialized) + assert splitter.split_by == deserialized_splitter.split_by + + def test_respect_sentence_boundary_serialization(self): + """Test serialization with respect_sentence_boundary option""" + splitter = DocumentSplitter(split_by="word", respect_sentence_boundary=True, language="de") + serialized = splitter.to_dict() + deserialized = DocumentSplitter.from_dict(serialized) + + assert deserialized.respect_sentence_boundary == True + assert hasattr(deserialized, "sentence_splitter") + assert deserialized.language == "de" diff --git a/test/components/preprocessors/test_sentence_tokenizer.py b/test/components/preprocessors/test_sentence_tokenizer.py new file mode 100644 index 0000000000..bf9aab9a9e --- /dev/null +++ b/test/components/preprocessors/test_sentence_tokenizer.py @@ -0,0 +1,67 @@ +import pytest +from unittest.mock import patch +from pathlib import Path + +from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter + +from pytest import LogCaptureFixture + + +def test_apply_split_rules_no_join() -> None: + text = "This is a test. This is another test. And a third test." + spans = [(0, 15), (16, 36), (37, 54)] + result = SentenceSplitter._apply_split_rules(text, spans) + assert len(result) == 3 + assert result == [(0, 15), (16, 36), (37, 54)] + + +def test_apply_split_rules_join_case_1(): + text = 'He said "This is sentence one. This is sentence two." Then he left.' + result = SentenceSplitter._apply_split_rules(text, [(0, 30), (31, 53), (54, 67)]) + assert len(result) == 2 + assert result == [(0, 53), (54, 67)] + + +def test_apply_split_rules_join_case_3(): + splitter = SentenceSplitter(language="en", use_split_rules=True) + text = """ + 1. First item + 2. Second item + 3. Third item.""" + spans = [(0, 7), (8, 25), (26, 44), (45, 56)] + result = splitter._apply_split_rules(text, spans) + assert len(result) == 1 + assert result == [(0, 56)] + + +def test_apply_split_rules_join_case_4() -> None: + text = "This is a test. (With a parenthetical statement.) And another sentence." + spans = [(0, 15), (16, 50), (51, 74)] + result = SentenceSplitter._apply_split_rules(text, spans) + assert len(result) == 2 + assert result == [(0, 50), (51, 74)] + + +@pytest.fixture +def mock_file_content(): + return "Mr.\nDr.\nProf." + + +def test_read_abbreviations_existing_file(tmp_path, mock_file_content): + abbrev_dir = tmp_path / "data" / "abbreviations" + abbrev_dir.mkdir(parents=True) + abbrev_file = abbrev_dir / f"en.txt" + abbrev_file.write_text(mock_file_content) + + with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path: + mock_path.return_value.parent.parent = tmp_path + result = SentenceSplitter._read_abbreviations("en") + assert result == ["Mr.", "Dr.", "Prof."] + + +def test_read_abbreviations_missing_file(caplog: LogCaptureFixture): + with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path: + mock_path.return_value.parent.parent = Path("/nonexistent") + result = SentenceSplitter._read_abbreviations("pt") + assert result == [] + assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text diff --git a/test/components/retrievers/test_sentence_window_retriever.py b/test/components/retrievers/test_sentence_window_retriever.py index 4979fa334d..04e03befbe 100644 --- a/test/components/retrievers/test_sentence_window_retriever.py +++ b/test/components/retrievers/test_sentence_window_retriever.py @@ -176,7 +176,7 @@ def test_context_documents_returned_are_ordered_by_split_idx_start(self): @pytest.mark.integration def test_run_with_pipeline(self): - splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by="sentence") + splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by="period") text = ( "This is a text with some words. There is a second sentence. And there is also a third sentence. " "It also contains a fourth sentence. And a fifth sentence. And a sixth sentence. And a seventh sentence"