From 2df40c338cd46146821af36437f5ffec67dec04c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 16:51:31 +0100 Subject: [PATCH] adding detecting for full overlap with previous chunks --- .../components/preprocessors/recursive_splitter.py | 10 +++++++++- .../preprocessors/test_recursive_splitter.py | 12 +++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py index 3cb31a6893..fc421d552f 100644 --- a/haystack/components/preprocessors/recursive_splitter.py +++ b/haystack/components/preprocessors/recursive_splitter.py @@ -105,13 +105,21 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]: The list of chunks with overlap applied. """ overlapped_chunks = [] + for idx, chunk in enumerate(chunks): if idx == 0: overlapped_chunks.append(chunk) continue overlap_start = max(0, len(chunks[idx - 1]) - self.split_overlap) - current_chunk = chunks[idx - 1][overlap_start:] + chunk + overlap = chunks[idx - 1][overlap_start:] + if overlap == chunks[idx - 1]: + logger.warning( + "Overlap is the same as the previous chunk. " + "Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter." + ) + current_chunk = overlap + chunk overlapped_chunks.append(current_chunk) + return overlapped_chunks def _chunk_text(self, text: str) -> List[str]: diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py index 71e189e30e..f7f0bf1467 100644 --- a/test/components/preprocessors/test_recursive_splitter.py +++ b/test/components/preprocessors/test_recursive_splitter.py @@ -39,7 +39,7 @@ def test_apply_overlap_no_overlap(): assert result == ["chunk1", "chunk2", "chunk3"] -def test_apply_overlap_with_overlap_case_1(): +def test_apply_overlap_with_overlap(): # Test the case where there is overlap between chunks splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3"] @@ -47,12 +47,14 @@ def test_apply_overlap_with_overlap_case_1(): assert result == ["chunk1", "unk1chunk2", "unk2chunk3"] -# ToDo: update this test, result above is not the expected one -def ignore_test_apply_overlap_with_overlap_case_2(): +def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog): splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."]) chunks = ["chunk1", "chunk2", "chunk3", "chunk4"] - result = splitter._apply_overlap(chunks) - assert result == ["chunk1", "chunk1chunk2", "chunk2chunk3", "chunk3chunk4"] + _ = splitter._apply_overlap(chunks) + assert ( + "Overlap is the same as the previous chunk. Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter." + in caplog.text + ) def test_apply_overlap_single_chunk():