adding detecting for full overlap with previous chunks

deepset-ai · Dec 16, 2024 · 2df40c3 · 2df40c3
1 parent c3f09d0
commit 2df40c3
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 6 deletions.
diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py
@@ -105,13 +105,21 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]:
             The list of chunks with overlap applied.
         """
         overlapped_chunks = []
+
         for idx, chunk in enumerate(chunks):
             if idx == 0:
                 overlapped_chunks.append(chunk)
                 continue
             overlap_start = max(0, len(chunks[idx - 1]) - self.split_overlap)
-            current_chunk = chunks[idx - 1][overlap_start:] + chunk
+            overlap = chunks[idx - 1][overlap_start:]
+            if overlap == chunks[idx - 1]:
+                logger.warning(
+                    "Overlap is the same as the previous chunk. "
+                    "Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter."
+                )
+            current_chunk = overlap + chunk
             overlapped_chunks.append(current_chunk)
+
         return overlapped_chunks
 
     def _chunk_text(self, text: str) -> List[str]:

diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py
@@ -39,20 +39,22 @@ def test_apply_overlap_no_overlap():
     assert result == ["chunk1", "chunk2", "chunk3"]
 
 
-def test_apply_overlap_with_overlap_case_1():
+def test_apply_overlap_with_overlap():
     # Test the case where there is overlap between chunks
     splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."])
     chunks = ["chunk1", "chunk2", "chunk3"]
     result = splitter._apply_overlap(chunks)
     assert result == ["chunk1", "unk1chunk2", "unk2chunk3"]
 
 
-# ToDo: update this test, result above is not the expected one
-def ignore_test_apply_overlap_with_overlap_case_2():
+def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog):
     splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."])
     chunks = ["chunk1", "chunk2", "chunk3", "chunk4"]
-    result = splitter._apply_overlap(chunks)
-    assert result == ["chunk1", "chunk1chunk2", "chunk2chunk3", "chunk3chunk4"]
+    _ = splitter._apply_overlap(chunks)
+    assert (
+        "Overlap is the same as the previous chunk. Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter."
+        in caplog.text
+    )
 
 
 def test_apply_overlap_single_chunk():