Skip to content

Commit

Permalink
adding detecting for full overlap with previous chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Dec 16, 2024
1 parent c3f09d0 commit 2df40c3
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 6 deletions.
10 changes: 9 additions & 1 deletion haystack/components/preprocessors/recursive_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,21 @@ def _apply_overlap(self, chunks: List[str]) -> List[str]:
The list of chunks with overlap applied.
"""
overlapped_chunks = []

for idx, chunk in enumerate(chunks):
if idx == 0:
overlapped_chunks.append(chunk)
continue
overlap_start = max(0, len(chunks[idx - 1]) - self.split_overlap)
current_chunk = chunks[idx - 1][overlap_start:] + chunk
overlap = chunks[idx - 1][overlap_start:]
if overlap == chunks[idx - 1]:
logger.warning(
"Overlap is the same as the previous chunk. "
"Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter."
)
current_chunk = overlap + chunk
overlapped_chunks.append(current_chunk)

return overlapped_chunks

def _chunk_text(self, text: str) -> List[str]:
Expand Down
12 changes: 7 additions & 5 deletions test/components/preprocessors/test_recursive_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,22 @@ def test_apply_overlap_no_overlap():
assert result == ["chunk1", "chunk2", "chunk3"]


def test_apply_overlap_with_overlap_case_1():
def test_apply_overlap_with_overlap():
# Test the case where there is overlap between chunks
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=4, separators=["."])
chunks = ["chunk1", "chunk2", "chunk3"]
result = splitter._apply_overlap(chunks)
assert result == ["chunk1", "unk1chunk2", "unk2chunk3"]


# ToDo: update this test, result above is not the expected one
def ignore_test_apply_overlap_with_overlap_case_2():
def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog):
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=6, separators=["."])
chunks = ["chunk1", "chunk2", "chunk3", "chunk4"]
result = splitter._apply_overlap(chunks)
assert result == ["chunk1", "chunk1chunk2", "chunk2chunk3", "chunk3chunk4"]
_ = splitter._apply_overlap(chunks)
assert (
"Overlap is the same as the previous chunk. Consider increasing the `split_overlap` parameter or decreasing the `split_length` parameter."
in caplog.text
)


def test_apply_overlap_single_chunk():
Expand Down

0 comments on commit 2df40c3

Please sign in to comment.