diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 4ed52446e7..9dd21978ce 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -641,6 +641,9 @@ def _concatenate_units( else: num_page_breaks = len(processed_units) cur_page += num_page_breaks + else: + if self.add_page_number and split_at == "\f": + cur_page += 1 return text_splits, splits_pages, splits_start_idxs diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py index 8801ce5125..e6f51fbfe4 100644 --- a/test/nodes/test_preprocessor.py +++ b/test/nodes/test_preprocessor.py @@ -265,6 +265,20 @@ def test_preprocess_page_split_and_split_overlap(): assert output[1].meta["page"] == 2 +@pytest.mark.unit +def test_preprocess_page_split_with_empty_pages(): + doc = Document( + content="This is a document on page 1.\f\fThis is a document on page 3.\f\fThis is a document on page 5." + ) + output = PreProcessor( + split_by="page", split_length=1, split_respect_sentence_boundary=False, split_overlap=0, add_page_number=True + ).run([doc])[0]["documents"] + assert len(output) == 3 + assert output[0] == Document(content="This is a document on page 1.", meta={"_split_id": 0, "page": 1}) + assert output[1] == Document(content="This is a document on page 3.", meta={"_split_id": 1, "page": 3}) + assert output[2] == Document(content="This is a document on page 5.", meta={"_split_id": 2, "page": 5}) + + @pytest.mark.unit def test_preprocess_tiktoken_token_split(mock_tiktoken_tokenizer): raw_docs = [