From b881f6941a4cf1abf749f6e27520145b69c6ed52 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 29 Feb 2024 10:32:03 +0100 Subject: [PATCH 1/6] update version --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 2ec58cb5ed..625e998afc 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.25.0-rc0 +1.25.0-rc1 From 76ff8e253f99adffeb1d5c89d8fb2e9e8fbb64a0 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 29 Feb 2024 11:40:44 +0100 Subject: [PATCH 2/6] ci: fix rest-api tests (#7256) --- .github/workflows/rest_api_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rest_api_tests.yml b/.github/workflows/rest_api_tests.yml index e3fc5a5971..2da6bf564b 100644 --- a/.github/workflows/rest_api_tests.yml +++ b/.github/workflows/rest_api_tests.yml @@ -110,7 +110,7 @@ jobs: - name: Install REST API run: | pip install -U "./rest_api[dev]" - pip install ".[inference,dev]" + pip install ".[inference,dev,preprocessing]" pip install . - name: Run tests From ae20bae6a4461993ba15e4d9e30ff10e2ef2133b Mon Sep 17 00:00:00 2001 From: tstadel Date: Thu, 29 Feb 2024 17:40:32 +0100 Subject: [PATCH 3/6] fix: page_number for pdfs with pages not containing any text --- haystack/nodes/preprocessor/preprocessor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 4ed52446e7..9dd21978ce 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -641,6 +641,9 @@ def _concatenate_units( else: num_page_breaks = len(processed_units) cur_page += num_page_breaks + else: + if self.add_page_number and split_at == "\f": + cur_page += 1 return text_splits, splits_pages, splits_start_idxs From 18fd55b595851b792f01aee27b69ff4acecece4b Mon Sep 17 00:00:00 2001 From: tstadel Date: Thu, 29 Feb 2024 17:41:06 +0100 Subject: [PATCH 4/6] Revert "fix: page_number for pdfs with pages not containing any text" This reverts commit ae20bae6a4461993ba15e4d9e30ff10e2ef2133b. --- haystack/nodes/preprocessor/preprocessor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 9dd21978ce..4ed52446e7 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -641,9 +641,6 @@ def _concatenate_units( else: num_page_breaks = len(processed_units) cur_page += num_page_breaks - else: - if self.add_page_number and split_at == "\f": - cur_page += 1 return text_splits, splits_pages, splits_start_idxs From 39ce7ff919f61e81a1e70980502edafe566f32c3 Mon Sep 17 00:00:00 2001 From: tstadel Date: Thu, 29 Feb 2024 17:42:34 +0100 Subject: [PATCH 5/6] Reapply "fix: page_number for pdfs with pages not containing any text" This reverts commit 18fd55b595851b792f01aee27b69ff4acecece4b. --- haystack/nodes/preprocessor/preprocessor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 4ed52446e7..9dd21978ce 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -641,6 +641,9 @@ def _concatenate_units( else: num_page_breaks = len(processed_units) cur_page += num_page_breaks + else: + if self.add_page_number and split_at == "\f": + cur_page += 1 return text_splits, splits_pages, splits_start_idxs From 41e74c23d76c1a61f0186e346eceb7cf8f604354 Mon Sep 17 00:00:00 2001 From: tstadel Date: Fri, 1 Mar 2024 11:43:47 +0100 Subject: [PATCH 6/6] add test --- test/nodes/test_preprocessor.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py index 8801ce5125..e6f51fbfe4 100644 --- a/test/nodes/test_preprocessor.py +++ b/test/nodes/test_preprocessor.py @@ -265,6 +265,20 @@ def test_preprocess_page_split_and_split_overlap(): assert output[1].meta["page"] == 2 +@pytest.mark.unit +def test_preprocess_page_split_with_empty_pages(): + doc = Document( + content="This is a document on page 1.\f\fThis is a document on page 3.\f\fThis is a document on page 5." + ) + output = PreProcessor( + split_by="page", split_length=1, split_respect_sentence_boundary=False, split_overlap=0, add_page_number=True + ).run([doc])[0]["documents"] + assert len(output) == 3 + assert output[0] == Document(content="This is a document on page 1.", meta={"_split_id": 0, "page": 1}) + assert output[1] == Document(content="This is a document on page 3.", meta={"_split_id": 1, "page": 3}) + assert output[2] == Document(content="This is a document on page 5.", meta={"_split_id": 2, "page": 5}) + + @pytest.mark.unit def test_preprocess_tiktoken_token_split(mock_tiktoken_tokenizer): raw_docs = [