From 436d0fcca9c55e07a3e58c884e59186340ba3423 Mon Sep 17 00:00:00 2001 From: James Villarrubia Date: Wed, 12 Jun 2024 09:31:25 -0400 Subject: [PATCH] Additional header required to get style tags for visual processing. --- nlm_ingestor/file_parser/tika_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nlm_ingestor/file_parser/tika_parser.py b/nlm_ingestor/file_parser/tika_parser.py index fb10955..97ad788 100644 --- a/nlm_ingestor/file_parser/tika_parser.py +++ b/nlm_ingestor/file_parser/tika_parser.py @@ -23,6 +23,7 @@ def parse_to_html(self, filepath, do_ocr=False): "X-Tika-OCRskipOcr": "false", "X-Tika-OCRoutputType": "hocr", "X-Tika-Timeout-Millis": str(100 * timeout), + "X-Tika-PDFOcrStrategy": "ocr_only", "X-Tika-OCRtimeoutSeconds": str(timeout), }