diff --git a/nlm_ingestor/file_parser/tika_parser.py b/nlm_ingestor/file_parser/tika_parser.py index fb10955..97ad788 100644 --- a/nlm_ingestor/file_parser/tika_parser.py +++ b/nlm_ingestor/file_parser/tika_parser.py @@ -23,6 +23,7 @@ def parse_to_html(self, filepath, do_ocr=False): "X-Tika-OCRskipOcr": "false", "X-Tika-OCRoutputType": "hocr", "X-Tika-Timeout-Millis": str(100 * timeout), + "X-Tika-PDFOcrStrategy": "ocr_only", "X-Tika-OCRtimeoutSeconds": str(timeout), }