From 710d283e98d57dd5c1c63a4029cab6c77038ca72 Mon Sep 17 00:00:00 2001 From: Christian O'Reilly Date: Fri, 10 Aug 2018 15:52:20 +0200 Subject: [PATCH] Fixing the OCR on server-side. For some reasons, the behavior of ocrmypdf seem to have change. Whereas before we were expecting directly the .txt file from it, now it was generating a PDF with the ocr-ed text overlaid to it. This commit fix this issue by overwriting the original scan PDF with a pdf with text overlaid and run the usual pdftotext on this new PDF. --- nat/restServer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nat/restServer.py b/nat/restServer.py index cc8b791..d59bc3a 100644 --- a/nat/restServer.py +++ b/nat/restServer.py @@ -53,7 +53,8 @@ def runOCR(fileName): app.OCRLock.release() # Run OCR - run_ocrmypdf(fileName + ".pdf", fileName + ".txt") + run_ocrmypdf(fileName + ".pdf", fileName + ".pdf") + check_call(['pdftotext', '-enc', 'UTF-8', fileName + ".pdf", fileName + ".txt"]) acquireLockWithTimeout() del app.OCRFiles[app.OCRFiles.index(fileName)]