From 710d283e98d57dd5c1c63a4029cab6c77038ca72 Mon Sep 17 00:00:00 2001
From: Christian O'Reilly <christian.oreilly@epfl.ch>
Date: Fri, 10 Aug 2018 15:52:20 +0200
Subject: [PATCH] Fixing the OCR on server-side.

For some reasons, the behavior of ocrmypdf seem to have change. Whereas before we were expecting directly the .txt file from it, now it was generating a PDF with the ocr-ed text overlaid to it. This commit fix this issue by overwriting the original scan PDF with a pdf with text overlaid and run the usual pdftotext on this new PDF.
---
 nat/restServer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nat/restServer.py b/nat/restServer.py
index cc8b791..d59bc3a 100644
--- a/nat/restServer.py
+++ b/nat/restServer.py
@@ -53,7 +53,8 @@ def runOCR(fileName):
         app.OCRLock.release()    
                 
         # Run OCR
-        run_ocrmypdf(fileName + ".pdf", fileName + ".txt")
+        run_ocrmypdf(fileName + ".pdf", fileName + ".pdf")
+        check_call(['pdftotext', '-enc', 'UTF-8', fileName + ".pdf", fileName + ".txt"])
  
         acquireLockWithTimeout()
         del app.OCRFiles[app.OCRFiles.index(fileName)]