From aa687239e15e842708af79351ba666360edb4ae5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Sun, 16 Jun 2024 16:58:06 +0200
Subject: [PATCH] feat: Compatibility with xps, epub, mobi, fb2, cbz, svg, txt

---
 README.md       | 31 +++++++++++++++++++++++++------
 function_app.py |  6 ++++--
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 39ec82d..287954e 100644
--- a/README.md
+++ b/README.md
@@ -61,12 +61,31 @@ graph LR
 
 ### Format support
 
-Document extraction is based on Azure Document Intelligence, specifically on the `prebuilt-layout` model. It [supports the following](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout?view=doc-intel-4.0.0&tabs=sample-code#input-requirements) formats:
-
-- HTML
-- Images: JPEG/JPG, PNG, BMP, TIFF, HEIF
-- Microsoft Office: Word (DOCX), Excel (XLSX), PowerPoint (PPTX)
-- PDF
+Document extraction is based on Azure Document Intelligence, specifically on the `prebuilt-layout` model. It [supports popular formats](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout?view=doc-intel-4.0.0&tabs=sample-code#input-requirements).
+
+Some formats are first converted to PDF [with MuPDF](https://github.com/ArtifexSoftware/mupdf) to ensure compatibility with Document Intelligence.
+
+> [!IMPORTANT]
+> Formats not listed there are treated as binary and decoded with `UTF-8` encoding.
+
+| `Format` | **OCR** | **Details** |
+|-|-|-|
+| `.bmp` | ✅ | |
+| `.cbz` | ✅ | First converted to PDF with MuPDF. |
+| `.docx` | ✅ | |
+| `.epub` | ✅ | First converted to PDF with MuPDF. |
+| `.fb2` | ✅ | First converted to PDF with MuPDF. |
+| `.heif` | ✅ | |
+| `.html` | ✅ | |
+| `.jpg`, `.jpeg` | ✅ | |
+| `.mobi` | ✅ | First converted to PDF with MuPDF. |
+| `.pdf` | ✅ | Sanitized & compressed with MuPDF. |
+| `.png` | ✅ | |
+| `.pptx` | ✅ | |
+| `.svg` | ✅ | First converted to PDF with MuPDF. |
+| `.tiff` | ✅ | |
+| `.xlsx` | ✅ | |
+| `.xps` | ✅ | First converted to PDF with MuPDF. |
 
 ### Demo
 
diff --git a/function_app.py b/function_app.py
index d10b591..efe74d8 100644
--- a/function_app.py
+++ b/function_app.py
@@ -89,11 +89,13 @@ async def _upload(local_file: IO, remote_path: str) -> None:
             await downloader.readinto(in_local_path)
             in_local_path.seek(0)  # Reset file pointer
 
-            if detect_extension(in_remote_path) == ".pdf":  # Sanitize PDF
-                logger.info(f"Sanitizing PDF ({in_remote_path})")
+            if detect_extension(in_remote_path) in {".pdf", ".xps", ".epub", ".mobi", ".fb2", ".cbz", ".svg", ".txt"}:  # Sanitize with PyMuPDF
+                logger.info(f"Sanitizing ({in_remote_path})")
                 doc_client = CONFIG.document_intelligence.instance()
                 # Open
                 in_pdf = pymupdf.open(in_local_path)
+                if not in_pdf.is_pdf:  # Convert to PDF
+                    in_pdf = pymupdf.open("pdf", in_pdf.convert_to_pdf())
                 # Sanitize
                 in_pdf.scrub(
                     hidden_text=False,  # Keep hidden text (it may contain OCR)