From aa687239e15e842708af79351ba666360edb4ae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Sun, 16 Jun 2024 16:58:06 +0200 Subject: [PATCH] feat: Compatibility with xps, epub, mobi, fb2, cbz, svg, txt --- README.md | 31 +++++++++++++++++++++++++------ function_app.py | 6 ++++-- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 39ec82d..287954e 100644 --- a/README.md +++ b/README.md @@ -61,12 +61,31 @@ graph LR ### Format support -Document extraction is based on Azure Document Intelligence, specifically on the `prebuilt-layout` model. It [supports the following](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout?view=doc-intel-4.0.0&tabs=sample-code#input-requirements) formats: - -- HTML -- Images: JPEG/JPG, PNG, BMP, TIFF, HEIF -- Microsoft Office: Word (DOCX), Excel (XLSX), PowerPoint (PPTX) -- PDF +Document extraction is based on Azure Document Intelligence, specifically on the `prebuilt-layout` model. It [supports popular formats](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout?view=doc-intel-4.0.0&tabs=sample-code#input-requirements). + +Some formats are first converted to PDF [with MuPDF](https://github.com/ArtifexSoftware/mupdf) to ensure compatibility with Document Intelligence. + +> [!IMPORTANT] +> Formats not listed there are treated as binary and decoded with `UTF-8` encoding. + +| `Format` | **OCR** | **Details** | +|-|-|-| +| `.bmp` | ✅ | | +| `.cbz` | ✅ | First converted to PDF with MuPDF. | +| `.docx` | ✅ | | +| `.epub` | ✅ | First converted to PDF with MuPDF. | +| `.fb2` | ✅ | First converted to PDF with MuPDF. | +| `.heif` | ✅ | | +| `.html` | ✅ | | +| `.jpg`, `.jpeg` | ✅ | | +| `.mobi` | ✅ | First converted to PDF with MuPDF. | +| `.pdf` | ✅ | Sanitized & compressed with MuPDF. | +| `.png` | ✅ | | +| `.pptx` | ✅ | | +| `.svg` | ✅ | First converted to PDF with MuPDF. | +| `.tiff` | ✅ | | +| `.xlsx` | ✅ | | +| `.xps` | ✅ | First converted to PDF with MuPDF. | ### Demo diff --git a/function_app.py b/function_app.py index d10b591..efe74d8 100644 --- a/function_app.py +++ b/function_app.py @@ -89,11 +89,13 @@ async def _upload(local_file: IO, remote_path: str) -> None: await downloader.readinto(in_local_path) in_local_path.seek(0) # Reset file pointer - if detect_extension(in_remote_path) == ".pdf": # Sanitize PDF - logger.info(f"Sanitizing PDF ({in_remote_path})") + if detect_extension(in_remote_path) in {".pdf", ".xps", ".epub", ".mobi", ".fb2", ".cbz", ".svg", ".txt"}: # Sanitize with PyMuPDF + logger.info(f"Sanitizing ({in_remote_path})") doc_client = CONFIG.document_intelligence.instance() # Open in_pdf = pymupdf.open(in_local_path) + if not in_pdf.is_pdf: # Convert to PDF + in_pdf = pymupdf.open("pdf", in_pdf.convert_to_pdf()) # Sanitize in_pdf.scrub( hidden_text=False, # Keep hidden text (it may contain OCR)