From 23e3cc23a9b4235a306dfa70f37535e7670899ca Mon Sep 17 00:00:00 2001 From: plpycoin <103234125+plpycoin@users.noreply.github.com> Date: Sun, 8 Dec 2024 07:57:04 +0800 Subject: [PATCH] local GPU's conversion of PDFs is really too slow. Just can't bear it anymore (#229) time consumption has been reduced from around 200s to around 15s ----- * turn off OCR * switch to Pdfium for the backend * turn on cell matching --- server/app/routes/convert.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/server/app/routes/convert.py b/server/app/routes/convert.py index a896bb37..2c680831 100644 --- a/server/app/routes/convert.py +++ b/server/app/routes/convert.py @@ -64,8 +64,23 @@ async def convert_documents(files: List[UploadFile] = File(...)): print(f"Modal endpoint failed: {str(e)}. Falling back to local processing...") # If Modal fails, fall back to local processing - from docling.document_converter import DocumentConverter - doc_converter = DocumentConverter() + from docling.datamodel.base_models import InputFormat + from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = False + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend + ) + } + ) # Create a temporary directory to store uploaded files with tempfile.TemporaryDirectory() as temp_dir: