Skip to content

Commit

Permalink
local GPU's conversion of PDFs is really too slow. Just can't bear it…
Browse files Browse the repository at this point in the history
… anymore (#229)

time consumption has been reduced from around 200s to around 15s

-----
* turn off OCR
* switch to Pdfium for the backend
*  turn on cell matching
  • Loading branch information
plpycoin authored Dec 7, 2024
1 parent 1c2fcf6 commit 23e3cc2
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions server/app/routes/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,23 @@ async def convert_documents(files: List[UploadFile] = File(...)):
print(f"Modal endpoint failed: {str(e)}. Falling back to local processing...")

# If Modal fails, fall back to local processing
from docling.document_converter import DocumentConverter
doc_converter = DocumentConverter()
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
)
}
)

# Create a temporary directory to store uploaded files
with tempfile.TemporaryDirectory() as temp_dir:
Expand Down

0 comments on commit 23e3cc2

Please sign in to comment.