feat: add pdf upload for the UI (#190)

* feat: add pdf upload for the UI * chore: update lockfile * batch inputs and support txt files * fix: fix bugs in UI * fix: resolve viz
ucbepic · Nov 17, 2024 · 02dd680 · 02dd680
1 parent 4f2e0e6
commit 02dd680
Show file tree

Hide file tree

Showing 25 changed files with 3,678 additions and 1,094 deletions.
diff --git a/docetl/builder.py b/docetl/builder.py
@@ -320,8 +320,12 @@ def _insert_empty_resolve_operations(self):
                     has_map = True
                     map_op = op
                 elif op_type == "reduce" and op_config.get("synthesize_resolve", True):
-                    has_reduce = True
-                    reduce_op = op
+                    reduce_key = op_config.get("reduce_key", "_all")
+                    if isinstance(reduce_key, str):
+                        reduce_key = [reduce_key]
+                    if "_all" not in reduce_key:
+                        has_reduce = True
+                        reduce_op = op
                 elif op_type == "resolve":
                     has_resolve = True
 

diff --git a/docetl/operations/reduce.py b/docetl/operations/reduce.py
@@ -387,6 +387,13 @@ def process_group(
             # Only execute merge-based plans if associative = True
             if "merge_prompt" in self.config and self.config.get("associative", True):
                 result, cost = self._parallel_fold_and_merge(key, group_list)
+            elif (
+                self.config.get("fold_batch_size", None)
+                and self.config.get("fold_batch_size") >= len(group_list)
+            ):
+                # If the fold batch size is greater than or equal to the number of items in the group,
+                # we can just run a single fold operation
+                result, cost = self._batch_reduce(key, group_list)
             elif "fold_prompt" in self.config:
                 result, cost = self._incremental_reduce(key, group_list)
             else:

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -0,0 +1 @@
+{}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,25 +23,24 @@ typer = "^0.12.5"
 pydantic = "^2.9.2"
 asteval = "^1.0.4"
 scikit-learn = "^1.5.2"
-numpy = "^1.26.4"
 pyrate-limiter = "^3.7.0"
 openpyxl = { version = "^3.1.5", optional = true }
 python-docx = { version = "^1.1.2", optional = true }
 pydub = { version = "^0.25.1", optional = true }
 python-pptx = { version = "^1.0.2", optional = true }
 azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }
 paddlepaddle = { version = "^2.6.2", optional = true }
-paddleocr = { version = "^2.8.1", optional = true }
 pymupdf = { version = "^1.24.10", optional = true }
 jsonschema = "^4.23.0"
 rapidfuzz = "^3.10.0"
 fastapi = { version = "^0.115.0", optional = true }
 uvicorn = { version = "^0.31.0", optional = true }
 websockets = "^13.1"
+docling = { version  = "^2.5.2", optional = true }
 
 [tool.poetry.extras]
-parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "paddleocr", "pymupdf"]
-server = ["fastapi", "uvicorn"]
+parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "pymupdf"]
+server = ["fastapi", "uvicorn", "docling"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.2"

diff --git a/server/app/main.py b/server/app/main.py
@@ -1,7 +1,7 @@
 import os
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-from server.app.routes import pipeline
+from server.app.routes import pipeline, convert
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -28,6 +28,7 @@
 )
 
 app.include_router(pipeline.router)
+app.include_router(convert.router)
 
 
 @app.get("/")

diff --git a/server/app/routes/convert.py b/server/app/routes/convert.py
@@ -0,0 +1,80 @@
+from fastapi import APIRouter, UploadFile, File
+from typing import List
+import tempfile
+import os
+import aiohttp
+from pathlib import Path
+
+router = APIRouter()
+
+MODAL_ENDPOINT = "https://ucbepic--docling-converter-convert-documents.modal.run"
+
+@router.post("/api/convert-documents")
+async def convert_documents(files: List[UploadFile] = File(...)):
+    # First try Modal endpoint
+    try:
+        async with aiohttp.ClientSession() as session:
+            # Prepare files for multipart upload
+            data = aiohttp.FormData()
+            for file in files:
+                data.add_field('files',
+                             await file.read(),
+                             filename=file.filename,
+                             content_type=file.content_type)
+
+            async with session.post(MODAL_ENDPOINT, data=data, timeout=120) as response:
+                if response.status == 200:
+                    return await response.json()
+
+    except Exception as e:
+        print(f"Modal endpoint failed: {str(e)}. Falling back to local processing...")
+
+    # If Modal fails, fall back to local processing
+    from docling.document_converter import DocumentConverter
+    doc_converter = DocumentConverter()
+
+    # Create a temporary directory to store uploaded files
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Save uploaded files to temporary directory
+        file_paths = []
+        original_filenames = []  # Keep track of original filenames
+        txt_files = []  # Track which files are .txt
+        for file in files:
+            # Reset file position since we might have read it in the Modal attempt
+            await file.seek(0)
+            file_path = os.path.join(temp_dir, file.filename)
+            # Create parent directories if they don't exist
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            with open(file_path, "wb") as buffer:
+                content = await file.read()
+                buffer.write(content)
+            file_paths.append(file_path)
+            original_filenames.append(file.filename)
+            txt_files.append(file.filename.lower().endswith('.txt'))
+
+        # Convert all documents
+        results = []
+        non_txt_paths = [fp for fp, is_txt in zip(file_paths, txt_files) if not is_txt]
+
+        # Get docling iterator for non-txt files if there are any
+        docling_iter = iter(doc_converter.convert_all(non_txt_paths)) if non_txt_paths else iter([])
+
+        # Process all files
+        for filename, file_path, is_txt in zip(original_filenames, file_paths, txt_files):
+            if is_txt:
+                # For txt files, just read the content
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                results.append({
+                    "filename": filename,
+                    "markdown": content
+                })
+            else:
+                # For non-txt files, get next result from docling iterator
+                conv_result = next(docling_iter)
+                results.append({
+                    "filename": filename,
+                    "markdown": conv_result.document.export_to_markdown()
+                })
+
+        return {"documents": results}