Skip to content

Commit

Permalink
feat: add pdf upload for the UI
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar committed Nov 17, 2024
1 parent b565846 commit df2dea7
Show file tree
Hide file tree
Showing 20 changed files with 3,281 additions and 1,025 deletions.
8 changes: 6 additions & 2 deletions docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,12 @@ def _insert_empty_resolve_operations(self):
has_map = True
map_op = op
elif op_type == "reduce" and op_config.get("synthesize_resolve", True):
has_reduce = True
reduce_op = op
reduce_key = op_config.get("reduce_key", "_all")
if isinstance(reduce_key, str):
reduce_key = [reduce_key]
if "_all" not in reduce_key:
has_reduce = True
reduce_op = op
elif op_type == "resolve":
has_resolve = True

Expand Down
6 changes: 6 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
2,138 changes: 1,278 additions & 860 deletions poetry.lock

Large diffs are not rendered by default.

7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,24 @@ typer = "^0.12.5"
pydantic = "^2.9.2"
asteval = "^1.0.4"
scikit-learn = "^1.5.2"
numpy = "^1.26.4"
pyrate-limiter = "^3.7.0"
openpyxl = { version = "^3.1.5", optional = true }
python-docx = { version = "^1.1.2", optional = true }
pydub = { version = "^0.25.1", optional = true }
python-pptx = { version = "^1.0.2", optional = true }
azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }
paddlepaddle = { version = "^2.6.2", optional = true }
paddleocr = { version = "^2.8.1", optional = true }
pymupdf = { version = "^1.24.10", optional = true }
jsonschema = "^4.23.0"
rapidfuzz = "^3.10.0"
fastapi = { version = "^0.115.0", optional = true }
uvicorn = { version = "^0.31.0", optional = true }
websockets = "^13.1"
docling = { version = "^2.5.2", optional = true }

[tool.poetry.extras]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "paddleocr", "pymupdf"]
server = ["fastapi", "uvicorn"]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "pymupdf"]
server = ["fastapi", "uvicorn", "docling"]

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.2"
Expand Down
3 changes: 2 additions & 1 deletion server/app/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from server.app.routes import pipeline
from server.app.routes import pipeline, convert
from dotenv import load_dotenv

load_dotenv()
Expand All @@ -28,6 +28,7 @@
)

app.include_router(pipeline.router)
app.include_router(convert.router)


@app.get("/")
Expand Down
59 changes: 59 additions & 0 deletions server/app/routes/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from fastapi import APIRouter, UploadFile, File
from typing import List
import tempfile
import os
import aiohttp
from pathlib import Path

router = APIRouter()

MODAL_ENDPOINT = "https://ucbepic--docling-converter-doclingconverter-convert-documents.modal.run"

@router.post("/api/convert-documents")
async def convert_documents(files: List[UploadFile] = File(...)):
# First try Modal endpoint
try:
async with aiohttp.ClientSession() as session:
# Prepare files for multipart upload
data = aiohttp.FormData()
for file in files:
data.add_field('files',
await file.read(),
filename=file.filename,
content_type=file.content_type)

async with session.post(MODAL_ENDPOINT, data=data, timeout=120) as response:
if response.status == 200:
return await response.json()

except Exception as e:
print(f"Modal endpoint failed: {str(e)}. Falling back to local processing...")

# If Modal fails, fall back to local processing
from docling.document_converter import DocumentConverter
doc_converter = DocumentConverter()

# Create a temporary directory to store uploaded files
with tempfile.TemporaryDirectory() as temp_dir:
# Save uploaded files to temporary directory
file_paths = []
original_filenames = [] # Keep track of original filenames
for file in files:
# Reset file position since we might have read it in the Modal attempt
await file.seek(0)
file_path = os.path.join(temp_dir, file.filename)
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
file_paths.append(file_path)
original_filenames.append(file.filename)

# Convert all documents
results = []
for filename, conv_result in zip(original_filenames, doc_converter.convert_all(file_paths)):
results.append({
"filename": filename,
"markdown": conv_result.document.export_to_markdown(),
})

return {"documents": results}
Loading

0 comments on commit df2dea7

Please sign in to comment.