Skip to content

Commit

Permalink
feat: add pdf upload for the UI (#190)
Browse files Browse the repository at this point in the history
* feat: add pdf upload for the UI

* chore: update lockfile

* batch inputs and support txt files

* fix: fix bugs in UI

* fix: resolve viz
  • Loading branch information
shreyashankar authored Nov 17, 2024
1 parent 4f2e0e6 commit 02dd680
Show file tree
Hide file tree
Showing 25 changed files with 3,678 additions and 1,094 deletions.
8 changes: 6 additions & 2 deletions docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,12 @@ def _insert_empty_resolve_operations(self):
has_map = True
map_op = op
elif op_type == "reduce" and op_config.get("synthesize_resolve", True):
has_reduce = True
reduce_op = op
reduce_key = op_config.get("reduce_key", "_all")
if isinstance(reduce_key, str):
reduce_key = [reduce_key]
if "_all" not in reduce_key:
has_reduce = True
reduce_op = op
elif op_type == "resolve":
has_resolve = True

Expand Down
7 changes: 7 additions & 0 deletions docetl/operations/reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,13 @@ def process_group(
# Only execute merge-based plans if associative = True
if "merge_prompt" in self.config and self.config.get("associative", True):
result, cost = self._parallel_fold_and_merge(key, group_list)
elif (
self.config.get("fold_batch_size", None)
and self.config.get("fold_batch_size") >= len(group_list)
):
# If the fold batch size is greater than or equal to the number of items in the group,
# we can just run a single fold operation
result, cost = self._batch_reduce(key, group_list)
elif "fold_prompt" in self.config:
result, cost = self._incremental_reduce(key, group_list)
else:
Expand Down
6 changes: 6 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
2,110 changes: 1,264 additions & 846 deletions poetry.lock

Large diffs are not rendered by default.

7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,24 @@ typer = "^0.12.5"
pydantic = "^2.9.2"
asteval = "^1.0.4"
scikit-learn = "^1.5.2"
numpy = "^1.26.4"
pyrate-limiter = "^3.7.0"
openpyxl = { version = "^3.1.5", optional = true }
python-docx = { version = "^1.1.2", optional = true }
pydub = { version = "^0.25.1", optional = true }
python-pptx = { version = "^1.0.2", optional = true }
azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }
paddlepaddle = { version = "^2.6.2", optional = true }
paddleocr = { version = "^2.8.1", optional = true }
pymupdf = { version = "^1.24.10", optional = true }
jsonschema = "^4.23.0"
rapidfuzz = "^3.10.0"
fastapi = { version = "^0.115.0", optional = true }
uvicorn = { version = "^0.31.0", optional = true }
websockets = "^13.1"
docling = { version = "^2.5.2", optional = true }

[tool.poetry.extras]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "paddleocr", "pymupdf"]
server = ["fastapi", "uvicorn"]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "pymupdf"]
server = ["fastapi", "uvicorn", "docling"]

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.2"
Expand Down
3 changes: 2 additions & 1 deletion server/app/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from server.app.routes import pipeline
from server.app.routes import pipeline, convert
from dotenv import load_dotenv

load_dotenv()
Expand All @@ -28,6 +28,7 @@
)

app.include_router(pipeline.router)
app.include_router(convert.router)


@app.get("/")
Expand Down
80 changes: 80 additions & 0 deletions server/app/routes/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from fastapi import APIRouter, UploadFile, File
from typing import List
import tempfile
import os
import aiohttp
from pathlib import Path

router = APIRouter()

MODAL_ENDPOINT = "https://ucbepic--docling-converter-convert-documents.modal.run"

@router.post("/api/convert-documents")
async def convert_documents(files: List[UploadFile] = File(...)):
# First try Modal endpoint
try:
async with aiohttp.ClientSession() as session:
# Prepare files for multipart upload
data = aiohttp.FormData()
for file in files:
data.add_field('files',
await file.read(),
filename=file.filename,
content_type=file.content_type)

async with session.post(MODAL_ENDPOINT, data=data, timeout=120) as response:
if response.status == 200:
return await response.json()

except Exception as e:
print(f"Modal endpoint failed: {str(e)}. Falling back to local processing...")

# If Modal fails, fall back to local processing
from docling.document_converter import DocumentConverter
doc_converter = DocumentConverter()

# Create a temporary directory to store uploaded files
with tempfile.TemporaryDirectory() as temp_dir:
# Save uploaded files to temporary directory
file_paths = []
original_filenames = [] # Keep track of original filenames
txt_files = [] # Track which files are .txt
for file in files:
# Reset file position since we might have read it in the Modal attempt
await file.seek(0)
file_path = os.path.join(temp_dir, file.filename)
# Create parent directories if they don't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
file_paths.append(file_path)
original_filenames.append(file.filename)
txt_files.append(file.filename.lower().endswith('.txt'))

# Convert all documents
results = []
non_txt_paths = [fp for fp, is_txt in zip(file_paths, txt_files) if not is_txt]

# Get docling iterator for non-txt files if there are any
docling_iter = iter(doc_converter.convert_all(non_txt_paths)) if non_txt_paths else iter([])

# Process all files
for filename, file_path, is_txt in zip(original_filenames, file_paths, txt_files):
if is_txt:
# For txt files, just read the content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
results.append({
"filename": filename,
"markdown": content
})
else:
# For non-txt files, get next result from docling iterator
conv_result = next(docling_iter)
results.append({
"filename": filename,
"markdown": conv_result.document.export_to_markdown()
})

return {"documents": results}
Loading

0 comments on commit 02dd680

Please sign in to comment.