Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Azure Document Intelligence Read Tool #36

Merged
merged 4 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions docetl/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,22 @@
"""

import os
from typing import Any, Dict, List, Optional
from typing import Any, Dict, Optional

import yaml
from rich import print

from docetl.builder import Optimizer
from docetl.runner import DSLRunner
from docetl.schemas import Dataset, EquijoinOp, FilterOp, GatherOp, MapOp, ParallelMapOp
from docetl.schemas import (
Dataset,
EquijoinOp,
FilterOp,
GatherOp,
MapOp,
ParallelMapOp,
ParsingTool,
)
from docetl.schemas import Pipeline as PipelineModel
from docetl.schemas import (
PipelineOutput,
Expand All @@ -66,7 +74,6 @@
ResolveOp,
SplitOp,
UnnestOp,
ParsingTool,
)


Expand Down
2 changes: 1 addition & 1 deletion docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
from typing import Any, Dict, List, Optional, Tuple, Union

import yaml
from docetl.dataset import Dataset, create_parsing_tool_map
from rich.console import Console
from rich.status import Status
from rich.traceback import install

from docetl.dataset import Dataset, create_parsing_tool_map
from docetl.operations import get_operation
from docetl.operations.base import BaseOperation
from docetl.operations.utils import flush_cache
Expand Down
4 changes: 2 additions & 2 deletions docetl/dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import List, Dict, Union, Optional, Any, Callable
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Callable, Dict, List, Optional, Union

from docetl.parsing_tools import PARSING_TOOLS
from docetl.schemas import ParsingTool
from concurrent.futures import ThreadPoolExecutor, as_completed


def process_item(
Expand Down
1 change: 0 additions & 1 deletion docetl/operations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from frozendict import frozendict
from jinja2 import Template
from litellm import completion, embedding, model_cost
from pydantic import create_model
from rich import print as rprint
from rich.console import Console
from rich.prompt import Prompt
Expand Down
156 changes: 149 additions & 7 deletions docetl/parsing_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from typing import Optional, List
import io
import os
from typing import List, Optional

from litellm import transcription


Expand Down Expand Up @@ -135,17 +136,17 @@ def docx_to_string(filename: str) -> List[str]:
return ["\n".join([paragraph.text for paragraph in doc.paragraphs])]


def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]:
def pptx_to_string(filename: str, doc_per_slide: bool = False) -> List[str]:
"""
Extract text from a PowerPoint presentation.

Args:
filename (str): Path to the pptx file.
slide_per_document (bool): If True, return each slide as a separate
doc_per_slide (bool): If True, return each slide as a separate
document. If False, return the entire presentation as one document.

Returns:
List[str]: Extracted text from the presentation. If slide_per_document
List[str]: Extracted text from the presentation. If doc_per_slide
is True, each string in the list represents a single slide.
Otherwise, the list contains a single string with all slides'
content.
Expand All @@ -161,22 +162,163 @@ def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]
if hasattr(shape, "text"):
slide_content.append(shape.text)

if slide_per_document:
if doc_per_slide:
result.append("\n".join(slide_content))
else:
result.extend(slide_content)

if not slide_per_document:
if not doc_per_slide:
result = ["\n".join(result)]

return result


def azure_di_read(
filename: str,
use_url: bool = False,
include_line_numbers: bool = False,
include_handwritten: bool = False,
include_font_styles: bool = False,
include_selection_marks: bool = False,
doc_per_page: bool = False,
) -> List[str]:
"""
Note to developers: We used this documentation: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/use-sdk-rest-api?view=doc-intel-4.0.0&tabs=windows&pivots=programming-language-python

This function uses Azure Document Intelligence to extract text from documents.
To use this function, you need to set up an Azure Document Intelligence resource:

1. Create an Azure account if you don't have one: https://azure.microsoft.com/
2. Set up a Document Intelligence resource in the Azure portal:
https://portal.azure.com/#create/Microsoft.CognitiveServicesFormRecognizer
3. Once created, find the resource's endpoint and key in the Azure portal
4. Set these as environment variables:
- DOCUMENTINTELLIGENCE_API_KEY: Your Azure Document Intelligence API key
- DOCUMENTINTELLIGENCE_ENDPOINT: Your Azure Document Intelligence endpoint URL

The function will use these credentials to authenticate with the Azure service.
If the environment variables are not set, the function will raise a ValueError.

The Azure Document Intelligence client is then initialized with these credentials.
It sends the document (either as a file or URL) to Azure for analysis.
The service processes the document and returns structured information about its content.

This function then extracts the text content from the returned data,
applying any specified formatting options (like including line numbers or font styles).
The extracted text is returned as a list of strings, with each string
representing either a page (if doc_per_page is True) or the entire document.

Args:
filename (str): Path to the file to be analyzed or URL of the document if use_url is True.
use_url (bool, optional): If True, treat filename as a URL. Defaults to False.
include_line_numbers (bool, optional): If True, include line numbers in the output. Defaults to False.
include_handwritten (bool, optional): If True, include handwritten text in the output. Defaults to False.
include_font_styles (bool, optional): If True, include font style information in the output. Defaults to False.
include_selection_marks (bool, optional): If True, include selection marks in the output. Defaults to False.
doc_per_page (bool, optional): If True, return each page as a separate document. Defaults to False.

Returns:
List[str]: Extracted text from the document. If doc_per_page is True, each string in the list represents
a single page. Otherwise, the list contains a single string with all pages' content.

Raises:
ValueError: If DOCUMENTINTELLIGENCE_API_KEY or DOCUMENTINTELLIGENCE_ENDPOINT environment variables are not set.
"""
import os

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.core.credentials import AzureKeyCredential

key = os.getenv("DOCUMENTINTELLIGENCE_API_KEY")
endpoint = os.getenv("DOCUMENTINTELLIGENCE_ENDPOINT")

if key is None:
raise ValueError("DOCUMENTINTELLIGENCE_API_KEY environment variable is not set")
if endpoint is None:
raise ValueError(
"DOCUMENTINTELLIGENCE_ENDPOINT environment variable is not set"
)

document_analysis_client = DocumentIntelligenceClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)

if use_url:
poller = document_analysis_client.begin_analyze_document(
"prebuilt-read", AnalyzeDocumentRequest(url_source=filename)
)
else:
with open(filename, "rb") as f:
poller = document_analysis_client.begin_analyze_document("prebuilt-read", f)

result = poller.result()

style_content = []
content = []

if result.styles:
for style in result.styles:
if style.is_handwritten and include_handwritten:
handwritten_text = ",".join(
[
result.content[span.offset : span.offset + span.length]
for span in style.spans
]
)
style_content.append(f"Handwritten content: {handwritten_text}")

if style.font_style and include_font_styles:
styled_text = ",".join(
[
result.content[span.offset : span.offset + span.length]
for span in style.spans
]
)
style_content.append(f"'{style.font_style}' font style: {styled_text}")

for page in result.pages:
page_content = []

if page.lines:
for line_idx, line in enumerate(page.lines):
if include_line_numbers:
page_content.append(f" Line #{line_idx}: {line.content}")
else:
page_content.append(f"{line.content}")

if page.selection_marks and include_selection_marks:
# TODO: figure this out
for selection_mark_idx, selection_mark in enumerate(page.selection_marks):
page_content.append(
f"Selection mark #{selection_mark_idx}: State is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
)

content.append("\n".join(page_content))

if doc_per_page:
return style_content + content
else:
return [
"\n\n".join(
[
"\n".join(style_content),
"\n\n".join(
f"Page {i+1}:\n{page_content}"
for i, page_content in enumerate(content)
),
]
)
]


# Define a dictionary mapping function names to their corresponding functions
PARSING_TOOLS = {
"whisper_speech_to_text": whisper_speech_to_text,
"xlsx_to_string": xlsx_to_string,
"txt_to_string": txt_to_string,
"docx_to_string": docx_to_string,
"pptx_to_string": pptx_to_string,
"azure_di_read": azure_di_read,
}
2 changes: 1 addition & 1 deletion docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from dotenv import load_dotenv
from rich.console import Console

from docetl.dataset import Dataset, create_parsing_tool_map
from docetl.operations import get_operation
from docetl.operations.utils import flush_cache
from docetl.utils import load_config
from docetl.dataset import Dataset, create_parsing_tool_map

load_dotenv()

Expand Down
4 changes: 4 additions & 0 deletions docs/examples/custom-parsing.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ DocETL provides several built-in parsing tools to handle common file formats and
options:
heading_level: 3

::: docetl.parsing_tools.azure_di_read
options:
heading_level: 3

### Using Function Arguments with Parsing Tools

When using parsing tools in your DocETL configuration, you can pass additional arguments to the parsing functions using the function_kwargs field. This allows you to customize the behavior of the parsing tools without modifying their implementation.
Expand Down
6 changes: 6 additions & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Before installing DocETL, ensure you have Python 3.10 or later installed on your
pip install docetl
```

If you want to use the parsing tools, you need to install the `parsing` extra:

```bash
pip install docetl[parsing]
```

This command will install DocETL along with its dependencies as specified in the pyproject.toml file. To verify that DocETL has been installed correctly, you can run the following command in your terminal:

```bash
Expand Down
53 changes: 51 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ openpyxl = { version = "^3.1.5", optional = true }
python-docx = { version = "^1.1.2", optional = true }
pydub = { version = "^0.25.1", optional = true }
python-pptx = { version = "^1.0.2", optional = true }
azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }

[tool.poetry.extras]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx"]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence"]

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.2"
Expand Down
Loading
Loading