ucbepic · shreyashankar · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/docetl/api.py b/docetl/api.py
@@ -50,14 +50,22 @@
 """
 
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 import yaml
 from rich import print
 
 from docetl.builder import Optimizer
 from docetl.runner import DSLRunner
-from docetl.schemas import Dataset, EquijoinOp, FilterOp, GatherOp, MapOp, ParallelMapOp
+from docetl.schemas import (
+    Dataset,
+    EquijoinOp,
+    FilterOp,
+    GatherOp,
+    MapOp,
+    ParallelMapOp,
+    ParsingTool,
+)
 from docetl.schemas import Pipeline as PipelineModel
 from docetl.schemas import (
     PipelineOutput,
@@ -66,7 +74,6 @@
     ResolveOp,
     SplitOp,
     UnnestOp,
-    ParsingTool,
 )
 
 

diff --git a/docetl/builder.py b/docetl/builder.py
@@ -8,11 +8,11 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import yaml
-from docetl.dataset import Dataset, create_parsing_tool_map
 from rich.console import Console
 from rich.status import Status
 from rich.traceback import install
 
+from docetl.dataset import Dataset, create_parsing_tool_map
 from docetl.operations import get_operation
 from docetl.operations.base import BaseOperation
 from docetl.operations.utils import flush_cache

diff --git a/docetl/dataset.py b/docetl/dataset.py
@@ -1,9 +1,9 @@
-from typing import List, Dict, Union, Optional, Any, Callable
 import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Callable, Dict, List, Optional, Union
 
 from docetl.parsing_tools import PARSING_TOOLS
 from docetl.schemas import ParsingTool
-from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 def process_item(

diff --git a/docetl/operations/utils.py b/docetl/operations/utils.py
@@ -16,7 +16,6 @@
 from frozendict import frozendict
 from jinja2 import Template
 from litellm import completion, embedding, model_cost
-from pydantic import create_model
 from rich import print as rprint
 from rich.console import Console
 from rich.prompt import Prompt

diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
@@ -1,6 +1,7 @@
-import os
-from typing import Optional, List
 import io
+import os
+from typing import List, Optional
+
 from litellm import transcription
 
 
@@ -135,17 +136,17 @@ def docx_to_string(filename: str) -> List[str]:
     return ["\n".join([paragraph.text for paragraph in doc.paragraphs])]
 
 
-def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]:
+def pptx_to_string(filename: str, doc_per_slide: bool = False) -> List[str]:
     """
     Extract text from a PowerPoint presentation.
 
     Args:
         filename (str): Path to the pptx file.
-        slide_per_document (bool): If True, return each slide as a separate
+        doc_per_slide (bool): If True, return each slide as a separate
             document. If False, return the entire presentation as one document.
 
     Returns:
-        List[str]: Extracted text from the presentation. If slide_per_document
+        List[str]: Extracted text from the presentation. If doc_per_slide
             is True, each string in the list represents a single slide.
             Otherwise, the list contains a single string with all slides'
             content.
@@ -161,22 +162,163 @@ def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]
             if hasattr(shape, "text"):
                 slide_content.append(shape.text)
 
-        if slide_per_document:
+        if doc_per_slide:
             result.append("\n".join(slide_content))
         else:
             result.extend(slide_content)
 
-    if not slide_per_document:
+    if not doc_per_slide:
         result = ["\n".join(result)]
 
     return result
 
 
+def azure_di_read(
+    filename: str,
+    use_url: bool = False,
+    include_line_numbers: bool = False,
+    include_handwritten: bool = False,
+    include_font_styles: bool = False,
+    include_selection_marks: bool = False,
+    doc_per_page: bool = False,
+) -> List[str]:
+    """
+    Note to developers: We used this documentation: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/use-sdk-rest-api?view=doc-intel-4.0.0&tabs=windows&pivots=programming-language-python
+
+    This function uses Azure Document Intelligence to extract text from documents.
+    To use this function, you need to set up an Azure Document Intelligence resource:
+
+    1. Create an Azure account if you don't have one: https://azure.microsoft.com/
+    2. Set up a Document Intelligence resource in the Azure portal:
+       https://portal.azure.com/#create/Microsoft.CognitiveServicesFormRecognizer
+    3. Once created, find the resource's endpoint and key in the Azure portal
+    4. Set these as environment variables:
+       - DOCUMENTINTELLIGENCE_API_KEY: Your Azure Document Intelligence API key
+       - DOCUMENTINTELLIGENCE_ENDPOINT: Your Azure Document Intelligence endpoint URL
+
+    The function will use these credentials to authenticate with the Azure service.
+    If the environment variables are not set, the function will raise a ValueError.
+
+    The Azure Document Intelligence client is then initialized with these credentials.
+    It sends the document (either as a file or URL) to Azure for analysis.
+    The service processes the document and returns structured information about its content.
+
+    This function then extracts the text content from the returned data,
+    applying any specified formatting options (like including line numbers or font styles).
+    The extracted text is returned as a list of strings, with each string
+    representing either a page (if doc_per_page is True) or the entire document.
+
+    Args:
+        filename (str): Path to the file to be analyzed or URL of the document if use_url is True.
+        use_url (bool, optional): If True, treat filename as a URL. Defaults to False.
+        include_line_numbers (bool, optional): If True, include line numbers in the output. Defaults to False.
+        include_handwritten (bool, optional): If True, include handwritten text in the output. Defaults to False.
+        include_font_styles (bool, optional): If True, include font style information in the output. Defaults to False.
+        include_selection_marks (bool, optional): If True, include selection marks in the output. Defaults to False.
+        doc_per_page (bool, optional): If True, return each page as a separate document. Defaults to False.
+
+    Returns:
+        List[str]: Extracted text from the document. If doc_per_page is True, each string in the list represents
+                   a single page. Otherwise, the list contains a single string with all pages' content.
+
+    Raises:
+        ValueError: If DOCUMENTINTELLIGENCE_API_KEY or DOCUMENTINTELLIGENCE_ENDPOINT environment variables are not set.
+    """
+    import os
+
+    from azure.ai.documentintelligence import DocumentIntelligenceClient
+    from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+    from azure.core.credentials import AzureKeyCredential
+
+    key = os.getenv("DOCUMENTINTELLIGENCE_API_KEY")
+    endpoint = os.getenv("DOCUMENTINTELLIGENCE_ENDPOINT")
+
+    if key is None:
+        raise ValueError("DOCUMENTINTELLIGENCE_API_KEY environment variable is not set")
+    if endpoint is None:
+        raise ValueError(
+            "DOCUMENTINTELLIGENCE_ENDPOINT environment variable is not set"
+        )
+
+    document_analysis_client = DocumentIntelligenceClient(
+        endpoint=endpoint, credential=AzureKeyCredential(key)
+    )
+
+    if use_url:
+        poller = document_analysis_client.begin_analyze_document(
+            "prebuilt-read", AnalyzeDocumentRequest(url_source=filename)
+        )
+    else:
+        with open(filename, "rb") as f:
+            poller = document_analysis_client.begin_analyze_document("prebuilt-read", f)
+
+    result = poller.result()
+
+    style_content = []
+    content = []
+
+    if result.styles:
+        for style in result.styles:
+            if style.is_handwritten and include_handwritten:
+                handwritten_text = ",".join(
+                    [
+                        result.content[span.offset : span.offset + span.length]
+                        for span in style.spans
+                    ]
+                )
+                style_content.append(f"Handwritten content: {handwritten_text}")
+
+            if style.font_style and include_font_styles:
+                styled_text = ",".join(
+                    [
+                        result.content[span.offset : span.offset + span.length]
+                        for span in style.spans
+                    ]
+                )
+                style_content.append(f"'{style.font_style}' font style: {styled_text}")
+
+    for page in result.pages:
+        page_content = []
+
+        if page.lines:
+            for line_idx, line in enumerate(page.lines):
+                if include_line_numbers:
+                    page_content.append(f" Line #{line_idx}: {line.content}")
+                else:
+                    page_content.append(f"{line.content}")
+
+        if page.selection_marks and include_selection_marks:
+            # TODO: figure this out
+            for selection_mark_idx, selection_mark in enumerate(page.selection_marks):
+                page_content.append(
+                    f"Selection mark #{selection_mark_idx}: State is '{selection_mark.state}' within bounding polygon "
+                    f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
+                )
+
+        content.append("\n".join(page_content))
+
+    if doc_per_page:
+        return style_content + content
+    else:
+        return [
+            "\n\n".join(
+                [
+                    "\n".join(style_content),
+                    "\n\n".join(
+                        f"Page {i+1}:\n{page_content}"
+                        for i, page_content in enumerate(content)
+                    ),
+                ]
+            )
+        ]
+
+
 # Define a dictionary mapping function names to their corresponding functions
 PARSING_TOOLS = {
     "whisper_speech_to_text": whisper_speech_to_text,
     "xlsx_to_string": xlsx_to_string,
     "txt_to_string": txt_to_string,
     "docx_to_string": docx_to_string,
     "pptx_to_string": pptx_to_string,
+    "azure_di_read": azure_di_read,
 }
diff --git a/docetl/runner.py b/docetl/runner.py
@@ -6,10 +6,10 @@
 from dotenv import load_dotenv
 from rich.console import Console
 
+from docetl.dataset import Dataset, create_parsing_tool_map
 from docetl.operations import get_operation
 from docetl.operations.utils import flush_cache
 from docetl.utils import load_config
-from docetl.dataset import Dataset, create_parsing_tool_map
 
 load_dotenv()
 

diff --git a/docs/examples/custom-parsing.md b/docs/examples/custom-parsing.md
@@ -238,6 +238,10 @@ DocETL provides several built-in parsing tools to handle common file formats and
     options:
         heading_level: 3
 
+::: docetl.parsing_tools.azure_di_read
+    options:
+        heading_level: 3
+
 ### Using Function Arguments with Parsing Tools
 
 When using parsing tools in your DocETL configuration, you can pass additional arguments to the parsing functions using the function_kwargs field. This allows you to customize the behavior of the parsing tools without modifying their implementation.

diff --git a/docs/installation.md b/docs/installation.md
@@ -14,6 +14,12 @@ Before installing DocETL, ensure you have Python 3.10 or later installed on your
 pip install docetl
 ```
 
+If you want to use the parsing tools, you need to install the `parsing` extra:
+
+```bash
+pip install docetl[parsing]
+```
+
 This command will install DocETL along with its dependencies as specified in the pyproject.toml file. To verify that DocETL has been installed correctly, you can run the following command in your terminal:
 
 ```bash

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,9 +28,10 @@ openpyxl = { version = "^3.1.5", optional = true }
 python-docx = { version = "^1.1.2", optional = true }
 pydub = { version = "^0.25.1", optional = true }
 python-pptx = { version = "^1.0.2", optional = true }
+azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }
 
 [tool.poetry.extras]
-parsing = ["python-docx", "openpyxl", "pydub", "python-pptx"]
+parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.2"