feat: DOCXToDocument: add table extraction (#8457)

* DOCXToDocument: add table extraction * Add reno note * mypy fixes * add unit tests * Add csv table support * Update release note * Add TableFormat enum * Add table_format as str init param * Update docx.py Co-authored-by: Madeesh Kannan <[email protected]> * PR feedback * PR feedback --------- Co-authored-by: medsriha <[email protected]> Co-authored-by: Mo Sriha <[email protected]> Co-authored-by: Madeesh Kannan <[email protected]>
deepset-ai · Oct 29, 2024 · 28161f7 · 28161f7
1 parent 8205724
commit 28161f7
Show file tree

Hide file tree

Showing 4 changed files with 371 additions and 50 deletions.
diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
@@ -2,12 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import csv
 import io
 from dataclasses import dataclass
+from enum import Enum
+from io import StringIO
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
-from haystack import Document, component, logging
+from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
@@ -17,6 +20,7 @@
 with LazyImport("Run 'pip install python-docx'") as docx_import:
     import docx
     from docx.document import Document as DocxDocument
+    from docx.table import Table
     from docx.text.paragraph import Paragraph
 
 
@@ -59,6 +63,30 @@ class DOCXMetadata:
     version: str
 
 
+class DOCXTableFormat(Enum):
+    """
+    Supported formats for storing DOCX tabular data in a Document.
+    """
+
+    MARKDOWN = "markdown"
+    CSV = "csv"
+
+    def __str__(self):
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "DOCXTableFormat":
+        """
+        Convert a string to a DOCXTableFormat enum.
+        """
+        enum_map = {e.value: e for e in DOCXTableFormat}
+        table_format = enum_map.get(string.lower())
+        if table_format is None:
+            msg = f"Unknown table format '{string}'. Supported formats are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return table_format
+
+
 @component
 class DOCXToDocument:
     """
@@ -69,21 +97,48 @@ class DOCXToDocument:
 
     Usage example:
     ```python
-    from haystack.components.converters.docx import DOCXToDocument
+    from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
 
-    converter = DOCXToDocument()
+    converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
     results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
     documents = results["documents"]
     print(documents[0].content)
     # 'This is a text from the DOCX file.'
     ```
     """
 
-    def __init__(self):
+    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV):
         """
         Create a DOCXToDocument component.
+
+        :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
+            DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
         """
         docx_import.check()
+        self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(self, table_format=str(self.table_format))
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
+        """
+        if "table_format" in data["init_parameters"]:
+            data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
+        return default_from_dict(cls, data)
 
     @component.output_types(documents=List[Document])
     def run(
@@ -118,9 +173,9 @@ def run(
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
             try:
-                file = docx.Document(io.BytesIO(bytestream.data))
-                paragraphs = self._extract_paragraphs_with_page_breaks(file.paragraphs)
-                text = "\n".join(paragraphs)
+                docx_document = docx.Document(io.BytesIO(bytestream.data))
+                elements = self._extract_elements(docx_document)
+                text = "\n".join(elements)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to a DOCX Document, skipping. Error: {error}",
@@ -129,52 +184,116 @@ def run(
                 )
                 continue
 
-            docx_metadata = self._get_docx_metadata(document=file)
+            docx_metadata = self._get_docx_metadata(document=docx_document)
             merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
             document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
 
-    def _extract_paragraphs_with_page_breaks(self, paragraphs: List["Paragraph"]) -> List[str]:
+    def _extract_elements(self, document: "DocxDocument") -> List[str]:
         """
-        Extracts paragraphs from a DOCX file, including page breaks.
+        Extracts elements from a DOCX file.
 
-        Page breaks (both soft and hard page breaks) are not automatically extracted by python-docx as '\f' chars.
-        This means we need to add them in ourselves, as done here. This allows the correct page number
-        to be associated with each document if the file contents are split, e.g. by DocumentSplitter.
+        :param document: The DOCX Document object.
+        :returns: List of strings (paragraph texts and table representations) with page breaks added as '\f' characters.
+        """
+        elements = []
+        for element in document.element.body:
+            if element.tag.endswith("p"):
+                paragraph = Paragraph(element, document)
+                if paragraph.contains_page_break:
+                    para_text = self._process_paragraph_with_page_breaks(paragraph)
+                else:
+                    para_text = paragraph.text
+                elements.append(para_text)
+            elif element.tag.endswith("tbl"):
+                table = docx.table.Table(element, document)
+                table_str = (
+                    self._table_to_markdown(table)
+                    if self.table_format == DOCXTableFormat.MARKDOWN
+                    else self._table_to_csv(table)
+                )
+                elements.append(table_str)
 
-        :param paragraphs:
-            List of paragraphs from a DOCX file.
+        return elements
 
-        :returns:
-            List of strings (paragraph text fields) with all page breaks added in as '\f' characters.
-        """
-        paragraph_texts = []
-        for para in paragraphs:
-            if para.contains_page_break:
-                para_text_w_page_breaks = ""
-                # Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
-                for pb_index, page_break in enumerate(para.rendered_page_breaks):
-                    # Can only extract text from first paragraph page break, unfortunately
-                    if pb_index == 0:
-                        if page_break.preceding_paragraph_fragment:
-                            para_text_w_page_breaks += page_break.preceding_paragraph_fragment.text
-                        para_text_w_page_breaks += "\f"
-                        if page_break.following_paragraph_fragment:
-                            # following_paragraph_fragment contains all text for remainder of paragraph.
-                            # However, if the remainder of the paragraph spans multiple page breaks, it won't include
-                            # those later page breaks so we have to add them at end of text in the `else` block below.
-                            # This is not ideal, but this case should be very rare and this is likely good enough.
-                            para_text_w_page_breaks += page_break.following_paragraph_fragment.text
-                    else:
-                        para_text_w_page_breaks += "\f"
-
-                paragraph_texts.append(para_text_w_page_breaks)
+    def _process_paragraph_with_page_breaks(self, paragraph: "Paragraph") -> str:
+        """
+        Processes a paragraph with page breaks.
+
+        :param paragraph: The DOCX paragraph to process.
+        :returns: A string with page breaks added as '\f' characters.
+        """
+        para_text = ""
+        # Usually, just 1 page break exists, but could be more if paragraph is really long, so we loop over them
+        for pb_index, page_break in enumerate(paragraph.rendered_page_breaks):
+            # Can only extract text from first paragraph page break, unfortunately
+            if pb_index == 0:
+                if page_break.preceding_paragraph_fragment:
+                    para_text += page_break.preceding_paragraph_fragment.text
+                para_text += "\f"
+                if page_break.following_paragraph_fragment:
+                    # following_paragraph_fragment contains all text for remainder of paragraph.
+                    # However, if the remainder of the paragraph spans multiple page breaks, it won't include
+                    # those later page breaks so we have to add them at end of text in the `else` block below.
+                    # This is not ideal, but this case should be very rare and this is likely good enough.
+                    para_text += page_break.following_paragraph_fragment.text
             else:
-                paragraph_texts.append(para.text)
+                para_text += "\f"
+        return para_text
+
+    def _table_to_markdown(self, table: "Table") -> str:
+        """
+        Converts a DOCX table to a Markdown string.
+
+        :param table: The DOCX table to convert.
+        :returns: A Markdown string representation of the table.
+        """
+        markdown: List[str] = []
+        max_col_widths: List[int] = []
+
+        # Calculate max width for each column
+        for row in table.rows:
+            for i, cell in enumerate(row.cells):
+                cell_text = cell.text.strip()
+                if i >= len(max_col_widths):
+                    max_col_widths.append(len(cell_text))
+                else:
+                    max_col_widths[i] = max(max_col_widths[i], len(cell_text))
+
+        # Process rows
+        for i, row in enumerate(table.rows):
+            md_row = [cell.text.strip().ljust(max_col_widths[j]) for j, cell in enumerate(row.cells)]
+            markdown.append("| " + " | ".join(md_row) + " |")
+
+            # Add separator after header row
+            if i == 0:
+                separator = ["-" * max_col_widths[j] for j in range(len(row.cells))]
+                markdown.append("| " + " | ".join(separator) + " |")
+
+        return "\n".join(markdown)
+
+    def _table_to_csv(self, table: "Table") -> str:
+        """
+        Converts a DOCX table to a CSV string.
+
+        :param table: The DOCX table to convert.
+        :returns: A CSV string representation of the table.
+        """
+        csv_output = StringIO()
+        csv_writer = csv.writer(csv_output, quoting=csv.QUOTE_MINIMAL)
+
+        # Process rows
+        for row in table.rows:
+            csv_row = [cell.text.strip() for cell in row.cells]
+            csv_writer.writerow(csv_row)
+
+        # Get the CSV as a string and strip any trailing newlines
+        csv_string = csv_output.getvalue().strip()
+        csv_output.close()
 
-        return paragraph_texts
+        return csv_string
 
     def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
         """
@@ -191,15 +310,15 @@ def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
             category=document.core_properties.category,
             comments=document.core_properties.comments,
             content_status=document.core_properties.content_status,
-            created=document.core_properties.created.isoformat() if document.core_properties.created else None,
+            created=(document.core_properties.created.isoformat() if document.core_properties.created else None),
             identifier=document.core_properties.identifier,
             keywords=document.core_properties.keywords,
             language=document.core_properties.language,
             last_modified_by=document.core_properties.last_modified_by,
-            last_printed=document.core_properties.last_printed.isoformat()
-            if document.core_properties.last_printed
-            else None,
-            modified=document.core_properties.modified.isoformat() if document.core_properties.modified else None,
+            last_printed=(
+                document.core_properties.last_printed.isoformat() if document.core_properties.last_printed else None
+            ),
+            modified=(document.core_properties.modified.isoformat() if document.core_properties.modified else None),
             revision=document.core_properties.revision,
             subject=document.core_properties.subject,
             title=document.core_properties.title,

diff --git a/releasenotes/notes/enhance-docx-table-extraction-3232d3059d220550.yaml b/releasenotes/notes/enhance-docx-table-extraction-3232d3059d220550.yaml
@@ -0,0 +1,4 @@
+---
+features:
+ - |
+   Enhanced DOCX converter to support table extraction in addition to paragraph content. The converter supports both CSV and Markdown table formats, providing flexible options for representing tabular data extracted from DOCX documents.