feat: add microsoft pptx file converter (#6399)

* Create pptx.py * feat: pptx converter import __init__.py * feat: add pptx import __init__.py * feat: add python-pptx dependency * feat: add sample pptx for testing * feat: add pptx file-converter test * feat: release note pptx-file-converter-3e494d2747637eb2.yaml * feat: Update releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml Co-authored-by: Stefano Fiorucci <[email protected]> * feat: refactor haystack/nodes/file_converter/pptx.py Co-authored-by: Stefano Fiorucci <[email protected]> * fix imports --------- Co-authored-by: Stefano Fiorucci <[email protected]>
deepset-ai · Nov 23, 2023 · c44e2cf · c44e2cf
1 parent 604b177
commit c44e2cf
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 0 deletions.
diff --git a/haystack/nodes/__init__.py b/haystack/nodes/__init__.py
@@ -7,6 +7,7 @@
 from haystack.nodes.file_converter import (
     BaseConverter,
     DocxToTextConverter,
+    PptxConverter,
     ImageToTextConverter,
     MarkdownConverter,
     PDFToTextConverter,

diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py
@@ -3,6 +3,7 @@
 
 from haystack.nodes.file_converter.csv import CsvTextConverter
 from haystack.nodes.file_converter.docx import DocxToTextConverter
+from haystack.nodes.file_converter.pptx import PptxConverter
 from haystack.nodes.file_converter.json import JsonConverter
 from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
 from haystack.nodes.file_converter.txt import TextConverter

diff --git a/haystack/nodes/file_converter/pptx.py b/haystack/nodes/file_converter/pptx.py
@@ -0,0 +1,85 @@
+from typing import List, Optional, Dict
+from pathlib import Path
+import logging
+
+from haystack.schema import Document
+from haystack.lazy_imports import LazyImport
+from haystack.nodes.file_converter.base import BaseConverter
+
+
+logger = logging.getLogger(__name__)
+
+
+with LazyImport("Run 'pip install python-pptx'") as pptx_import:
+    from pptx import Presentation
+
+
+class PptxConverter(BaseConverter):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = None,
+        id_hash_keys: Optional[List[str]] = None,
+        progress_bar: bool = True,
+    ):
+        pptx_import.check()
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables,
+            valid_languages=valid_languages,
+            id_hash_keys=id_hash_keys,
+            progress_bar=progress_bar,
+        )
+
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = None,
+        id_hash_keys: Optional[List[str]] = None,
+    ) -> List[Document]:
+        """
+        Extract text from a .pptx file.
+        Note: As pptx doesn't contain "page" information, we actually extract and return a list of texts from each slide here.
+        For compliance with other converters we nevertheless opted for keeping the methods name.
+
+        :param file_path: Path to the .pptx file you want to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        :param encoding: Not applicable
+        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
+            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+            In this case the id will be generated by using the content and the defined metadata.
+        """
+        if remove_numeric_tables is None:
+            remove_numeric_tables = self.remove_numeric_tables
+        if valid_languages is None:
+            valid_languages = self.valid_languages
+        if remove_numeric_tables is True:
+            raise Exception("'remove_numeric_tables' is not supported by PptxToTextConverter.")
+        if valid_languages is True:
+            raise Exception("Language validation using 'valid_languages' is not supported by PptxToTextConverter.")
+        if id_hash_keys is None:
+            id_hash_keys = self.id_hash_keys
+
+        pres = Presentation(file_path)
+        text_parts = []
+        for slide in pres.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    text_parts.append(shape.text)
+        text = "\n".join(text_parts)
+
+        document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
+        return [document]
diff --git a/pyproject.toml b/pyproject.toml
@@ -165,6 +165,7 @@ preprocessing = [
 file-conversion = [
   "azure-ai-formrecognizer>=3.2.0b2",  # Microsoft Azure's Form Recognizer service (text and table exctrator)
   "python-docx",
+  "python-pptx",
   "tika",  # Apache Tika (text & metadata extractor)
   "beautifulsoup4",
   "markdown",

diff --git a/releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml b/releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml
@@ -0,0 +1,4 @@
+---
+  features:
+    - |
+      Add PptxConverter: a node to convert pptx files to Haystack Documents.
diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py
@@ -16,6 +16,7 @@
     AzureConverter,
     CsvTextConverter,
     DocxToTextConverter,
+    PptxConverter,
     JsonConverter,
     MarkdownConverter,
     ParsrConverter,
@@ -207,6 +208,13 @@ def test_docx_converter(samples_path):
     assert document.content.startswith("Sample Docx File")
 
 
+@pytest.mark.unit
+def test_pptx_converter(samples_path):
+    converter = PptxConverter()
+    document = converter.convert(file_path=samples_path / "pptx" / "sample_pptx.pptx")[0]
+    assert document.content.startswith("Sample Pptx File")
+
+
 @pytest.mark.unit
 def test_markdown_converter(samples_path):
     converter = MarkdownConverter()

diff --git a/test/samples/pptx/sample_pptx.pptx b/test/samples/pptx/sample_pptx.pptx