diff --git a/VERSION b/VERSION
index 7e541aec..6b4d1577 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.2.2
\ No newline at end of file
+2.2.3
\ No newline at end of file
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
index f139733f..20d01db1 100644
--- a/dedoc/api/api_args.py
+++ b/dedoc/api/api_args.py
@@ -17,7 +17,6 @@ class QueryParameters:
need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
- attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")
# tables handling
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py
index 46a6ce6d..c942fefa 100644
--- a/dedoc/api/api_utils.py
+++ b/dedoc/api/api_utils.py
@@ -14,6 +14,7 @@
from dedoc.data_structures.parsed_document import ParsedDocument
from dedoc.data_structures.table import Table
from dedoc.data_structures.tree_node import TreeNode
+from dedoc.extensions import converted_mimes, recognized_mimes
def __prettify_text(text: str) -> Iterator[str]:
@@ -148,11 +149,22 @@ def json2html(text: str,
text += table2html(table, table2id)
text += "
"
+ image_mimes = recognized_mimes.image_like_format.union(converted_mimes.image_like_format)
+
if attachments is not None and len(attachments) > 0:
text += " Attachments:
"
for attachment_id, attachment in enumerate(attachments):
attachment_text = json2html(text="", paragraph=attachment.content.structure, tables=attachment.content.tables, attachments=attachment.attachments)
- text += f'attachment {attachment_id} ({attachment.metadata.file_name}):
{attachment_text}'
+ attachment_base64 = f'data:{attachment.metadata.file_type};base64,{attachment.metadata.base64}"'
+ attachment_link = f'{attachment.metadata.file_name}'
+ is_image = attachment.metadata.file_type in image_mimes
+ attachment_image = f'' if is_image else ""
+
+ text += f"""
+
attachment {attachment_id} ({attachment_link}):
+ {attachment_image}
+ {attachment_text}
+ """
return text
@@ -193,12 +205,9 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int], attach2id:
name = annotation.name
value = annotation.value
- bool_annotations = [BoldAnnotation.name,
- ItalicAnnotation.name,
- StrikeAnnotation.name,
- SubscriptAnnotation.name,
- SuperscriptAnnotation.name,
- UnderlinedAnnotation.name]
+ bool_annotations = [
+ BoldAnnotation.name, ItalicAnnotation.name, StrikeAnnotation.name, SubscriptAnnotation.name, SuperscriptAnnotation.name, UnderlinedAnnotation.name
+ ]
check_annotations = bool_annotations + [TableAnnotation.name, ReferenceAnnotation.name, AttachAnnotation.name]
if name not in check_annotations and not value.startswith("heading "):
continue
diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py
index 4572ab05..9e7063f7 100644
--- a/dedoc/api/dedoc_api.py
+++ b/dedoc/api/dedoc_api.py
@@ -1,3 +1,4 @@
+import base64
import dataclasses
import importlib
import json
@@ -62,41 +63,57 @@ def _get_static_file_path(request: Request) -> str:
return os.path.abspath(os.path.join(directory, file))
+def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None:
+ for attachment in document_tree.attachments:
+ with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
+ attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
+
+
@app.post("/upload", response_model=ParsedDocument)
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
parameters = dataclasses.asdict(query_params)
if not file or file.filename == "":
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)
+ return_format = str(parameters.get("return_format", "json")).lower()
+
with tempfile.TemporaryDirectory() as tmpdir:
file_path = save_upload_file(file, tmpdir)
- document_tree = manager.parse(file_path, parameters=dict(parameters))
+ document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
+
+ if return_format == "html":
+ __add_base64_info_to_attachments(document_tree, tmpdir)
- return_format = str(parameters.get("return_format", "json")).lower()
if return_format == "html":
html_content = json2html(
text="",
paragraph=document_tree.content.structure,
tables=document_tree.content.tables,
- attachments=document_tree.attachments, tabs=0
+ attachments=document_tree.attachments,
+ tabs=0
)
return HTMLResponse(content=html_content)
- elif return_format == "plain_text":
+
+ if return_format == "plain_text":
txt_content = json2txt(paragraph=document_tree.content.structure)
return PlainTextResponse(content=txt_content)
- elif return_format == "tree":
+
+ if return_format == "tree":
html_content = json2tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
- elif return_format == "ujson":
+
+ if return_format == "ujson":
return UJSONResponse(content=document_tree.to_api_schema().model_dump())
- elif return_format == "collapsed_tree":
+
+ if return_format == "collapsed_tree":
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
- elif return_format == "pretty_json":
+
+ if return_format == "pretty_json":
return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
- else:
- logger.info(f"Send result. File {file.filename} with parameters {parameters}")
- return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
+
+ logger.info(f"Send result. File {file.filename} with parameters {parameters}")
+ return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
@app.get("/upload_example")
diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
index 055ef58b..e045d483 100644
--- a/dedoc/api/web/index.html
+++ b/dedoc/api/web/index.html
@@ -70,7 +70,7 @@ Type of document structure parsing
diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py
index beec9c56..e93b2c16 100644
--- a/dedoc/data_structures/document_metadata.py
+++ b/dedoc/data_structures/document_metadata.py
@@ -1,5 +1,5 @@
import uuid
-from typing import Dict, Union
+from typing import Any, Dict, Union
from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata
from dedoc.data_structures.serializable import Serializable
@@ -38,8 +38,11 @@ def __init__(self,
self.access_time = access_time
self.file_type = file_type
for key, value in kwargs.items():
- setattr(self, key, value)
+ self.add_attribute(key, value)
self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid
+ def add_attribute(self, key: str, value: Any) -> None: # noqa
+ setattr(self, key, value)
+
def to_api_schema(self) -> ApiDocumentMetadata:
return ApiDocumentMetadata(**vars(self))
diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py
index 49d402e8..95901ec4 100644
--- a/dedoc/readers/docx_reader/data_structures/docx_document.py
+++ b/dedoc/readers/docx_reader/data_structures/docx_document.py
@@ -1,14 +1,11 @@
import hashlib
import logging
-import os
import re
-import zipfile
from collections import defaultdict
-from typing import List, Optional
+from typing import List
from bs4 import BeautifulSoup, Tag
-from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
@@ -19,6 +16,7 @@
from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
+from dedoc.utils.office_utils import get_bs_from_zip
from dedoc.utils.utils import calculate_file_hash
@@ -28,8 +26,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
self.path = path
self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
- self.document_bs_tree = self.__get_bs_tree("word/document.xml")
- self.document_bs_tree = self.__get_bs_tree("word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
+ self.document_bs_tree = get_bs_from_zip(self.path, "word/document.xml")
+ self.document_bs_tree = get_bs_from_zip(self.path, "word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
self.body = self.document_bs_tree.body if self.document_bs_tree else None
self.paragraph_maker = self.__get_paragraph_maker()
@@ -39,8 +37,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
self.lines = self.__get_lines()
def __get_paragraph_maker(self) -> ParagraphMaker:
- styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), self.logger)
- num_tree = self.__get_bs_tree("word/numbering.xml")
+ styles_extractor = StylesExtractor(get_bs_from_zip(self.path, "word/styles.xml"), self.logger)
+ num_tree = get_bs_from_zip(self.path, "word/numbering.xml")
numbering_extractor = NumberingExtractor(num_tree, styles_extractor) if num_tree else None
styles_extractor.numbering_extractor = numbering_extractor
@@ -49,8 +47,8 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
path_hash=calculate_file_hash(path=self.path),
styles_extractor=styles_extractor,
numbering_extractor=numbering_extractor,
- footnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")),
- endnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote")
+ footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
+ endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
)
def __get_lines(self) -> List[LineWithMeta]:
@@ -120,23 +118,6 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d
return lines_with_meta
- def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]:
- """
- Gets xml bs tree from the given file inside the self.path.
- :param filename: name of file to extract the tree
- :return: BeautifulSoup tree or None if file wasn't found
- """
- try:
- with zipfile.ZipFile(self.path) as document:
- content = document.read(filename)
- content = re.sub(br"\n[\t ]*", b"", content)
- soup = BeautifulSoup(content, "xml")
- return soup
- except KeyError:
- return None
- except zipfile.BadZipFile:
- raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken")
-
def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
table = DocxTable(xml, self.paragraph_maker)
self.tables.append(table.to_table())
@@ -150,9 +131,9 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
table_refs[len(self.paragraph_list) - 1].append(table_uid)
def __handle_images_xml(self, xmls: List[Tag], image_refs: dict) -> None:
- rels = self.__get_bs_tree("word/_rels/document.xml.rels")
+ rels = get_bs_from_zip(self.path, "word/_rels/document.xml.rels")
if rels is None:
- rels = self.__get_bs_tree("word/_rels/document2.xml.rels")
+ rels = get_bs_from_zip(self.path, "word/_rels/document2.xml.rels")
images_rels = dict()
for rel in rels.find_all("Relationship"):
diff --git a/dedoc/readers/pptx_reader/numbering_extractor.py b/dedoc/readers/pptx_reader/numbering_extractor.py
new file mode 100644
index 00000000..42da3557
--- /dev/null
+++ b/dedoc/readers/pptx_reader/numbering_extractor.py
@@ -0,0 +1,51 @@
+class NumberingExtractor:
+ """
+ This class is used to compute numbering text for list items.
+ For example: "1.", (i), "○"
+ """
+ def __init__(self) -> None:
+ # Mapping according to the ST_TextAutonumberScheme
+ # NOTE we ignore chinese, japanese, hindi, thai
+ self.numbering_types = dict(
+ arabic="1", # 1, 2, 3, ..., 10, 11, 12, ...
+ alphaLc="a", # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ...
+ alphaUc="A", # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ...
+ romanLc="i", # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ...
+ romanUc="I" # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ...
+ )
+
+ self.numbering_formatting = dict(
+ ParenBoth="({}) ",
+ ParenR="{}) ",
+ Period="{}. ",
+ Plain="{} "
+ )
+
+ self.combined_types = {
+ num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting
+ }
+ self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")]
+
+ def get_text(self, numbering: str, shift: int) -> str:
+ """
+ Computes the next item of the list sequence.
+ :param numbering: type of the numbering, e.g. "arabicPeriod"
+ :param shift: shift from the beginning of list numbering
+ :return: string representation of the next numbering item
+ """
+ num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period"))
+
+ if num_type in ("alphaLc", "alphaUc"):
+ shift1, shift2 = shift % 26, shift // 26 + 1
+ num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2
+ elif num_type in ("romanLc", "romanUc"):
+ num_char = ""
+ for number, letter in self.roman_mapping:
+ cnt, shift = shift // number, shift % number
+ if num_type == "romanUc":
+ letter = chr(ord(letter) + ord("A") - ord("a"))
+ num_char += letter * cnt
+ else:
+ num_char = str(int(self.numbering_types["arabic"]) + shift)
+
+ return self.numbering_formatting[num_formatting].format(num_char)
diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py
new file mode 100644
index 00000000..2dfcb952
--- /dev/null
+++ b/dedoc/readers/pptx_reader/paragraph.py
@@ -0,0 +1,55 @@
+from bs4 import Tag
+
+from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \
+ StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation
+from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
+from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
+from dedoc.utils.annotation_merger import AnnotationMerger
+
+
+class PptxParagraph:
+ """
+ This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag ).
+ """
+ def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None:
+ self.xml = xml
+ self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None
+ self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1
+ self.numbering_extractor = numbering_extractor
+ self.properties_extractor = properties_extractor
+ self.annotation_merger = AnnotationMerger()
+ annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
+ self.dict2annotation = {annotation.name: annotation for annotation in annotations}
+
+ def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
+ text = ""
+ paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
+ hierarchy_level = HierarchyLevel.create_raw_text()
+
+ if is_title or paragraph_properties.title:
+ hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
+ elif self.numbered_list_type: # numbered list
+ text += self.numbering_extractor.get_text(self.numbered_list_type, shift)
+ hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False)
+ elif self.xml.buChar: # bullet list
+ text += self.xml.buChar["char"] + " "
+ hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False)
+
+ annotations = []
+ if self.xml.r:
+ for run in self.xml.find_all("a:r"):
+ prev_text = text
+ for run_text in run:
+ if run_text.name == "t" and run.text:
+ text += run.text
+
+ run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties)
+ annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size)))
+ for property_name in self.dict2annotation:
+ if getattr(run_properties, property_name):
+ annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True"))
+
+ text = f"{text}\n"
+ annotations = self.annotation_merger.merge_annotations(annotations, text)
+ annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment))
+ return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations)
diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py
index e109fae0..2d68b850 100644
--- a/dedoc/readers/pptx_reader/pptx_reader.py
+++ b/dedoc/readers/pptx_reader/pptx_reader.py
@@ -1,20 +1,20 @@
+import zipfile
from typing import Dict, List, Optional
-from bs4 import BeautifulSoup
-from pptx import Presentation
-from pptx.shapes.graphfrm import GraphicFrame
-from pptx.shapes.picture import Picture
-from pptx.slide import Slide
+from bs4 import BeautifulSoup, Tag
from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation
-from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
-from dedoc.data_structures.table_metadata import TableMetadata
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.readers.base_reader import BaseReader
+from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
+from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
+from dedoc.readers.pptx_reader.shape import PptxShape
+from dedoc.readers.pptx_reader.table import PptxTable
+from dedoc.utils.office_utils import get_bs_from_zip
from dedoc.utils.parameter_utils import get_param_with_attachments
@@ -27,6 +27,7 @@ class PptxReader(BaseReader):
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format)
self.attachments_extractor = PptxAttachmentsExtractor(config=self.config)
+ self.numbering_extractor = NumberingExtractor()
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
@@ -36,55 +37,73 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
with_attachments = get_param_with_attachments(parameters)
attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else []
attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
-
- prs = Presentation(file_path)
- lines, tables = [], []
-
- for page_id, slide in enumerate(prs.slides, start=1):
- images_rels = self.__get_slide_images_rels(slide)
-
- for paragraph_id, shape in enumerate(slide.shapes, start=1):
-
- if shape.has_text_frame:
- lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
-
- if shape.has_table:
- self.__add_table(lines, tables, page_id, paragraph_id, shape)
-
- if with_attachments and hasattr(shape, "image"):
+ images_rels = self.__get_slide_images_rels(file_path)
+ properties_extractor = PropertiesExtractor(file_path)
+
+ slide_xml_list = self.__get_slides_bs(file_path, xml_prefix="ppt/slides/slide", xml_postfix=".xml")
+ lines = []
+ tables = []
+
+ for slide_id, slide_xml in enumerate(slide_xml_list):
+ shape_tree_xml = slide_xml.spTree
+
+ is_first_shape = True
+ for tag in shape_tree_xml:
+ if tag.name == "sp":
+ if not tag.txBody:
+ continue
+
+ shape = PptxShape(tag, page_id=slide_id, init_line_id=len(lines), numbering_extractor=self.numbering_extractor,
+ properties_extractor=properties_extractor, is_title=is_first_shape)
+ shape_lines = shape.get_lines()
+ lines.extend(shape_lines)
+ if is_first_shape and len(shape_lines) > 0:
+ is_first_shape = False
+
+ elif tag.tbl:
+ self.__add_table(lines=lines, tables=tables, page_id=slide_id, table_xml=tag.tbl, properties_extractor=properties_extractor)
+ elif tag.name == "pic" and tag.blip:
if len(lines) == 0:
- lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
- self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels)
+ lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=slide_id, line_id=0)))
+ image_rel_id = str(slide_id) + tag.blip.get("r:embed", "")
+ self.__add_attach_annotation(lines[-1], image_rel_id, attachment_name2uid, images_rels)
return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[])
- def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None:
- cells = [
- [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells]
- for row in shape.table.rows
- ]
- table = Table(cells=cells, metadata=TableMetadata(page_id=page_id))
+ def __get_slides_bs(self, path: str, xml_prefix: str, xml_postfix: str) -> List[BeautifulSoup]:
+ with zipfile.ZipFile(path) as document:
+ xml_names = document.namelist()
+ filtered_names = [file_name for file_name in xml_names if file_name.startswith(xml_prefix) and file_name.endswith(xml_postfix)]
+ sorted_names = sorted(filtered_names, key=lambda x: int(x[len(xml_prefix):-len(xml_postfix)]))
+ slides_bs_list = [get_bs_from_zip(path, file_name, remove_spaces=True) for file_name in sorted_names]
+ return slides_bs_list
- if len(lines) == 0:
- lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
- lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
- tables.append(table)
-
- def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]:
- rels = BeautifulSoup(slide.part.rels.xml, "xml")
+ def __get_slide_images_rels(self, path: str) -> Dict[str, str]:
+ """
+ return mapping: {image Id -> image name}
+ """
+ rels_xml_list = self.__get_slides_bs(path, xml_prefix="ppt/slides/_rels/slide", xml_postfix=".xml.rels")
images_dir = "../media/"
images_rels = dict()
- for rel in rels.find_all("Relationship"):
- if rel["Target"].startswith(images_dir):
- images_rels[rel["Id"]] = rel["Target"][len(images_dir):]
+ for slide_id, rels_xml in enumerate(rels_xml_list):
+ for rel in rels_xml.find_all("Relationship"):
+ if rel["Target"].startswith(images_dir):
+ images_rels[str(slide_id) + rel["Id"]] = rel["Target"][len(images_dir):]
return images_rels
- def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None:
+ def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, table_xml: Tag, properties_extractor: PropertiesExtractor) -> None:
+ table = PptxTable(table_xml, page_id, self.numbering_extractor, properties_extractor).to_table()
+
+ if len(lines) == 0:
+ lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=0)))
+ lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
+ tables.append(table)
+
+ def __add_attach_annotation(self, line: LineWithMeta, image_rel_id: str, attachment_name2uid: dict, images_rels: dict) -> None:
try:
- image_rels_id = shape.element.blip_rId
- image_name = images_rels[image_rels_id]
+ image_name = images_rels[image_rel_id]
image_uid = attachment_name2uid[image_name]
line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid))
except KeyError as e:
diff --git a/dedoc/readers/pptx_reader/properties_extractor.py b/dedoc/readers/pptx_reader/properties_extractor.py
new file mode 100644
index 00000000..67c0c919
--- /dev/null
+++ b/dedoc/readers/pptx_reader/properties_extractor.py
@@ -0,0 +1,124 @@
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+from bs4 import Tag
+
+from dedoc.utils.office_utils import get_bs_from_zip
+
+
+@dataclass
+class Properties:
+ bold: bool = False
+ italic: bool = False
+ underlined: bool = False
+ superscript: bool = False
+ subscript: bool = False
+ strike: bool = False
+ size: int = 0
+ alignment: str = "left"
+ title: bool = False
+
+
+class PropertiesExtractor:
+ """
+ This class allows to extract some text formatting properties (see class Properties)
+
+ Properties hierarchy:
+
+ - Run and paragraph properties (slide.xml)
+ - Slide layout properties (slideLayout.xml) TODO
+ - Master slide properties (slideMaster.xml) TODO
+ - Presentation default properties (presentation.xml -> defaultTextStyle)
+ """
+ def __init__(self, file_path: str) -> None:
+ self.alignment_mapping = dict(l="left", r="right", ctr="center", just="both", dist="both", justLow="both", thaiDist="both")
+ self.lvl2default_properties = self.__get_default_properties_mapping(file_path)
+
+ def get_properties(self, xml: Tag, level: int, properties: Optional[Properties] = None) -> Properties:
+ """
+ xml examples:
+
+
+
+ """
+ properties = properties or self.lvl2default_properties.get(level, Properties())
+ new_properties = deepcopy(properties)
+ if not xml:
+ return new_properties
+
+ self.__update_properties(xml, new_properties)
+ return new_properties
+
+ def __update_properties(self, xml: Tag, properties: Properties) -> None:
+ if int(xml.get("b", "0")):
+ properties.bold = True
+ if int(xml.get("i", "0")):
+ properties.italic = True
+
+ underlined = xml.get("u", "none").lower()
+ if underlined != "none":
+ properties.underlined = True
+
+ strike = xml.get("strike", "nostrike").lower()
+ if strike != "nostrike":
+ properties.strike = True
+
+ size = xml.get("sz")
+ if size:
+ properties.size = float(size) / 100
+
+ baseline = xml.get("baseline")
+ if baseline:
+ if float(baseline) < 0:
+ properties.subscript = True
+ else:
+ properties.superscript = True
+
+ self.__update_alignment(xml, properties)
+
+ def __update_alignment(self, xml: Tag, properties: Properties) -> None:
+ alignment = xml.get("algn")
+ if alignment and alignment in self.alignment_mapping:
+ properties.alignment = self.alignment_mapping[alignment]
+
+ def __get_default_properties_mapping(self, file_path: str) -> Dict[int, Properties]:
+ lvl2properties = {}
+
+ presentation_xml = get_bs_from_zip(file_path, "ppt/presentation.xml", remove_spaces=True)
+ default_style = presentation_xml.defaultTextStyle
+ if not default_style:
+ return lvl2properties
+
+ # lvl1pPr - lvl9pPr
+ for i in range(1, 10):
+ level_xml = getattr(default_style, f"lvl{i}pPr")
+ if level_xml:
+ self.__update_level_properties(level_xml, lvl2properties)
+ return lvl2properties
+
+ def __update_level_properties(self, xml: Tag, lvl2properties: Dict[int, Properties]) -> None:
+ """
+ Example:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """
+ level = int(xml.get("lvl", "0")) + 1
+ level_properties = lvl2properties.get(level, Properties())
+ self.__update_alignment(xml, level_properties)
+ if xml.defRPr:
+ self.__update_properties(xml.defRPr, level_properties)
+
+ lvl2properties[level] = level_properties
diff --git a/dedoc/readers/pptx_reader/shape.py b/dedoc/readers/pptx_reader/shape.py
new file mode 100644
index 00000000..b1c548d3
--- /dev/null
+++ b/dedoc/readers/pptx_reader/shape.py
@@ -0,0 +1,51 @@
+from collections import defaultdict
+from typing import List
+
+from bs4 import Tag
+
+from dedoc.data_structures import LineWithMeta
+from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
+from dedoc.readers.pptx_reader.paragraph import PptxParagraph
+from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
+
+
+class PptxShape:
+ """
+ This class corresponds to one textual block of the presentation (tag ).
+ """
+ def __init__(self, xml: Tag, page_id: int, init_line_id: int, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor,
+ is_title: bool = False) -> None:
+ self.xml = xml
+ self.page_id = page_id
+ self.init_line_id = init_line_id
+ self.numbering_extractor = numbering_extractor
+ self.properties_extractor = properties_extractor
+ self.is_title = is_title
+
+ def get_lines(self) -> List[LineWithMeta]:
+ if not self.xml.get_text().strip():
+ return []
+
+ if self.xml.ph and "title" in self.xml.ph.get("type", "").lower():
+ self.is_title = True
+
+ lines = []
+ numbering2shift = defaultdict(int)
+ prev_list_level = None
+
+ for line_id, paragraph_xml in enumerate(self.xml.find_all("a:p")):
+ paragraph = PptxParagraph(paragraph_xml, self.numbering_extractor, self.properties_extractor)
+
+ if paragraph.numbered_list_type:
+ if prev_list_level and paragraph.level > prev_list_level:
+ numbering2shift[(paragraph.numbered_list_type, paragraph.level)] = 0
+
+ shift = numbering2shift[(paragraph.numbered_list_type, paragraph.level)]
+ numbering2shift[(paragraph.numbered_list_type, paragraph.level)] += 1
+ prev_list_level = paragraph.level
+ else:
+ shift = 0
+
+ lines.append(paragraph.get_line_with_meta(line_id=self.init_line_id + line_id, page_id=self.page_id, is_title=self.is_title, shift=shift))
+
+ return lines
diff --git a/dedoc/readers/pptx_reader/table.py b/dedoc/readers/pptx_reader/table.py
new file mode 100644
index 00000000..cbe7febb
--- /dev/null
+++ b/dedoc/readers/pptx_reader/table.py
@@ -0,0 +1,64 @@
+import hashlib
+
+from bs4 import Tag
+
+from dedoc.data_structures import CellWithMeta, Table, TableMetadata
+from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
+from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
+from dedoc.readers.pptx_reader.shape import PptxShape
+
+
+class PptxTable:
+ """
+ This class corresponds to the table (tag ) in the slides xml files.
+ """
+ def __init__(self, xml: Tag, page_id: int, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None:
+ """
+ Contains information about table properties.
+ :param xml: BeautifulSoup tree with table properties
+ """
+ self.xml = xml
+ self.page_id = page_id
+ self.numbering_extractor = numbering_extractor
+ self.properties_extractor = properties_extractor
+ self.__uid = hashlib.md5(xml.encode()).hexdigest()
+
+ @property
+ def uid(self) -> str:
+ return self.__uid
+
+ def to_table(self) -> Table:
+ """
+ Converts xml file with table to Table class
+ """
+ # tbl -- table; tr -- table row, tc -- table cell
+ # delete tables inside tables
+ for tbl in self.xml.find_all("a:tbl"):
+ tbl.extract()
+
+ rows = self.xml.find_all("a:tr")
+ cell_list = []
+
+ for row in rows:
+ cells = row.find_all("a:tc")
+ col_index = 0
+ cell_row_list = []
+
+ for cell in cells:
+ if int(cell.get("vMerge", 0)): # vertical merge
+ cell_with_meta = CellWithMeta(lines=cell_list[-1][col_index].lines, colspan=1, rowspan=1, invisible=True)
+ elif int(cell.get("hMerge", 0)): # horizontal merge
+ cell_with_meta = CellWithMeta(lines=cell_row_list[-1].lines, colspan=1, rowspan=1, invisible=True)
+ else:
+ colspan = int(cell.get("gridSpan", 1)) # gridSpan attribute describes number of horizontally merged cells
+ rowspan = int(cell.get("rowSpan", 1)) # rowSpan attribute for vertically merged set of cells (or horizontally split cells)
+ lines = PptxShape(xml=cell, page_id=self.page_id, numbering_extractor=self.numbering_extractor, init_line_id=0,
+ properties_extractor=self.properties_extractor).get_lines()
+ cell_with_meta = CellWithMeta(lines=lines, colspan=colspan, rowspan=rowspan, invisible=False)
+
+ cell_row_list.append(cell_with_meta)
+ col_index += 1
+
+ cell_list.append(cell_row_list)
+
+ return Table(cells=cell_list, metadata=TableMetadata(page_id=self.page_id, uid=self.uid))
diff --git a/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py
index 4ef6d4e8..8e6e4a50 100644
--- a/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py
+++ b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py
@@ -23,7 +23,7 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
:class:`~dedoc.structure_extractors.AbstractStructureExtractor`.
"""
for line in document.lines:
- if line.metadata.tag_hierarchy_level is None:
+ if line.metadata.tag_hierarchy_level is None or line.metadata.tag_hierarchy_level.is_unknown():
line.metadata.tag_hierarchy_level = HierarchyLevel.create_raw_text()
else:
line.metadata.hierarchy_level = line.metadata.tag_hierarchy_level
diff --git a/dedoc/utils/office_utils.py b/dedoc/utils/office_utils.py
new file mode 100644
index 00000000..98693d94
--- /dev/null
+++ b/dedoc/utils/office_utils.py
@@ -0,0 +1,35 @@
+import os
+import re
+import zipfile
+from typing import Optional
+
+from bs4 import BeautifulSoup
+
+from dedoc.common.exceptions.bad_file_error import BadFileFormatError
+
+
+def get_bs_from_zip(zip_path: str, xml_path: str, remove_spaces: bool = False) -> Optional[BeautifulSoup]:
+ """
+ Utility for extracting xml from files of office formats (docx, pptx, xlsx).
+ Gets xml BeautifulSoup tree from the given file inside the zip_path.
+
+ :param zip_path: path to the file of the office format (docx, pptx, xlsx)
+ :param xml_path: name of file to extract the tree
+ :param remove_spaces: remove spaces between tags except (for pptx)
+ :return: BeautifulSoup tree or None if file wasn't found
+ """
+ try:
+ with zipfile.ZipFile(zip_path) as document:
+ content = document.read(xml_path)
+ content = re.sub(br"\n[\t ]*", b"", content)
+
+ if remove_spaces:
+ # remove spaces between tags, don't remove spaces inside pptx text fields:
+ content = re.sub(br"(?\s+<", b"><", content)
+
+ soup = BeautifulSoup(content, "xml")
+ return soup
+ except KeyError:
+ return None
+ except zipfile.BadZipFile:
+ raise BadFileFormatError(f"Bad office file:\n file_name = {os.path.basename(zip_path)}. Seems file is broken")
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index 25cf3af6..2834f209 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -1,6 +1,13 @@
Changelog
=========
+v2.2.3 (2024-06-05)
+-------------------
+Release note: `v2.2.3 `_
+
+* Show attached images and added ability to download attached files in the HTML output representation (API usage, return_format="html").
+* Added hierarchy level information and annotations to `PptxReader`.
+
v2.2.2 (2024-05-21)
-------------------
Release note: `v2.2.2 `_
diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
index ee68c29f..df6b4963 100644
--- a/docs/source/dedoc_api_usage/api.rst
+++ b/docs/source/dedoc_api_usage/api.rst
@@ -144,11 +144,6 @@ Api parameters description
The encoded contents will be saved in the attachment's metadata in the ``base64_encode`` field.
Use ``true`` value to enable this behaviour.
- * - attachments_dir
- - optional string with a valid path
- - None
- - The path to the directory where document's attached files can be saved instead of a temporary directory.
-
* - :cspan:`3` **Tables handling**
* - need_pdf_table_analysis
diff --git a/docs/source/readers_output/annotations.rst b/docs/source/readers_output/annotations.rst
index ee1785c2..2a13989d 100644
--- a/docs/source/readers_output/annotations.rst
+++ b/docs/source/readers_output/annotations.rst
@@ -11,11 +11,12 @@ Below the readers are enlisted that can return non-empty list of annotations for
.. _table_annotations:
.. list-table:: Annotations returned by each reader
- :widths: 20 10 10 10 10 10 10 10
+ :widths: 20 10 10 10 10 10 10 10 10
:class: tight-table
* - **Annotation**
- :class:`~dedoc.readers.DocxReader`
+ - :class:`~dedoc.readers.PptxReader`
- :class:`~dedoc.readers.HtmlReader`, :class:`~dedoc.readers.MhtmlReader`, :class:`~dedoc.readers.EmailReader`
- :class:`~dedoc.readers.RawTextReader`
- :class:`~dedoc.readers.PdfImageReader`
@@ -24,6 +25,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- :class:`~dedoc.readers.ArticleReader`
* - :class:`~dedoc.data_structures.AttachAnnotation`
+ - `+`
- `+`
- `-`
- `-`
@@ -33,6 +35,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `+`
* - :class:`~dedoc.data_structures.TableAnnotation`
+ - `+`
- `+`
- `-`
- `-`
@@ -43,6 +46,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
* - :class:`~dedoc.data_structures.LinkedTextAnnotation`
- `+`
+ - `-`
- `+`
- `-`
- `-`
@@ -54,12 +58,14 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
- `-`
- `-`
+ - `-`
- `+`
- `+`
- `+`
- `-`
* - :class:`~dedoc.data_structures.AlignmentAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -71,6 +77,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
* - :class:`~dedoc.data_structures.IndentationAnnotation`
- `+`
- `-`
+ - `-`
- `+`
- `+`
- `+`
@@ -80,6 +87,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
* - :class:`~dedoc.data_structures.SpacingAnnotation`
- `+`
- `-`
+ - `-`
- `+`
- `+`
- `+`
@@ -87,6 +95,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
* - :class:`~dedoc.data_structures.BoldAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -96,6 +105,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
* - :class:`~dedoc.data_structures.ItalicAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -105,6 +115,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
* - :class:`~dedoc.data_structures.UnderlinedAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -114,6 +125,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
* - :class:`~dedoc.data_structures.StrikeAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -123,6 +135,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
* - :class:`~dedoc.data_structures.SubscriptAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -132,6 +145,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
* - :class:`~dedoc.data_structures.SuperscriptAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -144,12 +158,14 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
- `-`
- `-`
+ - `-`
- `+`
- `-`
- `+`
- `-`
* - :class:`~dedoc.data_structures.SizeAnnotation`
+ - `+`
- `+`
- `+`
- `-`
@@ -160,6 +176,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
* - :class:`~dedoc.data_structures.StyleAnnotation`
- `+`
+ - `-`
- `+`
- `-`
- `-`
@@ -171,6 +188,7 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
- `-`
- `-`
+ - `-`
- `+`
- `-`
- `-`
@@ -183,4 +201,5 @@ Below the readers are enlisted that can return non-empty list of annotations for
- `-`
- `-`
- `-`
+ - `-`
- `+`
diff --git a/docs/source/readers_output/line_types.rst b/docs/source/readers_output/line_types.rst
index d7c42425..666a8d35 100644
--- a/docs/source/readers_output/line_types.rst
+++ b/docs/source/readers_output/line_types.rst
@@ -28,6 +28,12 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag``
- `+`
- `-`
+ * - :class:`~dedoc.readers.PptxReader`
+ - `+`
+ - `+`
+ - `+`
+ - `-`
+
* - :class:`~dedoc.readers.HtmlReader`, :class:`~dedoc.readers.MhtmlReader`, :class:`~dedoc.readers.EmailReader`
- `+`
- `+`
diff --git a/requirements.txt b/requirements.txt
index e6a7ac86..7d449f59 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
beautifulsoup4>=4.10.0,<=4.12.2
charset-normalizer>=2.0.12,<=3.2.0
Cython>=0.29.28,<=3.0.2
-docx==0.2.4
dedoc-utils==0.3.6
fastapi>=0.77.0,<=0.103.0
huggingface-hub>=0.14.1,<=0.16.4
@@ -26,7 +25,6 @@ python-Levenshtein==0.12.2
python-logstash-async>=2.5.0,<=2.7.0
python-magic<1.0
python-multipart==0.0.6
-python-pptx==0.6.21
rarfile==4.0
requests>=2.22.0
roman>=3.3,<4.0
diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py
index b2df8351..214265be 100644
--- a/tests/api_tests/test_api_format_pptx.py
+++ b/tests/api_tests/test_api_format_pptx.py
@@ -23,6 +23,138 @@ def test_odp(self) -> None:
result = self._send_request(file_name, data=dict(structure_type="linear"))
self.__check_content(result["content"])
+ def test_structure_and_annotations(self) -> None:
+ file_name = "test-presentation.pptx"
+ result = self._send_request(file_name, data=dict(with_attachments="True"))
+ structure = result["content"]["structure"]
+
+ # Test headers
+ node = self._get_by_tree_path(structure, "0.0")
+ self.assertEqual("Title\n", node["text"])
+ self.assertEqual("header", node["metadata"]["paragraph_type"])
+ annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "size"]
+ self.assertEqual(1, len(annotations))
+ self.assertEqual(50.0, float(annotations[0]["value"]))
+ annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "alignment"]
+ self.assertEqual(1, len(annotations))
+ self.assertEqual("center", annotations[0]["value"])
+ node = self._get_by_tree_path(structure, "0.2")
+ self.assertEqual("Title\n", node["text"])
+ self.assertEqual("header", node["metadata"]["paragraph_type"])
+
+ # Test lists
+ self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1")["metadata"]["paragraph_type"])
+ self.assertEqual("1. first item\n", self._get_by_tree_path(structure, "0.2.1.0")["text"])
+ self.assertEqual("2. second item\n", self._get_by_tree_path(structure, "0.2.1.1")["text"])
+ self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1.1.0")["metadata"]["paragraph_type"])
+ self.assertEqual("a. subitem\n", self._get_by_tree_path(structure, "0.2.1.1.0.0")["text"])
+ self.assertEqual("3. third item\n", self._get_by_tree_path(structure, "0.2.1.2")["text"])
+ self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1.2.0")["metadata"]["paragraph_type"])
+ self.assertEqual("a. \n", self._get_by_tree_path(structure, "0.2.1.2.0.0")["text"])
+
+ self.assertEqual("❏ first bullet item\n", self._get_by_tree_path(structure, "0.3.0.0")["text"])
+ self.assertEqual("❏ second bullet item\n", self._get_by_tree_path(structure, "0.3.0.1")["text"])
+ self.assertEqual("❏ subitem\n", self._get_by_tree_path(structure, "0.3.0.1.0.0")["text"])
+ self.assertEqual("A. first letter item\n", self._get_by_tree_path(structure, "0.3.1.0")["text"])
+ self.assertEqual("B. second letter item\n", self._get_by_tree_path(structure, "0.3.1.1")["text"])
+ self.assertEqual("○ first subitem\n", self._get_by_tree_path(structure, "0.3.1.1.0.0")["text"])
+ self.assertEqual("○ second subitem\n", self._get_by_tree_path(structure, "0.3.1.1.0.1")["text"])
+
+ # Test annotations
+ node = self._get_by_tree_path(structure, "0.5")
+ self.assertEqual("Custom title\n", node["text"])
+ self.assertEqual("header", node["metadata"]["paragraph_type"])
+ annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "size"]
+ self.assertEqual(30.0, float(annotations[0]["value"]))
+ annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "bold"]
+ self.assertEqual("True", annotations[0]["value"])
+ annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "alignment"]
+ self.assertEqual("center", annotations[0]["value"])
+
+ node = self._get_by_tree_path(structure, "0.5.0")
+ annotations = {float(annotation["value"]) for annotation in node["annotations"] if annotation["name"] == "size"}
+ self.assertSetEqual({18.0, 24.0, 10.0}, annotations)
+ self.assertIn({"start": 18, "end": 27, "name": "bold", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 28, "end": 39, "name": "italic", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 40, "end": 55, "name": "underlined", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 56, "end": 67, "name": "strike", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 68, "end": 79, "name": "superscript", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 81, "end": 90, "name": "subscript", "value": "True"}, node["annotations"])
+
+ node = self._get_by_tree_path(structure, "0.6")
+ self.assertIn({"start": 0, "end": 12, "name": "bold", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 0, "end": 12, "name": "italic", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 0, "end": 12, "name": "underlined", "value": "True"}, node["annotations"])
+ self.assertIn({"start": 0, "end": 12, "name": "size", "value": "20.0"}, node["annotations"])
+ self.assertIn({"start": 0, "end": 13, "name": "alignment", "value": "right"}, node["annotations"])
+
+ # Test tables
+ tables = result["content"]["tables"]
+ self.assertEqual(1, len(tables))
+ table = tables[0]
+ node = self._get_by_tree_path(structure, "0.4")
+ annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "table"]
+ self.assertEqual(table["metadata"]["uid"], annotations[0]["value"])
+ column_number = len(table["cells"][0])
+ for table_row in table["cells"]:
+ self.assertEqual(column_number, len(table_row))
+
+ cell = table["cells"][0][0]
+ self.assertEqual("Horizontally merged cells\n", cell["lines"][0]["text"])
+ self.assertEqual(1, cell["rowspan"])
+ self.assertEqual(2, cell["colspan"])
+ self.assertEqual(False, cell["invisible"])
+ cell = table["cells"][0][1]
+ self.assertEqual("Horizontally merged cells\n", cell["lines"][0]["text"])
+ self.assertEqual(1, cell["rowspan"])
+ self.assertEqual(1, cell["colspan"])
+ self.assertEqual(True, cell["invisible"])
+
+ cell = table["cells"][1][2]
+ self.assertEqual("Vertically merged cells\n", cell["lines"][0]["text"])
+ self.assertEqual(2, cell["rowspan"])
+ self.assertEqual(1, cell["colspan"])
+ self.assertEqual(False, cell["invisible"])
+ cell = table["cells"][2][2]
+ self.assertEqual("Vertically merged cells\n", cell["lines"][0]["text"])
+ self.assertEqual(1, cell["rowspan"])
+ self.assertEqual(1, cell["colspan"])
+ self.assertEqual(True, cell["invisible"])
+
+ cell = table["cells"][2][0]
+ self.assertEqual("Vertically merged cells 2\n", cell["lines"][0]["text"])
+ self.assertEqual(2, cell["rowspan"])
+ self.assertEqual(1, cell["colspan"])
+ self.assertEqual(False, cell["invisible"])
+ cell = table["cells"][3][0]
+ self.assertEqual("Vertically merged cells 2\n", cell["lines"][0]["text"])
+ self.assertEqual(1, cell["rowspan"])
+ self.assertEqual(1, cell["colspan"])
+ self.assertEqual(True, cell["invisible"])
+
+ cell = table["cells"][3][2]
+ self.assertEqual("Horizontally merged cells 2\n", cell["lines"][0]["text"])
+ self.assertEqual(1, cell["rowspan"])
+ self.assertEqual(3, cell["colspan"])
+ self.assertEqual(False, cell["invisible"])
+ cell = table["cells"][3][3]
+ self.assertEqual("Horizontally merged cells 2\n", cell["lines"][0]["text"])
+ self.assertEqual(1, cell["rowspan"])
+ self.assertEqual(1, cell["colspan"])
+ self.assertEqual(True, cell["invisible"])
+
+ # Test attachments
+ self.assertEqual(3, len(result["attachments"]))
+ attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
+ node = self._get_by_tree_path(structure, "0.6")
+ annotations = [annotation["value"] for annotation in node["annotations"] if annotation["name"] == "attachment"]
+ self.assertIn(annotations[0], attachment_uids)
+ self.assertIn(annotations[1], attachment_uids)
+ node = self._get_by_tree_path(structure, "0.8.0")
+ self.assertEqual("Text text\n", node["text"])
+ annotations = [annotation["value"] for annotation in node["annotations"] if annotation["name"] == "attachment"]
+ self.assertIn(annotations[0], attachment_uids)
+
def __check_content(self, content: dict) -> None:
subparagraphs = content["structure"]["subparagraphs"]
self.assertEqual("A long time ago in a galaxy far far away", subparagraphs[0]["text"].strip())
@@ -31,8 +163,8 @@ def __check_content(self, content: dict) -> None:
self.assertEqual("This is simple table", subparagraphs[3]["text"].strip())
table = content["tables"][0]
- self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table["cells"][0]))
- self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table["cells"][1]))
+ self.assertListEqual(["", "Header1\n", "Header2\n", "Header3\n"], self._get_text_of_row(table["cells"][0]))
+ self.assertListEqual(["Some content\n", "A\n", "B\n", "C\n"], self._get_text_of_row(table["cells"][1]))
table_annotations = [ann for ann in subparagraphs[2]["annotations"] if ann["name"] == TableAnnotation.name]
self.assertEqual(1, len(table_annotations))
diff --git a/tests/data/pptx/test-presentation.pptx b/tests/data/pptx/test-presentation.pptx
new file mode 100644
index 00000000..97eaf6a9
Binary files /dev/null and b/tests/data/pptx/test-presentation.pptx differ