diff --git a/VERSION b/VERSION index 7e541aec..6b4d1577 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.2 \ No newline at end of file +2.2.3 \ No newline at end of file diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index f139733f..20d01db1 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -17,7 +17,6 @@ class QueryParameters: need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files") recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true") return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format") - attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments") # tables handling need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf") diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index 46a6ce6d..c942fefa 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -14,6 +14,7 @@ from dedoc.data_structures.parsed_document import ParsedDocument from dedoc.data_structures.table import Table from dedoc.data_structures.tree_node import TreeNode +from dedoc.extensions import converted_mimes, recognized_mimes def __prettify_text(text: str) -> Iterator[str]: @@ -148,11 +149,22 @@ def json2html(text: str, text += table2html(table, table2id) text += "

 

" + image_mimes = recognized_mimes.image_like_format.union(converted_mimes.image_like_format) + if attachments is not None and len(attachments) > 0: text += "

Attachments:

" for attachment_id, attachment in enumerate(attachments): attachment_text = json2html(text="", paragraph=attachment.content.structure, tables=attachment.content.tables, attachments=attachment.attachments) - text += f'

attachment {attachment_id} ({attachment.metadata.file_name}):

{attachment_text}
' + attachment_base64 = f'data:{attachment.metadata.file_type};base64,{attachment.metadata.base64}"' + attachment_link = f'{attachment.metadata.file_name}' + is_image = attachment.metadata.file_type in image_mimes + attachment_image = f'' if is_image else "" + + text += f"""
+

attachment {attachment_id} ({attachment_link}):

+ {attachment_image} + {attachment_text} +
""" return text @@ -193,12 +205,9 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int], attach2id: name = annotation.name value = annotation.value - bool_annotations = [BoldAnnotation.name, - ItalicAnnotation.name, - StrikeAnnotation.name, - SubscriptAnnotation.name, - SuperscriptAnnotation.name, - UnderlinedAnnotation.name] + bool_annotations = [ + BoldAnnotation.name, ItalicAnnotation.name, StrikeAnnotation.name, SubscriptAnnotation.name, SuperscriptAnnotation.name, UnderlinedAnnotation.name + ] check_annotations = bool_annotations + [TableAnnotation.name, ReferenceAnnotation.name, AttachAnnotation.name] if name not in check_annotations and not value.startswith("heading "): continue diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 4572ab05..9e7063f7 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -1,3 +1,4 @@ +import base64 import dataclasses import importlib import json @@ -62,41 +63,57 @@ def _get_static_file_path(request: Request) -> str: return os.path.abspath(os.path.join(directory, file)) +def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None: + for attachment in document_tree.attachments: + with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file: + attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8")) + + @app.post("/upload", response_model=ParsedDocument) async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa parameters = dataclasses.asdict(query_params) if not file or file.filename == "": raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__) + return_format = str(parameters.get("return_format", "json")).lower() + with tempfile.TemporaryDirectory() as tmpdir: file_path = save_upload_file(file, tmpdir) - document_tree = manager.parse(file_path, parameters=dict(parameters)) + document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir}) + + if return_format == "html": + __add_base64_info_to_attachments(document_tree, tmpdir) - return_format = str(parameters.get("return_format", "json")).lower() if return_format == "html": html_content = json2html( text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, - attachments=document_tree.attachments, tabs=0 + attachments=document_tree.attachments, + tabs=0 ) return HTMLResponse(content=html_content) - elif return_format == "plain_text": + + if return_format == "plain_text": txt_content = json2txt(paragraph=document_tree.content.structure) return PlainTextResponse(content=txt_content) - elif return_format == "tree": + + if return_format == "tree": html_content = json2tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content) - elif return_format == "ujson": + + if return_format == "ujson": return UJSONResponse(content=document_tree.to_api_schema().model_dump()) - elif return_format == "collapsed_tree": + + if return_format == "collapsed_tree": html_content = json2collapsed_tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content) - elif return_format == "pretty_json": + + if return_format == "pretty_json": return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2)) - else: - logger.info(f"Send result. File {file.filename} with parameters {parameters}") - return ORJSONResponse(content=document_tree.to_api_schema().model_dump()) + + logger.info(f"Send result. File {file.filename} with parameters {parameters}") + return ORJSONResponse(content=document_tree.to_api_schema().model_dump()) @app.get("/upload_example") diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index 055ef58b..e045d483 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -70,7 +70,7 @@

Type of document structure parsing

Attachments handling

-
with_attachments, need_content_analysis, recursion_deep_attachments, return_base64, attachments_dir +
with_attachments, need_content_analysis, recursion_deep_attachments, return_base64

@@ -87,10 +87,6 @@

Attachments handling

- -

- -

diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index beec9c56..e93b2c16 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,5 +1,5 @@ import uuid -from typing import Dict, Union +from typing import Any, Dict, Union from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -38,8 +38,11 @@ def __init__(self, self.access_time = access_time self.file_type = file_type for key, value in kwargs.items(): - setattr(self, key, value) + self.add_attribute(key, value) self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid + def add_attribute(self, key: str, value: Any) -> None: # noqa + setattr(self, key, value) + def to_api_schema(self) -> ApiDocumentMetadata: return ApiDocumentMetadata(**vars(self)) diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py index 49d402e8..95901ec4 100644 --- a/dedoc/readers/docx_reader/data_structures/docx_document.py +++ b/dedoc/readers/docx_reader/data_structures/docx_document.py @@ -1,14 +1,11 @@ import hashlib import logging -import os import re -import zipfile from collections import defaultdict -from typing import List, Optional +from typing import List from bs4 import BeautifulSoup, Tag -from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.attached_file import AttachedFile from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation @@ -19,6 +16,7 @@ from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.docx_reader.styles_extractor import StylesExtractor +from dedoc.utils.office_utils import get_bs_from_zip from dedoc.utils.utils import calculate_file_hash @@ -28,8 +26,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L self.path = path self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} - self.document_bs_tree = self.__get_bs_tree("word/document.xml") - self.document_bs_tree = self.__get_bs_tree("word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree + self.document_bs_tree = get_bs_from_zip(self.path, "word/document.xml") + self.document_bs_tree = get_bs_from_zip(self.path, "word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree self.body = self.document_bs_tree.body if self.document_bs_tree else None self.paragraph_maker = self.__get_paragraph_maker() @@ -39,8 +37,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L self.lines = self.__get_lines() def __get_paragraph_maker(self) -> ParagraphMaker: - styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), self.logger) - num_tree = self.__get_bs_tree("word/numbering.xml") + styles_extractor = StylesExtractor(get_bs_from_zip(self.path, "word/styles.xml"), self.logger) + num_tree = get_bs_from_zip(self.path, "word/numbering.xml") numbering_extractor = NumberingExtractor(num_tree, styles_extractor) if num_tree else None styles_extractor.numbering_extractor = numbering_extractor @@ -49,8 +47,8 @@ def __get_paragraph_maker(self) -> ParagraphMaker: path_hash=calculate_file_hash(path=self.path), styles_extractor=styles_extractor, numbering_extractor=numbering_extractor, - footnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")), - endnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote") + footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")), + endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote") ) def __get_lines(self) -> List[LineWithMeta]: @@ -120,23 +118,6 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d return lines_with_meta - def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]: - """ - Gets xml bs tree from the given file inside the self.path. - :param filename: name of file to extract the tree - :return: BeautifulSoup tree or None if file wasn't found - """ - try: - with zipfile.ZipFile(self.path) as document: - content = document.read(filename) - content = re.sub(br"\n[\t ]*", b"", content) - soup = BeautifulSoup(content, "xml") - return soup - except KeyError: - return None - except zipfile.BadZipFile: - raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken") - def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None: table = DocxTable(xml, self.paragraph_maker) self.tables.append(table.to_table()) @@ -150,9 +131,9 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None: table_refs[len(self.paragraph_list) - 1].append(table_uid) def __handle_images_xml(self, xmls: List[Tag], image_refs: dict) -> None: - rels = self.__get_bs_tree("word/_rels/document.xml.rels") + rels = get_bs_from_zip(self.path, "word/_rels/document.xml.rels") if rels is None: - rels = self.__get_bs_tree("word/_rels/document2.xml.rels") + rels = get_bs_from_zip(self.path, "word/_rels/document2.xml.rels") images_rels = dict() for rel in rels.find_all("Relationship"): diff --git a/dedoc/readers/pptx_reader/numbering_extractor.py b/dedoc/readers/pptx_reader/numbering_extractor.py new file mode 100644 index 00000000..42da3557 --- /dev/null +++ b/dedoc/readers/pptx_reader/numbering_extractor.py @@ -0,0 +1,51 @@ +class NumberingExtractor: + """ + This class is used to compute numbering text for list items. + For example: "1.", (i), "○" + """ + def __init__(self) -> None: + # Mapping according to the ST_TextAutonumberScheme + # NOTE we ignore chinese, japanese, hindi, thai + self.numbering_types = dict( + arabic="1", # 1, 2, 3, ..., 10, 11, 12, ... + alphaLc="a", # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ... + alphaUc="A", # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ... + romanLc="i", # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ... + romanUc="I" # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ... + ) + + self.numbering_formatting = dict( + ParenBoth="({}) ", + ParenR="{}) ", + Period="{}. ", + Plain="{} " + ) + + self.combined_types = { + num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting + } + self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")] + + def get_text(self, numbering: str, shift: int) -> str: + """ + Computes the next item of the list sequence. + :param numbering: type of the numbering, e.g. "arabicPeriod" + :param shift: shift from the beginning of list numbering + :return: string representation of the next numbering item + """ + num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period")) + + if num_type in ("alphaLc", "alphaUc"): + shift1, shift2 = shift % 26, shift // 26 + 1 + num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2 + elif num_type in ("romanLc", "romanUc"): + num_char = "" + for number, letter in self.roman_mapping: + cnt, shift = shift // number, shift % number + if num_type == "romanUc": + letter = chr(ord(letter) + ord("A") - ord("a")) + num_char += letter * cnt + else: + num_char = str(int(self.numbering_types["arabic"]) + shift) + + return self.numbering_formatting[num_formatting].format(num_char) diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py new file mode 100644 index 00000000..2dfcb952 --- /dev/null +++ b/dedoc/readers/pptx_reader/paragraph.py @@ -0,0 +1,55 @@ +from bs4 import Tag + +from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \ + StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation +from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor +from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor +from dedoc.utils.annotation_merger import AnnotationMerger + + +class PptxParagraph: + """ + This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag ). + """ + def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None: + self.xml = xml + self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None + self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1 + self.numbering_extractor = numbering_extractor + self.properties_extractor = properties_extractor + self.annotation_merger = AnnotationMerger() + annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation] + self.dict2annotation = {annotation.name: annotation for annotation in annotations} + + def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta: + text = "" + paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level) + hierarchy_level = HierarchyLevel.create_raw_text() + + if is_title or paragraph_properties.title: + hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False) + elif self.numbered_list_type: # numbered list + text += self.numbering_extractor.get_text(self.numbered_list_type, shift) + hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False) + elif self.xml.buChar: # bullet list + text += self.xml.buChar["char"] + " " + hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False) + + annotations = [] + if self.xml.r: + for run in self.xml.find_all("a:r"): + prev_text = text + for run_text in run: + if run_text.name == "t" and run.text: + text += run.text + + run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties) + annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size))) + for property_name in self.dict2annotation: + if getattr(run_properties, property_name): + annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True")) + + text = f"{text}\n" + annotations = self.annotation_merger.merge_annotations(annotations, text) + annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment)) + return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations) diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py index e109fae0..2d68b850 100644 --- a/dedoc/readers/pptx_reader/pptx_reader.py +++ b/dedoc/readers/pptx_reader/pptx_reader.py @@ -1,20 +1,20 @@ +import zipfile from typing import Dict, List, Optional -from bs4 import BeautifulSoup -from pptx import Presentation -from pptx.shapes.graphfrm import GraphicFrame -from pptx.shapes.picture import Picture -from pptx.slide import Slide +from bs4 import BeautifulSoup, Tag from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation -from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.data_structures.table_metadata import TableMetadata from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.base_reader import BaseReader +from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor +from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor +from dedoc.readers.pptx_reader.shape import PptxShape +from dedoc.readers.pptx_reader.table import PptxTable +from dedoc.utils.office_utils import get_bs_from_zip from dedoc.utils.parameter_utils import get_param_with_attachments @@ -27,6 +27,7 @@ class PptxReader(BaseReader): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format) self.attachments_extractor = PptxAttachmentsExtractor(config=self.config) + self.numbering_extractor = NumberingExtractor() def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ @@ -36,55 +37,73 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure with_attachments = get_param_with_attachments(parameters) attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} - - prs = Presentation(file_path) - lines, tables = [], [] - - for page_id, slide in enumerate(prs.slides, start=1): - images_rels = self.__get_slide_images_rels(slide) - - for paragraph_id, shape in enumerate(slide.shapes, start=1): - - if shape.has_text_frame: - lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) - - if shape.has_table: - self.__add_table(lines, tables, page_id, paragraph_id, shape) - - if with_attachments and hasattr(shape, "image"): + images_rels = self.__get_slide_images_rels(file_path) + properties_extractor = PropertiesExtractor(file_path) + + slide_xml_list = self.__get_slides_bs(file_path, xml_prefix="ppt/slides/slide", xml_postfix=".xml") + lines = [] + tables = [] + + for slide_id, slide_xml in enumerate(slide_xml_list): + shape_tree_xml = slide_xml.spTree + + is_first_shape = True + for tag in shape_tree_xml: + if tag.name == "sp": + if not tag.txBody: + continue + + shape = PptxShape(tag, page_id=slide_id, init_line_id=len(lines), numbering_extractor=self.numbering_extractor, + properties_extractor=properties_extractor, is_title=is_first_shape) + shape_lines = shape.get_lines() + lines.extend(shape_lines) + if is_first_shape and len(shape_lines) > 0: + is_first_shape = False + + elif tag.tbl: + self.__add_table(lines=lines, tables=tables, page_id=slide_id, table_xml=tag.tbl, properties_extractor=properties_extractor) + elif tag.name == "pic" and tag.blip: if len(lines) == 0: - lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) - self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels) + lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=slide_id, line_id=0))) + image_rel_id = str(slide_id) + tag.blip.get("r:embed", "") + self.__add_attach_annotation(lines[-1], image_rel_id, attachment_name2uid, images_rels) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[]) - def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None: - cells = [ - [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells] - for row in shape.table.rows - ] - table = Table(cells=cells, metadata=TableMetadata(page_id=page_id)) + def __get_slides_bs(self, path: str, xml_prefix: str, xml_postfix: str) -> List[BeautifulSoup]: + with zipfile.ZipFile(path) as document: + xml_names = document.namelist() + filtered_names = [file_name for file_name in xml_names if file_name.startswith(xml_prefix) and file_name.endswith(xml_postfix)] + sorted_names = sorted(filtered_names, key=lambda x: int(x[len(xml_prefix):-len(xml_postfix)])) + slides_bs_list = [get_bs_from_zip(path, file_name, remove_spaces=True) for file_name in sorted_names] + return slides_bs_list - if len(lines) == 0: - lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id))) - lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid)) - tables.append(table) - - def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]: - rels = BeautifulSoup(slide.part.rels.xml, "xml") + def __get_slide_images_rels(self, path: str) -> Dict[str, str]: + """ + return mapping: {image Id -> image name} + """ + rels_xml_list = self.__get_slides_bs(path, xml_prefix="ppt/slides/_rels/slide", xml_postfix=".xml.rels") images_dir = "../media/" images_rels = dict() - for rel in rels.find_all("Relationship"): - if rel["Target"].startswith(images_dir): - images_rels[rel["Id"]] = rel["Target"][len(images_dir):] + for slide_id, rels_xml in enumerate(rels_xml_list): + for rel in rels_xml.find_all("Relationship"): + if rel["Target"].startswith(images_dir): + images_rels[str(slide_id) + rel["Id"]] = rel["Target"][len(images_dir):] return images_rels - def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None: + def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, table_xml: Tag, properties_extractor: PropertiesExtractor) -> None: + table = PptxTable(table_xml, page_id, self.numbering_extractor, properties_extractor).to_table() + + if len(lines) == 0: + lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=0))) + lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid)) + tables.append(table) + + def __add_attach_annotation(self, line: LineWithMeta, image_rel_id: str, attachment_name2uid: dict, images_rels: dict) -> None: try: - image_rels_id = shape.element.blip_rId - image_name = images_rels[image_rels_id] + image_name = images_rels[image_rel_id] image_uid = attachment_name2uid[image_name] line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid)) except KeyError as e: diff --git a/dedoc/readers/pptx_reader/properties_extractor.py b/dedoc/readers/pptx_reader/properties_extractor.py new file mode 100644 index 00000000..67c0c919 --- /dev/null +++ b/dedoc/readers/pptx_reader/properties_extractor.py @@ -0,0 +1,124 @@ +from copy import deepcopy +from dataclasses import dataclass +from typing import Dict, Optional + +from bs4 import Tag + +from dedoc.utils.office_utils import get_bs_from_zip + + +@dataclass +class Properties: + bold: bool = False + italic: bool = False + underlined: bool = False + superscript: bool = False + subscript: bool = False + strike: bool = False + size: int = 0 + alignment: str = "left" + title: bool = False + + +class PropertiesExtractor: + """ + This class allows to extract some text formatting properties (see class Properties) + + Properties hierarchy: + + - Run and paragraph properties (slide.xml) + - Slide layout properties (slideLayout.xml) TODO + - Master slide properties (slideMaster.xml) TODO + - Presentation default properties (presentation.xml -> defaultTextStyle) + """ + def __init__(self, file_path: str) -> None: + self.alignment_mapping = dict(l="left", r="right", ctr="center", just="both", dist="both", justLow="both", thaiDist="both") + self.lvl2default_properties = self.__get_default_properties_mapping(file_path) + + def get_properties(self, xml: Tag, level: int, properties: Optional[Properties] = None) -> Properties: + """ + xml examples: + + + + """ + properties = properties or self.lvl2default_properties.get(level, Properties()) + new_properties = deepcopy(properties) + if not xml: + return new_properties + + self.__update_properties(xml, new_properties) + return new_properties + + def __update_properties(self, xml: Tag, properties: Properties) -> None: + if int(xml.get("b", "0")): + properties.bold = True + if int(xml.get("i", "0")): + properties.italic = True + + underlined = xml.get("u", "none").lower() + if underlined != "none": + properties.underlined = True + + strike = xml.get("strike", "nostrike").lower() + if strike != "nostrike": + properties.strike = True + + size = xml.get("sz") + if size: + properties.size = float(size) / 100 + + baseline = xml.get("baseline") + if baseline: + if float(baseline) < 0: + properties.subscript = True + else: + properties.superscript = True + + self.__update_alignment(xml, properties) + + def __update_alignment(self, xml: Tag, properties: Properties) -> None: + alignment = xml.get("algn") + if alignment and alignment in self.alignment_mapping: + properties.alignment = self.alignment_mapping[alignment] + + def __get_default_properties_mapping(self, file_path: str) -> Dict[int, Properties]: + lvl2properties = {} + + presentation_xml = get_bs_from_zip(file_path, "ppt/presentation.xml", remove_spaces=True) + default_style = presentation_xml.defaultTextStyle + if not default_style: + return lvl2properties + + # lvl1pPr - lvl9pPr + for i in range(1, 10): + level_xml = getattr(default_style, f"lvl{i}pPr") + if level_xml: + self.__update_level_properties(level_xml, lvl2properties) + return lvl2properties + + def __update_level_properties(self, xml: Tag, lvl2properties: Dict[int, Properties]) -> None: + """ + Example: + + + + + + + + + + + + + + + """ + level = int(xml.get("lvl", "0")) + 1 + level_properties = lvl2properties.get(level, Properties()) + self.__update_alignment(xml, level_properties) + if xml.defRPr: + self.__update_properties(xml.defRPr, level_properties) + + lvl2properties[level] = level_properties diff --git a/dedoc/readers/pptx_reader/shape.py b/dedoc/readers/pptx_reader/shape.py new file mode 100644 index 00000000..b1c548d3 --- /dev/null +++ b/dedoc/readers/pptx_reader/shape.py @@ -0,0 +1,51 @@ +from collections import defaultdict +from typing import List + +from bs4 import Tag + +from dedoc.data_structures import LineWithMeta +from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor +from dedoc.readers.pptx_reader.paragraph import PptxParagraph +from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor + + +class PptxShape: + """ + This class corresponds to one textual block of the presentation (tag ). + """ + def __init__(self, xml: Tag, page_id: int, init_line_id: int, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor, + is_title: bool = False) -> None: + self.xml = xml + self.page_id = page_id + self.init_line_id = init_line_id + self.numbering_extractor = numbering_extractor + self.properties_extractor = properties_extractor + self.is_title = is_title + + def get_lines(self) -> List[LineWithMeta]: + if not self.xml.get_text().strip(): + return [] + + if self.xml.ph and "title" in self.xml.ph.get("type", "").lower(): + self.is_title = True + + lines = [] + numbering2shift = defaultdict(int) + prev_list_level = None + + for line_id, paragraph_xml in enumerate(self.xml.find_all("a:p")): + paragraph = PptxParagraph(paragraph_xml, self.numbering_extractor, self.properties_extractor) + + if paragraph.numbered_list_type: + if prev_list_level and paragraph.level > prev_list_level: + numbering2shift[(paragraph.numbered_list_type, paragraph.level)] = 0 + + shift = numbering2shift[(paragraph.numbered_list_type, paragraph.level)] + numbering2shift[(paragraph.numbered_list_type, paragraph.level)] += 1 + prev_list_level = paragraph.level + else: + shift = 0 + + lines.append(paragraph.get_line_with_meta(line_id=self.init_line_id + line_id, page_id=self.page_id, is_title=self.is_title, shift=shift)) + + return lines diff --git a/dedoc/readers/pptx_reader/table.py b/dedoc/readers/pptx_reader/table.py new file mode 100644 index 00000000..cbe7febb --- /dev/null +++ b/dedoc/readers/pptx_reader/table.py @@ -0,0 +1,64 @@ +import hashlib + +from bs4 import Tag + +from dedoc.data_structures import CellWithMeta, Table, TableMetadata +from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor +from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor +from dedoc.readers.pptx_reader.shape import PptxShape + + +class PptxTable: + """ + This class corresponds to the table (tag ) in the slides xml files. + """ + def __init__(self, xml: Tag, page_id: int, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None: + """ + Contains information about table properties. + :param xml: BeautifulSoup tree with table properties + """ + self.xml = xml + self.page_id = page_id + self.numbering_extractor = numbering_extractor + self.properties_extractor = properties_extractor + self.__uid = hashlib.md5(xml.encode()).hexdigest() + + @property + def uid(self) -> str: + return self.__uid + + def to_table(self) -> Table: + """ + Converts xml file with table to Table class + """ + # tbl -- table; tr -- table row, tc -- table cell + # delete tables inside tables + for tbl in self.xml.find_all("a:tbl"): + tbl.extract() + + rows = self.xml.find_all("a:tr") + cell_list = [] + + for row in rows: + cells = row.find_all("a:tc") + col_index = 0 + cell_row_list = [] + + for cell in cells: + if int(cell.get("vMerge", 0)): # vertical merge + cell_with_meta = CellWithMeta(lines=cell_list[-1][col_index].lines, colspan=1, rowspan=1, invisible=True) + elif int(cell.get("hMerge", 0)): # horizontal merge + cell_with_meta = CellWithMeta(lines=cell_row_list[-1].lines, colspan=1, rowspan=1, invisible=True) + else: + colspan = int(cell.get("gridSpan", 1)) # gridSpan attribute describes number of horizontally merged cells + rowspan = int(cell.get("rowSpan", 1)) # rowSpan attribute for vertically merged set of cells (or horizontally split cells) + lines = PptxShape(xml=cell, page_id=self.page_id, numbering_extractor=self.numbering_extractor, init_line_id=0, + properties_extractor=self.properties_extractor).get_lines() + cell_with_meta = CellWithMeta(lines=lines, colspan=colspan, rowspan=rowspan, invisible=False) + + cell_row_list.append(cell_with_meta) + col_index += 1 + + cell_list.append(cell_row_list) + + return Table(cells=cell_list, metadata=TableMetadata(page_id=self.page_id, uid=self.uid)) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py index 4ef6d4e8..8e6e4a50 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/article_structure_extractor.py @@ -23,7 +23,7 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ for line in document.lines: - if line.metadata.tag_hierarchy_level is None: + if line.metadata.tag_hierarchy_level is None or line.metadata.tag_hierarchy_level.is_unknown(): line.metadata.tag_hierarchy_level = HierarchyLevel.create_raw_text() else: line.metadata.hierarchy_level = line.metadata.tag_hierarchy_level diff --git a/dedoc/utils/office_utils.py b/dedoc/utils/office_utils.py new file mode 100644 index 00000000..98693d94 --- /dev/null +++ b/dedoc/utils/office_utils.py @@ -0,0 +1,35 @@ +import os +import re +import zipfile +from typing import Optional + +from bs4 import BeautifulSoup + +from dedoc.common.exceptions.bad_file_error import BadFileFormatError + + +def get_bs_from_zip(zip_path: str, xml_path: str, remove_spaces: bool = False) -> Optional[BeautifulSoup]: + """ + Utility for extracting xml from files of office formats (docx, pptx, xlsx). + Gets xml BeautifulSoup tree from the given file inside the zip_path. + + :param zip_path: path to the file of the office format (docx, pptx, xlsx) + :param xml_path: name of file to extract the tree + :param remove_spaces: remove spaces between tags except (for pptx) + :return: BeautifulSoup tree or None if file wasn't found + """ + try: + with zipfile.ZipFile(zip_path) as document: + content = document.read(xml_path) + content = re.sub(br"\n[\t ]*", b"", content) + + if remove_spaces: + # remove spaces between tags, don't remove spaces inside pptx text fields: + content = re.sub(br"(?\s+<", b"><", content) + + soup = BeautifulSoup(content, "xml") + return soup + except KeyError: + return None + except zipfile.BadZipFile: + raise BadFileFormatError(f"Bad office file:\n file_name = {os.path.basename(zip_path)}. Seems file is broken") diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 25cf3af6..2834f209 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,13 @@ Changelog ========= +v2.2.3 (2024-06-05) +------------------- +Release note: `v2.2.3 `_ + +* Show attached images and added ability to download attached files in the HTML output representation (API usage, return_format="html"). +* Added hierarchy level information and annotations to `PptxReader`. + v2.2.2 (2024-05-21) ------------------- Release note: `v2.2.2 `_ diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index ee68c29f..df6b4963 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -144,11 +144,6 @@ Api parameters description The encoded contents will be saved in the attachment's metadata in the ``base64_encode`` field. Use ``true`` value to enable this behaviour. - * - attachments_dir - - optional string with a valid path - - None - - The path to the directory where document's attached files can be saved instead of a temporary directory. - * - :cspan:`3` **Tables handling** * - need_pdf_table_analysis diff --git a/docs/source/readers_output/annotations.rst b/docs/source/readers_output/annotations.rst index ee1785c2..2a13989d 100644 --- a/docs/source/readers_output/annotations.rst +++ b/docs/source/readers_output/annotations.rst @@ -11,11 +11,12 @@ Below the readers are enlisted that can return non-empty list of annotations for .. _table_annotations: .. list-table:: Annotations returned by each reader - :widths: 20 10 10 10 10 10 10 10 + :widths: 20 10 10 10 10 10 10 10 10 :class: tight-table * - **Annotation** - :class:`~dedoc.readers.DocxReader` + - :class:`~dedoc.readers.PptxReader` - :class:`~dedoc.readers.HtmlReader`, :class:`~dedoc.readers.MhtmlReader`, :class:`~dedoc.readers.EmailReader` - :class:`~dedoc.readers.RawTextReader` - :class:`~dedoc.readers.PdfImageReader` @@ -24,6 +25,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - :class:`~dedoc.readers.ArticleReader` * - :class:`~dedoc.data_structures.AttachAnnotation` + - `+` - `+` - `-` - `-` @@ -33,6 +35,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `+` * - :class:`~dedoc.data_structures.TableAnnotation` + - `+` - `+` - `-` - `-` @@ -43,6 +46,7 @@ Below the readers are enlisted that can return non-empty list of annotations for * - :class:`~dedoc.data_structures.LinkedTextAnnotation` - `+` + - `-` - `+` - `-` - `-` @@ -54,12 +58,14 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` - `-` - `-` + - `-` - `+` - `+` - `+` - `-` * - :class:`~dedoc.data_structures.AlignmentAnnotation` + - `+` - `+` - `+` - `-` @@ -71,6 +77,7 @@ Below the readers are enlisted that can return non-empty list of annotations for * - :class:`~dedoc.data_structures.IndentationAnnotation` - `+` - `-` + - `-` - `+` - `+` - `+` @@ -80,6 +87,7 @@ Below the readers are enlisted that can return non-empty list of annotations for * - :class:`~dedoc.data_structures.SpacingAnnotation` - `+` - `-` + - `-` - `+` - `+` - `+` @@ -87,6 +95,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` * - :class:`~dedoc.data_structures.BoldAnnotation` + - `+` - `+` - `+` - `-` @@ -96,6 +105,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` * - :class:`~dedoc.data_structures.ItalicAnnotation` + - `+` - `+` - `+` - `-` @@ -105,6 +115,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` * - :class:`~dedoc.data_structures.UnderlinedAnnotation` + - `+` - `+` - `+` - `-` @@ -114,6 +125,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` * - :class:`~dedoc.data_structures.StrikeAnnotation` + - `+` - `+` - `+` - `-` @@ -123,6 +135,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` * - :class:`~dedoc.data_structures.SubscriptAnnotation` + - `+` - `+` - `+` - `-` @@ -132,6 +145,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` * - :class:`~dedoc.data_structures.SuperscriptAnnotation` + - `+` - `+` - `+` - `-` @@ -144,12 +158,14 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` - `-` - `-` + - `-` - `+` - `-` - `+` - `-` * - :class:`~dedoc.data_structures.SizeAnnotation` + - `+` - `+` - `+` - `-` @@ -160,6 +176,7 @@ Below the readers are enlisted that can return non-empty list of annotations for * - :class:`~dedoc.data_structures.StyleAnnotation` - `+` + - `-` - `+` - `-` - `-` @@ -171,6 +188,7 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` - `-` - `-` + - `-` - `+` - `-` - `-` @@ -183,4 +201,5 @@ Below the readers are enlisted that can return non-empty list of annotations for - `-` - `-` - `-` + - `-` - `+` diff --git a/docs/source/readers_output/line_types.rst b/docs/source/readers_output/line_types.rst index d7c42425..666a8d35 100644 --- a/docs/source/readers_output/line_types.rst +++ b/docs/source/readers_output/line_types.rst @@ -28,6 +28,12 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` - `+` - `-` + * - :class:`~dedoc.readers.PptxReader` + - `+` + - `+` + - `+` + - `-` + * - :class:`~dedoc.readers.HtmlReader`, :class:`~dedoc.readers.MhtmlReader`, :class:`~dedoc.readers.EmailReader` - `+` - `+` diff --git a/requirements.txt b/requirements.txt index e6a7ac86..7d449f59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ beautifulsoup4>=4.10.0,<=4.12.2 charset-normalizer>=2.0.12,<=3.2.0 Cython>=0.29.28,<=3.0.2 -docx==0.2.4 dedoc-utils==0.3.6 fastapi>=0.77.0,<=0.103.0 huggingface-hub>=0.14.1,<=0.16.4 @@ -26,7 +25,6 @@ python-Levenshtein==0.12.2 python-logstash-async>=2.5.0,<=2.7.0 python-magic<1.0 python-multipart==0.0.6 -python-pptx==0.6.21 rarfile==4.0 requests>=2.22.0 roman>=3.3,<4.0 diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py index b2df8351..214265be 100644 --- a/tests/api_tests/test_api_format_pptx.py +++ b/tests/api_tests/test_api_format_pptx.py @@ -23,6 +23,138 @@ def test_odp(self) -> None: result = self._send_request(file_name, data=dict(structure_type="linear")) self.__check_content(result["content"]) + def test_structure_and_annotations(self) -> None: + file_name = "test-presentation.pptx" + result = self._send_request(file_name, data=dict(with_attachments="True")) + structure = result["content"]["structure"] + + # Test headers + node = self._get_by_tree_path(structure, "0.0") + self.assertEqual("Title\n", node["text"]) + self.assertEqual("header", node["metadata"]["paragraph_type"]) + annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "size"] + self.assertEqual(1, len(annotations)) + self.assertEqual(50.0, float(annotations[0]["value"])) + annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "alignment"] + self.assertEqual(1, len(annotations)) + self.assertEqual("center", annotations[0]["value"]) + node = self._get_by_tree_path(structure, "0.2") + self.assertEqual("Title\n", node["text"]) + self.assertEqual("header", node["metadata"]["paragraph_type"]) + + # Test lists + self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1")["metadata"]["paragraph_type"]) + self.assertEqual("1. first item\n", self._get_by_tree_path(structure, "0.2.1.0")["text"]) + self.assertEqual("2. second item\n", self._get_by_tree_path(structure, "0.2.1.1")["text"]) + self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1.1.0")["metadata"]["paragraph_type"]) + self.assertEqual("a. subitem\n", self._get_by_tree_path(structure, "0.2.1.1.0.0")["text"]) + self.assertEqual("3. third item\n", self._get_by_tree_path(structure, "0.2.1.2")["text"]) + self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1.2.0")["metadata"]["paragraph_type"]) + self.assertEqual("a. \n", self._get_by_tree_path(structure, "0.2.1.2.0.0")["text"]) + + self.assertEqual("❏ first bullet item\n", self._get_by_tree_path(structure, "0.3.0.0")["text"]) + self.assertEqual("❏ second bullet item\n", self._get_by_tree_path(structure, "0.3.0.1")["text"]) + self.assertEqual("❏ subitem\n", self._get_by_tree_path(structure, "0.3.0.1.0.0")["text"]) + self.assertEqual("A. first letter item\n", self._get_by_tree_path(structure, "0.3.1.0")["text"]) + self.assertEqual("B. second letter item\n", self._get_by_tree_path(structure, "0.3.1.1")["text"]) + self.assertEqual("○ first subitem\n", self._get_by_tree_path(structure, "0.3.1.1.0.0")["text"]) + self.assertEqual("○ second subitem\n", self._get_by_tree_path(structure, "0.3.1.1.0.1")["text"]) + + # Test annotations + node = self._get_by_tree_path(structure, "0.5") + self.assertEqual("Custom title\n", node["text"]) + self.assertEqual("header", node["metadata"]["paragraph_type"]) + annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "size"] + self.assertEqual(30.0, float(annotations[0]["value"])) + annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "bold"] + self.assertEqual("True", annotations[0]["value"]) + annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "alignment"] + self.assertEqual("center", annotations[0]["value"]) + + node = self._get_by_tree_path(structure, "0.5.0") + annotations = {float(annotation["value"]) for annotation in node["annotations"] if annotation["name"] == "size"} + self.assertSetEqual({18.0, 24.0, 10.0}, annotations) + self.assertIn({"start": 18, "end": 27, "name": "bold", "value": "True"}, node["annotations"]) + self.assertIn({"start": 28, "end": 39, "name": "italic", "value": "True"}, node["annotations"]) + self.assertIn({"start": 40, "end": 55, "name": "underlined", "value": "True"}, node["annotations"]) + self.assertIn({"start": 56, "end": 67, "name": "strike", "value": "True"}, node["annotations"]) + self.assertIn({"start": 68, "end": 79, "name": "superscript", "value": "True"}, node["annotations"]) + self.assertIn({"start": 81, "end": 90, "name": "subscript", "value": "True"}, node["annotations"]) + + node = self._get_by_tree_path(structure, "0.6") + self.assertIn({"start": 0, "end": 12, "name": "bold", "value": "True"}, node["annotations"]) + self.assertIn({"start": 0, "end": 12, "name": "italic", "value": "True"}, node["annotations"]) + self.assertIn({"start": 0, "end": 12, "name": "underlined", "value": "True"}, node["annotations"]) + self.assertIn({"start": 0, "end": 12, "name": "size", "value": "20.0"}, node["annotations"]) + self.assertIn({"start": 0, "end": 13, "name": "alignment", "value": "right"}, node["annotations"]) + + # Test tables + tables = result["content"]["tables"] + self.assertEqual(1, len(tables)) + table = tables[0] + node = self._get_by_tree_path(structure, "0.4") + annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "table"] + self.assertEqual(table["metadata"]["uid"], annotations[0]["value"]) + column_number = len(table["cells"][0]) + for table_row in table["cells"]: + self.assertEqual(column_number, len(table_row)) + + cell = table["cells"][0][0] + self.assertEqual("Horizontally merged cells\n", cell["lines"][0]["text"]) + self.assertEqual(1, cell["rowspan"]) + self.assertEqual(2, cell["colspan"]) + self.assertEqual(False, cell["invisible"]) + cell = table["cells"][0][1] + self.assertEqual("Horizontally merged cells\n", cell["lines"][0]["text"]) + self.assertEqual(1, cell["rowspan"]) + self.assertEqual(1, cell["colspan"]) + self.assertEqual(True, cell["invisible"]) + + cell = table["cells"][1][2] + self.assertEqual("Vertically merged cells\n", cell["lines"][0]["text"]) + self.assertEqual(2, cell["rowspan"]) + self.assertEqual(1, cell["colspan"]) + self.assertEqual(False, cell["invisible"]) + cell = table["cells"][2][2] + self.assertEqual("Vertically merged cells\n", cell["lines"][0]["text"]) + self.assertEqual(1, cell["rowspan"]) + self.assertEqual(1, cell["colspan"]) + self.assertEqual(True, cell["invisible"]) + + cell = table["cells"][2][0] + self.assertEqual("Vertically merged cells 2\n", cell["lines"][0]["text"]) + self.assertEqual(2, cell["rowspan"]) + self.assertEqual(1, cell["colspan"]) + self.assertEqual(False, cell["invisible"]) + cell = table["cells"][3][0] + self.assertEqual("Vertically merged cells 2\n", cell["lines"][0]["text"]) + self.assertEqual(1, cell["rowspan"]) + self.assertEqual(1, cell["colspan"]) + self.assertEqual(True, cell["invisible"]) + + cell = table["cells"][3][2] + self.assertEqual("Horizontally merged cells 2\n", cell["lines"][0]["text"]) + self.assertEqual(1, cell["rowspan"]) + self.assertEqual(3, cell["colspan"]) + self.assertEqual(False, cell["invisible"]) + cell = table["cells"][3][3] + self.assertEqual("Horizontally merged cells 2\n", cell["lines"][0]["text"]) + self.assertEqual(1, cell["rowspan"]) + self.assertEqual(1, cell["colspan"]) + self.assertEqual(True, cell["invisible"]) + + # Test attachments + self.assertEqual(3, len(result["attachments"])) + attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]} + node = self._get_by_tree_path(structure, "0.6") + annotations = [annotation["value"] for annotation in node["annotations"] if annotation["name"] == "attachment"] + self.assertIn(annotations[0], attachment_uids) + self.assertIn(annotations[1], attachment_uids) + node = self._get_by_tree_path(structure, "0.8.0") + self.assertEqual("Text text\n", node["text"]) + annotations = [annotation["value"] for annotation in node["annotations"] if annotation["name"] == "attachment"] + self.assertIn(annotations[0], attachment_uids) + def __check_content(self, content: dict) -> None: subparagraphs = content["structure"]["subparagraphs"] self.assertEqual("A long time ago in a galaxy far far away", subparagraphs[0]["text"].strip()) @@ -31,8 +163,8 @@ def __check_content(self, content: dict) -> None: self.assertEqual("This is simple table", subparagraphs[3]["text"].strip()) table = content["tables"][0] - self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table["cells"][0])) - self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table["cells"][1])) + self.assertListEqual(["", "Header1\n", "Header2\n", "Header3\n"], self._get_text_of_row(table["cells"][0])) + self.assertListEqual(["Some content\n", "A\n", "B\n", "C\n"], self._get_text_of_row(table["cells"][1])) table_annotations = [ann for ann in subparagraphs[2]["annotations"] if ann["name"] == TableAnnotation.name] self.assertEqual(1, len(table_annotations)) diff --git a/tests/data/pptx/test-presentation.pptx b/tests/data/pptx/test-presentation.pptx new file mode 100644 index 00000000..97eaf6a9 Binary files /dev/null and b/tests/data/pptx/test-presentation.pptx differ