-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Andrew Perminov <[email protected]>
- Loading branch information
1 parent
6b84563
commit 370f6ef
Showing
22 changed files
with
672 additions
and
111 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
2.2.2 | ||
2.2.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
class NumberingExtractor: | ||
""" | ||
This class is used to compute numbering text for list items. | ||
For example: "1.", (i), "○" | ||
""" | ||
def __init__(self) -> None: | ||
# Mapping according to the ST_TextAutonumberScheme | ||
# NOTE we ignore chinese, japanese, hindi, thai | ||
self.numbering_types = dict( | ||
arabic="1", # 1, 2, 3, ..., 10, 11, 12, ... | ||
alphaLc="a", # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ... | ||
alphaUc="A", # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ... | ||
romanLc="i", # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ... | ||
romanUc="I" # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ... | ||
) | ||
|
||
self.numbering_formatting = dict( | ||
ParenBoth="({}) ", | ||
ParenR="{}) ", | ||
Period="{}. ", | ||
Plain="{} " | ||
) | ||
|
||
self.combined_types = { | ||
num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting | ||
} | ||
self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")] | ||
|
||
def get_text(self, numbering: str, shift: int) -> str: | ||
""" | ||
Computes the next item of the list sequence. | ||
:param numbering: type of the numbering, e.g. "arabicPeriod" | ||
:param shift: shift from the beginning of list numbering | ||
:return: string representation of the next numbering item | ||
""" | ||
num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period")) | ||
|
||
if num_type in ("alphaLc", "alphaUc"): | ||
shift1, shift2 = shift % 26, shift // 26 + 1 | ||
num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2 | ||
elif num_type in ("romanLc", "romanUc"): | ||
num_char = "" | ||
for number, letter in self.roman_mapping: | ||
cnt, shift = shift // number, shift % number | ||
if num_type == "romanUc": | ||
letter = chr(ord(letter) + ord("A") - ord("a")) | ||
num_char += letter * cnt | ||
else: | ||
num_char = str(int(self.numbering_types["arabic"]) + shift) | ||
|
||
return self.numbering_formatting[num_formatting].format(num_char) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from bs4 import Tag | ||
|
||
from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \ | ||
StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation | ||
from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor | ||
from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor | ||
from dedoc.utils.annotation_merger import AnnotationMerger | ||
|
||
|
||
class PptxParagraph: | ||
""" | ||
This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag <a:p>). | ||
""" | ||
def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None: | ||
self.xml = xml | ||
self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None | ||
self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1 | ||
self.numbering_extractor = numbering_extractor | ||
self.properties_extractor = properties_extractor | ||
self.annotation_merger = AnnotationMerger() | ||
annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation] | ||
self.dict2annotation = {annotation.name: annotation for annotation in annotations} | ||
|
||
def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta: | ||
text = "" | ||
paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level) | ||
hierarchy_level = HierarchyLevel.create_raw_text() | ||
|
||
if is_title or paragraph_properties.title: | ||
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False) | ||
elif self.numbered_list_type: # numbered list | ||
text += self.numbering_extractor.get_text(self.numbered_list_type, shift) | ||
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False) | ||
elif self.xml.buChar: # bullet list | ||
text += self.xml.buChar["char"] + " " | ||
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False) | ||
|
||
annotations = [] | ||
if self.xml.r: | ||
for run in self.xml.find_all("a:r"): | ||
prev_text = text | ||
for run_text in run: | ||
if run_text.name == "t" and run.text: | ||
text += run.text | ||
|
||
run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties) | ||
annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size))) | ||
for property_name in self.dict2annotation: | ||
if getattr(run_properties, property_name): | ||
annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True")) | ||
|
||
text = f"{text}\n" | ||
annotations = self.annotation_merger.merge_annotations(annotations, text) | ||
annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment)) | ||
return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations) |
Oops, something went wrong.