Skip to content

Commit

Permalink
bunch of fixes for deep/illegal nesting
Browse files Browse the repository at this point in the history
  • Loading branch information
drfho committed Dec 5, 2024
1 parent 536da29 commit 6a11368
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
# DOCX-Document will be set in main function manage_export_pydocx()
doc = None
# Set local path for docx-template
docx_tmpl = open("/home/zope/src/zms-publishing/ZMS5/Products/zms/conf/metacmd_manager/manage_export_pydocx/neon.docx", "rb")
# docx_tmpl = open("/home/zope/src/zms-publishing/ZMS5/Products/zms/conf/metacmd_manager/manage_export_pydocx/neon.docx", "rb")
docx_tmpl = open("/home/zope/instance/zms4_gez/neon-entw/Extensions/neon.docx", "rb")
# Set initial numbering.num_id for restarting decimal num-lists
num_id = 5

Expand Down Expand Up @@ -223,7 +224,7 @@ def add_hyperlink(docx_block, link_text, url):
# #############################################

# Clean HTML
def clean_html(html):
def clean_html(html, wrap_trailling_text=False):
"""
Clean comments, styles, empty tags
and handle special characters: left-to-right, triangle
Expand All @@ -245,8 +246,9 @@ def clean_html(html):
html = html.replace(left_to_right_char,'')
html = html.replace('[[', triangle_char)
html = html.replace(']]', '')
# Wrap untagged text following a block element into a paragraph
html = re.sub(r'(?i)(?m)(<div.*?>.*?<\/div>)\s*(?=\w)(.*?)', r'\g<1>\n<p>\g<2><p>', html)
if wrap_trailling_text:
# Wrap untagged text following a block element into a paragraph
html = re.sub(r'(?i)(?m)(<div.*?>.*?<\/div>)\s*(?=\w)(.*?)', r'\g<1>\n<p>\g<2><p>', html)
return html

# ADD RUNS TO DOCX-BLOCK
Expand Down Expand Up @@ -279,6 +281,8 @@ def add_runs(docx_block, bs_element):
docx_block.add_run(u'\U0000F021', style='Icon')
elif elrun.has_attr('class') and 'fa-phone' in elrun['class']:
docx_block.add_run(u'\U0000F028', style='Icon')
elif elrun.has_attr('class') and 'fa-exclamation-triangle' in elrun['class']:
docx_block.add_run(u'\U0000F045', style='Icon')
elif elrun.text != '':
docx_block.add_run(elrun.text).italic = True
elif elrun.text != '':
Expand All @@ -297,6 +301,8 @@ def add_runs(docx_block, bs_element):
docx_block.add_run(elrun.text).font.subscript = True
elif elrun.name == 'sup':
docx_block.add_run(elrun.text).font.superscript = True
elif elrun.name == 'u':
docx_block.add_run(elrun.text).underline = True
elif elrun.name == 'a':
if elrun.has_attr('href'):
add_hyperlink(docx_block = docx_block, link_text = elrun.text, url = elrun.get('href'))
Expand Down Expand Up @@ -378,21 +384,12 @@ def add_htmlblock_to_docx(zmscontext, docx_doc, htmlblock, zmsid=None, zmsmetaid
if c==0 and zmsid:
prepend_bookmark(p, zmsid)
else:
# #############################################
# INLINE ELEMENTS not nested by a block element
# just following a text element
# #############################################
# orphaned_inline_elements = ['strong','b','em','i','q','quote','samp','code','tt','var','kbd','sub','sup']
# if element.name in orphaned_inline_elements and element.find_parent() == soup:
# p.add_run(' ')
# add_runs(p, element)

# #############################################
# BLOCK-Elements, element.name != None
# ---------------------------------------------
# HEADINGS
# #############################################
if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8']:
heading_level = int(element.name[1])
heading_text = standard.pystr(element.text).strip()
p = add_heading(docx_doc, heading_text, level=heading_level)
Expand Down Expand Up @@ -508,7 +505,7 @@ def add_list(docx_obj, element, level=0, c=0):
# ------------------------------------------------
def convert_cell_html_to_docx(zmscontext, docx_cell, text_style='Normal'):
'''Convert cell html to docx'''
cl_html = clean_html(docx_cell.text)
cl_html = clean_html(docx_cell.text, wrap_trailling_text=True)
cl_type = cl_html.startswith('[th:') and 'th' or 'td'
cl_html = re.sub(r'\[(th|td):\d:\d\] ','',cl_html)
cl = BeautifulSoup(cl_html, 'html.parser')
Expand Down Expand Up @@ -639,7 +636,7 @@ def add_list(docx_obj, element, level=0, c=0):
add_runs(docx_block = p, bs_element = element)
else:
child_tags = [e.name for e in element.children if e.name]
if {'em','strong','i', 'span'} & set(child_tags):
if {'em','strong','i','span','u'} & set(child_tags):
p = docx_doc.add_paragraph()
if c==1 and zmsid:
prepend_bookmark(p, zmsid)
Expand All @@ -659,8 +656,15 @@ def add_list(docx_obj, element, level=0, c=0):
# p.add_run('Routing: ')
p.add_run(u'\U0000F028', style='Icon')
p.add_run(' ')
elif 'fa-exclamation-triangle' in class_name:
# p.add_run('Kommentar: ')
p.add_run(u'\U0000F045', style='Icon')
p.add_run(' ')
if list(e.children)!=[]:
add_runs(docx_block = p, bs_element = e)
if [ch.name for ch in e.children if ch.name in ['p', 'ol', 'ul', 'div']]:
add_htmlblock_to_docx(zmscontext, docx_doc, standard.pystr(e), zmsid)
else:
add_runs(docx_block = p, bs_element = e)
else:
p.add_run(standard.pystr(e.text))
elif e.name:
Expand Down Expand Up @@ -1267,7 +1271,7 @@ def manage_export_pydocx(self, save_file=True, file_name=None):
prepend_bookmark(p, block['id'])
# #############################################
# [5] TEXT-BLOCK with given block format (style)
elif v and block['docx_format'] in [e.name for e in doc.styles]:
elif v and ( block['docx_format'] in [e.name for e in doc.styles] or block['docx_format'] in [e.name.replace(' ','') for e in doc.styles] ):
p = doc.add_paragraph(v, style=block['docx_format'])
prepend_bookmark(p, block['id'])
elif v:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@
"body" : "Normal",
"blockquote" : "Quote",
"caption" : "Caption",
"headline_1" : "Heading1",
"headline_2" : "Heading2",
"headline_3" : "Heading3",
"headline_4" : "Heading4",
"headline_5" : "Heading5",
"headline_6" : "Heading6",
"headline_1" : "Heading 1",
"headline_2" : "Heading 2",
"headline_3" : "Heading 3",
"headline_4" : "Heading 4",
"headline_5" : "Heading 5",
"headline_6" : "Heading 6",
"headline_7" : "Heading 7",
"headline_8" : "Heading 8",
"ordered_list" : "ListBullet",
"unordered_list" : "ListBullet",
"plain_html": "html",
Expand Down

0 comments on commit 6a11368

Please sign in to comment.