From b1b82a5e1a36b77c889e4e5709fb83eac24fbaac Mon Sep 17 00:00:00 2001 From: Paul Craig Date: Fri, 13 Sep 2024 09:53:42 -0400 Subject: [PATCH] Improve preserve_heading_links function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, it would only grab a single link preceding a heading, but now it will work with any number of links preceding a heading. For example, this would not have worked previously: ```
Sustainability
``` And now it does. 👍 --- CHANGELOG.md | 1 + bloom_nofos/nofos/nofo.py | 36 +++++++++++++++++----------------- bloom_nofos/nofos/test_nofo.py | 16 ++++++++++++--- bloom_nofos/nofos/views.py | 7 +++++++ 4 files changed, 39 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38893aa1..1b80f0a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve - "Preview" link in markdown editor has "USA Blue" link text - Specify exact Python version in Dockerfile - Fix 2 broken cover images +- preserve_heading_links function now accounts for multiple links preceding headings ## [1.25.0] - 2023-08-27 diff --git a/bloom_nofos/nofos/nofo.py b/bloom_nofos/nofos/nofo.py index 82931dd7..fc99bdcd 100644 --- a/bloom_nofos/nofos/nofo.py +++ b/bloom_nofos/nofos/nofo.py @@ -1347,10 +1347,6 @@ def _transfer_id_and_decompose(heading, anchor): # List of all heading tags to check heading_tags = ["h1", "h2", "h3", "h4", "h5", "h6"] - # Store found empty anchors - heading_id_anchors = [] - preceding_id_anchors = [] - # Check each heading tag for tag in heading_tags: # Find all heading tags in the document @@ -1358,34 +1354,38 @@ def _transfer_id_and_decompose(heading, anchor): # Check each heading for heading in headings: + # Store found empty anchors + heading_id_anchors = [] + preceding_id_anchors = [] + # Find empty tags immediately preceding the heading previous_sibling = heading.find_previous_sibling() - if ( + + # Traverse back through preceding siblings + while ( previous_sibling and previous_sibling.name == "a" and not previous_sibling.get_text(strip=True) and not previous_sibling.contents ): preceding_id_anchors.append(previous_sibling) + previous_sibling = previous_sibling.find_previous_sibling() + + # Process all preceding empty tags + for preceding_anchor in reversed(preceding_id_anchors): + if not preceding_anchor.text.strip(): + _transfer_id_and_decompose(heading=heading, anchor=preceding_anchor) # Find direct child anchor tags that are empty for a in heading.find_all("a", recursive=False): if a.get_text(strip=True) == "" and not a.contents: heading_id_anchors.append(a) - for preceding_anchor in preceding_id_anchors: - # Check if the tag is empty - if not preceding_anchor.text.strip(): - _transfer_id_and_decompose( - heading=preceding_anchor.find_next_sibling(), anchor=preceding_anchor - ) - - for heading_anchor in heading_id_anchors: - # Check if the tag is empty - if not heading_anchor.text.strip(): - _transfer_id_and_decompose( - heading=heading_anchor.parent, anchor=heading_anchor - ) + # Process all internal empty tags + for heading_anchor in heading_id_anchors: + # Check if the tag is empty + if not heading_anchor.text.strip(): + _transfer_id_and_decompose(heading=heading, anchor=heading_anchor) def preserve_table_heading_links(soup): diff --git a/bloom_nofos/nofos/test_nofo.py b/bloom_nofos/nofos/test_nofo.py index 4b918301..afd53758 100644 --- a/bloom_nofos/nofos/test_nofo.py +++ b/bloom_nofos/nofos/test_nofo.py @@ -3567,14 +3567,24 @@ def test_empty_anchor_wrapped_does_not_become_heading_id(self): result = str(soup) self.assertEqual(result, html) - def test_two_empty_anchors_preceding_heading_id_only_uses_first_anchor( + def test_two_empty_anchors_preceding_heading_id( self, ): - html = '

About priority populations

' + html = '

About priority populations

About 1

About 2

' soup = BeautifulSoup(html, "html.parser") preserve_heading_links(soup) result = str(soup) - expected = '

About priority populations

' + expected = '

About priority populations

About 1

About 2

' + self.assertEqual(result, expected) + + def test_three_empty_anchors_preceding_heading_id( + self, + ): + html = '

About priority populations

About 1

About 2

About 3

' + soup = BeautifulSoup(html, "html.parser") + preserve_heading_links(soup) + result = str(soup) + expected = '

About priority populations

About 1

About 2

About 3

' self.assertEqual(result, expected) def test_empty_anchor_preceding_heading_id_and_inside_heading_only_uses_inside( diff --git a/bloom_nofos/nofos/views.py b/bloom_nofos/nofos/views.py index 374fdeef..b3ba5864 100644 --- a/bloom_nofos/nofos/views.py +++ b/bloom_nofos/nofos/views.py @@ -275,6 +275,13 @@ def nofo_import(request, pk=None): soup = BeautifulSoup(cleaned_content, "html.parser") # Parse the cleaned HTML soup = add_body_if_no_body(soup) + # # Specify the output file path + # output_file_path = "debug_output.html" + + # # Write the HTML content to the file + # with open(output_file_path, "w", encoding="utf-8") as file: + # file.write(str(soup)) + # if there are no h1s, then h2s are the new top top_heading_level = "h1" if soup.find("h1") else "h2"