Improve preserve_heading_links function

Previously, it would only grab a single link preceding a heading, but now it will work with any number of links preceding a heading. For example, this would not have worked previously: ``` <a id="_Sustainability_1"></a> <a id="_Sustainability_2"></a> <h5>Sustainability</h5> ``` And now it does. 👍
HHS · Sep 13, 2024 · b1b82a5 · b1b82a5
1 parent e95c805
commit b1b82a5
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,7 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
 - "Preview" link in markdown editor has "USA Blue" link text
 - Specify exact Python version in Dockerfile
 - Fix 2 broken cover images
+- preserve_heading_links function now accounts for multiple links preceding headings
 
 ## [1.25.0] - 2023-08-27
 

diff --git a/bloom_nofos/nofos/nofo.py b/bloom_nofos/nofos/nofo.py
@@ -1347,45 +1347,45 @@ def _transfer_id_and_decompose(heading, anchor):
     # List of all heading tags to check
     heading_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
 
-    # Store found empty anchors
-    heading_id_anchors = []
-    preceding_id_anchors = []
-
     # Check each heading tag
     for tag in heading_tags:
         # Find all heading tags in the document
         headings = soup.find_all(tag)
 
         # Check each heading
         for heading in headings:
+            # Store found empty anchors
+            heading_id_anchors = []
+            preceding_id_anchors = []
+
             # Find empty <a> tags immediately preceding the heading
             previous_sibling = heading.find_previous_sibling()
-            if (
+
+            # Traverse back through preceding siblings
+            while (
                 previous_sibling
                 and previous_sibling.name == "a"
                 and not previous_sibling.get_text(strip=True)
                 and not previous_sibling.contents
             ):
                 preceding_id_anchors.append(previous_sibling)
+                previous_sibling = previous_sibling.find_previous_sibling()
+
+            # Process all preceding empty <a> tags
+            for preceding_anchor in reversed(preceding_id_anchors):
+                if not preceding_anchor.text.strip():
+                    _transfer_id_and_decompose(heading=heading, anchor=preceding_anchor)
 
             # Find direct child anchor tags that are empty
             for a in heading.find_all("a", recursive=False):
                 if a.get_text(strip=True) == "" and not a.contents:
                     heading_id_anchors.append(a)
 
-    for preceding_anchor in preceding_id_anchors:
-        # Check if the <a> tag is empty
-        if not preceding_anchor.text.strip():
-            _transfer_id_and_decompose(
-                heading=preceding_anchor.find_next_sibling(), anchor=preceding_anchor
-            )
-
-    for heading_anchor in heading_id_anchors:
-        # Check if the <a> tag is empty
-        if not heading_anchor.text.strip():
-            _transfer_id_and_decompose(
-                heading=heading_anchor.parent, anchor=heading_anchor
-            )
+            # Process all internal empty <a> tags
+            for heading_anchor in heading_id_anchors:
+                # Check if the <a> tag is empty
+                if not heading_anchor.text.strip():
+                    _transfer_id_and_decompose(heading=heading, anchor=heading_anchor)
 
 
 def preserve_table_heading_links(soup):

diff --git a/bloom_nofos/nofos/test_nofo.py b/bloom_nofos/nofos/test_nofo.py
@@ -3567,14 +3567,24 @@ def test_empty_anchor_wrapped_does_not_become_heading_id(self):
         result = str(soup)
         self.assertEqual(result, html)
 
-    def test_two_empty_anchors_preceding_heading_id_only_uses_first_anchor(
+    def test_two_empty_anchors_preceding_heading_id(
         self,
     ):
-        html = '<a id="_About_Priority_Populations_1"></a><a id="_About_Priority_Populations_2"></a><h4>About priority populations</h4>'
+        html = '<a id="_About_Priority_Populations_1"></a><a id="_About_Priority_Populations_2"></a><h4>About priority populations</h4><p><a href="#_About_Priority_Populations_1">About 1</a></p><p><a href="#_About_Priority_Populations_2">About 2</a></p>'
         soup = BeautifulSoup(html, "html.parser")
         preserve_heading_links(soup)
         result = str(soup)
-        expected = '<a id="_About_Priority_Populations_1"></a><h4 id="_About_Priority_Populations_2">About priority populations</h4>'
+        expected = '<h4 id="_About_Priority_Populations_2">About priority populations</h4><p><a href="#_About_Priority_Populations_2">About 1</a></p><p><a href="#_About_Priority_Populations_2">About 2</a></p>'
+        self.assertEqual(result, expected)
+
+    def test_three_empty_anchors_preceding_heading_id(
+        self,
+    ):
+        html = '<a id="_About_Priority_Populations_1"></a><a id="_About_Priority_Populations_2"></a><a id="_About_Priority_Populations_3"></a><h4>About priority populations</h4><p><a href="#_About_Priority_Populations_1">About 1</a></p><p><a href="#_About_Priority_Populations_2">About 2</a></p><p><a href="#_About_Priority_Populations_3">About 3</a></p>'
+        soup = BeautifulSoup(html, "html.parser")
+        preserve_heading_links(soup)
+        result = str(soup)
+        expected = '<h4 id="_About_Priority_Populations_3">About priority populations</h4><p><a href="#_About_Priority_Populations_3">About 1</a></p><p><a href="#_About_Priority_Populations_3">About 2</a></p><p><a href="#_About_Priority_Populations_3">About 3</a></p>'
         self.assertEqual(result, expected)
 
     def test_empty_anchor_preceding_heading_id_and_inside_heading_only_uses_inside(

diff --git a/bloom_nofos/nofos/views.py b/bloom_nofos/nofos/views.py
@@ -275,6 +275,13 @@ def nofo_import(request, pk=None):
         soup = BeautifulSoup(cleaned_content, "html.parser")  # Parse the cleaned HTML
         soup = add_body_if_no_body(soup)
 
+        # # Specify the output file path
+        # output_file_path = "debug_output.html"
+
+        # # Write the HTML content to the file
+        # with open(output_file_path, "w", encoding="utf-8") as file:
+        #     file.write(str(soup))
+
         # if there are no h1s, then h2s are the new top
         top_heading_level = "h1" if soup.find("h1") else "h2"