Skip to content

Commit

Permalink
Improve preserve_heading_links function
Browse files Browse the repository at this point in the history
Previously, it would only grab a single link preceding a heading, but
now it will work with any number of links preceding a heading.

For example, this would not have worked previously:

```
<a id="_Sustainability_1"></a>
<a id="_Sustainability_2"></a>
<h5>Sustainability</h5>
```

And now it does. 👍
  • Loading branch information
pcraig3 committed Sep 13, 2024
1 parent e95c805 commit b1b82a5
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 21 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
- "Preview" link in markdown editor has "USA Blue" link text
- Specify exact Python version in Dockerfile
- Fix 2 broken cover images
- preserve_heading_links function now accounts for multiple links preceding headings

## [1.25.0] - 2023-08-27

Expand Down
36 changes: 18 additions & 18 deletions bloom_nofos/nofos/nofo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,45 +1347,45 @@ def _transfer_id_and_decompose(heading, anchor):
# List of all heading tags to check
heading_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]

# Store found empty anchors
heading_id_anchors = []
preceding_id_anchors = []

# Check each heading tag
for tag in heading_tags:
# Find all heading tags in the document
headings = soup.find_all(tag)

# Check each heading
for heading in headings:
# Store found empty anchors
heading_id_anchors = []
preceding_id_anchors = []

# Find empty <a> tags immediately preceding the heading
previous_sibling = heading.find_previous_sibling()
if (

# Traverse back through preceding siblings
while (
previous_sibling
and previous_sibling.name == "a"
and not previous_sibling.get_text(strip=True)
and not previous_sibling.contents
):
preceding_id_anchors.append(previous_sibling)
previous_sibling = previous_sibling.find_previous_sibling()

# Process all preceding empty <a> tags
for preceding_anchor in reversed(preceding_id_anchors):
if not preceding_anchor.text.strip():
_transfer_id_and_decompose(heading=heading, anchor=preceding_anchor)

# Find direct child anchor tags that are empty
for a in heading.find_all("a", recursive=False):
if a.get_text(strip=True) == "" and not a.contents:
heading_id_anchors.append(a)

for preceding_anchor in preceding_id_anchors:
# Check if the <a> tag is empty
if not preceding_anchor.text.strip():
_transfer_id_and_decompose(
heading=preceding_anchor.find_next_sibling(), anchor=preceding_anchor
)

for heading_anchor in heading_id_anchors:
# Check if the <a> tag is empty
if not heading_anchor.text.strip():
_transfer_id_and_decompose(
heading=heading_anchor.parent, anchor=heading_anchor
)
# Process all internal empty <a> tags
for heading_anchor in heading_id_anchors:
# Check if the <a> tag is empty
if not heading_anchor.text.strip():
_transfer_id_and_decompose(heading=heading, anchor=heading_anchor)


def preserve_table_heading_links(soup):
Expand Down
16 changes: 13 additions & 3 deletions bloom_nofos/nofos/test_nofo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3567,14 +3567,24 @@ def test_empty_anchor_wrapped_does_not_become_heading_id(self):
result = str(soup)
self.assertEqual(result, html)

def test_two_empty_anchors_preceding_heading_id_only_uses_first_anchor(
def test_two_empty_anchors_preceding_heading_id(
self,
):
html = '<a id="_About_Priority_Populations_1"></a><a id="_About_Priority_Populations_2"></a><h4>About priority populations</h4>'
html = '<a id="_About_Priority_Populations_1"></a><a id="_About_Priority_Populations_2"></a><h4>About priority populations</h4><p><a href="#_About_Priority_Populations_1">About 1</a></p><p><a href="#_About_Priority_Populations_2">About 2</a></p>'
soup = BeautifulSoup(html, "html.parser")
preserve_heading_links(soup)
result = str(soup)
expected = '<a id="_About_Priority_Populations_1"></a><h4 id="_About_Priority_Populations_2">About priority populations</h4>'
expected = '<h4 id="_About_Priority_Populations_2">About priority populations</h4><p><a href="#_About_Priority_Populations_2">About 1</a></p><p><a href="#_About_Priority_Populations_2">About 2</a></p>'
self.assertEqual(result, expected)

def test_three_empty_anchors_preceding_heading_id(
self,
):
html = '<a id="_About_Priority_Populations_1"></a><a id="_About_Priority_Populations_2"></a><a id="_About_Priority_Populations_3"></a><h4>About priority populations</h4><p><a href="#_About_Priority_Populations_1">About 1</a></p><p><a href="#_About_Priority_Populations_2">About 2</a></p><p><a href="#_About_Priority_Populations_3">About 3</a></p>'
soup = BeautifulSoup(html, "html.parser")
preserve_heading_links(soup)
result = str(soup)
expected = '<h4 id="_About_Priority_Populations_3">About priority populations</h4><p><a href="#_About_Priority_Populations_3">About 1</a></p><p><a href="#_About_Priority_Populations_3">About 2</a></p><p><a href="#_About_Priority_Populations_3">About 3</a></p>'
self.assertEqual(result, expected)

def test_empty_anchor_preceding_heading_id_and_inside_heading_only_uses_inside(
Expand Down
7 changes: 7 additions & 0 deletions bloom_nofos/nofos/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,13 @@ def nofo_import(request, pk=None):
soup = BeautifulSoup(cleaned_content, "html.parser") # Parse the cleaned HTML
soup = add_body_if_no_body(soup)

# # Specify the output file path
# output_file_path = "debug_output.html"

# # Write the HTML content to the file
# with open(output_file_path, "w", encoding="utf-8") as file:
# file.write(str(soup))

# if there are no h1s, then h2s are the new top
top_heading_level = "h1" if soup.find("h1") else "h2"

Expand Down

0 comments on commit b1b82a5

Please sign in to comment.