Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up NOFO importing #134

Merged
merged 9 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve

### Added

### Changed

### Fixed

## [1.42.0] - 2023-12-24

### Added

- Add cover image for CDC-RFA-IP-25-0007
- Add cover image for CMS-2V2-25-001
- Also add inline images
Expand All @@ -20,8 +28,9 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve
- "Standard" icons are blue
- Tighter line-height for h5s
- Smaller, bolder h7s
- TEMP: Double the app's timeout time
- TODO: Remove once import bottleneck is improved
- Speed up "add_headings_to_nofo" function
- Precompile regex patterns for heading ID substitution
- Batch update sections and subsections

### Fixed

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ RUN /root/.local/bin/poetry run python bloom_nofos/manage.py collectstatic --noi

EXPOSE $PORT

CMD ["sh", "-c", "/root/.local/bin/poetry run gunicorn --workers 8 --timeout 179 --chdir bloom_nofos --bind 0.0.0.0:$PORT bloom_nofos.wsgi:application"]
CMD ["sh", "-c", "/root/.local/bin/poetry run gunicorn --workers 8 --timeout 89 --chdir bloom_nofos --bind 0.0.0.0:$PORT bloom_nofos.wsgi:application"]
1 change: 1 addition & 0 deletions bloom_nofos/bloom_nofos/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from django.urls import include, path, re_path
from django.views.generic.base import RedirectView
from nofos.api.api import api

from . import views

handler404 = views.page_not_found
Expand Down
11 changes: 6 additions & 5 deletions bloom_nofos/nofos/api/api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from ninja import NinjaAPI, Schema
from ninja.security import HttpBearer
from django.conf import settings
from django.core.exceptions import ValidationError
from .schemas import NofoSchema, ErrorSchema, SuccessSchema
from ninja import NinjaAPI
from ninja.security import HttpBearer
from nofos.models import Nofo
from nofos.views import _build_nofo
from django.conf import settings
from nofos.nofo import _build_nofo

from .schemas import ErrorSchema, NofoSchema, SuccessSchema


class BearerAuth(HttpBearer):
Expand Down
3 changes: 2 additions & 1 deletion bloom_nofos/nofos/api/schemas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ninja import ModelSchema, Schema
from typing import List, Optional

from ninja import ModelSchema, Schema
from nofos.models import Nofo, Section, Subsection


Expand Down
7 changes: 4 additions & 3 deletions bloom_nofos/nofos/api/tests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from django.test import TestCase, override_settings
from django.conf import settings
from nofos.models import Nofo, Section, Subsection
import json
import os

from django.conf import settings
from django.test import TestCase, override_settings
from nofos.models import Nofo, Section, Subsection


@override_settings(API_TOKEN="test-token-for-ci")
class NofoAPITest(TestCase):
Expand Down
52 changes: 35 additions & 17 deletions bloom_nofos/nofos/nofo.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ def replace_links(file_content):
@transaction.atomic
def add_headings_to_nofo(nofo):
new_ids = []
# collect sections and subsections in arrays to facilitate bulk updating
sections_to_update = []
subsections_to_update = []
# add counter because subheading titles can repeat, resulting in duplicate IDs
counter = 1

Expand All @@ -87,7 +90,7 @@ def add_headings_to_nofo(nofo):
if not section.html_id or len(section.html_id) == 0:
raise ValueError("html_id blank for section: {}".format(section.name))

section.save()
sections_to_update.append(section)

# add ids to all subsection headings
for subsection in section.subsections.all():
Expand All @@ -104,31 +107,46 @@ def add_headings_to_nofo(nofo):
)

subsection.html_id = subsection_id
subsection.save()
subsections_to_update.append(subsection)
counter += 1

# Bulk update sections and subsections
Section.objects.bulk_update(sections_to_update, ["html_id"])
Subsection.objects.bulk_update(subsections_to_update, ["html_id"])
# Reset the subsections list to avoid duplication
subsections_to_update = []

# Precompile regex patterns for all new_ids
compiled_patterns = [
{
# Case-insensitive match to replace old_id value with new_id in hrefs
"href_pattern": re.compile(
r'href="#{}"'.format(re.escape(ids["old_id"])), re.IGNORECASE
),
# Pattern to match old_id in hash links (like anchor links) case insensitively
"hash_pattern": re.compile(
r"\(#{}\)".format(re.escape(ids["old_id"])), re.IGNORECASE
),
"new_id": ids["new_id"],
}
for ids in new_ids
]

# replace all old ids with new ids
for section in nofo.sections.all():
for subsection in section.subsections.all():
for ids in new_ids:
href_pattern = re.compile(
r'href="#{}"'.format(re.escape(ids["old_id"])), re.IGNORECASE
for patterns in compiled_patterns:
# Use precompiled patterns
subsection.body = patterns["href_pattern"].sub(
'href="#{}"'.format(patterns["new_id"]), subsection.body
)
# Case-insensitive match to replace old_id value with new_id in hrefs
subsection.body = href_pattern.sub(
'href="#{}"'.format(ids["new_id"]), subsection.body
subsection.body = patterns["hash_pattern"].sub(
"(#{})".format(patterns["new_id"]), subsection.body
)

# Pattern to match old_id in hash links (like anchor links) case insensitively
hash_pattern = re.compile(
r"\(#{}\)".format(re.escape(ids["old_id"])), re.IGNORECASE
)
# Replace old_id with new_id in hash links
subsection.body = hash_pattern.sub(
"(#{})".format(ids["new_id"]), subsection.body
)
subsections_to_update.append(subsection)

subsection.save()
Subsection.objects.bulk_update(subsections_to_update, ["body"])


def add_page_breaks_to_headings(nofo):
Expand Down
9 changes: 2 additions & 7 deletions bloom_nofos/nofos/views.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import io
import json

import docraptor
import mammoth
Expand All @@ -11,10 +10,9 @@
from django.core.exceptions import PermissionDenied, ValidationError
from django.db import transaction
from django.db.models import F
from django.forms.models import model_to_dict
from django.http import Http404, HttpResponse, HttpResponseBadRequest, JsonResponse
from django.http import HttpResponse, HttpResponseBadRequest
from django.shortcuts import get_object_or_404, redirect, render
from django.urls import reverse, reverse_lazy
from django.urls import reverse_lazy
from django.utils import dateformat, timezone
from django.views.generic import (
CreateView,
Expand All @@ -23,7 +21,6 @@
FormView,
ListView,
UpdateView,
View,
)

from bloom_nofos.utils import cast_to_boolean, is_docraptor_live_mode_active
Expand Down Expand Up @@ -57,7 +54,6 @@
)
from .models import THEME_CHOICES, Nofo, Section, Subsection
from .nofo import (
_build_nofo,
add_body_if_no_body,
add_em_to_de_minimis,
add_endnotes_header_if_exists,
Expand Down Expand Up @@ -306,7 +302,6 @@ def nofo_import(request, pk=None):

# replace problematic characters/links on import
cleaned_content = replace_links(replace_chars(file_content))

soup = BeautifulSoup(cleaned_content, "html.parser") # Parse the cleaned HTML
soup = add_body_if_no_body(soup)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "bloom-nofos"
version = "1.41.0"
version = "1.42.0"
description = "the no-code solo nofo web flow"
authors = ["Paul Craig <[email protected]>"]
readme = "README.md"
Expand Down
Loading