Skip to content

Commit

Permalink
Change detection of libretexts.org special pages and add log
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Nov 26, 2024
1 parent dfe4246 commit a86df2c
Showing 1 changed file with 15 additions and 9 deletions.
24 changes: 15 additions & 9 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,22 +488,28 @@ def _process_page(
rewriten = None
# Handle special rewriting of special libretexts.org pages
if CONTEXT.library_url.endswith(".libretexts.org"):
# back-matter special pages on libretexts.org, e.g. "Courses/California_Stat
# e_University_Los_Angeles/Book:_An_Introduction_to_Geology_(Johnson_Affolte
# r_Inkenbrandt_and_Mosher)/zz:_Back_Matter/20:_Glossary", running at https:
# //geo.libretexts.org/Courses/California_State_University_Los_Angeles/Book%
# 3A_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)/zz
# %3A_Back_Matter/20%3A_Glossary
# same kind of pattern works for glossary, index, ... pages
# Let's try to guess back-matter special pages on libretexts.org based on
# HTML content
try:
if re.match(r"^.*\/zz:_[^\/]*?\/10:_[^\/]*$", page.path):
if (
"https://cdn.libretexts.net/github/LibreTextsMain/Leo "
"Jayachandran/DynamicIndex/dynamicIndexMaker.js"
in page_content.html_body
):
logger.debug(

Check warning on line 499 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L499

Added line #L499 was not covered by tests
f"Rewriting {CONTEXT.processing_step} as libretexts.org index"
)
rewriten = rewrite_index(
rewriter=rewriter,
jinja2_template=self.libretexts_index_template,
mindtouch_client=self.mindtouch_client,
page=page,
)
elif re.match(r"^.*\/zz:_[^\/]*?\/20:_[^\/]*$", page.path):
elif "new LibreTextsGlossarizer()" in page_content.html_body:
logger.debug(

Check warning on line 509 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L509

Added line #L509 was not covered by tests
f"Rewriting {CONTEXT.processing_step} as libretexts.org "
"glossary"
)
rewriten = rewrite_glossary(
jinja2_template=self.libretexts_glossary_template,
original_content=page_content.html_body,
Expand Down

0 comments on commit a86df2c

Please sign in to comment.