Skip to content

Commit

Permalink
Add support for redirects in meta http-equiv tag
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Jul 18, 2024
1 parent 54098f6 commit f25b51e
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 4 deletions.
32 changes: 32 additions & 0 deletions src/warc2zim/content_rewriting/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import re
from collections import namedtuple
from collections.abc import Callable
from dataclasses import dataclass
Expand All @@ -19,6 +20,10 @@

RewritenHtml = namedtuple("RewritenHmtl", ["title", "content"])

HTTP_EQUIV_REDIRECT_RE = re.compile(
r"^\s*(?P<interval>.*?)\s*;\s*url\s*=\s*(?P<url>.*?)\s*$"
)


def extract_attr(attrs: AttrsList, name: str, default: str | None = None) -> str | None:
"""Get one HTML attribute value if present, else return default value"""
Expand Down Expand Up @@ -624,3 +629,30 @@ def rewrite_js_data(
data,
opts={"isModule": html_rewrite_context == "js-module"},
)


@rules.rewrite_attribute()
def rewrite_meta_http_equiv_redirect(
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
) -> AttrNameAndValue | None:
"""Rewrite redirect URL in meta http-equiv refresh"""
if tag != "meta":
return
if attr_name != "content":
return
if not attr_value:
return
http_equiv = extract_attr(attrs, "http-equiv")
if http_equiv != "refresh":
return
if (match := HTTP_EQUIV_REDIRECT_RE.match(attr_value)) is None:
return
return (
attr_name,
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
)
23 changes: 23 additions & 0 deletions test-website/content/http-equiv-redirect.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<!DOCTYPE html>
<html lang="en">

<head>
<meta charset="utf-8">
<title>Test website</title>
<link rel="apple-touch-icon" sizes="180x180" href="./icons/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="./icons/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="./icons/favicon-16x16.png">
<link rel="manifest" href="./icons/site.webmanifest">
<link rel="shortcut icon" href="./icons/favicon.ico">
<meta http-equiv="refresh" content="3;url=https://standard_netloc/index.html" />
</head>

<body>

<h2>Redirect with http-equiv meta directive</h2>

<p>You should be redirected to home page in 3 seconds</p>

</body>

</html>
1 change: 1 addition & 0 deletions test-website/content/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
<li><a href="./href-to-folder/">links to folder instead of file</a></li>
<li><a href="./redirection-loops.html">Bad redirections loops</a></li>
<li><a href="./content-types/index.html">Handling of content types</a></li>
<li><a href="./http-equiv-redirect.html">Redirect with http-equiv meta directive</a></li>
</ul>
</body>

Expand Down
9 changes: 5 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ def no_js_notify_handler(_: str):
class SimpleUrlRewriter(ArticleUrlRewriter):
"""Basic URL rewriter mocking most calls"""

def __init__(self, article_url: HttpUrl):
def __init__(self, article_url: HttpUrl, suffix: str = ""):
self.article_url = article_url
self.suffix = suffix

def __call__(
self,
Expand All @@ -31,7 +32,7 @@ def __call__(
*,
rewrite_all_url: bool = True, # noqa: ARG002
) -> str:
return item_url
return item_url + self.suffix

def get_item_path(
self, item_url: str, base_href: str | None # noqa: ARG002
Expand All @@ -48,8 +49,8 @@ def get_document_uri(
def simple_url_rewriter():
"""Fixture to create a basic url rewriter returning URLs as-is"""

def get_simple_url_rewriter(url: str):
return SimpleUrlRewriter(HttpUrl(url))
def get_simple_url_rewriter(url: str, suffix: str = ""):
return SimpleUrlRewriter(HttpUrl(url), suffix=suffix)

yield get_simple_url_rewriter

Expand Down
146 changes: 146 additions & 0 deletions tests/test_html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
extract_attr,
extract_base_href,
format_attr,
rewrite_meta_http_equiv_redirect,
)
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath

Expand Down Expand Up @@ -903,6 +904,43 @@ def test_rewrite_meta_charset(rewrite_meta_charset_content, no_js_notify):
)


@pytest.fixture(
params=[
ContentForTests(
'<html><head><meta http-equiv="refresh" '
'content="3;url=https://kiwix.org/somepage" />'
"</head><body>whatever</body></html>",
'<html><head><meta http-equiv="refresh" '
'content="3;url=somepage" />'
"</head><body>whatever</body></html>",
),
]
)
def rewrite_meta_http_equiv_redirect_full_content(request):
yield request.param


def test_rewrite_meta_http_equiv_redirect_full(
rewrite_meta_http_equiv_redirect_full_content, no_js_notify
):
assert (
HtmlRewriter(
ArticleUrlRewriter(
HttpUrl(
f"http://{rewrite_meta_http_equiv_redirect_full_content.article_url}"
),
{ZimPath("kiwix.org/somepage")},
),
"",
"",
no_js_notify,
)
.rewrite(rewrite_meta_http_equiv_redirect_full_content.input_str)
.content
== rewrite_meta_http_equiv_redirect_full_content.expected_str
)


rules = HTMLRewritingRules()


Expand Down Expand Up @@ -1355,3 +1393,111 @@ def test_bad_html_data_rewrite_rules_argument_type():
@bad_rules.rewrite_data()
def bad_signature(data: int) -> str | None:
return f"{data}"


@pytest.mark.parametrize(
"tag, attr_name, attr_value, attrs, expected_result",
[
pytest.param(
"meta",
"content",
"1;url=http://www.example.com/somewhere",
[("http-equiv", "refresh")],
("content", "1;url=http://www.example.com/somewhererewritten"),
id="nomimal_case",
),
pytest.param(
"meta",
"content",
" 1 ; url = http://www.example.com/somewhere ",
[("http-equiv", "refresh")],
("content", "1;url=http://www.example.com/somewhererewritten"),
id="nomimal_case_with_spaces",
),
pytest.param(
"foo",
"content",
"1;url=http://www.example.com/somewhere",
[("http-equiv", "refresh")],
None,
id="do_not_rewrite_foo_tag",
),
pytest.param(
"meta",
"foo",
"1;url=http://www.example.com/somewhere",
[("http-equiv", "refresh")],
None,
id="do_not_rewrite_foo_attribute",
),
pytest.param(
"meta",
"content",
"1;url=http://www.example.com/somewhere",
[("http-equiv", "foo")],
None,
id="do_not_rewrite_http_equiv_not_refresh",
),
pytest.param(
"meta",
"content",
"1;url=http://www.example.com/somewhere",
[],
None,
id="do_not_rewrite_no_http_equiv",
),
pytest.param(
"meta",
"content",
None,
[("http-equiv", "refresh")],
None,
id="do_not_rewrite_missing_attribute",
),
pytest.param(
"meta",
"content",
"",
[("http-equiv", "refresh")],
None,
id="do_not_rewrite_empty_attribute",
),
pytest.param(
"meta",
"content",
"1",
[("http-equiv", "refresh")],
None,
id="do_not_rewrite_attribute_without_url",
),
pytest.param(
"meta",
"content",
"1;foo=http://www.example.com/somewhere",
[("http-equiv", "refresh")],
None,
id="do_not_rewrite_bad_attribute",
),
],
)
def test_rewrite_meta_http_equiv_redirect_rule(
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
expected_result: AttrNameAndValue | None,
simple_url_rewriter,
):
url_rewriter = simple_url_rewriter("http://www.example.com", suffix="rewritten")

assert (
rewrite_meta_http_equiv_redirect(
tag=tag,
attr_name=attr_name,
attr_value=attr_value,
attrs=attrs,
url_rewriter=url_rewriter,
base_href=None,
)
== expected_result
)

0 comments on commit f25b51e

Please sign in to comment.