Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revisit handling of special characters in ZIM / HTML URLs #218

Merged
merged 10 commits into from
Apr 5, 2024
12 changes: 8 additions & 4 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from warc2zim.content_rewriting.html import HtmlRewriter
from warc2zim.content_rewriting.js import JsRewriter
from warc2zim.content_rewriting.rx_replacer import RxRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
from warc2zim.utils import (
get_record_content,
get_record_encoding,
Expand Down Expand Up @@ -59,7 +59,7 @@ def __init__(
self,
path: str,
record: ArcWarcRecord,
known_urls: set[str],
existing_zim_paths: set[ZimPath],
):
self.content = get_record_content(record)

Expand All @@ -69,7 +69,9 @@ def __init__(

self.path = path
self.orig_url_str = get_record_url(record)
self.url_rewriter = ArticleUrlRewriter(self.orig_url_str, known_urls)
self.url_rewriter = ArticleUrlRewriter(
HttpUrl(self.orig_url_str), existing_zim_paths
)

self.rewrite_mode = self.get_rewrite_mode(record, mimetype)

Expand Down Expand Up @@ -133,7 +135,9 @@ def get_rewrite_mode(self, record, mimetype):
def rewrite_html(self, head_template: Template, css_insert: str | None):
orig_url = urlsplit(self.orig_url_str)

rel_static_prefix = self.url_rewriter.from_normalized("_zim_static/")
rel_static_prefix = self.url_rewriter.get_document_uri(
ZimPath("_zim_static/"), ""
)
head_insert = head_template.render(
path=self.path,
static_prefix=rel_static_prefix,
Expand Down
4 changes: 2 additions & 2 deletions src/warc2zim/content_rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,12 @@ def handle_data(self, data: str):
elif self._active_tag == "style":
data = self.css_rewriter.rewrite(data)
elif self._active_tag == "script":
rules = get_ds_rules(self.url_rewriter.article_url)
rules = get_ds_rules(self.url_rewriter.article_url.value)
if data.strip():
data = JsRewriter(self.url_rewriter, rules).rewrite(data)
elif self._active_tag == "json":
if data.strip():
rules = get_ds_rules(self.url_rewriter.article_url)
rules = get_ds_rules(self.url_rewriter.article_url.value)
if rules:
data = RxRewriter(rules).rewrite(data, {})
self.send(data)
Expand Down
5 changes: 3 additions & 2 deletions src/warc2zim/content_rewriting/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
replace,
replace_prefix_from,
)
from warc2zim.url_rewriting import ZimPath

# Regex used to check if we ar import or exporting things in the string
# ie : If we are a module
Expand Down Expand Up @@ -236,8 +237,8 @@ def _get_module_decl(self, local_decls: Iterable[str]) -> str:

This will be added to script only if the script is a module script.
"""
wb_module_decl_url = self.url_rewriter.from_normalized(
"_zim_static/__wb_module_decl.js"
wb_module_decl_url = self.url_rewriter.get_document_uri(
ZimPath("_zim_static/__wb_module_decl.js"), ""
)
return (
f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n"""
Expand Down
83 changes: 47 additions & 36 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import time
from http import HTTPStatus
from pathlib import Path
from urllib.parse import urldefrag, urljoin, urlsplit, urlunsplit
from urllib.parse import urljoin, urlsplit, urlunsplit

import requests

Expand All @@ -46,7 +46,7 @@

from warc2zim.constants import logger
from warc2zim.items import StaticArticle, WARCPayloadItem
from warc2zim.url_rewriting import FUZZY_RULES, normalize
from warc2zim.url_rewriting import FUZZY_RULES, HttpUrl, ZimPath, normalize
from warc2zim.utils import (
get_record_content,
get_record_mime_type,
Expand Down Expand Up @@ -88,14 +88,15 @@
for handler in logger.handlers:
handler.setLevel(logging.DEBUG)

main_url: str = args.url
main_url: str | None = str(args.url) if args.url else None
# ensure trailing slash is added if missing
parts = urlsplit(main_url)
if parts.path == "":
parts = list(parts)
# set path
parts[2] = "/"
main_url = urlunsplit(parts)
if main_url:
parts = urlsplit(main_url)
if parts.path == "":
parts = list(parts)
# set path
parts[2] = "/"
main_url = urlunsplit(parts)

self.name = args.name
self.title = args.title
Expand All @@ -107,10 +108,10 @@
self.creator_metadata = args.creator
self.publisher = args.publisher
self.tags = DEFAULT_TAGS + (args.tags or [])
self.source: str = args.source or main_url
self.source: str | None = str(args.source) if args.source else None or main_url
self.scraper = "warc2zim " + get_version()
self.illustration = b""
self.main_path = normalize(main_url)
self.main_path = normalize(HttpUrl(main_url)) if main_url else None

self.output = Path(args.output)
self.zim_file = args.zim_file
Expand All @@ -132,8 +133,8 @@
self.custom_css = args.custom_css

self.indexed_urls = set({})
self.revisits = {}
self.warc_urls = set({})
self.revisits: dict[ZimPath, ZimPath] = {}
self.existing_zim_paths: set[ZimPath] = set()

# progress file handling
self.stats_filename = (
Expand Down Expand Up @@ -202,6 +203,8 @@
return 100

self.gather_information_from_warc()
if not self.main_path:
raise ValueError("Unable to find main path, aborting")

Check warning on line 207 in src/warc2zim/converter.py

View check run for this annotation

Codecov / codecov/patch

src/warc2zim/converter.py#L207

Added line #L207 was not covered by tests
self.title = self.title or "Untitled"
if len(self.title) > RECOMMENDED_MAX_TITLE_LENGTH:
self.title = f"{self.title[0:29]}…"
Expand Down Expand Up @@ -229,7 +232,7 @@

self.creator = Creator(
self.full_filename,
main_path=self.main_path,
main_path=self.main_path.value,
)

self.creator.config_metadata(
Expand All @@ -250,7 +253,7 @@
for filename in importlib.resources.files("warc2zim.statics").iterdir():
with importlib.resources.as_file(filename) as file:
self.creator.add_item(
StaticArticle(filename=file, main_path=self.main_path)
StaticArticle(filename=file, main_path=self.main_path.value)
)

# Add wombat_setup.js
Expand All @@ -272,7 +275,9 @@
if normalized_url not in self.indexed_urls:
logger.debug(f"Adding alias {normalized_url} -> {target_url}")
try:
self.creator.add_alias(normalized_url, "", target_url, {})
self.creator.add_alias(
normalized_url.value, "", target_url.value, {}
)
except RuntimeError as exc:
if not ALIAS_EXC_STR.match(str(exc)):
raise exc
Expand All @@ -299,9 +304,9 @@
continue

url = get_record_url(record)
normalized_url = normalize(url)
zim_path = normalize(HttpUrl(url))

self.warc_urls.add(normalized_url)
self.existing_zim_paths.add(zim_path)

if main_page_found:
continue
Expand All @@ -322,9 +327,9 @@
or record.http_headers.get_statuscode() == "200"
)
):
self.main_path = normalized_url
self.main_path = zim_path

if urldefrag(self.main_path).url != normalized_url:
if self.main_path != zim_path:
continue

# if we get here, found record for the main page
Expand All @@ -338,14 +343,16 @@
]:
original_path = self.main_path
self.main_path = normalize(
urljoin(
get_record_url(record),
record.http_headers.get_header("Location").strip(),
HttpUrl(
urljoin(
get_record_url(record),
record.http_headers.get_header("Location").strip(),
)
)
)
logger.warning(
f"HTTP {status_code} occurred on main page; "
f"replacing {original_path} with '{self.main_path}'"
f"replacing {original_path} with {self.main_path}"
)
continue

Expand Down Expand Up @@ -399,9 +406,11 @@

# transform icon URL into WARC path
self.favicon_path = normalize(
urljoin(
get_record_url(record),
icon_url,
HttpUrl(
urljoin(
get_record_url(record),
icon_url,
)
)
)

Expand Down Expand Up @@ -443,7 +452,7 @@
if record.rec_type != "response":
continue
url = get_record_url(record)
path = normalize(url)
path = normalize(HttpUrl(url))
if path == self.favicon_path or url == self.favicon_url:
logger.debug("Found WARC record for favicon")
if (
Expand Down Expand Up @@ -503,22 +512,20 @@
return False

location = record.http_headers.get("Location", "")
return normalize(url) == normalize(location)
location = urljoin(url, location)
return normalize(HttpUrl(url)) == normalize(HttpUrl(location))

def add_items_for_warc_record(self, record):

if record.rec_type not in ("response", "revisit"):
return

url = get_record_url(record)
normalized_url = normalize(url)
if not url:
logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
return

if normalized_url in self.indexed_urls:
logger.debug(f"Skipping duplicate {url}, already added to ZIM")
return
normalized_url = normalize(HttpUrl(url))
benoit74 marked this conversation as resolved.
Show resolved Hide resolved

# if include_domains is set, only include urls from those domains
if self.include_domains:
Expand All @@ -529,17 +536,21 @@
logger.debug(f"Skipping url {url}, outside included domains")
return

if normalized_url in self.indexed_urls:
logger.debug(f"Skipping duplicate {url}, already added to ZIM")
return

if record.rec_type == "response":
if self.is_self_redirect(record, url):
logger.debug("Skipping self-redirect: " + url)
return

payload_item = WARCPayloadItem(
normalized_url,
normalized_url.value,
record,
self.head_template,
self.css_insert,
self.warc_urls,
self.existing_zim_paths,
)

if len(payload_item.content) != 0:
Expand All @@ -559,7 +570,7 @@
and normalized_url not in self.revisits
): # pragma: no branch
self.revisits[normalized_url] = normalize(
record.rec_headers["WARC-Refers-To-Target-URI"]
HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
)


Expand Down
5 changes: 3 additions & 2 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from zimscraperlib.zim.items import StaticItem

from warc2zim.content_rewriting.generic import Rewriter
from warc2zim.url_rewriting import ZimPath
from warc2zim.utils import get_record_mime_type


Expand All @@ -29,13 +30,13 @@ def __init__(
record: ArcWarcRecord,
head_template: Template,
css_insert: str | None,
known_urls: set[str],
existing_zim_paths: set[ZimPath],
):
super().__init__()

self.path = path
self.mimetype = get_record_mime_type(record)
(self.title, self.content) = Rewriter(path, record, known_urls).rewrite(
(self.title, self.content) = Rewriter(path, record, existing_zim_paths).rewrite(
head_template, css_insert
)

Expand Down
Loading
Loading