Skip to content

Commit

Permalink
Merge pull request #218 from openzim/url_handling
Browse files Browse the repository at this point in the history
Revisit handling of special characters in ZIM / HTML URLs
  • Loading branch information
benoit74 authored Apr 5, 2024
2 parents beeb5d2 + 83438d6 commit 7809a6d
Show file tree
Hide file tree
Showing 15 changed files with 880 additions and 308 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,7 @@ pyrightconfig.json

# installed at build time
wombat.js

# temporary directories using during development
output
tmp
12 changes: 8 additions & 4 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from warc2zim.content_rewriting.html import HtmlRewriter
from warc2zim.content_rewriting.js import JsRewriter
from warc2zim.content_rewriting.rx_replacer import RxRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
from warc2zim.utils import (
get_record_content,
get_record_encoding,
Expand Down Expand Up @@ -59,7 +59,7 @@ def __init__(
self,
path: str,
record: ArcWarcRecord,
known_urls: set[str],
existing_zim_paths: set[ZimPath],
):
self.content = get_record_content(record)

Expand All @@ -69,7 +69,9 @@ def __init__(

self.path = path
self.orig_url_str = get_record_url(record)
self.url_rewriter = ArticleUrlRewriter(self.orig_url_str, known_urls)
self.url_rewriter = ArticleUrlRewriter(
HttpUrl(self.orig_url_str), existing_zim_paths
)

self.rewrite_mode = self.get_rewrite_mode(record, mimetype)

Expand Down Expand Up @@ -133,7 +135,9 @@ def get_rewrite_mode(self, record, mimetype):
def rewrite_html(self, head_template: Template, css_insert: str | None):
orig_url = urlsplit(self.orig_url_str)

rel_static_prefix = self.url_rewriter.from_normalized("_zim_static/")
rel_static_prefix = self.url_rewriter.get_document_uri(
ZimPath("_zim_static/"), ""
)
head_insert = head_template.render(
path=self.path,
static_prefix=rel_static_prefix,
Expand Down
4 changes: 2 additions & 2 deletions src/warc2zim/content_rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,12 @@ def handle_data(self, data: str):
elif self._active_tag == "style":
data = self.css_rewriter.rewrite(data)
elif self._active_tag == "script":
rules = get_ds_rules(self.url_rewriter.article_url)
rules = get_ds_rules(self.url_rewriter.article_url.value)
if data.strip():
data = JsRewriter(self.url_rewriter, rules).rewrite(data)
elif self._active_tag == "json":
if data.strip():
rules = get_ds_rules(self.url_rewriter.article_url)
rules = get_ds_rules(self.url_rewriter.article_url.value)
if rules:
data = RxRewriter(rules).rewrite(data, {})
self.send(data)
Expand Down
5 changes: 3 additions & 2 deletions src/warc2zim/content_rewriting/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
replace,
replace_prefix_from,
)
from warc2zim.url_rewriting import ZimPath

# Regex used to check if we ar import or exporting things in the string
# ie : If we are a module
Expand Down Expand Up @@ -236,8 +237,8 @@ def _get_module_decl(self, local_decls: Iterable[str]) -> str:
This will be added to script only if the script is a module script.
"""
wb_module_decl_url = self.url_rewriter.from_normalized(
"_zim_static/__wb_module_decl.js"
wb_module_decl_url = self.url_rewriter.get_document_uri(
ZimPath("_zim_static/__wb_module_decl.js"), ""
)
return (
f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n"""
Expand Down
105 changes: 63 additions & 42 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import time
from http import HTTPStatus
from pathlib import Path
from urllib.parse import urldefrag, urljoin, urlsplit, urlunsplit
from urllib.parse import urljoin, urlsplit, urlunsplit

import requests

Expand All @@ -46,7 +46,7 @@

from warc2zim.constants import logger
from warc2zim.items import StaticArticle, WARCPayloadItem
from warc2zim.url_rewriting import FUZZY_RULES, normalize
from warc2zim.url_rewriting import FUZZY_RULES, HttpUrl, ZimPath, normalize
from warc2zim.utils import (
get_record_content,
get_record_mime_type,
Expand Down Expand Up @@ -88,14 +88,15 @@ def __init__(self, args):
for handler in logger.handlers:
handler.setLevel(logging.DEBUG)

main_url: str = args.url
main_url: str | None = str(args.url) if args.url else None
# ensure trailing slash is added if missing
parts = urlsplit(main_url)
if parts.path == "":
parts = list(parts)
# set path
parts[2] = "/"
main_url = urlunsplit(parts)
if main_url:
parts = urlsplit(main_url)
if parts.path == "":
parts = list(parts)
# set path
parts[2] = "/"
main_url = urlunsplit(parts)

self.name = args.name
self.title = args.title
Expand All @@ -107,10 +108,10 @@ def __init__(self, args):
self.creator_metadata = args.creator
self.publisher = args.publisher
self.tags = DEFAULT_TAGS + (args.tags or [])
self.source: str = args.source or main_url
self.source: str | None = str(args.source) if args.source else None or main_url
self.scraper = "warc2zim " + get_version()
self.illustration = b""
self.main_path = normalize(main_url)
self.main_path = normalize(HttpUrl(main_url)) if main_url else None

self.output = Path(args.output)
self.zim_file = args.zim_file
Expand All @@ -131,9 +132,9 @@ def __init__(self, args):

self.custom_css = args.custom_css

self.indexed_urls = set({})
self.revisits = {}
self.warc_urls = set({})
self.added_zim_items: set[ZimPath] = set()
self.revisits: dict[ZimPath, ZimPath] = {}
self.expected_zim_items: set[ZimPath] = set()

# progress file handling
self.stats_filename = (
Expand Down Expand Up @@ -202,6 +203,8 @@ def run(self):
return 100

self.gather_information_from_warc()
if not self.main_path:
raise ValueError("Unable to find main path, aborting")
self.title = self.title or "Untitled"
if len(self.title) > RECOMMENDED_MAX_TITLE_LENGTH:
self.title = f"{self.title[0:29]}…"
Expand Down Expand Up @@ -229,7 +232,7 @@ def run(self):

self.creator = Creator(
self.full_filename,
main_path=self.main_path,
main_path=self.main_path.value,
)

self.creator.config_metadata(
Expand All @@ -250,7 +253,7 @@ def run(self):
for filename in importlib.resources.files("warc2zim.statics").iterdir():
with importlib.resources.as_file(filename) as file:
self.creator.add_item(
StaticArticle(filename=file, main_path=self.main_path)
StaticArticle(filename=file, main_path=self.main_path.value)
)

# Add wombat_setup.js
Expand All @@ -269,14 +272,16 @@ def run(self):

# process revisits
for normalized_url, target_url in self.revisits.items():
if normalized_url not in self.indexed_urls:
if normalized_url not in self.added_zim_items:
logger.debug(f"Adding alias {normalized_url} -> {target_url}")
try:
self.creator.add_alias(normalized_url, "", target_url, {})
self.creator.add_alias(
normalized_url.value, "", target_url.value, {}
)
except RuntimeError as exc:
if not ALIAS_EXC_STR.match(str(exc)):
raise exc
self.indexed_urls.add(normalized_url)
self.added_zim_items.add(normalized_url)

logger.debug(f"Found {self.total_records} records in WARCs")

Expand All @@ -292,10 +297,16 @@ def iter_all_warc_records(self):
def gather_information_from_warc(self):
main_page_found = False
for record in iter_warc_records(self.inputs):

# only response records can be considered as main_path and as existing ZIM
# path
if record.rec_type not in ("response", "revisit"):
continue

url = get_record_url(record)
normalized_url = normalize(url)
zim_path = normalize(HttpUrl(url))

self.warc_urls.add(normalized_url)
self.expected_zim_items.add(zim_path)

if main_page_found:
continue
Expand All @@ -316,9 +327,9 @@ def gather_information_from_warc(self):
or record.http_headers.get_statuscode() == "200"
)
):
self.main_path = normalized_url
self.main_path = zim_path

if urldefrag(self.main_path).url != normalized_url:
if self.main_path != zim_path:
continue

# if we get here, found record for the main page
Expand All @@ -332,14 +343,16 @@ def gather_information_from_warc(self):
]:
original_path = self.main_path
self.main_path = normalize(
urljoin(
get_record_url(record),
record.http_headers.get_header("Location").strip(),
HttpUrl(
urljoin(
get_record_url(record),
record.http_headers.get_header("Location").strip(),
)
)
)
logger.warning(
f"HTTP {status_code} occurred on main page; "
f"replacing {original_path} with '{self.main_path}'"
f"replacing {original_path} with {self.main_path}"
)
continue

Expand Down Expand Up @@ -393,9 +406,11 @@ def find_icon_and_language(self, record, content):

# transform icon URL into WARC path
self.favicon_path = normalize(
urljoin(
get_record_url(record),
icon_url,
HttpUrl(
urljoin(
get_record_url(record),
icon_url,
)
)
)

Expand Down Expand Up @@ -437,7 +452,7 @@ def retrieve_illustration(self):
if record.rec_type != "response":
continue
url = get_record_url(record)
path = normalize(url)
path = normalize(HttpUrl(url))
if path == self.favicon_path or url == self.favicon_url:
logger.debug("Found WARC record for favicon")
if (
Expand Down Expand Up @@ -497,18 +512,20 @@ def is_self_redirect(self, record, url):
return False

location = record.http_headers.get("Location", "")
return normalize(url) == normalize(location)
location = urljoin(url, location)
return normalize(HttpUrl(url)) == normalize(HttpUrl(location))

def add_items_for_warc_record(self, record):

if record.rec_type not in ("response", "revisit"):
return

url = get_record_url(record)
normalized_url = normalize(url)
if not url:
logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
return

if normalized_url in self.indexed_urls:
logger.debug(f"Skipping duplicate {url}, already added to ZIM")
return
item_zim_path = normalize(HttpUrl(url))

# if include_domains is set, only include urls from those domains
if self.include_domains:
Expand All @@ -519,17 +536,21 @@ def add_items_for_warc_record(self, record):
logger.debug(f"Skipping url {url}, outside included domains")
return

if item_zim_path in self.added_zim_items:
logger.debug(f"Skipping duplicate {url}, already added to ZIM")
return

if record.rec_type == "response":
if self.is_self_redirect(record, url):
logger.debug("Skipping self-redirect: " + url)
return

payload_item = WARCPayloadItem(
normalized_url,
item_zim_path.value,
record,
self.head_template,
self.css_insert,
self.warc_urls,
self.expected_zim_items,
)

if len(payload_item.content) != 0:
Expand All @@ -541,15 +562,15 @@ def add_items_for_warc_record(self, record):
self.total_records += 1
self.update_stats()

self.indexed_urls.add(normalized_url)
self.added_zim_items.add(item_zim_path)

elif (
record.rec_type == "revisit"
and record.rec_headers["WARC-Refers-To-Target-URI"] != url
and normalized_url not in self.revisits
and item_zim_path not in self.revisits
): # pragma: no branch
self.revisits[normalized_url] = normalize(
record.rec_headers["WARC-Refers-To-Target-URI"]
self.revisits[item_zim_path] = normalize(
HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
)


Expand Down
5 changes: 3 additions & 2 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from zimscraperlib.zim.items import StaticItem

from warc2zim.content_rewriting.generic import Rewriter
from warc2zim.url_rewriting import ZimPath
from warc2zim.utils import get_record_mime_type


Expand All @@ -29,13 +30,13 @@ def __init__(
record: ArcWarcRecord,
head_template: Template,
css_insert: str | None,
known_urls: set[str],
existing_zim_paths: set[ZimPath],
):
super().__init__()

self.path = path
self.mimetype = get_record_mime_type(record)
(self.title, self.content) = Rewriter(path, record, known_urls).rewrite(
(self.title, self.content) = Rewriter(path, record, existing_zim_paths).rewrite(
head_template, css_insert
)

Expand Down
Loading

0 comments on commit 7809a6d

Please sign in to comment.