From 44a54d440e6d6714a15f27044c75a84de0ddb0ae Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 15 Mar 2024 20:27:04 +0000 Subject: [PATCH 01/10] Fix improperly escaped character in test-website --- test-website/url-encoding.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-website/url-encoding.html b/test-website/url-encoding.html index 6a983537..46bb15bb 100644 --- a/test-website/url-encoding.html +++ b/test-website/url-encoding.html @@ -33,7 +33,7 @@

Image with non-encoded chars in name and a mix of non-encoded and already en

Image with non-encoded chars in name with non-encoded special chars in link and a query parameter and fragment with reserved chars

An image should be displayed below as well

- +
From def7bc1d67561b5009ccd4feb43aec1f2a33f275 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Sat, 16 Mar 2024 06:38:33 +0000 Subject: [PATCH 02/10] Rewrite fuzzy rule to not contain the trailing ? when querystring is empty --- src/warc2zim/url_rewriting.py | 2 +- tests/test_fuzzy_rules.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index 7a62fa78..96488f8c 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -71,7 +71,7 @@ r"=[^&]+).*", "replace": r"youtube.fuzzy.replayweb.page/\1\2", }, - {"pattern": r"([^?]+\?)[\d]+$", "replace": r"\1"}, + {"pattern": r"([^?]+)\?[\d]+$", "replace": r"\1"}, { "pattern": r"(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[" r"^&]+).*", diff --git a/tests/test_fuzzy_rules.py b/tests/test_fuzzy_rules.py index 92dd0368..3fe12bcb 100644 --- a/tests/test_fuzzy_rules.py +++ b/tests/test_fuzzy_rules.py @@ -87,7 +87,7 @@ def test_fuzzyrules_google_video_infos(google_video_info_case): params=[ ContentForTests( "www.example.com/page?1234", - "www.example.com/page?", + "www.example.com/page", ), ContentForTests( "www.example.com/page?foo=1234", From 40edf3c49124de02471b2972d9b0b7e1d5576ca1 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 18 Mar 2024 09:56:11 +0000 Subject: [PATCH 03/10] Really do not consider 'resource' WARC record for all operations --- src/warc2zim/converter.py | 10 ++++++++++ tests/data/example-resource.warc.gz | Bin 1779 -> 0 bytes 2 files changed, 10 insertions(+) delete mode 100644 tests/data/example-resource.warc.gz diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 8625af9b..10089930 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -292,6 +292,12 @@ def iter_all_warc_records(self): def gather_information_from_warc(self): main_page_found = False for record in iter_warc_records(self.inputs): + + # only response records can be considered as main_path and as existing ZIM + # path + if record.rec_type not in ("response", "revisit"): + continue + url = get_record_url(record) normalized_url = normalize(url) @@ -500,6 +506,10 @@ def is_self_redirect(self, record, url): return normalize(url) == normalize(location) def add_items_for_warc_record(self, record): + + if record.rec_type not in ("response", "revisit"): + return + url = get_record_url(record) normalized_url = normalize(url) if not url: diff --git a/tests/data/example-resource.warc.gz b/tests/data/example-resource.warc.gz deleted file mode 100644 index 91737eeebb0e62e5308753786cfe9373f5653f2f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1779 zcmVYr-%TeJ}Jsgu>nyP1-nX7=xiW27`$>1z$#*rkc?v zCAn_s*neLlqHi}jU-#V2^Kv?*1Cl12FyYm)R~-EaC$(wp<#VZ|bv1sSarAg_h93^P z=Hn5ORWT}YF2@z7RE=?2Rt+XO%Y~#lA*6h|7|eC6jOc>12LW1QZ$$6+7Cg=g9TS#M z2rUSsgP1(noi4hWfS`#KOc>55D?)ieIHTJ%nOXzNz-436_pO$}+nQnYz>QXI?QgU< zWp=O+W^9rsY2s}Ifftk;Rk?FN<ABzY8000000t0PPU2EJh5PUcIKL|nIDqFH$d={k;?vhd{G+eGZNGL@o%RU8L zGSXh#B;>zG@?H7?dDG6!j&^6a=j&z8GMXma@FJUsNpbW`IH^r%PvBZf>ss=8#nFd@ zGyHJSEwA5G**27tLe@LNn5xNQvFiveszOLsP)ZjcPX?EIP)3Zw*@FNri8o^MdkdZv zl-0B<8p=vqu`K?a^^d2%m4MI_C4^R_Xf~AblJbh(rpeM8PzFvDi)k9P4Bpn{(Svka z4Xytx`=U$_dydMcNs~13wu7Jxom;hY^M1;qt3g0#-H3jc*&oUCs!a z^O8;pu#O{8<3v8Kx3@RnHaBUa{grU=wc}_L25v!UqWK@;(9l~Bes(>|h9g0cx%7^Oi34ejZ_J-`eRp9wbd(080|(oQeVf03VA80000000IM*RBcb=NDTg7iT~iu zeF3R?YoRS6jgAEbXyM?xWqJ9snkG&&YLdy$O!{)C{qO6fp|l4l-L=(p#$!LWACLWb zIy$pm%h_u^G8vj9j8UTWAkn%|o^3C1A~6&)LTPbfYs?Dsa`gD}?P2__KGCAIz1FcS zD>A|*%Ze1OP-J!k{Gd>pr&G+e2e;xco~Fz`vK-jIWg*YCP{|2g=NeNW0o;9sCvctb zuKV3RfYXA&9&z%8Yx}6%x4J#p|8h0Hy=gN=fcYv96H z7Bk5kD+-0OHOH$0J&>qGDMM_8qdEce`rHG*l(|=yJo0+jk9$tU%ud*!n68U`ld-sO zI-SD~3*C<6IQ`%imB}e;^I>$oCht8BmDV}anGcbshS>Gz#}@l?oP_ z=X^@MdY}^Kde2WcR~IjLz31zT` zb2T&?9u|v*wdh(xPHp%2_}E@nItHmCi-J}Q=;LdyQw%QFjKR6c7|(rsEp1dX)NB)# zQHN#N!PZ^n&cq)jpAU`Ka5bf6K)5~)jepB;7bs^L4vjg-MIof#GGFjWC&LKmJVdiT zw1MYbbC#MaWGN2a4U*C;%B1q&xDAyu)^`&Tt>Eu^TOV0Cn@UmUkr|3qQ1|^7JM`_G zY*aFOT|D=|c@q^Zig-TVk;N3K8MBP1D-Vpj0(0tsTp4XJt}vaW<{^VWutbt<1Z_B# zlt3F4shZTMJbpv0H$Uq&;`6Pb>+C(~IZm;BALkKfz)CGX2={2m;G^8*E&r2x^q7)G zyj1k+Fgw`_iFql{XKnhQbHz1ATLHepbq9xo!4_gw3p1+PooOy8)?DPc4f=bA5ofTU zv8CB$*{`xZ*e=;_1^-JXKR3HwBDcugWo)PJAKd+e8fW)*!}o@=w;{AU z5t`YksC~#fEHmmaWx=y3Dd$yX1e%c753fPb->eo~C!*9)NKVM1AS!rX*AwW>)QnQE zofarFt`uEZzFlmh_Gwy3jCV#9XF02f)Y_kVo!>@qD Date: Mon, 18 Mar 2024 09:56:32 +0000 Subject: [PATCH 04/10] Rework transformation of WARC record url to ZIM path and URL normalization to support encoded URL and query strings --- src/warc2zim/content_rewriting/generic.py | 12 +- src/warc2zim/content_rewriting/html.py | 4 +- src/warc2zim/content_rewriting/js.py | 5 +- src/warc2zim/converter.py | 83 ++-- src/warc2zim/items.py | 5 +- src/warc2zim/url_rewriting.py | 322 +++++++++++---- tests/test_css_rewriting.py | 36 +- tests/test_fuzzy_rules.py | 23 +- tests/test_html_rewriting.py | 35 +- tests/test_js_rewriting.py | 6 +- tests/test_url_rewriting.py | 455 +++++++++++++++++----- tests/test_warc_to_zim.py | 156 ++++++-- 12 files changed, 852 insertions(+), 290 deletions(-) diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py index 280338c8..4a35aeaa 100644 --- a/src/warc2zim/content_rewriting/generic.py +++ b/src/warc2zim/content_rewriting/generic.py @@ -11,7 +11,7 @@ from warc2zim.content_rewriting.html import HtmlRewriter from warc2zim.content_rewriting.js import JsRewriter from warc2zim.content_rewriting.rx_replacer import RxRewriter -from warc2zim.url_rewriting import ArticleUrlRewriter +from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath from warc2zim.utils import ( get_record_content, get_record_encoding, @@ -59,7 +59,7 @@ def __init__( self, path: str, record: ArcWarcRecord, - known_urls: set[str], + existing_zim_paths: set[ZimPath], ): self.content = get_record_content(record) @@ -69,7 +69,9 @@ def __init__( self.path = path self.orig_url_str = get_record_url(record) - self.url_rewriter = ArticleUrlRewriter(self.orig_url_str, known_urls) + self.url_rewriter = ArticleUrlRewriter( + HttpUrl(self.orig_url_str), existing_zim_paths + ) self.rewrite_mode = self.get_rewrite_mode(record, mimetype) @@ -133,7 +135,9 @@ def get_rewrite_mode(self, record, mimetype): def rewrite_html(self, head_template: Template, css_insert: str | None): orig_url = urlsplit(self.orig_url_str) - rel_static_prefix = self.url_rewriter.from_normalized("_zim_static/") + rel_static_prefix = self.url_rewriter.get_document_uri( + ZimPath("_zim_static/"), "" + ) head_insert = head_template.render( path=self.path, static_prefix=rel_static_prefix, diff --git a/src/warc2zim/content_rewriting/html.py b/src/warc2zim/content_rewriting/html.py index e938f2b1..1e724a12 100644 --- a/src/warc2zim/content_rewriting/html.py +++ b/src/warc2zim/content_rewriting/html.py @@ -134,12 +134,12 @@ def handle_data(self, data: str): elif self._active_tag == "style": data = self.css_rewriter.rewrite(data) elif self._active_tag == "script": - rules = get_ds_rules(self.url_rewriter.article_url) + rules = get_ds_rules(self.url_rewriter.article_url.value) if data.strip(): data = JsRewriter(self.url_rewriter, rules).rewrite(data) elif self._active_tag == "json": if data.strip(): - rules = get_ds_rules(self.url_rewriter.article_url) + rules = get_ds_rules(self.url_rewriter.article_url.value) if rules: data = RxRewriter(rules).rewrite(data, {}) self.send(data) diff --git a/src/warc2zim/content_rewriting/js.py b/src/warc2zim/content_rewriting/js.py index 24baba46..9c93a743 100644 --- a/src/warc2zim/content_rewriting/js.py +++ b/src/warc2zim/content_rewriting/js.py @@ -12,6 +12,7 @@ replace, replace_prefix_from, ) +from warc2zim.url_rewriting import ZimPath # Regex used to check if we ar import or exporting things in the string # ie : If we are a module @@ -236,8 +237,8 @@ def _get_module_decl(self, local_decls: Iterable[str]) -> str: This will be added to script only if the script is a module script. """ - wb_module_decl_url = self.url_rewriter.from_normalized( - "_zim_static/__wb_module_decl.js" + wb_module_decl_url = self.url_rewriter.get_document_uri( + ZimPath("_zim_static/__wb_module_decl.js"), "" ) return ( f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n""" diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 10089930..9d231999 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -23,7 +23,7 @@ import time from http import HTTPStatus from pathlib import Path -from urllib.parse import urldefrag, urljoin, urlsplit, urlunsplit +from urllib.parse import urljoin, urlsplit, urlunsplit import requests @@ -46,7 +46,7 @@ from warc2zim.constants import logger from warc2zim.items import StaticArticle, WARCPayloadItem -from warc2zim.url_rewriting import FUZZY_RULES, normalize +from warc2zim.url_rewriting import FUZZY_RULES, HttpUrl, ZimPath, normalize from warc2zim.utils import ( get_record_content, get_record_mime_type, @@ -88,14 +88,15 @@ def __init__(self, args): for handler in logger.handlers: handler.setLevel(logging.DEBUG) - main_url: str = args.url + main_url: str | None = str(args.url) if args.url else None # ensure trailing slash is added if missing - parts = urlsplit(main_url) - if parts.path == "": - parts = list(parts) - # set path - parts[2] = "/" - main_url = urlunsplit(parts) + if main_url: + parts = urlsplit(main_url) + if parts.path == "": + parts = list(parts) + # set path + parts[2] = "/" + main_url = urlunsplit(parts) self.name = args.name self.title = args.title @@ -107,10 +108,10 @@ def __init__(self, args): self.creator_metadata = args.creator self.publisher = args.publisher self.tags = DEFAULT_TAGS + (args.tags or []) - self.source: str = args.source or main_url + self.source: str | None = str(args.source) if args.source else None or main_url self.scraper = "warc2zim " + get_version() self.illustration = b"" - self.main_path = normalize(main_url) + self.main_path = normalize(HttpUrl(main_url)) if main_url else None self.output = Path(args.output) self.zim_file = args.zim_file @@ -132,8 +133,8 @@ def __init__(self, args): self.custom_css = args.custom_css self.indexed_urls = set({}) - self.revisits = {} - self.warc_urls = set({}) + self.revisits: dict[ZimPath, ZimPath] = {} + self.existing_zim_paths: set[ZimPath] = set() # progress file handling self.stats_filename = ( @@ -202,6 +203,8 @@ def run(self): return 100 self.gather_information_from_warc() + if not self.main_path: + raise ValueError("Unable to find main path, aborting") self.title = self.title or "Untitled" if len(self.title) > RECOMMENDED_MAX_TITLE_LENGTH: self.title = f"{self.title[0:29]}…" @@ -229,7 +232,7 @@ def run(self): self.creator = Creator( self.full_filename, - main_path=self.main_path, + main_path=self.main_path.value, ) self.creator.config_metadata( @@ -250,7 +253,7 @@ def run(self): for filename in importlib.resources.files("warc2zim.statics").iterdir(): with importlib.resources.as_file(filename) as file: self.creator.add_item( - StaticArticle(filename=file, main_path=self.main_path) + StaticArticle(filename=file, main_path=self.main_path.value) ) # Add wombat_setup.js @@ -272,7 +275,9 @@ def run(self): if normalized_url not in self.indexed_urls: logger.debug(f"Adding alias {normalized_url} -> {target_url}") try: - self.creator.add_alias(normalized_url, "", target_url, {}) + self.creator.add_alias( + normalized_url.value, "", target_url.value, {} + ) except RuntimeError as exc: if not ALIAS_EXC_STR.match(str(exc)): raise exc @@ -299,9 +304,9 @@ def gather_information_from_warc(self): continue url = get_record_url(record) - normalized_url = normalize(url) + zim_path = normalize(HttpUrl(url)) - self.warc_urls.add(normalized_url) + self.existing_zim_paths.add(zim_path) if main_page_found: continue @@ -322,9 +327,9 @@ def gather_information_from_warc(self): or record.http_headers.get_statuscode() == "200" ) ): - self.main_path = normalized_url + self.main_path = zim_path - if urldefrag(self.main_path).url != normalized_url: + if self.main_path != zim_path: continue # if we get here, found record for the main page @@ -338,14 +343,16 @@ def gather_information_from_warc(self): ]: original_path = self.main_path self.main_path = normalize( - urljoin( - get_record_url(record), - record.http_headers.get_header("Location").strip(), + HttpUrl( + urljoin( + get_record_url(record), + record.http_headers.get_header("Location").strip(), + ) ) ) logger.warning( f"HTTP {status_code} occurred on main page; " - f"replacing {original_path} with '{self.main_path}'" + f"replacing {original_path} with {self.main_path}" ) continue @@ -399,9 +406,11 @@ def find_icon_and_language(self, record, content): # transform icon URL into WARC path self.favicon_path = normalize( - urljoin( - get_record_url(record), - icon_url, + HttpUrl( + urljoin( + get_record_url(record), + icon_url, + ) ) ) @@ -443,7 +452,7 @@ def retrieve_illustration(self): if record.rec_type != "response": continue url = get_record_url(record) - path = normalize(url) + path = normalize(HttpUrl(url)) if path == self.favicon_path or url == self.favicon_url: logger.debug("Found WARC record for favicon") if ( @@ -503,7 +512,8 @@ def is_self_redirect(self, record, url): return False location = record.http_headers.get("Location", "") - return normalize(url) == normalize(location) + location = urljoin(url, location) + return normalize(HttpUrl(url)) == normalize(HttpUrl(location)) def add_items_for_warc_record(self, record): @@ -511,14 +521,11 @@ def add_items_for_warc_record(self, record): return url = get_record_url(record) - normalized_url = normalize(url) if not url: logger.debug(f"Skipping record with empty WARC-Target-URI {record}") return - if normalized_url in self.indexed_urls: - logger.debug(f"Skipping duplicate {url}, already added to ZIM") - return + normalized_url = normalize(HttpUrl(url)) # if include_domains is set, only include urls from those domains if self.include_domains: @@ -529,17 +536,21 @@ def add_items_for_warc_record(self, record): logger.debug(f"Skipping url {url}, outside included domains") return + if normalized_url in self.indexed_urls: + logger.debug(f"Skipping duplicate {url}, already added to ZIM") + return + if record.rec_type == "response": if self.is_self_redirect(record, url): logger.debug("Skipping self-redirect: " + url) return payload_item = WARCPayloadItem( - normalized_url, + normalized_url.value, record, self.head_template, self.css_insert, - self.warc_urls, + self.existing_zim_paths, ) if len(payload_item.content) != 0: @@ -559,7 +570,7 @@ def add_items_for_warc_record(self, record): and normalized_url not in self.revisits ): # pragma: no branch self.revisits[normalized_url] = normalize( - record.rec_headers["WARC-Refers-To-Target-URI"] + HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"]) ) diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index 30d63a4d..9168c6e4 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -15,6 +15,7 @@ from zimscraperlib.zim.items import StaticItem from warc2zim.content_rewriting.generic import Rewriter +from warc2zim.url_rewriting import ZimPath from warc2zim.utils import get_record_mime_type @@ -29,13 +30,13 @@ def __init__( record: ArcWarcRecord, head_template: Template, css_insert: str | None, - known_urls: set[str], + existing_zim_paths: set[ZimPath], ): super().__init__() self.path = path self.mimetype = get_record_mime_type(record) - (self.title, self.content) = Rewriter(path, record, known_urls).rewrite( + (self.title, self.content) = Rewriter(path, record, existing_zim_paths).rewrite( head_template, css_insert ) diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index 96488f8c..f066ef34 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -7,42 +7,38 @@ The global scheme is the following: -Entries are stored in the zim file using their urldecoded full path properly urlencoded -(yes!): -- The full path is the full url without the scheme (ie : - "/(?/(?, - quote_via=quote)` - -On top of that, paths are "reduced" using fuzzy rules: -A path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value&other_key= -other_value&videoId=xxxx&yet_another_key=yet_another_value" -is reduced to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx" + . This is NOT valid : + "foo/part/file with %3F and +?who=Chip%26Dale&quer=Is%20there%20any%20%2B%20here%3F" +- space in query string must be stored as ` `, not `%2B`, `%20` or `+`, the `+` in a ZIM + path means a `%2B in web resource (HTML document, ...): + . This is valid : "foo/part/file?question=Is there any + here?" + . This is NOT valid : "foo/part/file?question%3DIs%20there%20any%20%2B%20here%3F" + +On top of that, fuzzy rules are applied on the ZIM path: +For instance a path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value +&other_key=other_value&videoId=xxxx&yet_another_key=yet_another_value" +is transformed to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx" by slightly simplifying the path and keeping only the usefull arguments in the querystring. + +When rewriting documents (HTML, CSS, JS, ...), every time we find a URI to rewrite we +start by resolving it into an absolute URL (based on the containing document absolute +URI), applying the transformation to compute the corresponding ZIM path and we +url-encode the whole ZIM path, so that readers will have one single blob to process, +url-decode and find corresponding ZIM entry. Only '/' separators are considered safe +and not url-encoded. """ from __future__ import annotations @@ -52,14 +48,18 @@ import re from urllib.parse import ( quote, + unquote, urljoin, urlsplit, urlunsplit, ) +import idna + # Shared logger logger = logging.getLogger("warc2zim.url_rewriting") +known_bad_hostnames: set[str] = set() FUZZY_RULES = [ { @@ -99,49 +99,186 @@ }, ] + COMPILED_FUZZY_RULES = [ {"match": re.compile(rule["pattern"]), "replace": rule["replace"]} for rule in FUZZY_RULES ] -def reduce(path: str) -> str: - """Reduce a path""" +class HttpUrl: + """A utility class representing an HTTP url, usefull to pass this data around + + Includes a basic validation, ensuring that URL is encoded, scheme is provided. + """ + + def __init__(self, value: str) -> None: + HttpUrl.check_validity(value) + self._value = value + + def __eq__(self, __value: object) -> bool: + return isinstance(__value, HttpUrl) and __value.value == self.value + + def __hash__(self) -> int: + return self.value.__hash__() + + def __str__(self) -> str: + return f"HttpUrl({self.value})" + + @property + def value(self) -> str: + return self._value + + @classmethod + def check_validity(cls, value: str) -> None: + parts = urlsplit(value) + + if parts.scheme.lower() not in ["http", "https"]: + raise ValueError( + f"Incorrect HttpUrl scheme in value: {value} {parts.scheme}" + ) + + if not parts.hostname: + raise ValueError(f"Unsupported empty hostname in value: {value}") + + if parts.hostname.lower() != parts.hostname: + raise ValueError(f"Unsupported upper-case chars in hostname : {value}") + + +class ZimPath: + """A utility class representing a ZIM path, usefull to pass this data around + + Includes a basic validation, ensuring that path does start with scheme, hostname,... + """ + + def __init__(self, value: str) -> None: + ZimPath.check_validity(value) + self._value = value + + def __eq__(self, __value: object) -> bool: + return isinstance(__value, ZimPath) and __value.value == self.value + + def __hash__(self) -> int: + return self.value.__hash__() + + def __str__(self) -> str: + return f"ZimPath({self.value})" + + @property + def value(self) -> str: + return self._value + + @classmethod + def check_validity(cls, value: str) -> None: + parts = urlsplit(value) + + if parts.scheme: + raise ValueError(f"Unexpected scheme in value: {value} {parts.scheme}") + + if parts.hostname: + raise ValueError(f"Unexpected hostname in value: {value} {parts.hostname}") + + if parts.username: + raise ValueError(f"Unexpected username in value: {value} {parts.username}") + + if parts.password: + raise ValueError(f"Unexpected password in value: {value} {parts.password}") + + +def apply_fuzzy_rules(uri: HttpUrl | str) -> HttpUrl | str: + """Apply fuzzy rules on a URL or relative path + + First matching fuzzy rule matching the input value is applied and its result + is returned. + + If no fuzzy rule is matching, the input is returned as-is. + """ + value = uri.value if isinstance(uri, HttpUrl) else uri for rule in COMPILED_FUZZY_RULES: - if match := rule["match"].match(path): + if match := rule["match"].match(value): return match.expand(rule["replace"]) - return path + return value + + +def normalize(url: HttpUrl) -> ZimPath: + """Transform a HTTP URL into a ZIM path to use as a entry's key. + According to RFC 3986, a URL allows only a very limited set of characters, so we + assume by default that the url is encoded to match this specification. -def normalize(url: str | None) -> str: - """Normalize a properly contructed url to a path to use as a entry's key. + The transformation rewrites the hostname, the path and the querystring. - >>> normalize("http://exemple.com/path/to/article?foo=bar") - "exemple.com/path/to/article?foo=bar" - >>> normalize("http://other.com/path to strange ar+t%3Ficle?foo=bar+baz") - "other.com/path to strange ar+t%3Ficle?foo=bar%20baz" - >>> normalize("http://youtube.com/youtubei/bar?key=value&videoId=xxxx&otherKey= - otherValue") - "youtube.fuzzy.replayweb.page/youtubei/bar?videoId=xxxx" + The transformation drops the URL scheme, username, password, port and fragment: + - we suppose there is no conflict of URL scheme or port: there is no two ressources + with same hostname, path and querystring but different URL schemeor port leading + to different content + - we consider username/password port are purely authentication mechanism which have + no impact on the content to server + - we know that the fragment is never passed to the server, it stays in the + User-Agent, so if we encounter a fragment while normalizing a URL found in a + document, it won't make its way to the ZIM anyway and will stay client-side + + The transformation consists mainly in decoding the three components so that ZIM path + is not encoded at all, as required by the ZIM specification. + + Decoding is done differently for the hostname (decoded with puny encoding) and the + path and querystring (both decoded with url decoding). + + The final transformation is the application of fuzzy rules (sourced from wabac) to + transform some URLs into replay URLs and drop some useless stuff. + + Returned value is a ZIM path, without any puny/url encoding applied, ready to be + passed to python-libzim for UTF-8 encoding. """ - if not url: - return url # pyright: ignore[reportGeneralTypeIssues, reportReturnType] + url_parts = urlsplit(url.value) + + hostname = url_parts.hostname + + if hostname and hostname not in known_bad_hostnames: + try: + # try to decode the hostname + hostname = idna.decode(hostname) + except idna.IDNAError as exc: + # exception might happen if illegal character are found in hostname like the + # `_` in `host_ip` ; we keep the set of bad hostname in memory to not fill + # the logs with these warnings + logger.warning(f"Bad hostname found, kept as-is: {hostname}", exc_info=exc) + known_bad_hostnames.add(hostname) - url_parts = urlsplit(url) - url_parts = url_parts._replace(scheme="") + path = url_parts.path - # Remove the netloc (by moving it into path) - if url_parts.netloc: - new_path = url_parts.netloc + url_parts.path - url_parts = url_parts._replace(netloc="", path=new_path) - if url_parts.path and url_parts.path[0] == "/": - url_parts = url_parts._replace(path=url_parts.path[1:]) + if path: + # unquote the path so that it is stored unencoded in the ZIM as required by ZIM + # specification + path = unquote(path) + else: + # if path is empty, we need a "/" to remove ambiguities, e.g. https://example.com + # and https://example.com/ must all lead to the same ZIM entry to match RFC 3986 + # section 6.2.3 : https://www.rfc-editor.org/rfc/rfc3986#section-6.2.3 + path = "/" - path = urlunsplit(url_parts) - path = reduce(path) + query = url_parts.query - return path + # if query is missing, we do not add it at all, not even a trailing ? without + # anything after it + if url_parts.query: + # `+`` in query parameter must be decoded as space first to remove ambiguities + # between a space (encoded as `+` in url query parameter) and a real plus sign + # (encoded as %2B but soon decoded in ZIM path) + query = query.replace("+", " ") + # unquote the query so that it is stored unencoded in the ZIM as required by ZIM + # specification + query = "?" + unquote(query) + else: + query = "" + + fuzzified_url = apply_fuzzy_rules(f"{hostname}{path}{query}") + + if not isinstance(fuzzified_url, str): + raise ValueError("Inappropriate value returned, should be str") + + return ZimPath(fuzzified_url) def get_without_fragment(url: str) -> str: @@ -152,46 +289,69 @@ def get_without_fragment(url: str) -> str: class ArticleUrlRewriter: """Rewrite urls in article.""" - def __init__(self, article_url: str, known_urls: set[str]): + def __init__(self, article_url: HttpUrl, existing_zim_paths: set[ZimPath]): + self.article_path = normalize(article_url) self.article_url = article_url - self.known_urls = known_urls - self.base_path = f"/{urlsplit(normalize(article_url)).path}" - if self.base_path[-1] != "/": - # We want a directory - self.base_path = posixpath.dirname(self.base_path) + self.existing_zim_paths = existing_zim_paths + self.missing_zim_paths: set[ZimPath] = set() - def __call__(self, url: str, *, rewrite_all_url: bool = True) -> str: + def __call__(self, item_url: str, *, rewrite_all_url: bool = True) -> str: """Rewrite a url contained in a article. The url is "fully" rewrited to point to a normalized entry path """ - url = url.strip() + item_url = item_url.strip() - if url.startswith("data:") or url.startswith("blob:"): - return url + item_scheme = urlsplit(item_url).scheme + if item_scheme and item_scheme not in ("http", "https"): + return item_url - absolute_url = urljoin(self.article_url, url) + item_absolute_url = urljoin(self.article_url.value, item_url) + item_fragment = urlsplit(item_absolute_url).fragment - normalized_url = normalize(absolute_url) + item_path = normalize(HttpUrl(item_absolute_url)) - if rewrite_all_url or get_without_fragment(normalized_url) in self.known_urls: - return self.from_normalized(normalized_url) + if rewrite_all_url or item_path in self.existing_zim_paths: + return self.get_document_uri(item_path, item_fragment) else: - logger.debug(f"WARNING {normalized_url} ({url}) not in archive.") + if item_path not in self.missing_zim_paths: + logger.debug(f"WARNING {item_path} ({item_url}) not in archive.") + # maintain a collection of missing Zim Path to not fill the logs with + # duplicate messages + self.missing_zim_paths.add(item_path) # The url doesn't point to a known entry - return url + return item_absolute_url - def from_normalized(self, normalized_url_str: str) -> str: - normalized_url = urlsplit(f"/{normalized_url_str}") + def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str: + """Given an ZIM item path and its fragment, get the URI to use in document - # relative_to will lost our potential last '/' - slash_ending = normalized_url.path[-1] == "/" - relative_path = posixpath.relpath(normalized_url.path, self.base_path) + This function transforms the path of a ZIM item we want to adress from current + document (HTML / JS / ...) and returns the corresponding URI to use. - if slash_ending: + It computes the relative path based on current document location and escape + everything which needs to be to transform the ZIM path into a valid RFC 3986 URI + + It also append a potential trailing item fragment at the end of the resulting + URI. + + """ + item_parts = urlsplit(item_path.value) + + # item_path is both path + querystring, both will be url-encoded in the document + # so that readers consider them as a whole and properly pass them to libzim + item_url = item_parts.path + if item_parts.query: + item_url += "?" + item_parts.query + + relative_path = posixpath.relpath( + item_url, posixpath.dirname(self.article_path.value) + ) + # relpath removes a potential last '/' in the path, we add it back + if item_path.value.endswith("/"): relative_path += "/" - normalized_url = normalized_url._replace(path=relative_path) - normalized_url = urlunsplit(normalized_url) - return quote(normalized_url, safe="/#") + return ( + f"{quote(relative_path, safe='/')}" + f"{'#' + item_fragment if item_fragment else ''}" + ) diff --git a/tests/test_css_rewriting.py b/tests/test_css_rewriting.py index 1375ce9a..6f4f358c 100644 --- a/tests/test_css_rewriting.py +++ b/tests/test_css_rewriting.py @@ -3,7 +3,7 @@ import pytest from warc2zim.content_rewriting.css import CssRewriter -from warc2zim.url_rewriting import ArticleUrlRewriter +from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl from .utils import ContentForTests @@ -19,15 +19,15 @@ ), ContentForTests( b"p { width= } div { background: url(http://exemple.com/img.png)}", - b"p { width= } div { background: url(exemple.com/img.png)}", + b"p { width= } div { background: url(../exemple.com/img.png)}", ), ContentForTests( b"p { width= } div { background: url('http://exemple.com/img.png')}", - b'p { width= } div { background: url("exemple.com/img.png")}', + b'p { width= } div { background: url("../exemple.com/img.png")}', ), ContentForTests( b'p { width= } div { background: url("http://exemple.com/img.png")}', - b'p { width= } div { background: url("exemple.com/img.png")}', + b'p { width= } div { background: url("../exemple.com/img.png")}', ), ] ) @@ -37,9 +37,11 @@ def no_rewrite_content(request): def test_no_rewrite(no_rewrite_content): assert ( - CssRewriter(ArticleUrlRewriter(no_rewrite_content.article_url, set())).rewrite( - no_rewrite_content.input_bytes - ) + CssRewriter( + ArticleUrlRewriter( + HttpUrl(f"http://{no_rewrite_content.article_url}"), set() + ) + ).rewrite(no_rewrite_content.input_bytes) == no_rewrite_content.expected_bytes.decode() ) @@ -53,7 +55,7 @@ def test_no_rewrite(no_rewrite_content): ContentForTests("border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"), ContentForTests( 'background: url("http://exemple.com/foo.png"); width=', - 'background: url("exemple.com/foo.png"); width=', + 'background: url("../exemple.com/foo.png"); width=', ), ] ) @@ -64,7 +66,9 @@ def invalid_content_inline(request): def test_invalid_css_inline(invalid_content_inline): assert ( CssRewriter( - ArticleUrlRewriter(invalid_content_inline.article_url, set()) + ArticleUrlRewriter( + HttpUrl(f"http://{invalid_content_inline.article_url}"), set() + ) ).rewrite_inline(invalid_content_inline.input_str) == invalid_content_inline.expected_str ) @@ -83,7 +87,7 @@ def test_invalid_css_inline(invalid_content_inline): ), ContentForTests( b'p { background: url("http://exemple.com/foo.png"); width= }', - b'p { background: url("exemple.com/foo.png"); width= }', + b'p { background: url("../exemple.com/foo.png"); width= }', ), ] ) @@ -93,9 +97,9 @@ def invalid_content(request): def test_invalid_cssl(invalid_content): assert ( - CssRewriter(ArticleUrlRewriter(invalid_content.article_url, set())).rewrite( - invalid_content.input_bytes - ) + CssRewriter( + ArticleUrlRewriter(HttpUrl(f"http://{invalid_content.article_url}"), set()) + ).rewrite(invalid_content.input_bytes) == invalid_content.expected_bytes.decode() ) @@ -123,7 +127,7 @@ def test_rewrite(): expected = """ /* A comment with a link : http://foo.com */ - @import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%2BIcons); + @import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%20Icons); p, input { color: rbg(1, 2, 3); @@ -143,6 +147,8 @@ def test_rewrite(): expected = dedent(expected) assert ( - CssRewriter(ArticleUrlRewriter("kiwix.org/article", set())).rewrite(content) + CssRewriter( + ArticleUrlRewriter(HttpUrl("http://kiwix.org/article"), set()) + ).rewrite(content) == expected ) diff --git a/tests/test_fuzzy_rules.py b/tests/test_fuzzy_rules.py index 3fe12bcb..66d3750f 100644 --- a/tests/test_fuzzy_rules.py +++ b/tests/test_fuzzy_rules.py @@ -1,6 +1,6 @@ import pytest -from warc2zim.url_rewriting import reduce +from warc2zim.url_rewriting import apply_fuzzy_rules from .utils import ContentForTests @@ -34,7 +34,10 @@ def google_videos_case(request): def test_fuzzyrules_google_videos(google_videos_case): - assert reduce(google_videos_case.input_str) == google_videos_case.expected_str + assert ( + apply_fuzzy_rules(google_videos_case.input_str) + == google_videos_case.expected_str + ) @pytest.fixture( @@ -79,7 +82,8 @@ def google_video_info_case(request): def test_fuzzyrules_google_video_infos(google_video_info_case): assert ( - reduce(google_video_info_case.input_str) == google_video_info_case.expected_str + apply_fuzzy_rules(google_video_info_case.input_str) + == google_video_info_case.expected_str ) @@ -112,7 +116,7 @@ def trim_digits_only_query_case(request): def test_fuzzyrules_trim_digits_only_query(trim_digits_only_query_case): assert ( - reduce(trim_digits_only_query_case.input_str) + apply_fuzzy_rules(trim_digits_only_query_case.input_str) == trim_digits_only_query_case.expected_str ) @@ -165,7 +169,7 @@ def youtubei_case(request): def test_fuzzyrules_youtubei(youtubei_case): - assert reduce(youtubei_case.input_str) == youtubei_case.expected_str + assert apply_fuzzy_rules(youtubei_case.input_str) == youtubei_case.expected_str @pytest.fixture( @@ -205,7 +209,10 @@ def youtube_embed_case(request): def test_fuzzyrules_youtube_embed(youtube_embed_case): - assert reduce(youtube_embed_case.input_str) == youtube_embed_case.expected_str + assert ( + apply_fuzzy_rules(youtube_embed_case.input_str) + == youtube_embed_case.expected_str + ) @pytest.fixture( @@ -254,7 +261,7 @@ def vimeo_cdn_case(request): def test_fuzzyrules_vimeo_cdn(vimeo_cdn_case): - assert reduce(vimeo_cdn_case.input_str) == vimeo_cdn_case.expected_str + assert apply_fuzzy_rules(vimeo_cdn_case.input_str) == vimeo_cdn_case.expected_str @pytest.fixture( @@ -283,4 +290,4 @@ def vimeo_host_case(request): def test_fuzzyrules_vimeo_host(vimeo_host_case): - assert reduce(vimeo_host_case.input_str) == vimeo_host_case.expected_str + assert apply_fuzzy_rules(vimeo_host_case.input_str) == vimeo_host_case.expected_str diff --git a/tests/test_html_rewriting.py b/tests/test_html_rewriting.py index fd56d912..cfac82c7 100644 --- a/tests/test_html_rewriting.py +++ b/tests/test_html_rewriting.py @@ -3,7 +3,7 @@ import pytest from warc2zim.content_rewriting.html import HtmlRewriter -from warc2zim.url_rewriting import ArticleUrlRewriter +from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath from .utils import ContentForTests @@ -44,7 +44,13 @@ def no_rewrite_content(request): def test_no_rewrite(no_rewrite_content): assert ( - HtmlRewriter(ArticleUrlRewriter(no_rewrite_content.article_url, set()), "", "") + HtmlRewriter( + ArticleUrlRewriter( + HttpUrl(f"http://{no_rewrite_content.article_url}"), set() + ), + "", + "", + ) .rewrite(no_rewrite_content.input_str) .content == no_rewrite_content.expected_str @@ -101,7 +107,11 @@ def escaped_content(request): def test_escaped_content(escaped_content): transformed = ( - HtmlRewriter(ArticleUrlRewriter(escaped_content.article_url, set()), "", "") + HtmlRewriter( + ArticleUrlRewriter(HttpUrl(f"http://{escaped_content.article_url}"), set()), + "", + "", + ) .rewrite(escaped_content.input_str) .content ) @@ -121,12 +131,12 @@ def long_path_replace_test_content(input_: str, rewriten_url: str, article_url: # Normalized path is "exemple.com/a/long/path" lprtc( 'A link to rewrite', - "exemple.com/a/long/path", + "a/long/path", "exemple.com", ), lprtc( 'A link to rewrite', - "exemple.com/a/long/path", + "../exemple.com/a/long/path", "kiwix.org", ), lprtc( @@ -188,7 +198,10 @@ def rewrite_url(request): def test_rewrite(rewrite_url): assert ( HtmlRewriter( - ArticleUrlRewriter(rewrite_url.article_url, {"exemple.com/a/long/path"}), + ArticleUrlRewriter( + HttpUrl(f"http://{rewrite_url.article_url}"), + {ZimPath("exemple.com/a/long/path")}, + ), "", "", ) @@ -222,7 +235,11 @@ def test_extract_title(): def test_rewrite_attributes(): - rewriter = HtmlRewriter(ArticleUrlRewriter("kiwix.org/", {"kiwix.org/foo"}), "", "") + rewriter = HtmlRewriter( + ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), {ZimPath("kiwix.org/foo")}), + "", + "", + ) assert ( rewriter.rewrite("A link").content @@ -245,7 +262,7 @@ def test_rewrite_attributes(): def test_rewrite_css(): output = ( - HtmlRewriter(ArticleUrlRewriter("", set()), "", "") + HtmlRewriter(ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), set()), "", "") .rewrite( "", @@ -268,7 +285,7 @@ def test_head_insert(): content = dedent(content) - url_rewriter = ArticleUrlRewriter("foo", set()) + url_rewriter = ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), set()) assert HtmlRewriter(url_rewriter, "", "").rewrite(content).content == content assert HtmlRewriter(url_rewriter, "PRE_HEAD_INSERT", "").rewrite( diff --git a/tests/test_js_rewriting.py b/tests/test_js_rewriting.py index fc77290f..0c425274 100644 --- a/tests/test_js_rewriting.py +++ b/tests/test_js_rewriting.py @@ -1,7 +1,7 @@ import pytest from warc2zim.content_rewriting.js import JsRewriter -from warc2zim.url_rewriting import ArticleUrlRewriter +from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl from .utils import ContentForTests @@ -218,7 +218,9 @@ def rewrite_import_content(request): def test_import_rewrite(rewrite_import_content): - url_rewriter = ArticleUrlRewriter(rewrite_import_content.article_url, set()) + url_rewriter = ArticleUrlRewriter( + HttpUrl(rewrite_import_content.article_url), set() + ) assert ( JsRewriter(url_rewriter).rewrite(rewrite_import_content.input_str) == rewrite_import_content.expected_str diff --git a/tests/test_url_rewriting.py b/tests/test_url_rewriting.py index 3fe17b48..e4633e2b 100644 --- a/tests/test_url_rewriting.py +++ b/tests/test_url_rewriting.py @@ -1,103 +1,384 @@ -from urllib.parse import urljoin - import pytest -from warc2zim.url_rewriting import ArticleUrlRewriter +from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath -@pytest.fixture( - params=[ - "https://kiwix.org/a/article/path", - "https://kiwix.org/a/article/path/", - "https://kiwix.org/a/path", - "https://kiwix.org/a/path/", - ] +@pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "foo.html", + "foo.html", + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html#anchor1", + "foo.html#anchor1", + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html?foo=bar", + "foo.html%3Ffoo%3Dbar", + ["kiwix.org/a/article/foo.html?foo=bar"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html?foo=b%24ar", + "foo.html%3Ffoo%3Db%24ar", + ["kiwix.org/a/article/foo.html?foo=b$ar"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html?foo=b%3Far", # a query string with an encoded ? char in value + "foo.html%3Ffoo%3Db%3Far", + ["kiwix.org/a/article/foo.html?foo=b?ar"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "fo%o.html", + "fo%25o.html", + ["kiwix.org/a/article/fo%o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foé.html", # URL not matching RFC 3986 (many HTML documents are invalid) + "fo%C3%A9.html", # character is encoded so that URL match RFC 3986 + ["kiwix.org/a/article/foé.html"], # but ZIM path is non-encoded + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "./foo.html", + "foo.html", + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../foo.html", + "https://kiwix.org/a/foo.html", # Full URL since not in known URLs + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../foo.html", + "../foo.html", # all URLs rewrite activated + ["kiwix.org/a/article/foo.html"], + True, + ), + ( + "https://kiwix.org/a/article/document.html", + "../foo.html", + "../foo.html", + ["kiwix.org/a/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../bar/foo.html", + "https://kiwix.org/a/bar/foo.html", # Full URL since not in known URLs + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../bar/foo.html", + "../bar/foo.html", # all URLs rewrite activated + ["kiwix.org/a/article/foo.html"], + True, + ), + ( + "https://kiwix.org/a/article/document.html", + "../bar/foo.html", + "../bar/foo.html", + ["kiwix.org/a/bar/foo.html"], + False, + ), + ( # we cannot go upper than host, so '../' in excess are removed + "https://kiwix.org/a/article/document.html", + "../../../../../foo.html", + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo?param=value%2F", + "foo%3Fparam%3Dvalue/", + ["kiwix.org/a/article/foo?param=value/"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo?param=value%2Fend", + "foo%3Fparam%3Dvalue/end", + ["kiwix.org/a/article/foo?param=value/end"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo/", + "foo/", + ["kiwix.org/a/article/foo/"], + False, + ), + ], ) -def rewriter(request): - yield ArticleUrlRewriter(request.param, {"kiwix.org/bar/foo"}) - - -def test_relative_url(rewriter): - for url in ["foo", "bar/foo", "foo/", "bar/foo/", "../baz"]: - assert rewriter(url) == url - assert rewriter("./" + url) == url - assert urljoin(rewriter.article_url, rewriter(url)) == urljoin( - rewriter.article_url, url - ) - - if rewriter.article_url == "https://kiwix.org/a/path": - # While it may seems wrong from a path point of view, - # it is valid from a url side when going up to the `/` is "no op". - assert rewriter("../../biz") == "../biz" - else: - assert rewriter("../../biz") == "../../biz" - assert rewriter("./../../biz") == "../../biz" - - -def test_absolute_path_url(rewriter): - for url in ["/foo", "/foo/bar"]: - for input_url in [url, f" {url}", f"{url} ", f" {url} "]: - rewriten = rewriter(input_url) - # Must produce a relative link. - assert not rewriten.startswith("/") - # Relative link must be resolved to a absolute url in the same domain than - # article_url. - assert urljoin(rewriter.article_url, rewriten) == "https://kiwix.org" + url - - -def test_absolute_scheme_url(rewriter): - # We will serve our content from serving.com (moving kiwix.org as the first part of - # the path). - serving_address = rewriter.article_url.replace("kiwix.org", "serving.com/kiwix.org") - - for url in ["//exemple.com/foo", "//exemple.com/foo/bar", "//kiwix.org/baz"]: - rewriten = rewriter(url) - # Must produce a relative link. - assert not rewriten.startswith("/") - # Relative link must be resolved to a absolute url in the serving domain with a - # path containing article_url's host. - assert urljoin(serving_address, rewriten) == "https://serving.com" + url[1:] +def test_relative_url( + article_url, + know_paths, + original_content_url, + expected_rewriten_content_url, + rewrite_all_url, +): + article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url, + {ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter(original_content_url, rewrite_all_url=rewrite_all_url) + == expected_rewriten_content_url + ) -def test_absolute_url(rewriter): - # We will serve our content from serving.com (moving kiwix.org as the first part of - # the path). - serving_address = rewriter.article_url.replace("kiwix.org", "serving.com/kiwix.org") +@pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "/foo.html", + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/bar.html", + "https://kiwix.org/bar.html", # Full URL since not in known URLs + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/bar.html", + "../../bar.html", # all URLs rewrite activated + ["kiwix.org/foo.html"], + True, + ), + ], +) +def test_absolute_path_url( + article_url, + know_paths, + original_content_url, + expected_rewriten_content_url, + rewrite_all_url, +): + article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url, + {ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter(original_content_url, rewrite_all_url=rewrite_all_url) + == expected_rewriten_content_url + ) - for url in [ - "https://exemple.com/foo", - "http://exemple.com/foo/bar", - "http://kiwix.org/baz", - ]: - rewriten = rewriter(url) - # Must produce a relative link - assert not rewriten.startswith("/") - # Relative link must be resolved to a absolute url in the serving domain with a - # path containing article_url's host. - # We don't care about scheme, always use what we are serving from. - assert ( - urljoin(serving_address, rewriten) - == "https://serving.com" + url.split(":", 1)[1][1:] - ) +@pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "//kiwix.org/foo.html", + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "//kiwix.org/bar.html", + "https://kiwix.org/bar.html", # Full URL since not in known URLs + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "//kiwix.org/bar.html", + "../../bar.html", # all URLs rewrite activated + ["kiwix.org/foo.html"], + True, + ), + ( + "https://kiwix.org/a/article/document.html", + "//acme.com/foo.html", + "../../../acme.com/foo.html", + ["acme.com/foo.html"], + False, + ), + ( + "http://kiwix.org/a/article/document.html", + "//acme.com/bar.html", + "http://acme.com/bar.html", # Full URL since not in known URLs + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "//acme.com/bar.html", + "../../../acme.com/bar.html", # all URLs rewrite activated + ["kiwix.org/foo.html"], + True, + ), + ( # puny-encoded host is transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "//xn--exmple-cva.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ( # host who should be puny-encoded ir transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "//exémple.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ], +) +def test_absolute_scheme_url( + article_url, + know_paths, + original_content_url, + expected_rewriten_content_url, + rewrite_all_url, +): + article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url, + {ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter(original_content_url, rewrite_all_url=rewrite_all_url) + == expected_rewriten_content_url + ) -def test_no_rewrite_blob_data(rewriter): - for url in ["data:0548datacontent", "blob:exemple.com/url"]: - assert rewriter(url) == url +@pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "http://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( + "http://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( + "http://kiwix.org/a/article/document.html", + "https://user:password@foo.org:8080/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( # Full URL since not in known URLs + "https://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "https://foo.org/a/article/document.html", + ["kiwix.org/a/article/foo/"], + False, + ), + ( # all URLs rewrite activated + "https://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["kiwix.org/a/article/foo/"], + True, + ), + ( # puny-encoded host is transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "https://xn--exmple-cva.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ( # host who should be puny-encoded is transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "https://exémple.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ], +) +def test_absolute_url( + article_url, + know_paths, + original_content_url, + expected_rewriten_content_url, + rewrite_all_url, +): + article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url, + {ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter(original_content_url, rewrite_all_url=rewrite_all_url) + == expected_rewriten_content_url + ) -def test_no_rewrite_external_link(rewriter): - for rewrite_all_url in [True, False]: - # We always rewrite "internal" urls - assert "kiwix.org" not in rewriter( - "https://kiwix.org/bar/foo", rewrite_all_url=rewrite_all_url - ) - # External urls are only rewriten if 'rewrite_all_url' is True - assert "kiwix.org" not in rewriter( - "https://kiwix.org/external/link", rewrite_all_url=True +@pytest.mark.parametrize( + "original_content_url, rewrite_all_url", + [ + ("data:0548datacontent", False), + ("blob:exemple.com/url", False), + ("mailto:bob@acme.com", False), + ("tel:+33.1.12.12.23", False), + ("data:0548datacontent", True), + ("blob:exemple.com/url", True), + ("mailto:bob@acme.com", True), + ("tel:+33.1.12.12.23", True), + ], +) +# other schemes are never rewritten, even when rewrite_all_url is true +def test_no_rewrite_other_schemes(original_content_url, rewrite_all_url): + article_url = HttpUrl("https://kiwix.org/a/article/document.html") + rewriter = ArticleUrlRewriter( + article_url, + set(), ) assert ( - rewriter("https://kiwix.org/external/link", rewrite_all_url=False) - == "https://kiwix.org/external/link" + rewriter(original_content_url, rewrite_all_url=rewrite_all_url) + == original_content_url ) diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index a48c4fe1..7855873c 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -6,6 +6,7 @@ import os import re import time +from urllib.parse import unquote import pytest import requests @@ -15,7 +16,7 @@ from warc2zim.__about__ import __version__ from warc2zim.converter import iter_warc_records from warc2zim.main import main -from warc2zim.url_rewriting import normalize +from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize from warc2zim.utils import get_record_url TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") @@ -27,8 +28,6 @@ ["example-response.warc"], ["example-response.warc", "--progress-file", "progress.json"], ["example-response.warc", "--scraper-suffix", SCRAPER_SUFFIX], - ["example-resource.warc.gz", "--favicon", "https://example.com/some/favicon.ico"], - ["example-resource.warc.gz", "--favicon", "https://www.google.com/favicon.ico"], ["example-revisit.warc.gz"], [ "example-revisit.warc.gz", @@ -154,6 +153,24 @@ def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix): url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme) + # remove user/password + if "@" in url_no_scheme: + at_index = url_no_scheme.index("@") + if at_index >= 0: + if "/" in url_no_scheme: + slash_index = url_no_scheme.index("/") + if at_index < slash_index: + url_no_scheme = url_no_scheme[at_index + 1 :] + else: + url_no_scheme = url_no_scheme[at_index + 1 :] + + # remove trailing ? + if url_no_scheme.endswith("?"): + url_no_scheme = url_no_scheme[:-1] + + # unquote url since everything is not encoded in ZIM + url_no_scheme = unquote(url_no_scheme) + # ensure payloads match try: payload = zim_fh.get_item(url_no_scheme) @@ -199,38 +216,97 @@ def rebuild_favicon_bytes(self, zim, favicon_path) -> bytes: resize_image(dst, width=48, height=48, method="cover") return dst.getvalue() - def test_normalize(self): - assert normalize(None) is None - assert normalize("") == "" - assert normalize("https://exemple.com") == "exemple.com" - assert normalize("https://exemple.com/") == "exemple.com/" - assert normalize("http://example.com/?foo=bar") == "example.com/?foo=bar" - - assert normalize("https://example.com/?foo=bar") == "example.com/?foo=bar" - - assert ( - normalize("https://example.com/some/path/http://example.com/?foo=bar") - == "example.com/some/path/http://example.com/?foo=bar" - ) - - assert ( - normalize("example.com/some/path/http://example.com/?foo=bar") - == "example.com/some/path/http://example.com/?foo=bar" - ) - - assert ( - normalize("http://example.com/path/with/final/slash/") - == "example.com/path/with/final/slash/" - ) - - assert normalize("http://test@example.com/") == "test@example.com/" - - assert ( - normalize( - "http://lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493" - ) - == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?" - ) + @pytest.mark.parametrize( + "url,zim_path", + [ + ("https://exemple.com", "exemple.com/"), + ("https://exemple.com/", "exemple.com/"), + ("http://example.com/resource", "example.com/resource"), + ("http://example.com/resource/", "example.com/resource/"), + ( + "http://example.com/resource/folder/sub.txt", + "example.com/resource/folder/sub.txt", + ), + ( + "http://example.com/resource/folder/sub", + "example.com/resource/folder/sub", + ), + ( + "http://example.com/resource/folder/sub?foo=bar", + "example.com/resource/folder/sub?foo=bar", + ), + ( + "http://example.com/resource/folder/sub?foo=bar#anchor1", + "example.com/resource/folder/sub?foo=bar", + ), + ("http://example.com/resource/#anchor1", "example.com/resource/"), + ("http://example.com/resource/?foo=bar", "example.com/resource/?foo=bar"), + ("http://example.com#anchor1", "example.com/"), + ("http://example.com?foo=bar#anchor1", "example.com/?foo=bar"), + ("http://example.com/?foo=bar", "example.com/?foo=bar"), + ("http://example.com/?foo=ba+r", "example.com/?foo=ba r"), + ( + "http://example.com/?foo=ba r", + "example.com/?foo=ba r", + ), # situation where the ` ` has not been properly escaped in document + ("http://example.com/?foo=ba%2Br", "example.com/?foo=ba+r"), + ("http://example.com/?foo=ba+%2B+r", "example.com/?foo=ba + r"), + ("http://example.com/#anchor1", "example.com/"), + ( + "http://example.com/some/path/http://example.com//some/path", + "example.com/some/path/http://example.com//some/path", + ), + ( + "http://example.com/some/pa?th/http://example.com//some/path", + "example.com/some/pa?th/http://example.com//some/path", + ), + ( + "http://example.com/so?me/pa?th/http://example.com//some/path", + "example.com/so?me/pa?th/http://example.com//some/path", + ), + ("http://example.com/resource?", "example.com/resource"), + ("http://example.com/resource#", "example.com/resource"), + ("http://user@example.com/resource", "example.com/resource"), + ("http://user:password@example.com/resource", "example.com/resource"), + ("http://example.com:8080/resource", "example.com/resource"), + ( + "http://foobargooglevideo.com/videoplayback?id=1576&key=value", + "youtube.fuzzy.replayweb.page/videoplayback?id=1576", + ), # Fuzzy rule is applied in addition to path transformations + ("https://xn--exmple-cva.com", "exémple.com/"), + ("https://xn--exmple-cva.com/", "exémple.com/"), + ("https://xn--exmple-cva.com/resource", "exémple.com/resource"), + ("https://exémple.com/", "exémple.com/"), + ("https://exémple.com/resource", "exémple.com/resource"), + ("https://host_ip/", "host_ip/"), + ("https://host_ip/resource", "host_ip/resource"), + ("http://example.com/res%24urce", "example.com/res$urce"), + ( + "http://example.com/resource?foo=b%24r", + "example.com/resource?foo=b$r", + ), + ("http://example.com/resource@300x", "example.com/resource@300x"), + ("http://example.com:8080/resource", "example.com/resource"), + ("http://user@example.com:8080/resource", "example.com/resource"), + ("http://user:password@example.com:8080/resource", "example.com/resource"), + # the two URI below are an illustration of a potential collision (two + # differents URI leading to the same ZIM path) + ( + "http://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-" + "de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png", + "tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-" + "Solidarité-Numérique_1@300x.png", + ), + ( + "https://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-" + "de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png", + "tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-" + "Solidarité-Numérique_1@300x.png", + ), + ], + ) + def test_normalize(self, url, zim_path): + assert normalize(HttpUrl(url)) == ZimPath(zim_path) def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): zim_output = "zim-out-filename.zim" @@ -299,7 +375,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): ) assert self.get_metadata(zim_output, "Title") == b"Some Title" - def test_warc_to_zim(self, cmdline, tmp_path): + def test_warc_to_zim_main(self, cmdline, tmp_path): # intput filename filename = cmdline[0] @@ -479,7 +555,7 @@ def test_all_warcs_root_dir(self, tmp_path): ) # timestamp fuzzy match from example-with-timestamp.warc - assert self.get_article(zim_output, "example.com/path.txt?") != b"" + assert self.get_article(zim_output, "example.com/path.txt") != b"" def test_fuzzy_urls(self, tmp_path, fuzzycheck): zim_output = fuzzycheck["filename"] + ".zim" @@ -503,11 +579,7 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck): def test_error_bad_main_page(self, tmp_path): zim_output_not_created = "zim-out-not-created.zim" - with pytest.raises( - KeyError, - match="Unable to find WARC record for main page: no-such-url.example.com/," - " aborting", - ): + with pytest.raises(KeyError, match="Unable to find WARC record for main page:"): main( [ "-v", From 95492e5b1a815f7477294dd96a4cb1319f3f6540 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 26 Mar 2024 10:26:38 +0000 Subject: [PATCH 05/10] Add development directories to .gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 9901ece4..f6522214 100644 --- a/.gitignore +++ b/.gitignore @@ -213,3 +213,7 @@ pyrightconfig.json # installed at build time wombat.js + +# temporary directories using during development +output +tmp \ No newline at end of file From c6d593709279b1c59b8858e00173350af74d33fe Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 09:13:06 +0000 Subject: [PATCH 06/10] apply_fuzzy_rules always returns a str indeed --- src/warc2zim/url_rewriting.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index f066ef34..e120299a 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -185,7 +185,7 @@ def check_validity(cls, value: str) -> None: raise ValueError(f"Unexpected password in value: {value} {parts.password}") -def apply_fuzzy_rules(uri: HttpUrl | str) -> HttpUrl | str: +def apply_fuzzy_rules(uri: HttpUrl | str) -> str: """Apply fuzzy rules on a URL or relative path First matching fuzzy rule matching the input value is applied and its result @@ -275,9 +275,6 @@ def normalize(url: HttpUrl) -> ZimPath: fuzzified_url = apply_fuzzy_rules(f"{hostname}{path}{query}") - if not isinstance(fuzzified_url, str): - raise ValueError("Inappropriate value returned, should be str") - return ZimPath(fuzzified_url) From eeafb62c460f8a0b49a99d938a316bcbae6c40a7 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 09:13:12 +0000 Subject: [PATCH 07/10] Fix typo --- src/warc2zim/url_rewriting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index e120299a..e9a9ebb2 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -210,7 +210,7 @@ def normalize(url: HttpUrl) -> ZimPath: The transformation drops the URL scheme, username, password, port and fragment: - we suppose there is no conflict of URL scheme or port: there is no two ressources - with same hostname, path and querystring but different URL schemeor port leading + with same hostname, path and querystring but different URL scheme or port leading to different content - we consider username/password port are purely authentication mechanism which have no impact on the content to server From 0b64bd4dd47acc753cdde11438ff4e56c1e4c1f3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 09:30:51 +0000 Subject: [PATCH 08/10] Use PurePosixPath now that it can walk_up in Python 3.12 --- src/warc2zim/url_rewriting.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index e9a9ebb2..99564b29 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -44,8 +44,8 @@ from __future__ import annotations import logging -import posixpath import re +from pathlib import PurePosixPath from urllib.parse import ( quote, unquote, @@ -340,11 +340,17 @@ def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str: item_url = item_parts.path if item_parts.query: item_url += "?" + item_parts.query - - relative_path = posixpath.relpath( - item_url, posixpath.dirname(self.article_path.value) + relative_path = str( + PurePosixPath(item_url).relative_to( + ( + PurePosixPath(self.article_path.value) + if self.article_path.value.endswith("/") + else PurePosixPath(self.article_path.value).parent + ), + walk_up=True, + ) ) - # relpath removes a potential last '/' in the path, we add it back + # relative_to removes a potential last '/' in the path, we add it back if item_path.value.endswith("/"): relative_path += "/" From 18cf34f49f72f6032dfa8bd605be1770e870a9c6 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 09:35:38 +0000 Subject: [PATCH 09/10] Rename variables for clarity --- src/warc2zim/converter.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 9d231999..04908459 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -132,9 +132,9 @@ def __init__(self, args): self.custom_css = args.custom_css - self.indexed_urls = set({}) + self.added_zim_items: set[ZimPath] = set() self.revisits: dict[ZimPath, ZimPath] = {} - self.existing_zim_paths: set[ZimPath] = set() + self.expected_zim_items: set[ZimPath] = set() # progress file handling self.stats_filename = ( @@ -272,7 +272,7 @@ def run(self): # process revisits for normalized_url, target_url in self.revisits.items(): - if normalized_url not in self.indexed_urls: + if normalized_url not in self.added_zim_items: logger.debug(f"Adding alias {normalized_url} -> {target_url}") try: self.creator.add_alias( @@ -281,7 +281,7 @@ def run(self): except RuntimeError as exc: if not ALIAS_EXC_STR.match(str(exc)): raise exc - self.indexed_urls.add(normalized_url) + self.added_zim_items.add(normalized_url) logger.debug(f"Found {self.total_records} records in WARCs") @@ -306,7 +306,7 @@ def gather_information_from_warc(self): url = get_record_url(record) zim_path = normalize(HttpUrl(url)) - self.existing_zim_paths.add(zim_path) + self.expected_zim_items.add(zim_path) if main_page_found: continue @@ -525,7 +525,7 @@ def add_items_for_warc_record(self, record): logger.debug(f"Skipping record with empty WARC-Target-URI {record}") return - normalized_url = normalize(HttpUrl(url)) + item_zim_path = normalize(HttpUrl(url)) # if include_domains is set, only include urls from those domains if self.include_domains: @@ -536,7 +536,7 @@ def add_items_for_warc_record(self, record): logger.debug(f"Skipping url {url}, outside included domains") return - if normalized_url in self.indexed_urls: + if item_zim_path in self.added_zim_items: logger.debug(f"Skipping duplicate {url}, already added to ZIM") return @@ -546,11 +546,11 @@ def add_items_for_warc_record(self, record): return payload_item = WARCPayloadItem( - normalized_url.value, + item_zim_path.value, record, self.head_template, self.css_insert, - self.existing_zim_paths, + self.expected_zim_items, ) if len(payload_item.content) != 0: @@ -562,14 +562,14 @@ def add_items_for_warc_record(self, record): self.total_records += 1 self.update_stats() - self.indexed_urls.add(normalized_url) + self.added_zim_items.add(item_zim_path) elif ( record.rec_type == "revisit" and record.rec_headers["WARC-Refers-To-Target-URI"] != url - and normalized_url not in self.revisits + and item_zim_path not in self.revisits ): # pragma: no branch - self.revisits[normalized_url] = normalize( + self.revisits[item_zim_path] = normalize( HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"]) ) From 83438d6f9ff71ec85d2ab72c4fb283c518916dd5 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 14:13:19 +0000 Subject: [PATCH 10/10] Remove extra Youtube rule which is not needed anymore This rule was needed most probably only because of a trailing ? in some URLs --- src/warc2zim/url_rewriting.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index 99564b29..1adacd01 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -81,13 +81,6 @@ "pattern": r"(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*", "replace": r"youtube.fuzzy.replayweb.page/embed/\1", }, - # This is an extra rule for JS rewriting only. - # Youtube replayer may add thing to an already rewrited url. We have to clean it - # again - { - "pattern": r"youtube.fuzzy.replayweb.page/embed/([^?&]+).*", - "replace": r"youtube.fuzzy.replayweb.page/embed/\1", - }, { "pattern": r".*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+" r".mp4)$",