From 44a54d440e6d6714a15f27044c75a84de0ddb0ae Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 15 Mar 2024 20:27:04 +0000
Subject: [PATCH 01/10] Fix improperly escaped character in test-website

---
 test-website/url-encoding.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/test-website/url-encoding.html b/test-website/url-encoding.html
index 6a983537..46bb15bb 100644
--- a/test-website/url-encoding.html
+++ b/test-website/url-encoding.html
@@ -33,7 +33,7 @@ <h2>Image with non-encoded chars in name and a mix of non-encoded and already en
     <h2>Image with non-encoded chars in name with non-encoded special chars in link and a query parameter and fragment with reserved chars</h2>
 
     <p>An image should be displayed below as well</p>
-    <img width="200" src="./images/urlencoding1_icône-débuter-Solidarité-Numérique_1@300x.png?param1=value1&param2=value%262#anchor%233" />
+    <img width="200" src="./images/urlencoding1_icône-débuter-Solidarité-Numérique_1@300x.png?param1=value1&amp;param2=value%262#anchor%233" />
 
     <hr/>
 

From def7bc1d67561b5009ccd4feb43aec1f2a33f275 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Sat, 16 Mar 2024 06:38:33 +0000
Subject: [PATCH 02/10] Rewrite fuzzy rule to not contain the trailing ? when
 querystring is empty

---
 src/warc2zim/url_rewriting.py | 2 +-
 tests/test_fuzzy_rules.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index 7a62fa78..96488f8c 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -71,7 +71,7 @@
         r"=[^&]+).*",
         "replace": r"youtube.fuzzy.replayweb.page/\1\2",
     },
-    {"pattern": r"([^?]+\?)[\d]+$", "replace": r"\1"},
+    {"pattern": r"([^?]+)\?[\d]+$", "replace": r"\1"},
     {
         "pattern": r"(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId["
         r"^&]+).*",
diff --git a/tests/test_fuzzy_rules.py b/tests/test_fuzzy_rules.py
index 92dd0368..3fe12bcb 100644
--- a/tests/test_fuzzy_rules.py
+++ b/tests/test_fuzzy_rules.py
@@ -87,7 +87,7 @@ def test_fuzzyrules_google_video_infos(google_video_info_case):
     params=[
         ContentForTests(
             "www.example.com/page?1234",
-            "www.example.com/page?",
+            "www.example.com/page",
         ),
         ContentForTests(
             "www.example.com/page?foo=1234",

From 40edf3c49124de02471b2972d9b0b7e1d5576ca1 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 18 Mar 2024 09:56:11 +0000
Subject: [PATCH 03/10] Really do not consider 'resource' WARC record for all
 operations

---
 src/warc2zim/converter.py           |  10 ++++++++++
 tests/data/example-resource.warc.gz | Bin 1779 -> 0 bytes
 2 files changed, 10 insertions(+)
 delete mode 100644 tests/data/example-resource.warc.gz

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 8625af9b..10089930 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -292,6 +292,12 @@ def iter_all_warc_records(self):
     def gather_information_from_warc(self):
         main_page_found = False
         for record in iter_warc_records(self.inputs):
+
+            # only response records can be considered as main_path and as existing ZIM
+            # path
+            if record.rec_type not in ("response", "revisit"):
+                continue
+
             url = get_record_url(record)
             normalized_url = normalize(url)
 
@@ -500,6 +506,10 @@ def is_self_redirect(self, record, url):
         return normalize(url) == normalize(location)
 
     def add_items_for_warc_record(self, record):
+
+        if record.rec_type not in ("response", "revisit"):
+            return
+
         url = get_record_url(record)
         normalized_url = normalize(url)
         if not url:
diff --git a/tests/data/example-resource.warc.gz b/tests/data/example-resource.warc.gz
deleted file mode 100644
index 91737eeebb0e62e5308753786cfe9373f5653f2f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1779
zcmV<P1`PQhiwFP!0000214U3>Yr-%TeJ}Jsgu>nyP1-nX7=xiW27`$>1z$#*rkc?v
zCAn_s*neLlqHi}jU-#V2^Kv?*1Cl12FyYm)R~-EaC$(wp<#VZ|bv1sSarAg_h93^P
z=Hn5ORWT}YF2@z7RE=?2Rt+XO%Y~#lA*6h|7|eC6jOc>12LW1QZ$$6+7Cg=g9TS#M
z2rUSsgP1(noi4hWfS`#KOc>55D?)ieIHTJ%nOXzNz-436_pO$}+nQnYz>QXI?QgU<
zWp=O+W^9rsY2s}Ifftk;Rk?FN<<O!P&{)@@?_yA<NgM?PO4tR(4rqS<LFao(rvzBX
z5vZ=mFU!q#{jpl7iS`TO;Jx8!6#(mmJKp7FYTH)HxL=xld24W|K-2;RN52__4jD(8
zyppo}@QMMd{uB@}$|glP6Qd*`wem9ZBg^Yk)Ut<BL_kA(wz_7cTXOantEb67U{$k@
z^Z@_>ABzY8000000t0PPU2EJh5PUcIKL|nIDqFH$d={k;?vhd{G+eGZNGL@o%RU8L
zGSXh#B;>zG@?H7?dDG6!j&^6a=j&z8GMXma@FJUsNpbW`IH^r%PvBZf>ss=8#nFd@
zGyHJSEwA5G**27tLe@LNn5xNQvFiveszOLsP)ZjcPX?EIP)3Zw*@FNri8o^MdkdZv
zl-0B<8p=vqu`K?a^^d2%m4MI_C4^R_Xf~AblJbh(rpeM8PzFvDi)k9P4Bpn{(Svka
z4Xytx`=U$_dydMcNs~13wu7Jxom;hY^M1;qt3g0#-H3jc*&<EiQ$Ub}Sy0>oUCs!a
z^O8;pu#O{8<3v8Kx3@RnHaBUa{grU=wc}_L2<wD<I4kJV4ug{OeAjaZ0lR)pAx-YR
zHDpvES^<KiCp^2W_J;t>5v!UqWK@;(9l~Bes(>|h9g0cx%7^Oi34<P0h<BVqiT{nY
zq&O%KZivJlVB&eMeh1;8GHJ&=vLO1|2ESege~+k!A^wFgj&iiJqtFIF|3@D8L4UTH
z{lZlucZ^n>ejZ_J-`eRp9wbd(080|(oQeVf03VA80000000IM*RBcb=NDTg7iT~iu
zeF3R?YoRS6jgAEbXyM?xWqJ9snkG&&YLdy$O!{)C{qO6fp|l4l-L=(p#$!LWACLWb
zIy$pm%h_u^G8vj9j8UTWAkn%|o^3C1A~6&)LTPbfYs?Dsa`gD}?P2__KGCAIz1FcS
zD>A|*%Ze1OP-J!k{Gd>pr&G+e2e;xco~Fz`vK-jIWg*YCP{|2g=NeNW0o;9sCvctb
zuKV3RfYXA&9&z%8Yx}6%x4J#p|8h0Hy=g<rX9$-V&cp$nB~oN)9}O(W>N=fcYv96H
z7Bk5kD+-0OHOH$0J&>qGDMM_8qdEce`rHG*l(|=yJo0+jk9$tU%ud*!n68U`ld-sO
zI-SD~3*C<6IQ`%imB}e;^I>$oCht8BmDV}a<h$dz1afG0j>nGcbshS>Gz#}@l?oP_
z=X^@MdY}^Kde2WcR~IjLz31zT`<tuN&zEO^cE-1z!^hFx{ZsGdva$V~ig5OSEzbn8
zp6iAcjY~~c%+eDG6DAevVX0$t@TPZz`BWz!xLv2a*V=3OpU&^j#?SW`P?-gOwZbw>
zb2T&?9u|v*wdh(xPHp%2_}E@nItHmCi-J}Q=;LdyQw%QFjKR6c7|(rsEp1dX)NB)#
zQHN#N!PZ^n&cq)jpAU`Ka5bf6K)5~)jepB;7bs^L4vjg-MIof#GGFjWC&LKmJVdiT
zw1MYbbC#MaWGN2a4U*C;%B1q&xDAyu)^`&Tt>Eu^TOV0Cn@UmUkr|3qQ1|^7JM`_G
zY*aFOT|D=|c@q^Zig-TVk;N3K8MBP1D-Vpj0(0tsTp4XJt}vaW<{^VWutbt<1Z_B#
zlt3F4shZTMJbpv0H$Uq&;`6Pb>+C(~IZm;BALkKfz)CGX2={2m;G^8*E&r2x^q7)G
zyj1k+Fgw`_iFql{XKnhQbHz1ATLHepbq9xo!4_gw3p1+PooOy8)?DPc4f=bA5ofTU
zv8CB$*{`xZ*e=;_1^-JXKR3HwBDcugWo)PJAKd+e8fW)*!}o@=w;<N5Z`V&WsLSlz
zO*i^gYC+4VAe)DrxbOOsMC-dE7$;moRLg;OG<w)1<tm|CXaN(1QqeXR5@_a^N>{AU
z5t`YksC~#fEHmmaWx=y3Dd$yX1e%c753fPb->eo~C!*9)NKVM1AS!rX*AwW>)QnQE
zofarFt`uEZzFlmh_Gwy3jCV#9XF02f)Y_kVo!>@qD<qP8!pmwOmPKGJr~!SuW{QZj
VYv%lVyad%~`~{1=zu{a5007e*U=siU


From 63bf7a9fd9c71766d416964aff0f45eeac304003 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 18 Mar 2024 09:56:32 +0000
Subject: [PATCH 04/10] Rework transformation of WARC record url to ZIM path
 and URL normalization to support encoded URL and query strings

---
 src/warc2zim/content_rewriting/generic.py |  12 +-
 src/warc2zim/content_rewriting/html.py    |   4 +-
 src/warc2zim/content_rewriting/js.py      |   5 +-
 src/warc2zim/converter.py                 |  83 ++--
 src/warc2zim/items.py                     |   5 +-
 src/warc2zim/url_rewriting.py             | 322 +++++++++++----
 tests/test_css_rewriting.py               |  36 +-
 tests/test_fuzzy_rules.py                 |  23 +-
 tests/test_html_rewriting.py              |  35 +-
 tests/test_js_rewriting.py                |   6 +-
 tests/test_url_rewriting.py               | 455 +++++++++++++++++-----
 tests/test_warc_to_zim.py                 | 156 ++++++--
 12 files changed, 852 insertions(+), 290 deletions(-)

diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py
index 280338c8..4a35aeaa 100644
--- a/src/warc2zim/content_rewriting/generic.py
+++ b/src/warc2zim/content_rewriting/generic.py
@@ -11,7 +11,7 @@
 from warc2zim.content_rewriting.html import HtmlRewriter
 from warc2zim.content_rewriting.js import JsRewriter
 from warc2zim.content_rewriting.rx_replacer import RxRewriter
-from warc2zim.url_rewriting import ArticleUrlRewriter
+from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
 from warc2zim.utils import (
     get_record_content,
     get_record_encoding,
@@ -59,7 +59,7 @@ def __init__(
         self,
         path: str,
         record: ArcWarcRecord,
-        known_urls: set[str],
+        existing_zim_paths: set[ZimPath],
     ):
         self.content = get_record_content(record)
 
@@ -69,7 +69,9 @@ def __init__(
 
         self.path = path
         self.orig_url_str = get_record_url(record)
-        self.url_rewriter = ArticleUrlRewriter(self.orig_url_str, known_urls)
+        self.url_rewriter = ArticleUrlRewriter(
+            HttpUrl(self.orig_url_str), existing_zim_paths
+        )
 
         self.rewrite_mode = self.get_rewrite_mode(record, mimetype)
 
@@ -133,7 +135,9 @@ def get_rewrite_mode(self, record, mimetype):
     def rewrite_html(self, head_template: Template, css_insert: str | None):
         orig_url = urlsplit(self.orig_url_str)
 
-        rel_static_prefix = self.url_rewriter.from_normalized("_zim_static/")
+        rel_static_prefix = self.url_rewriter.get_document_uri(
+            ZimPath("_zim_static/"), ""
+        )
         head_insert = head_template.render(
             path=self.path,
             static_prefix=rel_static_prefix,
diff --git a/src/warc2zim/content_rewriting/html.py b/src/warc2zim/content_rewriting/html.py
index e938f2b1..1e724a12 100644
--- a/src/warc2zim/content_rewriting/html.py
+++ b/src/warc2zim/content_rewriting/html.py
@@ -134,12 +134,12 @@ def handle_data(self, data: str):
         elif self._active_tag == "style":
             data = self.css_rewriter.rewrite(data)
         elif self._active_tag == "script":
-            rules = get_ds_rules(self.url_rewriter.article_url)
+            rules = get_ds_rules(self.url_rewriter.article_url.value)
             if data.strip():
                 data = JsRewriter(self.url_rewriter, rules).rewrite(data)
         elif self._active_tag == "json":
             if data.strip():
-                rules = get_ds_rules(self.url_rewriter.article_url)
+                rules = get_ds_rules(self.url_rewriter.article_url.value)
                 if rules:
                     data = RxRewriter(rules).rewrite(data, {})
         self.send(data)
diff --git a/src/warc2zim/content_rewriting/js.py b/src/warc2zim/content_rewriting/js.py
index 24baba46..9c93a743 100644
--- a/src/warc2zim/content_rewriting/js.py
+++ b/src/warc2zim/content_rewriting/js.py
@@ -12,6 +12,7 @@
     replace,
     replace_prefix_from,
 )
+from warc2zim.url_rewriting import ZimPath
 
 # Regex used to check if we ar import or exporting things in the string
 # ie : If we are a module
@@ -236,8 +237,8 @@ def _get_module_decl(self, local_decls: Iterable[str]) -> str:
 
         This will be added to script only if the script is a module script.
         """
-        wb_module_decl_url = self.url_rewriter.from_normalized(
-            "_zim_static/__wb_module_decl.js"
+        wb_module_decl_url = self.url_rewriter.get_document_uri(
+            ZimPath("_zim_static/__wb_module_decl.js"), ""
         )
         return (
             f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n"""
diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 10089930..9d231999 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -23,7 +23,7 @@
 import time
 from http import HTTPStatus
 from pathlib import Path
-from urllib.parse import urldefrag, urljoin, urlsplit, urlunsplit
+from urllib.parse import urljoin, urlsplit, urlunsplit
 
 import requests
 
@@ -46,7 +46,7 @@
 
 from warc2zim.constants import logger
 from warc2zim.items import StaticArticle, WARCPayloadItem
-from warc2zim.url_rewriting import FUZZY_RULES, normalize
+from warc2zim.url_rewriting import FUZZY_RULES, HttpUrl, ZimPath, normalize
 from warc2zim.utils import (
     get_record_content,
     get_record_mime_type,
@@ -88,14 +88,15 @@ def __init__(self, args):
             for handler in logger.handlers:
                 handler.setLevel(logging.DEBUG)
 
-        main_url: str = args.url
+        main_url: str | None = str(args.url) if args.url else None
         # ensure trailing slash is added if missing
-        parts = urlsplit(main_url)
-        if parts.path == "":
-            parts = list(parts)
-            # set path
-            parts[2] = "/"
-            main_url = urlunsplit(parts)
+        if main_url:
+            parts = urlsplit(main_url)
+            if parts.path == "":
+                parts = list(parts)
+                # set path
+                parts[2] = "/"
+                main_url = urlunsplit(parts)
 
         self.name = args.name
         self.title = args.title
@@ -107,10 +108,10 @@ def __init__(self, args):
         self.creator_metadata = args.creator
         self.publisher = args.publisher
         self.tags = DEFAULT_TAGS + (args.tags or [])
-        self.source: str = args.source or main_url
+        self.source: str | None = str(args.source) if args.source else None or main_url
         self.scraper = "warc2zim " + get_version()
         self.illustration = b""
-        self.main_path = normalize(main_url)
+        self.main_path = normalize(HttpUrl(main_url)) if main_url else None
 
         self.output = Path(args.output)
         self.zim_file = args.zim_file
@@ -132,8 +133,8 @@ def __init__(self, args):
         self.custom_css = args.custom_css
 
         self.indexed_urls = set({})
-        self.revisits = {}
-        self.warc_urls = set({})
+        self.revisits: dict[ZimPath, ZimPath] = {}
+        self.existing_zim_paths: set[ZimPath] = set()
 
         # progress file handling
         self.stats_filename = (
@@ -202,6 +203,8 @@ def run(self):
             return 100
 
         self.gather_information_from_warc()
+        if not self.main_path:
+            raise ValueError("Unable to find main path, aborting")
         self.title = self.title or "Untitled"
         if len(self.title) > RECOMMENDED_MAX_TITLE_LENGTH:
             self.title = f"{self.title[0:29]}…"
@@ -229,7 +232,7 @@ def run(self):
 
         self.creator = Creator(
             self.full_filename,
-            main_path=self.main_path,
+            main_path=self.main_path.value,
         )
 
         self.creator.config_metadata(
@@ -250,7 +253,7 @@ def run(self):
         for filename in importlib.resources.files("warc2zim.statics").iterdir():
             with importlib.resources.as_file(filename) as file:
                 self.creator.add_item(
-                    StaticArticle(filename=file, main_path=self.main_path)
+                    StaticArticle(filename=file, main_path=self.main_path.value)
                 )
 
         # Add wombat_setup.js
@@ -272,7 +275,9 @@ def run(self):
             if normalized_url not in self.indexed_urls:
                 logger.debug(f"Adding alias {normalized_url} -> {target_url}")
                 try:
-                    self.creator.add_alias(normalized_url, "", target_url, {})
+                    self.creator.add_alias(
+                        normalized_url.value, "", target_url.value, {}
+                    )
                 except RuntimeError as exc:
                     if not ALIAS_EXC_STR.match(str(exc)):
                         raise exc
@@ -299,9 +304,9 @@ def gather_information_from_warc(self):
                 continue
 
             url = get_record_url(record)
-            normalized_url = normalize(url)
+            zim_path = normalize(HttpUrl(url))
 
-            self.warc_urls.add(normalized_url)
+            self.existing_zim_paths.add(zim_path)
 
             if main_page_found:
                 continue
@@ -322,9 +327,9 @@ def gather_information_from_warc(self):
                     or record.http_headers.get_statuscode() == "200"
                 )
             ):
-                self.main_path = normalized_url
+                self.main_path = zim_path
 
-            if urldefrag(self.main_path).url != normalized_url:
+            if self.main_path != zim_path:
                 continue
 
             # if we get here, found record for the main page
@@ -338,14 +343,16 @@ def gather_information_from_warc(self):
                 ]:
                     original_path = self.main_path
                     self.main_path = normalize(
-                        urljoin(
-                            get_record_url(record),
-                            record.http_headers.get_header("Location").strip(),
+                        HttpUrl(
+                            urljoin(
+                                get_record_url(record),
+                                record.http_headers.get_header("Location").strip(),
+                            )
                         )
                     )
                     logger.warning(
                         f"HTTP {status_code} occurred on main page; "
-                        f"replacing {original_path} with '{self.main_path}'"
+                        f"replacing {original_path} with {self.main_path}"
                     )
                     continue
 
@@ -399,9 +406,11 @@ def find_icon_and_language(self, record, content):
 
             # transform icon URL into WARC path
             self.favicon_path = normalize(
-                urljoin(
-                    get_record_url(record),
-                    icon_url,
+                HttpUrl(
+                    urljoin(
+                        get_record_url(record),
+                        icon_url,
+                    )
                 )
             )
 
@@ -443,7 +452,7 @@ def retrieve_illustration(self):
                 if record.rec_type != "response":
                     continue
                 url = get_record_url(record)
-                path = normalize(url)
+                path = normalize(HttpUrl(url))
                 if path == self.favicon_path or url == self.favicon_url:
                     logger.debug("Found WARC record for favicon")
                     if (
@@ -503,7 +512,8 @@ def is_self_redirect(self, record, url):
             return False
 
         location = record.http_headers.get("Location", "")
-        return normalize(url) == normalize(location)
+        location = urljoin(url, location)
+        return normalize(HttpUrl(url)) == normalize(HttpUrl(location))
 
     def add_items_for_warc_record(self, record):
 
@@ -511,14 +521,11 @@ def add_items_for_warc_record(self, record):
             return
 
         url = get_record_url(record)
-        normalized_url = normalize(url)
         if not url:
             logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
             return
 
-        if normalized_url in self.indexed_urls:
-            logger.debug(f"Skipping duplicate {url}, already added to ZIM")
-            return
+        normalized_url = normalize(HttpUrl(url))
 
         # if include_domains is set, only include urls from those domains
         if self.include_domains:
@@ -529,17 +536,21 @@ def add_items_for_warc_record(self, record):
                 logger.debug(f"Skipping url {url}, outside included domains")
                 return
 
+        if normalized_url in self.indexed_urls:
+            logger.debug(f"Skipping duplicate {url}, already added to ZIM")
+            return
+
         if record.rec_type == "response":
             if self.is_self_redirect(record, url):
                 logger.debug("Skipping self-redirect: " + url)
                 return
 
             payload_item = WARCPayloadItem(
-                normalized_url,
+                normalized_url.value,
                 record,
                 self.head_template,
                 self.css_insert,
-                self.warc_urls,
+                self.existing_zim_paths,
             )
 
             if len(payload_item.content) != 0:
@@ -559,7 +570,7 @@ def add_items_for_warc_record(self, record):
             and normalized_url not in self.revisits
         ):  # pragma: no branch
             self.revisits[normalized_url] = normalize(
-                record.rec_headers["WARC-Refers-To-Target-URI"]
+                HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
             )
 
 
diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
index 30d63a4d..9168c6e4 100644
--- a/src/warc2zim/items.py
+++ b/src/warc2zim/items.py
@@ -15,6 +15,7 @@
 from zimscraperlib.zim.items import StaticItem
 
 from warc2zim.content_rewriting.generic import Rewriter
+from warc2zim.url_rewriting import ZimPath
 from warc2zim.utils import get_record_mime_type
 
 
@@ -29,13 +30,13 @@ def __init__(
         record: ArcWarcRecord,
         head_template: Template,
         css_insert: str | None,
-        known_urls: set[str],
+        existing_zim_paths: set[ZimPath],
     ):
         super().__init__()
 
         self.path = path
         self.mimetype = get_record_mime_type(record)
-        (self.title, self.content) = Rewriter(path, record, known_urls).rewrite(
+        (self.title, self.content) = Rewriter(path, record, existing_zim_paths).rewrite(
             head_template, css_insert
         )
 
diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index 96488f8c..f066ef34 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -7,42 +7,38 @@
 
 The global scheme is the following:
 
-Entries are stored in the zim file using their urldecoded full path properly urlencoded
-(yes!):
-- The full path is the full url without the scheme (ie :
-  "<host>/<path>(?<query_string)"). The scheme information is lost. We will serve the
-  content using the scheme of the real server, whatever was the scheme of the original
-  url. We probably don't care about different content served from different scheme but
-  with same `host/path`.
-- urldecoded: As most as possible the path itself must not be urlencoded:
+Entries are stored in the ZIM file using their decoded fully decoded path:
+- The full path is the full url without the scheme, username, password, port, fragment
+ (ie : "<host>/<path>(?<query_string)"). See documentation of the `normalize` function
+ for more details.
+- urldecoded: the path itself must not be urlencoded or it would conflict with ZIM
+  specification and readers won't be able to retrieve it, some parts (e.g. querystring)
+  might be absorbed by a web server, ...
   . This is valid : "foo/part with space/bar?key=value"
   . This is NOT valid : "foo/part%20with%20space/bar%3Fkey%3Dvalue"
-- Properly urlencoded: However, for correct parsing, some character may still need to be
-  encoded.
-  The querystring components (and others) must be url encoded as needed:
+- even having multiple ? in a ZIM path is valid
   . This is valid :
-    "foo/part/file with %3F and +?who=Chip%26Dale&quer=Is%20there%20any%20%2B%20here%3F"
-  . This is NOT valid :
     "foo/part/file with ? and +?who=Chip&Dale&question=It there any + here?"
-- Space in query string must be encoded with `%20` not `+`:
-  . This is valid : "foo/part/file?question=Is%20there%20any%20%2B%20here%3F"
-  . This is NOT valid : "foo/part/file?question=Is+there+any+%2B+here%3F"
-
-In python words :
-- full path are `urllib.parse.ParseResults` with `scheme==''`
-- `urllib.parse.urlparse` must correctly parse the path (generating `ParseResults` with
-  empty scheme)
-- The querystring part must be parsable by `urllib.parse.parse_qs` (even if we don't do
-  it here)
-- The querystring must be generated as by `urllib.parse.urlencode(<query>,
-  quote_via=quote)`
-
-On top of that, paths are "reduced" using fuzzy rules:
-A path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value&other_key=
-other_value&videoId=xxxx&yet_another_key=yet_another_value"
-is reduced to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx"
+  . This is NOT valid :
+    "foo/part/file with %3F and +?who=Chip%26Dale&quer=Is%20there%20any%20%2B%20here%3F"
+- space in query string must be stored as ` `, not `%2B`, `%20` or `+`, the `+` in a ZIM
+  path means a `%2B in web resource (HTML document, ...):
+  . This is valid : "foo/part/file?question=Is there any + here?"
+  . This is NOT valid : "foo/part/file?question%3DIs%20there%20any%20%2B%20here%3F"
+
+On top of that, fuzzy rules are applied on the ZIM path:
+For instance a path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value
+&other_key=other_value&videoId=xxxx&yet_another_key=yet_another_value"
+is transformed to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx"
 by slightly simplifying the path and keeping only the usefull arguments in the
 querystring.
+
+When rewriting documents (HTML, CSS, JS, ...), every time we find a URI to rewrite we
+start by resolving it into an absolute URL (based on the containing document absolute
+URI), applying the transformation to compute the corresponding ZIM path and we
+url-encode the whole ZIM path, so that readers will have one single blob to process,
+url-decode and find corresponding ZIM entry. Only '/' separators are considered safe
+and not url-encoded.
 """
 
 from __future__ import annotations
@@ -52,14 +48,18 @@
 import re
 from urllib.parse import (
     quote,
+    unquote,
     urljoin,
     urlsplit,
     urlunsplit,
 )
 
+import idna
+
 # Shared logger
 logger = logging.getLogger("warc2zim.url_rewriting")
 
+known_bad_hostnames: set[str] = set()
 
 FUZZY_RULES = [
     {
@@ -99,49 +99,186 @@
     },
 ]
 
+
 COMPILED_FUZZY_RULES = [
     {"match": re.compile(rule["pattern"]), "replace": rule["replace"]}
     for rule in FUZZY_RULES
 ]
 
 
-def reduce(path: str) -> str:
-    """Reduce a path"""
+class HttpUrl:
+    """A utility class representing an HTTP url, usefull to pass this data around
+
+    Includes a basic validation, ensuring that URL is encoded, scheme is provided.
+    """
+
+    def __init__(self, value: str) -> None:
+        HttpUrl.check_validity(value)
+        self._value = value
+
+    def __eq__(self, __value: object) -> bool:
+        return isinstance(__value, HttpUrl) and __value.value == self.value
+
+    def __hash__(self) -> int:
+        return self.value.__hash__()
+
+    def __str__(self) -> str:
+        return f"HttpUrl({self.value})"
+
+    @property
+    def value(self) -> str:
+        return self._value
+
+    @classmethod
+    def check_validity(cls, value: str) -> None:
+        parts = urlsplit(value)
+
+        if parts.scheme.lower() not in ["http", "https"]:
+            raise ValueError(
+                f"Incorrect HttpUrl scheme in value: {value} {parts.scheme}"
+            )
+
+        if not parts.hostname:
+            raise ValueError(f"Unsupported empty hostname in value: {value}")
+
+        if parts.hostname.lower() != parts.hostname:
+            raise ValueError(f"Unsupported upper-case chars in hostname : {value}")
+
+
+class ZimPath:
+    """A utility class representing a ZIM path, usefull to pass this data around
+
+    Includes a basic validation, ensuring that path does start with scheme, hostname,...
+    """
+
+    def __init__(self, value: str) -> None:
+        ZimPath.check_validity(value)
+        self._value = value
+
+    def __eq__(self, __value: object) -> bool:
+        return isinstance(__value, ZimPath) and __value.value == self.value
+
+    def __hash__(self) -> int:
+        return self.value.__hash__()
+
+    def __str__(self) -> str:
+        return f"ZimPath({self.value})"
+
+    @property
+    def value(self) -> str:
+        return self._value
+
+    @classmethod
+    def check_validity(cls, value: str) -> None:
+        parts = urlsplit(value)
+
+        if parts.scheme:
+            raise ValueError(f"Unexpected scheme in value: {value} {parts.scheme}")
+
+        if parts.hostname:
+            raise ValueError(f"Unexpected hostname in value: {value} {parts.hostname}")
+
+        if parts.username:
+            raise ValueError(f"Unexpected username in value: {value} {parts.username}")
+
+        if parts.password:
+            raise ValueError(f"Unexpected password in value: {value} {parts.password}")
+
+
+def apply_fuzzy_rules(uri: HttpUrl | str) -> HttpUrl | str:
+    """Apply fuzzy rules on a URL or relative path
+
+    First matching fuzzy rule matching the input value is applied and its result
+    is returned.
+
+    If no fuzzy rule is matching, the input is returned as-is.
+    """
+    value = uri.value if isinstance(uri, HttpUrl) else uri
     for rule in COMPILED_FUZZY_RULES:
-        if match := rule["match"].match(path):
+        if match := rule["match"].match(value):
             return match.expand(rule["replace"])
-    return path
+    return value
+
+
+def normalize(url: HttpUrl) -> ZimPath:
+    """Transform a HTTP URL into a ZIM path to use as a entry's key.
 
+    According to RFC 3986, a URL allows only a very limited set of characters, so we
+    assume by default that the url is encoded to match this specification.
 
-def normalize(url: str | None) -> str:
-    """Normalize a properly contructed url to a path to use as a entry's key.
+    The transformation rewrites the hostname, the path and the querystring.
 
-    >>> normalize("http://exemple.com/path/to/article?foo=bar")
-    "exemple.com/path/to/article?foo=bar"
-    >>> normalize("http://other.com/path to strange ar+t%3Ficle?foo=bar+baz")
-    "other.com/path to strange ar+t%3Ficle?foo=bar%20baz"
-    >>> normalize("http://youtube.com/youtubei/bar?key=value&videoId=xxxx&otherKey=
-    otherValue")
-    "youtube.fuzzy.replayweb.page/youtubei/bar?videoId=xxxx"
+    The transformation drops the URL scheme, username, password, port and fragment:
+    - we suppose there is no conflict of URL scheme or port: there is no two ressources
+     with same hostname, path and querystring but different URL schemeor port leading
+     to different content
+    - we consider username/password port are purely authentication mechanism which have
+    no impact on the content to server
+    - we know that the fragment is never passed to the server, it stays in the
+    User-Agent, so if we encounter a fragment while normalizing a URL found in a
+    document, it won't make its way to the ZIM anyway and will stay client-side
+
+    The transformation consists mainly in decoding the three components so that ZIM path
+    is not encoded at all, as required by the ZIM specification.
+
+    Decoding is done differently for the hostname (decoded with puny encoding) and the
+    path and querystring (both decoded with url decoding).
+
+    The final transformation is the application of fuzzy rules (sourced from wabac) to
+    transform some URLs into replay URLs and drop some useless stuff.
+
+    Returned value is a ZIM path, without any puny/url encoding applied, ready to be
+    passed to python-libzim for UTF-8 encoding.
     """
 
-    if not url:
-        return url  # pyright: ignore[reportGeneralTypeIssues, reportReturnType]
+    url_parts = urlsplit(url.value)
+
+    hostname = url_parts.hostname
+
+    if hostname and hostname not in known_bad_hostnames:
+        try:
+            # try to decode the hostname
+            hostname = idna.decode(hostname)
+        except idna.IDNAError as exc:
+            # exception might happen if illegal character are found in hostname like the
+            # `_` in `host_ip` ; we keep the set of bad hostname in memory to not fill
+            # the logs with these warnings
+            logger.warning(f"Bad hostname found, kept as-is: {hostname}", exc_info=exc)
+            known_bad_hostnames.add(hostname)
 
-    url_parts = urlsplit(url)
-    url_parts = url_parts._replace(scheme="")
+    path = url_parts.path
 
-    # Remove the netloc (by moving it into path)
-    if url_parts.netloc:
-        new_path = url_parts.netloc + url_parts.path
-        url_parts = url_parts._replace(netloc="", path=new_path)
-    if url_parts.path and url_parts.path[0] == "/":
-        url_parts = url_parts._replace(path=url_parts.path[1:])
+    if path:
+        # unquote the path so that it is stored unencoded in the ZIM as required by ZIM
+        # specification
+        path = unquote(path)
+    else:
+        # if path is empty, we need a "/" to remove ambiguities, e.g. https://example.com
+        # and https://example.com/ must all lead to the same ZIM entry to match RFC 3986
+        # section 6.2.3 : https://www.rfc-editor.org/rfc/rfc3986#section-6.2.3
+        path = "/"
 
-    path = urlunsplit(url_parts)
-    path = reduce(path)
+    query = url_parts.query
 
-    return path
+    # if query is missing, we do not add it at all, not even a trailing ? without
+    # anything after it
+    if url_parts.query:
+        # `+`` in query parameter must be decoded as space first to remove ambiguities
+        # between a space (encoded as `+` in url query parameter) and a real plus sign
+        # (encoded as %2B but soon decoded in ZIM path)
+        query = query.replace("+", " ")
+        # unquote the query so that it is stored unencoded in the ZIM as required by ZIM
+        # specification
+        query = "?" + unquote(query)
+    else:
+        query = ""
+
+    fuzzified_url = apply_fuzzy_rules(f"{hostname}{path}{query}")
+
+    if not isinstance(fuzzified_url, str):
+        raise ValueError("Inappropriate value returned, should be str")
+
+    return ZimPath(fuzzified_url)
 
 
 def get_without_fragment(url: str) -> str:
@@ -152,46 +289,69 @@ def get_without_fragment(url: str) -> str:
 class ArticleUrlRewriter:
     """Rewrite urls in article."""
 
-    def __init__(self, article_url: str, known_urls: set[str]):
+    def __init__(self, article_url: HttpUrl, existing_zim_paths: set[ZimPath]):
+        self.article_path = normalize(article_url)
         self.article_url = article_url
-        self.known_urls = known_urls
-        self.base_path = f"/{urlsplit(normalize(article_url)).path}"
-        if self.base_path[-1] != "/":
-            # We want a directory
-            self.base_path = posixpath.dirname(self.base_path)
+        self.existing_zim_paths = existing_zim_paths
+        self.missing_zim_paths: set[ZimPath] = set()
 
-    def __call__(self, url: str, *, rewrite_all_url: bool = True) -> str:
+    def __call__(self, item_url: str, *, rewrite_all_url: bool = True) -> str:
         """Rewrite a url contained in a article.
 
         The url is "fully" rewrited to point to a normalized entry path
         """
 
-        url = url.strip()
+        item_url = item_url.strip()
 
-        if url.startswith("data:") or url.startswith("blob:"):
-            return url
+        item_scheme = urlsplit(item_url).scheme
+        if item_scheme and item_scheme not in ("http", "https"):
+            return item_url
 
-        absolute_url = urljoin(self.article_url, url)
+        item_absolute_url = urljoin(self.article_url.value, item_url)
+        item_fragment = urlsplit(item_absolute_url).fragment
 
-        normalized_url = normalize(absolute_url)
+        item_path = normalize(HttpUrl(item_absolute_url))
 
-        if rewrite_all_url or get_without_fragment(normalized_url) in self.known_urls:
-            return self.from_normalized(normalized_url)
+        if rewrite_all_url or item_path in self.existing_zim_paths:
+            return self.get_document_uri(item_path, item_fragment)
         else:
-            logger.debug(f"WARNING {normalized_url} ({url}) not in archive.")
+            if item_path not in self.missing_zim_paths:
+                logger.debug(f"WARNING {item_path} ({item_url}) not in archive.")
+                # maintain a collection of missing Zim Path to not fill the logs with
+                # duplicate messages
+                self.missing_zim_paths.add(item_path)
             # The url doesn't point to a known entry
-            return url
+            return item_absolute_url
 
-    def from_normalized(self, normalized_url_str: str) -> str:
-        normalized_url = urlsplit(f"/{normalized_url_str}")
+    def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
+        """Given an ZIM item path and its fragment, get the URI to use in document
 
-        # relative_to will lost our potential last '/'
-        slash_ending = normalized_url.path[-1] == "/"
-        relative_path = posixpath.relpath(normalized_url.path, self.base_path)
+        This function transforms the path of a ZIM item we want to adress from current
+        document (HTML / JS / ...) and returns the corresponding URI to use.
 
-        if slash_ending:
+        It computes the relative path based on current document location and escape
+        everything which needs to be to transform the ZIM path into a valid RFC 3986 URI
+
+        It also append a potential trailing item fragment at the end of the resulting
+        URI.
+
+        """
+        item_parts = urlsplit(item_path.value)
+
+        # item_path is both path + querystring, both will be url-encoded in the document
+        # so that readers consider them as a whole and properly pass them to libzim
+        item_url = item_parts.path
+        if item_parts.query:
+            item_url += "?" + item_parts.query
+
+        relative_path = posixpath.relpath(
+            item_url, posixpath.dirname(self.article_path.value)
+        )
+        # relpath removes a potential last '/' in the path, we add it back
+        if item_path.value.endswith("/"):
             relative_path += "/"
-        normalized_url = normalized_url._replace(path=relative_path)
-        normalized_url = urlunsplit(normalized_url)
 
-        return quote(normalized_url, safe="/#")
+        return (
+            f"{quote(relative_path, safe='/')}"
+            f"{'#' + item_fragment if item_fragment else ''}"
+        )
diff --git a/tests/test_css_rewriting.py b/tests/test_css_rewriting.py
index 1375ce9a..6f4f358c 100644
--- a/tests/test_css_rewriting.py
+++ b/tests/test_css_rewriting.py
@@ -3,7 +3,7 @@
 import pytest
 
 from warc2zim.content_rewriting.css import CssRewriter
-from warc2zim.url_rewriting import ArticleUrlRewriter
+from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl
 
 from .utils import ContentForTests
 
@@ -19,15 +19,15 @@
         ),
         ContentForTests(
             b"p { width= } div { background: url(http://exemple.com/img.png)}",
-            b"p { width= } div { background: url(exemple.com/img.png)}",
+            b"p { width= } div { background: url(../exemple.com/img.png)}",
         ),
         ContentForTests(
             b"p { width= } div { background: url('http://exemple.com/img.png')}",
-            b'p { width= } div { background: url("exemple.com/img.png")}',
+            b'p { width= } div { background: url("../exemple.com/img.png")}',
         ),
         ContentForTests(
             b'p { width= } div { background: url("http://exemple.com/img.png")}',
-            b'p { width= } div { background: url("exemple.com/img.png")}',
+            b'p { width= } div { background: url("../exemple.com/img.png")}',
         ),
     ]
 )
@@ -37,9 +37,11 @@ def no_rewrite_content(request):
 
 def test_no_rewrite(no_rewrite_content):
     assert (
-        CssRewriter(ArticleUrlRewriter(no_rewrite_content.article_url, set())).rewrite(
-            no_rewrite_content.input_bytes
-        )
+        CssRewriter(
+            ArticleUrlRewriter(
+                HttpUrl(f"http://{no_rewrite_content.article_url}"), set()
+            )
+        ).rewrite(no_rewrite_content.input_bytes)
         == no_rewrite_content.expected_bytes.decode()
     )
 
@@ -53,7 +55,7 @@ def test_no_rewrite(no_rewrite_content):
         ContentForTests("border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"),
         ContentForTests(
             'background: url("http://exemple.com/foo.png"); width=',
-            'background: url("exemple.com/foo.png"); width=',
+            'background: url("../exemple.com/foo.png"); width=',
         ),
     ]
 )
@@ -64,7 +66,9 @@ def invalid_content_inline(request):
 def test_invalid_css_inline(invalid_content_inline):
     assert (
         CssRewriter(
-            ArticleUrlRewriter(invalid_content_inline.article_url, set())
+            ArticleUrlRewriter(
+                HttpUrl(f"http://{invalid_content_inline.article_url}"), set()
+            )
         ).rewrite_inline(invalid_content_inline.input_str)
         == invalid_content_inline.expected_str
     )
@@ -83,7 +87,7 @@ def test_invalid_css_inline(invalid_content_inline):
         ),
         ContentForTests(
             b'p { background: url("http://exemple.com/foo.png"); width= }',
-            b'p { background: url("exemple.com/foo.png"); width= }',
+            b'p { background: url("../exemple.com/foo.png"); width= }',
         ),
     ]
 )
@@ -93,9 +97,9 @@ def invalid_content(request):
 
 def test_invalid_cssl(invalid_content):
     assert (
-        CssRewriter(ArticleUrlRewriter(invalid_content.article_url, set())).rewrite(
-            invalid_content.input_bytes
-        )
+        CssRewriter(
+            ArticleUrlRewriter(HttpUrl(f"http://{invalid_content.article_url}"), set())
+        ).rewrite(invalid_content.input_bytes)
         == invalid_content.expected_bytes.decode()
     )
 
@@ -123,7 +127,7 @@ def test_rewrite():
 
     expected = """
     /* A comment with a link : http://foo.com */
-    @import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%2BIcons);
+    @import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%20Icons);
 
     p, input {
         color: rbg(1, 2, 3);
@@ -143,6 +147,8 @@ def test_rewrite():
     expected = dedent(expected)
 
     assert (
-        CssRewriter(ArticleUrlRewriter("kiwix.org/article", set())).rewrite(content)
+        CssRewriter(
+            ArticleUrlRewriter(HttpUrl("http://kiwix.org/article"), set())
+        ).rewrite(content)
         == expected
     )
diff --git a/tests/test_fuzzy_rules.py b/tests/test_fuzzy_rules.py
index 3fe12bcb..66d3750f 100644
--- a/tests/test_fuzzy_rules.py
+++ b/tests/test_fuzzy_rules.py
@@ -1,6 +1,6 @@
 import pytest
 
-from warc2zim.url_rewriting import reduce
+from warc2zim.url_rewriting import apply_fuzzy_rules
 
 from .utils import ContentForTests
 
@@ -34,7 +34,10 @@ def google_videos_case(request):
 
 
 def test_fuzzyrules_google_videos(google_videos_case):
-    assert reduce(google_videos_case.input_str) == google_videos_case.expected_str
+    assert (
+        apply_fuzzy_rules(google_videos_case.input_str)
+        == google_videos_case.expected_str
+    )
 
 
 @pytest.fixture(
@@ -79,7 +82,8 @@ def google_video_info_case(request):
 
 def test_fuzzyrules_google_video_infos(google_video_info_case):
     assert (
-        reduce(google_video_info_case.input_str) == google_video_info_case.expected_str
+        apply_fuzzy_rules(google_video_info_case.input_str)
+        == google_video_info_case.expected_str
     )
 
 
@@ -112,7 +116,7 @@ def trim_digits_only_query_case(request):
 
 def test_fuzzyrules_trim_digits_only_query(trim_digits_only_query_case):
     assert (
-        reduce(trim_digits_only_query_case.input_str)
+        apply_fuzzy_rules(trim_digits_only_query_case.input_str)
         == trim_digits_only_query_case.expected_str
     )
 
@@ -165,7 +169,7 @@ def youtubei_case(request):
 
 
 def test_fuzzyrules_youtubei(youtubei_case):
-    assert reduce(youtubei_case.input_str) == youtubei_case.expected_str
+    assert apply_fuzzy_rules(youtubei_case.input_str) == youtubei_case.expected_str
 
 
 @pytest.fixture(
@@ -205,7 +209,10 @@ def youtube_embed_case(request):
 
 
 def test_fuzzyrules_youtube_embed(youtube_embed_case):
-    assert reduce(youtube_embed_case.input_str) == youtube_embed_case.expected_str
+    assert (
+        apply_fuzzy_rules(youtube_embed_case.input_str)
+        == youtube_embed_case.expected_str
+    )
 
 
 @pytest.fixture(
@@ -254,7 +261,7 @@ def vimeo_cdn_case(request):
 
 
 def test_fuzzyrules_vimeo_cdn(vimeo_cdn_case):
-    assert reduce(vimeo_cdn_case.input_str) == vimeo_cdn_case.expected_str
+    assert apply_fuzzy_rules(vimeo_cdn_case.input_str) == vimeo_cdn_case.expected_str
 
 
 @pytest.fixture(
@@ -283,4 +290,4 @@ def vimeo_host_case(request):
 
 
 def test_fuzzyrules_vimeo_host(vimeo_host_case):
-    assert reduce(vimeo_host_case.input_str) == vimeo_host_case.expected_str
+    assert apply_fuzzy_rules(vimeo_host_case.input_str) == vimeo_host_case.expected_str
diff --git a/tests/test_html_rewriting.py b/tests/test_html_rewriting.py
index fd56d912..cfac82c7 100644
--- a/tests/test_html_rewriting.py
+++ b/tests/test_html_rewriting.py
@@ -3,7 +3,7 @@
 import pytest
 
 from warc2zim.content_rewriting.html import HtmlRewriter
-from warc2zim.url_rewriting import ArticleUrlRewriter
+from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
 
 from .utils import ContentForTests
 
@@ -44,7 +44,13 @@ def no_rewrite_content(request):
 
 def test_no_rewrite(no_rewrite_content):
     assert (
-        HtmlRewriter(ArticleUrlRewriter(no_rewrite_content.article_url, set()), "", "")
+        HtmlRewriter(
+            ArticleUrlRewriter(
+                HttpUrl(f"http://{no_rewrite_content.article_url}"), set()
+            ),
+            "",
+            "",
+        )
         .rewrite(no_rewrite_content.input_str)
         .content
         == no_rewrite_content.expected_str
@@ -101,7 +107,11 @@ def escaped_content(request):
 
 def test_escaped_content(escaped_content):
     transformed = (
-        HtmlRewriter(ArticleUrlRewriter(escaped_content.article_url, set()), "", "")
+        HtmlRewriter(
+            ArticleUrlRewriter(HttpUrl(f"http://{escaped_content.article_url}"), set()),
+            "",
+            "",
+        )
         .rewrite(escaped_content.input_str)
         .content
     )
@@ -121,12 +131,12 @@ def long_path_replace_test_content(input_: str, rewriten_url: str, article_url:
         # Normalized path is "exemple.com/a/long/path"
         lprtc(
             '<a href="http://exemple.com/a/long/path">A link to rewrite</a>',
-            "exemple.com/a/long/path",
+            "a/long/path",
             "exemple.com",
         ),
         lprtc(
             '<a href="http://exemple.com/a/long/path">A link to rewrite</a>',
-            "exemple.com/a/long/path",
+            "../exemple.com/a/long/path",
             "kiwix.org",
         ),
         lprtc(
@@ -188,7 +198,10 @@ def rewrite_url(request):
 def test_rewrite(rewrite_url):
     assert (
         HtmlRewriter(
-            ArticleUrlRewriter(rewrite_url.article_url, {"exemple.com/a/long/path"}),
+            ArticleUrlRewriter(
+                HttpUrl(f"http://{rewrite_url.article_url}"),
+                {ZimPath("exemple.com/a/long/path")},
+            ),
             "",
             "",
         )
@@ -222,7 +235,11 @@ def test_extract_title():
 
 
 def test_rewrite_attributes():
-    rewriter = HtmlRewriter(ArticleUrlRewriter("kiwix.org/", {"kiwix.org/foo"}), "", "")
+    rewriter = HtmlRewriter(
+        ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), {ZimPath("kiwix.org/foo")}),
+        "",
+        "",
+    )
 
     assert (
         rewriter.rewrite("<a href='https://kiwix.org/foo'>A link</a>").content
@@ -245,7 +262,7 @@ def test_rewrite_attributes():
 
 def test_rewrite_css():
     output = (
-        HtmlRewriter(ArticleUrlRewriter("", set()), "", "")
+        HtmlRewriter(ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), set()), "", "")
         .rewrite(
             "<style>p { /* A comment with a http://link.org/ */ "
             "background: url('some/image.png') ; }</style>",
@@ -268,7 +285,7 @@ def test_head_insert():
 
     content = dedent(content)
 
-    url_rewriter = ArticleUrlRewriter("foo", set())
+    url_rewriter = ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), set())
     assert HtmlRewriter(url_rewriter, "", "").rewrite(content).content == content
 
     assert HtmlRewriter(url_rewriter, "PRE_HEAD_INSERT", "").rewrite(
diff --git a/tests/test_js_rewriting.py b/tests/test_js_rewriting.py
index fc77290f..0c425274 100644
--- a/tests/test_js_rewriting.py
+++ b/tests/test_js_rewriting.py
@@ -1,7 +1,7 @@
 import pytest
 
 from warc2zim.content_rewriting.js import JsRewriter
-from warc2zim.url_rewriting import ArticleUrlRewriter
+from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl
 
 from .utils import ContentForTests
 
@@ -218,7 +218,9 @@ def rewrite_import_content(request):
 
 
 def test_import_rewrite(rewrite_import_content):
-    url_rewriter = ArticleUrlRewriter(rewrite_import_content.article_url, set())
+    url_rewriter = ArticleUrlRewriter(
+        HttpUrl(rewrite_import_content.article_url), set()
+    )
     assert (
         JsRewriter(url_rewriter).rewrite(rewrite_import_content.input_str)
         == rewrite_import_content.expected_str
diff --git a/tests/test_url_rewriting.py b/tests/test_url_rewriting.py
index 3fe17b48..e4633e2b 100644
--- a/tests/test_url_rewriting.py
+++ b/tests/test_url_rewriting.py
@@ -1,103 +1,384 @@
-from urllib.parse import urljoin
-
 import pytest
 
-from warc2zim.url_rewriting import ArticleUrlRewriter
+from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
 
 
-@pytest.fixture(
-    params=[
-        "https://kiwix.org/a/article/path",
-        "https://kiwix.org/a/article/path/",
-        "https://kiwix.org/a/path",
-        "https://kiwix.org/a/path/",
-    ]
+@pytest.mark.parametrize(
+    "article_url, original_content_url, expected_rewriten_content_url, know_paths, "
+    "rewrite_all_url",
+    [
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo.html",
+            "foo.html",
+            ["kiwix.org/a/article/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo.html#anchor1",
+            "foo.html#anchor1",
+            ["kiwix.org/a/article/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo.html?foo=bar",
+            "foo.html%3Ffoo%3Dbar",
+            ["kiwix.org/a/article/foo.html?foo=bar"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo.html?foo=b%24ar",
+            "foo.html%3Ffoo%3Db%24ar",
+            ["kiwix.org/a/article/foo.html?foo=b$ar"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo.html?foo=b%3Far",  # a query string with an encoded ? char in value
+            "foo.html%3Ffoo%3Db%3Far",
+            ["kiwix.org/a/article/foo.html?foo=b?ar"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "fo%o.html",
+            "fo%25o.html",
+            ["kiwix.org/a/article/fo%o.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foé.html",  # URL not matching RFC 3986 (many HTML documents are invalid)
+            "fo%C3%A9.html",  # character is encoded so that URL match RFC 3986
+            ["kiwix.org/a/article/foé.html"],  # but ZIM path is non-encoded
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "./foo.html",
+            "foo.html",
+            ["kiwix.org/a/article/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "../foo.html",
+            "https://kiwix.org/a/foo.html",  # Full URL since not in known URLs
+            ["kiwix.org/a/article/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "../foo.html",
+            "../foo.html",  # all URLs rewrite activated
+            ["kiwix.org/a/article/foo.html"],
+            True,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "../foo.html",
+            "../foo.html",
+            ["kiwix.org/a/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "../bar/foo.html",
+            "https://kiwix.org/a/bar/foo.html",  # Full URL since not in known URLs
+            ["kiwix.org/a/article/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "../bar/foo.html",
+            "../bar/foo.html",  # all URLs rewrite activated
+            ["kiwix.org/a/article/foo.html"],
+            True,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "../bar/foo.html",
+            "../bar/foo.html",
+            ["kiwix.org/a/bar/foo.html"],
+            False,
+        ),
+        (  # we cannot go upper than host, so '../' in excess are removed
+            "https://kiwix.org/a/article/document.html",
+            "../../../../../foo.html",
+            "../../foo.html",
+            ["kiwix.org/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo?param=value%2F",
+            "foo%3Fparam%3Dvalue/",
+            ["kiwix.org/a/article/foo?param=value/"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo?param=value%2Fend",
+            "foo%3Fparam%3Dvalue/end",
+            ["kiwix.org/a/article/foo?param=value/end"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "foo/",
+            "foo/",
+            ["kiwix.org/a/article/foo/"],
+            False,
+        ),
+    ],
 )
-def rewriter(request):
-    yield ArticleUrlRewriter(request.param, {"kiwix.org/bar/foo"})
-
-
-def test_relative_url(rewriter):
-    for url in ["foo", "bar/foo", "foo/", "bar/foo/", "../baz"]:
-        assert rewriter(url) == url
-        assert rewriter("./" + url) == url
-        assert urljoin(rewriter.article_url, rewriter(url)) == urljoin(
-            rewriter.article_url, url
-        )
-
-    if rewriter.article_url == "https://kiwix.org/a/path":
-        # While it may seems wrong from a path point of view,
-        # it is valid from a url side when going up to the `<host>/` is "no op".
-        assert rewriter("../../biz") == "../biz"
-    else:
-        assert rewriter("../../biz") == "../../biz"
-        assert rewriter("./../../biz") == "../../biz"
-
-
-def test_absolute_path_url(rewriter):
-    for url in ["/foo", "/foo/bar"]:
-        for input_url in [url, f" {url}", f"{url} ", f" {url} "]:
-            rewriten = rewriter(input_url)
-            # Must produce a relative link.
-            assert not rewriten.startswith("/")
-            # Relative link must be resolved to a absolute url in the same domain than
-            # article_url.
-            assert urljoin(rewriter.article_url, rewriten) == "https://kiwix.org" + url
-
-
-def test_absolute_scheme_url(rewriter):
-    # We will serve our content from serving.com (moving kiwix.org as the first part of
-    # the path).
-    serving_address = rewriter.article_url.replace("kiwix.org", "serving.com/kiwix.org")
-
-    for url in ["//exemple.com/foo", "//exemple.com/foo/bar", "//kiwix.org/baz"]:
-        rewriten = rewriter(url)
-        # Must produce a relative link.
-        assert not rewriten.startswith("/")
-        # Relative link must be resolved to a absolute url in the serving domain with a
-        # path containing article_url's host.
-        assert urljoin(serving_address, rewriten) == "https://serving.com" + url[1:]
+def test_relative_url(
+    article_url,
+    know_paths,
+    original_content_url,
+    expected_rewriten_content_url,
+    rewrite_all_url,
+):
+    article_url = HttpUrl(article_url)
+    rewriter = ArticleUrlRewriter(
+        article_url,
+        {ZimPath(path) for path in know_paths},
+    )
+    assert (
+        rewriter(original_content_url, rewrite_all_url=rewrite_all_url)
+        == expected_rewriten_content_url
+    )
 
 
-def test_absolute_url(rewriter):
-    # We will serve our content from serving.com (moving kiwix.org as the first part of
-    # the path).
-    serving_address = rewriter.article_url.replace("kiwix.org", "serving.com/kiwix.org")
+@pytest.mark.parametrize(
+    "article_url, original_content_url, expected_rewriten_content_url, know_paths, "
+    "rewrite_all_url",
+    [
+        (
+            "https://kiwix.org/a/article/document.html",
+            "/foo.html",
+            "../../foo.html",
+            ["kiwix.org/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "/bar.html",
+            "https://kiwix.org/bar.html",  # Full URL since not in known URLs
+            ["kiwix.org/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "/bar.html",
+            "../../bar.html",  # all URLs rewrite activated
+            ["kiwix.org/foo.html"],
+            True,
+        ),
+    ],
+)
+def test_absolute_path_url(
+    article_url,
+    know_paths,
+    original_content_url,
+    expected_rewriten_content_url,
+    rewrite_all_url,
+):
+    article_url = HttpUrl(article_url)
+    rewriter = ArticleUrlRewriter(
+        article_url,
+        {ZimPath(path) for path in know_paths},
+    )
+    assert (
+        rewriter(original_content_url, rewrite_all_url=rewrite_all_url)
+        == expected_rewriten_content_url
+    )
 
-    for url in [
-        "https://exemple.com/foo",
-        "http://exemple.com/foo/bar",
-        "http://kiwix.org/baz",
-    ]:
-        rewriten = rewriter(url)
-        # Must produce a relative link
-        assert not rewriten.startswith("/")
-        # Relative link must be resolved to a absolute url in the serving domain with a
-        # path containing article_url's host.
-        # We don't care about scheme, always use what we are serving from.
-        assert (
-            urljoin(serving_address, rewriten)
-            == "https://serving.com" + url.split(":", 1)[1][1:]
-        )
 
+@pytest.mark.parametrize(
+    "article_url, original_content_url, expected_rewriten_content_url, know_paths, "
+    "rewrite_all_url",
+    [
+        (
+            "https://kiwix.org/a/article/document.html",
+            "//kiwix.org/foo.html",
+            "../../foo.html",
+            ["kiwix.org/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "//kiwix.org/bar.html",
+            "https://kiwix.org/bar.html",  # Full URL since not in known URLs
+            ["kiwix.org/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "//kiwix.org/bar.html",
+            "../../bar.html",  # all URLs rewrite activated
+            ["kiwix.org/foo.html"],
+            True,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "//acme.com/foo.html",
+            "../../../acme.com/foo.html",
+            ["acme.com/foo.html"],
+            False,
+        ),
+        (
+            "http://kiwix.org/a/article/document.html",
+            "//acme.com/bar.html",
+            "http://acme.com/bar.html",  # Full URL since not in known URLs
+            ["kiwix.org/foo.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "//acme.com/bar.html",
+            "../../../acme.com/bar.html",  # all URLs rewrite activated
+            ["kiwix.org/foo.html"],
+            True,
+        ),
+        (  # puny-encoded host is transformed into url-encoded value
+            "https://kiwix.org/a/article/document.html",
+            "//xn--exmple-cva.com/a/article/document.html",
+            "../../../ex%C3%A9mple.com/a/article/document.html",
+            ["exémple.com/a/article/document.html"],
+            False,
+        ),
+        (  # host who should be puny-encoded ir transformed into url-encoded value
+            "https://kiwix.org/a/article/document.html",
+            "//exémple.com/a/article/document.html",
+            "../../../ex%C3%A9mple.com/a/article/document.html",
+            ["exémple.com/a/article/document.html"],
+            False,
+        ),
+    ],
+)
+def test_absolute_scheme_url(
+    article_url,
+    know_paths,
+    original_content_url,
+    expected_rewriten_content_url,
+    rewrite_all_url,
+):
+    article_url = HttpUrl(article_url)
+    rewriter = ArticleUrlRewriter(
+        article_url,
+        {ZimPath(path) for path in know_paths},
+    )
+    assert (
+        rewriter(original_content_url, rewrite_all_url=rewrite_all_url)
+        == expected_rewriten_content_url
+    )
 
-def test_no_rewrite_blob_data(rewriter):
-    for url in ["data:0548datacontent", "blob:exemple.com/url"]:
-        assert rewriter(url) == url
 
+@pytest.mark.parametrize(
+    "article_url, original_content_url, expected_rewriten_content_url, know_paths, "
+    "rewrite_all_url",
+    [
+        (
+            "https://kiwix.org/a/article/document.html",
+            "https://foo.org/a/article/document.html",
+            "../../../foo.org/a/article/document.html",
+            ["foo.org/a/article/document.html"],
+            False,
+        ),
+        (
+            "https://kiwix.org/a/article/document.html",
+            "http://foo.org/a/article/document.html",
+            "../../../foo.org/a/article/document.html",
+            ["foo.org/a/article/document.html"],
+            False,
+        ),
+        (
+            "http://kiwix.org/a/article/document.html",
+            "https://foo.org/a/article/document.html",
+            "../../../foo.org/a/article/document.html",
+            ["foo.org/a/article/document.html"],
+            False,
+        ),
+        (
+            "http://kiwix.org/a/article/document.html",
+            "https://user:password@foo.org:8080/a/article/document.html",
+            "../../../foo.org/a/article/document.html",
+            ["foo.org/a/article/document.html"],
+            False,
+        ),
+        (  # Full URL since not in known URLs
+            "https://kiwix.org/a/article/document.html",
+            "https://foo.org/a/article/document.html",
+            "https://foo.org/a/article/document.html",
+            ["kiwix.org/a/article/foo/"],
+            False,
+        ),
+        (  # all URLs rewrite activated
+            "https://kiwix.org/a/article/document.html",
+            "https://foo.org/a/article/document.html",
+            "../../../foo.org/a/article/document.html",
+            ["kiwix.org/a/article/foo/"],
+            True,
+        ),
+        (  # puny-encoded host is transformed into url-encoded value
+            "https://kiwix.org/a/article/document.html",
+            "https://xn--exmple-cva.com/a/article/document.html",
+            "../../../ex%C3%A9mple.com/a/article/document.html",
+            ["exémple.com/a/article/document.html"],
+            False,
+        ),
+        (  # host who should be puny-encoded is transformed into url-encoded value
+            "https://kiwix.org/a/article/document.html",
+            "https://exémple.com/a/article/document.html",
+            "../../../ex%C3%A9mple.com/a/article/document.html",
+            ["exémple.com/a/article/document.html"],
+            False,
+        ),
+    ],
+)
+def test_absolute_url(
+    article_url,
+    know_paths,
+    original_content_url,
+    expected_rewriten_content_url,
+    rewrite_all_url,
+):
+    article_url = HttpUrl(article_url)
+    rewriter = ArticleUrlRewriter(
+        article_url,
+        {ZimPath(path) for path in know_paths},
+    )
+    assert (
+        rewriter(original_content_url, rewrite_all_url=rewrite_all_url)
+        == expected_rewriten_content_url
+    )
 
-def test_no_rewrite_external_link(rewriter):
-    for rewrite_all_url in [True, False]:
-        # We always rewrite "internal" urls
-        assert "kiwix.org" not in rewriter(
-            "https://kiwix.org/bar/foo", rewrite_all_url=rewrite_all_url
-        )
 
-    # External urls are only rewriten if 'rewrite_all_url' is True
-    assert "kiwix.org" not in rewriter(
-        "https://kiwix.org/external/link", rewrite_all_url=True
+@pytest.mark.parametrize(
+    "original_content_url, rewrite_all_url",
+    [
+        ("data:0548datacontent", False),
+        ("blob:exemple.com/url", False),
+        ("mailto:bob@acme.com", False),
+        ("tel:+33.1.12.12.23", False),
+        ("data:0548datacontent", True),
+        ("blob:exemple.com/url", True),
+        ("mailto:bob@acme.com", True),
+        ("tel:+33.1.12.12.23", True),
+    ],
+)
+# other schemes are never rewritten, even when rewrite_all_url is true
+def test_no_rewrite_other_schemes(original_content_url, rewrite_all_url):
+    article_url = HttpUrl("https://kiwix.org/a/article/document.html")
+    rewriter = ArticleUrlRewriter(
+        article_url,
+        set(),
     )
     assert (
-        rewriter("https://kiwix.org/external/link", rewrite_all_url=False)
-        == "https://kiwix.org/external/link"
+        rewriter(original_content_url, rewrite_all_url=rewrite_all_url)
+        == original_content_url
     )
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index a48c4fe1..7855873c 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -6,6 +6,7 @@
 import os
 import re
 import time
+from urllib.parse import unquote
 
 import pytest
 import requests
@@ -15,7 +16,7 @@
 from warc2zim.__about__ import __version__
 from warc2zim.converter import iter_warc_records
 from warc2zim.main import main
-from warc2zim.url_rewriting import normalize
+from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
 from warc2zim.utils import get_record_url
 
 TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
@@ -27,8 +28,6 @@
     ["example-response.warc"],
     ["example-response.warc", "--progress-file", "progress.json"],
     ["example-response.warc", "--scraper-suffix", SCRAPER_SUFFIX],
-    ["example-resource.warc.gz", "--favicon", "https://example.com/some/favicon.ico"],
-    ["example-resource.warc.gz", "--favicon", "https://www.google.com/favicon.ico"],
     ["example-revisit.warc.gz"],
     [
         "example-revisit.warc.gz",
@@ -154,6 +153,24 @@ def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
 
             url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme)
 
+            # remove user/password
+            if "@" in url_no_scheme:
+                at_index = url_no_scheme.index("@")
+                if at_index >= 0:
+                    if "/" in url_no_scheme:
+                        slash_index = url_no_scheme.index("/")
+                        if at_index < slash_index:
+                            url_no_scheme = url_no_scheme[at_index + 1 :]
+                    else:
+                        url_no_scheme = url_no_scheme[at_index + 1 :]
+
+            # remove trailing ?
+            if url_no_scheme.endswith("?"):
+                url_no_scheme = url_no_scheme[:-1]
+
+            # unquote url since everything is not encoded in ZIM
+            url_no_scheme = unquote(url_no_scheme)
+
             # ensure payloads match
             try:
                 payload = zim_fh.get_item(url_no_scheme)
@@ -199,38 +216,97 @@ def rebuild_favicon_bytes(self, zim, favicon_path) -> bytes:
         resize_image(dst, width=48, height=48, method="cover")
         return dst.getvalue()
 
-    def test_normalize(self):
-        assert normalize(None) is None
-        assert normalize("") == ""
-        assert normalize("https://exemple.com") == "exemple.com"
-        assert normalize("https://exemple.com/") == "exemple.com/"
-        assert normalize("http://example.com/?foo=bar") == "example.com/?foo=bar"
-
-        assert normalize("https://example.com/?foo=bar") == "example.com/?foo=bar"
-
-        assert (
-            normalize("https://example.com/some/path/http://example.com/?foo=bar")
-            == "example.com/some/path/http://example.com/?foo=bar"
-        )
-
-        assert (
-            normalize("example.com/some/path/http://example.com/?foo=bar")
-            == "example.com/some/path/http://example.com/?foo=bar"
-        )
-
-        assert (
-            normalize("http://example.com/path/with/final/slash/")
-            == "example.com/path/with/final/slash/"
-        )
-
-        assert normalize("http://test@example.com/") == "test@example.com/"
-
-        assert (
-            normalize(
-                "http://lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493"
-            )
-            == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?"
-        )
+    @pytest.mark.parametrize(
+        "url,zim_path",
+        [
+            ("https://exemple.com", "exemple.com/"),
+            ("https://exemple.com/", "exemple.com/"),
+            ("http://example.com/resource", "example.com/resource"),
+            ("http://example.com/resource/", "example.com/resource/"),
+            (
+                "http://example.com/resource/folder/sub.txt",
+                "example.com/resource/folder/sub.txt",
+            ),
+            (
+                "http://example.com/resource/folder/sub",
+                "example.com/resource/folder/sub",
+            ),
+            (
+                "http://example.com/resource/folder/sub?foo=bar",
+                "example.com/resource/folder/sub?foo=bar",
+            ),
+            (
+                "http://example.com/resource/folder/sub?foo=bar#anchor1",
+                "example.com/resource/folder/sub?foo=bar",
+            ),
+            ("http://example.com/resource/#anchor1", "example.com/resource/"),
+            ("http://example.com/resource/?foo=bar", "example.com/resource/?foo=bar"),
+            ("http://example.com#anchor1", "example.com/"),
+            ("http://example.com?foo=bar#anchor1", "example.com/?foo=bar"),
+            ("http://example.com/?foo=bar", "example.com/?foo=bar"),
+            ("http://example.com/?foo=ba+r", "example.com/?foo=ba r"),
+            (
+                "http://example.com/?foo=ba r",
+                "example.com/?foo=ba r",
+            ),  # situation where the ` ` has not been properly escaped in document
+            ("http://example.com/?foo=ba%2Br", "example.com/?foo=ba+r"),
+            ("http://example.com/?foo=ba+%2B+r", "example.com/?foo=ba + r"),
+            ("http://example.com/#anchor1", "example.com/"),
+            (
+                "http://example.com/some/path/http://example.com//some/path",
+                "example.com/some/path/http://example.com//some/path",
+            ),
+            (
+                "http://example.com/some/pa?th/http://example.com//some/path",
+                "example.com/some/pa?th/http://example.com//some/path",
+            ),
+            (
+                "http://example.com/so?me/pa?th/http://example.com//some/path",
+                "example.com/so?me/pa?th/http://example.com//some/path",
+            ),
+            ("http://example.com/resource?", "example.com/resource"),
+            ("http://example.com/resource#", "example.com/resource"),
+            ("http://user@example.com/resource", "example.com/resource"),
+            ("http://user:password@example.com/resource", "example.com/resource"),
+            ("http://example.com:8080/resource", "example.com/resource"),
+            (
+                "http://foobargooglevideo.com/videoplayback?id=1576&key=value",
+                "youtube.fuzzy.replayweb.page/videoplayback?id=1576",
+            ),  # Fuzzy rule is applied in addition to path transformations
+            ("https://xn--exmple-cva.com", "exémple.com/"),
+            ("https://xn--exmple-cva.com/", "exémple.com/"),
+            ("https://xn--exmple-cva.com/resource", "exémple.com/resource"),
+            ("https://exémple.com/", "exémple.com/"),
+            ("https://exémple.com/resource", "exémple.com/resource"),
+            ("https://host_ip/", "host_ip/"),
+            ("https://host_ip/resource", "host_ip/resource"),
+            ("http://example.com/res%24urce", "example.com/res$urce"),
+            (
+                "http://example.com/resource?foo=b%24r",
+                "example.com/resource?foo=b$r",
+            ),
+            ("http://example.com/resource@300x", "example.com/resource@300x"),
+            ("http://example.com:8080/resource", "example.com/resource"),
+            ("http://user@example.com:8080/resource", "example.com/resource"),
+            ("http://user:password@example.com:8080/resource", "example.com/resource"),
+            # the two URI below are an illustration of a potential collision (two
+            # differents URI leading to the same ZIM path)
+            (
+                "http://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
+                "de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png",
+                "tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
+                "Solidarité-Numérique_1@300x.png",
+            ),
+            (
+                "https://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
+                "de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png",
+                "tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
+                "Solidarité-Numérique_1@300x.png",
+            ),
+        ],
+    )
+    def test_normalize(self, url, zim_path):
+        assert normalize(HttpUrl(url)) == ZimPath(zim_path)
 
     def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
         zim_output = "zim-out-filename.zim"
@@ -299,7 +375,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
         )
         assert self.get_metadata(zim_output, "Title") == b"Some Title"
 
-    def test_warc_to_zim(self, cmdline, tmp_path):
+    def test_warc_to_zim_main(self, cmdline, tmp_path):
         # intput filename
         filename = cmdline[0]
 
@@ -479,7 +555,7 @@ def test_all_warcs_root_dir(self, tmp_path):
         )
 
         # timestamp fuzzy match from example-with-timestamp.warc
-        assert self.get_article(zim_output, "example.com/path.txt?") != b""
+        assert self.get_article(zim_output, "example.com/path.txt") != b""
 
     def test_fuzzy_urls(self, tmp_path, fuzzycheck):
         zim_output = fuzzycheck["filename"] + ".zim"
@@ -503,11 +579,7 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck):
 
     def test_error_bad_main_page(self, tmp_path):
         zim_output_not_created = "zim-out-not-created.zim"
-        with pytest.raises(
-            KeyError,
-            match="Unable to find WARC record for main page: no-such-url.example.com/,"
-            " aborting",
-        ):
+        with pytest.raises(KeyError, match="Unable to find WARC record for main page:"):
             main(
                 [
                     "-v",

From 95492e5b1a815f7477294dd96a4cb1319f3f6540 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 26 Mar 2024 10:26:38 +0000
Subject: [PATCH 05/10] Add development directories to .gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 9901ece4..f6522214 100644
--- a/.gitignore
+++ b/.gitignore
@@ -213,3 +213,7 @@ pyrightconfig.json
 
 # installed at build time
 wombat.js
+
+# temporary directories using during development
+output
+tmp
\ No newline at end of file

From c6d593709279b1c59b8858e00173350af74d33fe Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Wed, 27 Mar 2024 09:13:06 +0000
Subject: [PATCH 06/10] apply_fuzzy_rules always returns a str indeed

---
 src/warc2zim/url_rewriting.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index f066ef34..e120299a 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -185,7 +185,7 @@ def check_validity(cls, value: str) -> None:
             raise ValueError(f"Unexpected password in value: {value} {parts.password}")
 
 
-def apply_fuzzy_rules(uri: HttpUrl | str) -> HttpUrl | str:
+def apply_fuzzy_rules(uri: HttpUrl | str) -> str:
     """Apply fuzzy rules on a URL or relative path
 
     First matching fuzzy rule matching the input value is applied and its result
@@ -275,9 +275,6 @@ def normalize(url: HttpUrl) -> ZimPath:
 
     fuzzified_url = apply_fuzzy_rules(f"{hostname}{path}{query}")
 
-    if not isinstance(fuzzified_url, str):
-        raise ValueError("Inappropriate value returned, should be str")
-
     return ZimPath(fuzzified_url)
 
 

From eeafb62c460f8a0b49a99d938a316bcbae6c40a7 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Wed, 27 Mar 2024 09:13:12 +0000
Subject: [PATCH 07/10] Fix typo

---
 src/warc2zim/url_rewriting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index e120299a..e9a9ebb2 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -210,7 +210,7 @@ def normalize(url: HttpUrl) -> ZimPath:
 
     The transformation drops the URL scheme, username, password, port and fragment:
     - we suppose there is no conflict of URL scheme or port: there is no two ressources
-     with same hostname, path and querystring but different URL schemeor port leading
+     with same hostname, path and querystring but different URL scheme or port leading
      to different content
     - we consider username/password port are purely authentication mechanism which have
     no impact on the content to server

From 0b64bd4dd47acc753cdde11438ff4e56c1e4c1f3 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Wed, 27 Mar 2024 09:30:51 +0000
Subject: [PATCH 08/10] Use PurePosixPath now that it can walk_up in Python
 3.12

---
 src/warc2zim/url_rewriting.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index e9a9ebb2..99564b29 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -44,8 +44,8 @@
 from __future__ import annotations
 
 import logging
-import posixpath
 import re
+from pathlib import PurePosixPath
 from urllib.parse import (
     quote,
     unquote,
@@ -340,11 +340,17 @@ def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
         item_url = item_parts.path
         if item_parts.query:
             item_url += "?" + item_parts.query
-
-        relative_path = posixpath.relpath(
-            item_url, posixpath.dirname(self.article_path.value)
+        relative_path = str(
+            PurePosixPath(item_url).relative_to(
+                (
+                    PurePosixPath(self.article_path.value)
+                    if self.article_path.value.endswith("/")
+                    else PurePosixPath(self.article_path.value).parent
+                ),
+                walk_up=True,
+            )
         )
-        # relpath removes a potential last '/' in the path, we add it back
+        # relative_to removes a potential last '/' in the path, we add it back
         if item_path.value.endswith("/"):
             relative_path += "/"
 

From 18cf34f49f72f6032dfa8bd605be1770e870a9c6 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Wed, 27 Mar 2024 09:35:38 +0000
Subject: [PATCH 09/10] Rename variables for clarity

---
 src/warc2zim/converter.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 9d231999..04908459 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -132,9 +132,9 @@ def __init__(self, args):
 
         self.custom_css = args.custom_css
 
-        self.indexed_urls = set({})
+        self.added_zim_items: set[ZimPath] = set()
         self.revisits: dict[ZimPath, ZimPath] = {}
-        self.existing_zim_paths: set[ZimPath] = set()
+        self.expected_zim_items: set[ZimPath] = set()
 
         # progress file handling
         self.stats_filename = (
@@ -272,7 +272,7 @@ def run(self):
 
         # process revisits
         for normalized_url, target_url in self.revisits.items():
-            if normalized_url not in self.indexed_urls:
+            if normalized_url not in self.added_zim_items:
                 logger.debug(f"Adding alias {normalized_url} -> {target_url}")
                 try:
                     self.creator.add_alias(
@@ -281,7 +281,7 @@ def run(self):
                 except RuntimeError as exc:
                     if not ALIAS_EXC_STR.match(str(exc)):
                         raise exc
-                self.indexed_urls.add(normalized_url)
+                self.added_zim_items.add(normalized_url)
 
         logger.debug(f"Found {self.total_records} records in WARCs")
 
@@ -306,7 +306,7 @@ def gather_information_from_warc(self):
             url = get_record_url(record)
             zim_path = normalize(HttpUrl(url))
 
-            self.existing_zim_paths.add(zim_path)
+            self.expected_zim_items.add(zim_path)
 
             if main_page_found:
                 continue
@@ -525,7 +525,7 @@ def add_items_for_warc_record(self, record):
             logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
             return
 
-        normalized_url = normalize(HttpUrl(url))
+        item_zim_path = normalize(HttpUrl(url))
 
         # if include_domains is set, only include urls from those domains
         if self.include_domains:
@@ -536,7 +536,7 @@ def add_items_for_warc_record(self, record):
                 logger.debug(f"Skipping url {url}, outside included domains")
                 return
 
-        if normalized_url in self.indexed_urls:
+        if item_zim_path in self.added_zim_items:
             logger.debug(f"Skipping duplicate {url}, already added to ZIM")
             return
 
@@ -546,11 +546,11 @@ def add_items_for_warc_record(self, record):
                 return
 
             payload_item = WARCPayloadItem(
-                normalized_url.value,
+                item_zim_path.value,
                 record,
                 self.head_template,
                 self.css_insert,
-                self.existing_zim_paths,
+                self.expected_zim_items,
             )
 
             if len(payload_item.content) != 0:
@@ -562,14 +562,14 @@ def add_items_for_warc_record(self, record):
                 self.total_records += 1
                 self.update_stats()
 
-            self.indexed_urls.add(normalized_url)
+            self.added_zim_items.add(item_zim_path)
 
         elif (
             record.rec_type == "revisit"
             and record.rec_headers["WARC-Refers-To-Target-URI"] != url
-            and normalized_url not in self.revisits
+            and item_zim_path not in self.revisits
         ):  # pragma: no branch
-            self.revisits[normalized_url] = normalize(
+            self.revisits[item_zim_path] = normalize(
                 HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
             )
 

From 83438d6f9ff71ec85d2ab72c4fb283c518916dd5 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Wed, 27 Mar 2024 14:13:19 +0000
Subject: [PATCH 10/10] Remove extra Youtube rule which is not needed anymore

This rule was needed most probably only because of a trailing ? in some
URLs
---
 src/warc2zim/url_rewriting.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index 99564b29..1adacd01 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -81,13 +81,6 @@
         "pattern": r"(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*",
         "replace": r"youtube.fuzzy.replayweb.page/embed/\1",
     },
-    # This is an extra rule for JS rewriting only.
-    # Youtube replayer may add thing to an already rewrited url. We have to clean it
-    # again
-    {
-        "pattern": r"youtube.fuzzy.replayweb.page/embed/([^?&]+).*",
-        "replace": r"youtube.fuzzy.replayweb.page/embed/\1",
-    },
     {
         "pattern": r".*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+"
         r".mp4)$",