Merge pull request #218 from openzim/url_handling

Revisit handling of special characters in ZIM / HTML URLs
openzim · Apr 5, 2024 · 7809a6d · 7809a6d
2 parents beeb5d2 + 83438d6
commit 7809a6d
Show file tree

Hide file tree

Showing 15 changed files with 880 additions and 308 deletions.
diff --git a/.gitignore b/.gitignore
@@ -213,3 +213,7 @@ pyrightconfig.json
 
 # installed at build time
 wombat.js
+
+# temporary directories using during development
+output
+tmp
diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py
@@ -11,7 +11,7 @@
 from warc2zim.content_rewriting.html import HtmlRewriter
 from warc2zim.content_rewriting.js import JsRewriter
 from warc2zim.content_rewriting.rx_replacer import RxRewriter
-from warc2zim.url_rewriting import ArticleUrlRewriter
+from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
 from warc2zim.utils import (
     get_record_content,
     get_record_encoding,
@@ -59,7 +59,7 @@ def __init__(
         self,
         path: str,
         record: ArcWarcRecord,
-        known_urls: set[str],
+        existing_zim_paths: set[ZimPath],
     ):
         self.content = get_record_content(record)
 
@@ -69,7 +69,9 @@ def __init__(
 
         self.path = path
         self.orig_url_str = get_record_url(record)
-        self.url_rewriter = ArticleUrlRewriter(self.orig_url_str, known_urls)
+        self.url_rewriter = ArticleUrlRewriter(
+            HttpUrl(self.orig_url_str), existing_zim_paths
+        )
 
         self.rewrite_mode = self.get_rewrite_mode(record, mimetype)
 
@@ -133,7 +135,9 @@ def get_rewrite_mode(self, record, mimetype):
     def rewrite_html(self, head_template: Template, css_insert: str | None):
         orig_url = urlsplit(self.orig_url_str)
 
-        rel_static_prefix = self.url_rewriter.from_normalized("_zim_static/")
+        rel_static_prefix = self.url_rewriter.get_document_uri(
+            ZimPath("_zim_static/"), ""
+        )
         head_insert = head_template.render(
             path=self.path,
             static_prefix=rel_static_prefix,

diff --git a/src/warc2zim/content_rewriting/html.py b/src/warc2zim/content_rewriting/html.py
@@ -134,12 +134,12 @@ def handle_data(self, data: str):
         elif self._active_tag == "style":
             data = self.css_rewriter.rewrite(data)
         elif self._active_tag == "script":
-            rules = get_ds_rules(self.url_rewriter.article_url)
+            rules = get_ds_rules(self.url_rewriter.article_url.value)
             if data.strip():
                 data = JsRewriter(self.url_rewriter, rules).rewrite(data)
         elif self._active_tag == "json":
             if data.strip():
-                rules = get_ds_rules(self.url_rewriter.article_url)
+                rules = get_ds_rules(self.url_rewriter.article_url.value)
                 if rules:
                     data = RxRewriter(rules).rewrite(data, {})
         self.send(data)

diff --git a/src/warc2zim/content_rewriting/js.py b/src/warc2zim/content_rewriting/js.py
@@ -12,6 +12,7 @@
     replace,
     replace_prefix_from,
 )
+from warc2zim.url_rewriting import ZimPath
 
 # Regex used to check if we ar import or exporting things in the string
 # ie : If we are a module
@@ -236,8 +237,8 @@ def _get_module_decl(self, local_decls: Iterable[str]) -> str:
 
         This will be added to script only if the script is a module script.
         """
-        wb_module_decl_url = self.url_rewriter.from_normalized(
-            "_zim_static/__wb_module_decl.js"
+        wb_module_decl_url = self.url_rewriter.get_document_uri(
+            ZimPath("_zim_static/__wb_module_decl.js"), ""
         )
         return (
             f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n"""

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -23,7 +23,7 @@
 import time
 from http import HTTPStatus
 from pathlib import Path
-from urllib.parse import urldefrag, urljoin, urlsplit, urlunsplit
+from urllib.parse import urljoin, urlsplit, urlunsplit
 
 import requests
 
@@ -46,7 +46,7 @@
 
 from warc2zim.constants import logger
 from warc2zim.items import StaticArticle, WARCPayloadItem
-from warc2zim.url_rewriting import FUZZY_RULES, normalize
+from warc2zim.url_rewriting import FUZZY_RULES, HttpUrl, ZimPath, normalize
 from warc2zim.utils import (
     get_record_content,
     get_record_mime_type,
@@ -88,14 +88,15 @@ def __init__(self, args):
             for handler in logger.handlers:
                 handler.setLevel(logging.DEBUG)
 
-        main_url: str = args.url
+        main_url: str | None = str(args.url) if args.url else None
         # ensure trailing slash is added if missing
-        parts = urlsplit(main_url)
-        if parts.path == "":
-            parts = list(parts)
-            # set path
-            parts[2] = "/"
-            main_url = urlunsplit(parts)
+        if main_url:
+            parts = urlsplit(main_url)
+            if parts.path == "":
+                parts = list(parts)
+                # set path
+                parts[2] = "/"
+                main_url = urlunsplit(parts)
 
         self.name = args.name
         self.title = args.title
@@ -107,10 +108,10 @@ def __init__(self, args):
         self.creator_metadata = args.creator
         self.publisher = args.publisher
         self.tags = DEFAULT_TAGS + (args.tags or [])
-        self.source: str = args.source or main_url
+        self.source: str | None = str(args.source) if args.source else None or main_url
         self.scraper = "warc2zim " + get_version()
         self.illustration = b""
-        self.main_path = normalize(main_url)
+        self.main_path = normalize(HttpUrl(main_url)) if main_url else None
 
         self.output = Path(args.output)
         self.zim_file = args.zim_file
@@ -131,9 +132,9 @@ def __init__(self, args):
 
         self.custom_css = args.custom_css
 
-        self.indexed_urls = set({})
-        self.revisits = {}
-        self.warc_urls = set({})
+        self.added_zim_items: set[ZimPath] = set()
+        self.revisits: dict[ZimPath, ZimPath] = {}
+        self.expected_zim_items: set[ZimPath] = set()
 
         # progress file handling
         self.stats_filename = (
@@ -202,6 +203,8 @@ def run(self):
             return 100
 
         self.gather_information_from_warc()
+        if not self.main_path:
+            raise ValueError("Unable to find main path, aborting")
         self.title = self.title or "Untitled"
         if len(self.title) > RECOMMENDED_MAX_TITLE_LENGTH:
             self.title = f"{self.title[0:29]}…"
@@ -229,7 +232,7 @@ def run(self):
 
         self.creator = Creator(
             self.full_filename,
-            main_path=self.main_path,
+            main_path=self.main_path.value,
         )
 
         self.creator.config_metadata(
@@ -250,7 +253,7 @@ def run(self):
         for filename in importlib.resources.files("warc2zim.statics").iterdir():
             with importlib.resources.as_file(filename) as file:
                 self.creator.add_item(
-                    StaticArticle(filename=file, main_path=self.main_path)
+                    StaticArticle(filename=file, main_path=self.main_path.value)
                 )
 
         # Add wombat_setup.js
@@ -269,14 +272,16 @@ def run(self):
 
         # process revisits
         for normalized_url, target_url in self.revisits.items():
-            if normalized_url not in self.indexed_urls:
+            if normalized_url not in self.added_zim_items:
                 logger.debug(f"Adding alias {normalized_url} -> {target_url}")
                 try:
-                    self.creator.add_alias(normalized_url, "", target_url, {})
+                    self.creator.add_alias(
+                        normalized_url.value, "", target_url.value, {}
+                    )
                 except RuntimeError as exc:
                     if not ALIAS_EXC_STR.match(str(exc)):
                         raise exc
-                self.indexed_urls.add(normalized_url)
+                self.added_zim_items.add(normalized_url)
 
         logger.debug(f"Found {self.total_records} records in WARCs")
 
@@ -292,10 +297,16 @@ def iter_all_warc_records(self):
     def gather_information_from_warc(self):
         main_page_found = False
         for record in iter_warc_records(self.inputs):
+
+            # only response records can be considered as main_path and as existing ZIM
+            # path
+            if record.rec_type not in ("response", "revisit"):
+                continue
+
             url = get_record_url(record)
-            normalized_url = normalize(url)
+            zim_path = normalize(HttpUrl(url))
 
-            self.warc_urls.add(normalized_url)
+            self.expected_zim_items.add(zim_path)
 
             if main_page_found:
                 continue
@@ -316,9 +327,9 @@ def gather_information_from_warc(self):
                     or record.http_headers.get_statuscode() == "200"
                 )
             ):
-                self.main_path = normalized_url
+                self.main_path = zim_path
 
-            if urldefrag(self.main_path).url != normalized_url:
+            if self.main_path != zim_path:
                 continue
 
             # if we get here, found record for the main page
@@ -332,14 +343,16 @@ def gather_information_from_warc(self):
                 ]:
                     original_path = self.main_path
                     self.main_path = normalize(
-                        urljoin(
-                            get_record_url(record),
-                            record.http_headers.get_header("Location").strip(),
+                        HttpUrl(
+                            urljoin(
+                                get_record_url(record),
+                                record.http_headers.get_header("Location").strip(),
+                            )
                         )
                     )
                     logger.warning(
                         f"HTTP {status_code} occurred on main page; "
-                        f"replacing {original_path} with '{self.main_path}'"
+                        f"replacing {original_path} with {self.main_path}"
                     )
                     continue
 
@@ -393,9 +406,11 @@ def find_icon_and_language(self, record, content):
 
             # transform icon URL into WARC path
             self.favicon_path = normalize(
-                urljoin(
-                    get_record_url(record),
-                    icon_url,
+                HttpUrl(
+                    urljoin(
+                        get_record_url(record),
+                        icon_url,
+                    )
                 )
             )
 
@@ -437,7 +452,7 @@ def retrieve_illustration(self):
                 if record.rec_type != "response":
                     continue
                 url = get_record_url(record)
-                path = normalize(url)
+                path = normalize(HttpUrl(url))
                 if path == self.favicon_path or url == self.favicon_url:
                     logger.debug("Found WARC record for favicon")
                     if (
@@ -497,18 +512,20 @@ def is_self_redirect(self, record, url):
             return False
 
         location = record.http_headers.get("Location", "")
-        return normalize(url) == normalize(location)
+        location = urljoin(url, location)
+        return normalize(HttpUrl(url)) == normalize(HttpUrl(location))
 
     def add_items_for_warc_record(self, record):
+
+        if record.rec_type not in ("response", "revisit"):
+            return
+
         url = get_record_url(record)
-        normalized_url = normalize(url)
         if not url:
             logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
             return
 
-        if normalized_url in self.indexed_urls:
-            logger.debug(f"Skipping duplicate {url}, already added to ZIM")
-            return
+        item_zim_path = normalize(HttpUrl(url))
 
         # if include_domains is set, only include urls from those domains
         if self.include_domains:
@@ -519,17 +536,21 @@ def add_items_for_warc_record(self, record):
                 logger.debug(f"Skipping url {url}, outside included domains")
                 return
 
+        if item_zim_path in self.added_zim_items:
+            logger.debug(f"Skipping duplicate {url}, already added to ZIM")
+            return
+
         if record.rec_type == "response":
             if self.is_self_redirect(record, url):
                 logger.debug("Skipping self-redirect: " + url)
                 return
 
             payload_item = WARCPayloadItem(
-                normalized_url,
+                item_zim_path.value,
                 record,
                 self.head_template,
                 self.css_insert,
-                self.warc_urls,
+                self.expected_zim_items,
             )
 
             if len(payload_item.content) != 0:
@@ -541,15 +562,15 @@ def add_items_for_warc_record(self, record):
                 self.total_records += 1
                 self.update_stats()
 
-            self.indexed_urls.add(normalized_url)
+            self.added_zim_items.add(item_zim_path)
 
         elif (
             record.rec_type == "revisit"
             and record.rec_headers["WARC-Refers-To-Target-URI"] != url
-            and normalized_url not in self.revisits
+            and item_zim_path not in self.revisits
         ):  # pragma: no branch
-            self.revisits[normalized_url] = normalize(
-                record.rec_headers["WARC-Refers-To-Target-URI"]
+            self.revisits[item_zim_path] = normalize(
+                HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
             )
 
 

diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
@@ -15,6 +15,7 @@
 from zimscraperlib.zim.items import StaticItem
 
 from warc2zim.content_rewriting.generic import Rewriter
+from warc2zim.url_rewriting import ZimPath
 from warc2zim.utils import get_record_mime_type
 
 
@@ -29,13 +30,13 @@ def __init__(
         record: ArcWarcRecord,
         head_template: Template,
         css_insert: str | None,
-        known_urls: set[str],
+        existing_zim_paths: set[ZimPath],
     ):
         super().__init__()
 
         self.path = path
         self.mimetype = get_record_mime_type(record)
-        (self.title, self.content) = Rewriter(path, record, known_urls).rewrite(
+        (self.title, self.content) = Rewriter(path, record, existing_zim_paths).rewrite(
             head_template, css_insert
         )