From dd3da130c4f4fb2ab38cc6127e9d988a2a8c9811 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 17 Jul 2024 12:52:51 +0000 Subject: [PATCH] Ignore non HTTP(S) WARC records --- CHANGELOG.md | 1 + src/warc2zim/converter.py | 15 +++++++++++++++ src/warc2zim/utils.py | 4 ++-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 339c178..fdb6df0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Handle case where the redirect target is bad / unsupported (#332 and #356) - Fixed WARC files handling order to follow creation order (#366) - Remove subsequent slashes in URLs, both in Python and JS (#365) +- Ignore non HTTP(S) WARC records (#351) ## [2.0.3] - 2024-07-24 diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index bf666f3..07adad2 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -407,6 +407,11 @@ def gather_information_from_warc(self): continue url = get_record_url(record) + + # ignore non HTTP(S) URLs (intent:// for instance, see #332) + if not (url.startswith("http://") or url.startswith("https://")): + continue + zim_path = normalize(HttpUrl(url)) status_code = get_status_code(record) @@ -675,6 +680,9 @@ def retrieve_illustration(self): if record.rec_type != "response": continue url = get_record_url(record) + # ignore non HTTP(S) URLs (intent:// for instance, see #332) + if not (url.startswith("http://") or url.startswith("https://")): + continue path = normalize(HttpUrl(url)) if path == self.favicon_path or url == self.favicon_url: logger.debug("Found WARC record for favicon") @@ -748,6 +756,13 @@ def add_items_for_warc_record(self, record): logger.debug(f"Skipping record with empty WARC-Target-URI {record}") return + url = str(url) + + # ignore non HTTP(S) URLs (intent:// for instance, see #332) + if not (url.startswith("http://") or url.startswith("https://")): + logger.debug(f"Skipping record with non HTTP(S) WARC-Target-URI {url}") + return + item_zim_path = normalize(HttpUrl(url)) # if include_domains is set, only include urls from those domains diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py index 707d307..8e38166 100644 --- a/src/warc2zim/utils.py +++ b/src/warc2zim/utils.py @@ -21,12 +21,12 @@ def get_version(): return __version__ -def get_record_url(record): +def get_record_url(record) -> str: """Check if record has url converted from POST/PUT, and if so, use that otherwise return the target url""" if hasattr(record, "urlkey"): return record.urlkey - return record.rec_headers["WARC-Target-URI"] + return str(record.rec_headers["WARC-Target-URI"]) def get_status_code(record: ArcWarcRecord) -> HTTPStatus | int | None: