Skip to content

Commit

Permalink
Merge pull request #351 from openzim/ignore_non_http_records
Browse files Browse the repository at this point in the history
Ignore non HTTP(S) WARC records
  • Loading branch information
benoit74 authored Jul 30, 2024
2 parents 030f91a + dd3da13 commit 95790fa
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Handle case where the redirect target is bad / unsupported (#332 and #356)
- Fixed WARC files handling order to follow creation order (#366)
- Remove subsequent slashes in URLs, both in Python and JS (#365)
- Ignore non HTTP(S) WARC records (#351)

## [2.0.3] - 2024-07-24

Expand Down
15 changes: 15 additions & 0 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,11 @@ def gather_information_from_warc(self):
continue

url = get_record_url(record)

# ignore non HTTP(S) URLs (intent:// for instance, see #332)
if not (url.startswith("http://") or url.startswith("https://")):
continue

zim_path = normalize(HttpUrl(url))

status_code = get_status_code(record)
Expand Down Expand Up @@ -675,6 +680,9 @@ def retrieve_illustration(self):
if record.rec_type != "response":
continue
url = get_record_url(record)
# ignore non HTTP(S) URLs (intent:// for instance, see #332)
if not (url.startswith("http://") or url.startswith("https://")):
continue
path = normalize(HttpUrl(url))
if path == self.favicon_path or url == self.favicon_url:
logger.debug("Found WARC record for favicon")
Expand Down Expand Up @@ -748,6 +756,13 @@ def add_items_for_warc_record(self, record):
logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
return

url = str(url)

# ignore non HTTP(S) URLs (intent:// for instance, see #332)
if not (url.startswith("http://") or url.startswith("https://")):
logger.debug(f"Skipping record with non HTTP(S) WARC-Target-URI {url}")
return

item_zim_path = normalize(HttpUrl(url))

# if include_domains is set, only include urls from those domains
Expand Down
4 changes: 2 additions & 2 deletions src/warc2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def get_version():
return __version__


def get_record_url(record):
def get_record_url(record) -> str:
"""Check if record has url converted from POST/PUT, and if so, use that
otherwise return the target url"""
if hasattr(record, "urlkey"):
return record.urlkey
return record.rec_headers["WARC-Target-URI"]
return str(record.rec_headers["WARC-Target-URI"])


def get_status_code(record: ArcWarcRecord) -> HTTPStatus | int | None:
Expand Down

0 comments on commit 95790fa

Please sign in to comment.