Skip to content

Commit

Permalink
Try several encoding.
Browse files Browse the repository at this point in the history
Even chardet may return a invalid encoding.

By looping on all potential encodings detected by chardet, we are more
tolerant.
  • Loading branch information
mgautierfr committed Feb 13, 2024
1 parent 5ec4b38 commit 71f6aa2
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions src/warc2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,19 @@ def to_string(input_: str | bytes, encoding: str | None) -> str:
# First try declared encoding in content.
content_start = input_[:1024].decode("ascii", errors="replace")
if m := ENCODING_RE.search(content_start):
encoding = m.group("encoding")
encodings = [m.group("encoding")]
else:
encoding = chardet.detect(input_)["encoding"]
all_encodings = chardet.detect_all(input_)
encodings = [e["encoding"] for e in all_encodings]

if not encoding:
if not encodings:
raise ValueError(f"Impossible to detect encoding of content {input_[:200]}")
return input_.decode(encoding)
for encoding in encodings:
try:
return input_.decode(encoding)
except ValueError:
pass
raise ValueError(f"Impossible to decode content {input_[:200]}")


def get_record_content(record: ArcWarcRecord):
Expand Down

0 comments on commit 71f6aa2

Please sign in to comment.