Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix detection of encoding again #314

Merged
merged 1 commit into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions docs/software_architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@ It provide two main features:

Except that, scraper directly uses WarcRecord (returned by cdxj_indexer, implemented in warcio) to access metadata and such.

## chardet

[chardet Python library](https://pypi.org/project/chardet/) is used to detect character encoding of files when it is absent (only HTML file typically specify its encoding) or incoherent.

## zimscraperlib

[zimscraperlib Python library](https://pypi.org/project/zimscraperlib) is used for ZIM operations.
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ dependencies = [
"requests==2.32.3",
"zimscraperlib==3.3.2",
"jinja2==3.1.4",
"chardet==5.2.0",
# to support possible brotli content in warcs, must be added separately
"brotlipy==0.7.0",
"cdxj_indexer==1.4.5",
Expand Down
18 changes: 3 additions & 15 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(
existing_zim_paths: set[ZimPath],
missing_zim_paths: set[ZimPath] | None,
js_modules: set[ZimPath],
charsets_to_try: list[str],
):
self.content = get_record_content(record)

Expand All @@ -78,24 +79,11 @@ def __init__(

self.rewrite_mode = self.get_rewrite_mode(record, mimetype)
self.js_modules = js_modules
self.charsets_to_try = charsets_to_try

@property
def content_str(self) -> str:
try:
result = to_string(self.content, self.encoding)
if self.encoding and result.encoding and result.encoding != self.encoding:
logger.warning(
f"Encoding issue, '{result.encoding}' has been used instead of "
f"'{self.encoding}' to decode content of '{self.orig_url_str}'"
)
if result.chars_ignored:
logger.warning(
"Encoding issue, some chars had to be ignored to properly decode "
f"content of '{self.orig_url_str}' with '{result.encoding}'"
)
return result.value
except ValueError as e:
raise RuntimeError(f"Impossible to decode item {self.path.value}") from e
return to_string(self.content, self.encoding, self.charsets_to_try)

def rewrite(
self, pre_head_template: Template, post_head_template: Template
Expand Down
4 changes: 4 additions & 0 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ def __init__(self, args):
self.redirections: dict[ZimPath, ZimPath] = {}
self.missing_zim_paths: set[ZimPath] | None = set() if args.verbose else None
self.js_modules: set[ZimPath] = set()
self.charsets_to_try: list[str] = [
charset_to_try.strip() for charset_to_try in args.charsets_to_try.split(",")
]

# progress file handling
self.stats_filename = (
Expand Down Expand Up @@ -747,6 +750,7 @@ def add_items_for_warc_record(self, record):
self.expected_zim_items,
self.missing_zim_paths,
self.js_modules,
self.charsets_to_try,
)

if len(payload_item.content) != 0:
Expand Down
8 changes: 7 additions & 1 deletion src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,19 @@ def __init__(
existing_zim_paths: set[ZimPath],
missing_zim_paths: set[ZimPath] | None,
js_modules: set[ZimPath],
charsets_to_try: list[str],
):
super().__init__()

self.path = path.value
self.mimetype = get_record_mime_type(record)
(self.title, self.content) = Rewriter(
path, record, existing_zim_paths, missing_zim_paths, js_modules
path,
record,
existing_zim_paths,
missing_zim_paths,
js_modules,
charsets_to_try,
).rewrite(pre_head_template, post_head_template)

def get_hints(self):
Expand Down
8 changes: 8 additions & 0 deletions src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ def main(raw_args=None):
dest="disable_metadata_checks",
)

parser.add_argument(
"--charsets-to-try",
help="List of charsets to try decode content when charset is not defined at "
"document or HTTP level. Single string, values separated by a comma. Default: "
"UTF-8,ISO-8859-1",
default="UTF-8,ISO-8859-1",
)

args = parser.parse_args(args=raw_args)
converter = Converter(args)
return converter.run()
Expand Down
102 changes: 34 additions & 68 deletions src/warc2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@

import re
from http import HTTPStatus
from typing import NamedTuple

import chardet
from bs4 import BeautifulSoup
from warcio.recordloader import ArcWarcRecord

Expand All @@ -19,12 +17,6 @@
)


class StringConversionResult(NamedTuple):
value: str
encoding: str | None
chars_ignored: bool


def get_version():
return __version__

Expand Down Expand Up @@ -132,84 +124,58 @@ def get_record_encoding(record: ArcWarcRecord) -> str | None:
return m.group("encoding")


def to_string(input_: str | bytes, encoding: str | None) -> StringConversionResult:
def to_string(
input_: str | bytes, http_encoding: str | None, charsets_to_try: list[str]
) -> str:
"""
Decode content to string, trying to be the more tolerant possible to invalid
declared encoding.
Decode content to string based on charset declared in content or fallback.
This method tries to not be smarter than necessary.
This try to decode the content using 3 methods:
- From http headers in the warc record (given as `encoding` argument)
- From encoding declaration inside the content (hopping that content can be
losely decode using ascii to something usable)
- From statistical analysis of the content (made by chardet)
First, it tries to find an charset declaration inside the first bytes of the content
(hopping that content first bytes can be losely decoded using few known encoding to
something usable). If found, it is used to decode and any bad character is
automatically replaced, assuming document editor is right.
If all these methods fails, try again with the encoding passed via http headers but
ignore all unrecognized characters.
Second, if no charset declaration has been found in content, it uses the charset
declared in HTTP `Content-Type` header. This is passed to this method as
`http_encoding` argument. If present, it is used to decode and any bad character is
automatically replaced, assuming web server is right.
Returns the decoded content, the encoding used (or None if the input was already
decoded) and a boolean indicating wether unrecognized characters had to been ignored
or not.
Finally, we fallback to use `charsets_to_try` argument, which is a list of charsets
to try. Each charset is tried in order, but any bad character found is raising an
error. If none of these charsets achieves to decode the content, an exception is
raised.
Returns the decoded content.
"""
http_encoding = encoding

tried_encodings: set[str] = set()
if isinstance(input_, str):
return StringConversionResult(input_, None, False)
return input_

if not input_:
# Empty bytes are easy to decode
return StringConversionResult("", None, False)

if encoding:
try:
return StringConversionResult(input_.decode(encoding), encoding, False)
except (ValueError, LookupError):
tried_encodings.add(encoding)
pass
return ""

# Search for encoding from content first bytes based on regexp
content_start = input_[:1024].decode("ascii", errors="replace")
if m := ENCODING_RE.search(content_start):
encoding = m.group("encoding")
if encoding and encoding not in tried_encodings:
try:
return StringConversionResult(input_.decode(encoding), encoding, False)
except (ValueError, LookupError):
tried_encodings.add(encoding)
pass

# Try to detect the most probable encoding with chardet (and only most probable
# one, since otherwise we will likely find an encoding which pass but produces only
# garbage with most characters badly decoded just due to a wrongly encoded character
# see https://github.com/openzim/warc2zim/issues/221)
# Nota: we use the detect_all method of chardet even if we are interesting only in
# the most probable encoding, because (as-of chardet 5.2.0 at least) the detect
# chardet method seems to be more naive, and detect_all gives better results in our
# tests
chardet_encodings = chardet.detect_all(input_)
if len(chardet_encodings):
chardet_encoding = chardet_encodings[0]["encoding"]
if chardet_encoding and chardet_encoding not in tried_encodings:
try:
return StringConversionResult(
input_.decode(chardet_encoding), chardet_encoding, False
)
except (ValueError, LookupError):
tried_encodings.add(chardet_encoding)
pass

# Try again encoding detected by chardet (most probable one), but this time ignore
# all bad chars
for encoding in ["ascii", "utf-16", "utf-32"]:
content_start = input_[:1024].decode(encoding, errors="replace")
if m := ENCODING_RE.search(content_start):
head_encoding = m.group("encoding")
return input_.decode(head_encoding, errors="replace")

if http_encoding:
return input_.decode(http_encoding, errors="replace")

# Try all charsets_to_try passed
for charset_to_try in charsets_to_try:
try:
return StringConversionResult(
input_.decode(http_encoding, errors="ignore"), http_encoding, True
)
return input_.decode(charset_to_try)
except (ValueError, LookupError):
pass

raise ValueError(f"Impossible to decode content {input_[:200]}")
raise ValueError(f"No suitable charset found to decode content {input_[:200]}")


def get_record_content(record: ArcWarcRecord):
Expand Down
87 changes: 87 additions & 0 deletions tests/encodings/definition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"files": [
{
"filename": "file01.js",
"source": "https://www.marxists.org/espanol/menu.js",
"date": "2024-06",
"probable_charset": "ISO-8859-1",
"expected_strings": [
"Afanásiev, Víktor",
"Andrópov, Yuri",
"Amaguaña, Tránsito",
"Cunhal, Álvaro",
"De la Cruz, Juana Inés",
"Faure, Sèbastien"
]
},
{
"filename": "file02.js",
"source": "https://www.cloudflare.com/vendor/onetrust/scripttemplates/202308.2.0/otBannerSdk.js",
"date": "2024-06",
"probable_charset": "UTF-8",
"expected_strings": [
"_Container:\"#ot-ven-lst\",P_Ven_Bx:\"ot-ven-box\",P_Ven_Name:\".ot-ven-name\"",
"ist,IabType:e.IabType,InactiveText:e.InactiveText,IsConsentLoggingEnabled:e.IsConsentLoggingEnabl",
"0;\\n transition: visibility 0s \"+e+\"ms, opacity \"+e+\"ms linear;\\n \",!0);var",
"r.prototype.escapeRegExp=function(e){return e.replace(/[-/\\\\^$*+?.()|[\\]{}]/g,\"\\\\$&\")}"
]
},
{
"filename": "file03.html",
"source": "https://www.solidarite-numerique.fr/tutoriels/comprendre-les-cookies/?thematique=internet",
"date": "2024-06",
"probable_charset": "UTF-8",
"contains_bad_chars": true,
"expected_strings": [
"Vous souhaitez changer de navigateur et utiliser Firefox ? Ce tutoriel vous détaille la procédure d'installation et la configuration pour une premi�..."
]
},
{
"filename": "file04.js",
"source": "https://static.mailerlite.com/js/w/ml_jQuery.inputmask.bundle.min.js?v3.3.1",
"date": "2024-06",
"probable_charset": "ascii",
"expected_strings": [
"1,this.isOptional=b||!1,this.isQuantifier=c||!1,this.isAlterna",
"is;if(na=!1,g.clearMaskOnLostFocus&&document.activeElement!==b){var c=x().slice(),d=b.inputmask._v"
]
},
{
"filename": "file05.js",
"source": "https://static.sketchfab.com/static/builds/web/dist/ac0f732c4fc1a30c77920d75c1a9be83-v2.js",
"date": "2024-06",
"probable_charset": "ascii",
"expected_strings": [
"isTickUsed||(this._isTickUsed=!0,this._schedule(this.drainQueues))},s.prototype._reset=function(){this._is"
]
},
{
"filename": "file06.html",
"source": "https://website.test.openzim.org/chinese-encoding.html",
"date": "2024-06",
"known_charset": "gb2312",
"expected_strings": [
"simplified chinese characters: 汉字"
]
},
{
"filename": "file07.html",
"source": "https://website.test.openzim.org/chinese-encoding.html without <meta> header",
"date": "2024-06",
"known_charset": "gb2312",
"http_charset": "gb2312",
"expected_strings": [
"simplified chinese characters: 汉字"
]
},
{
"filename": "file08.js",
"source": "https://community.mozilla.org/wp-content/plugins/events-manager/includes/js/events-manager.min.js?ver=6.4.1",
"date": "2024-06",
"probable_charset": "UTF-8",
"expected_strings": [
"t Array]\"===Object.prototype.toString.call(e)},s={a:\"[aḀḁĂăÂâǍǎȺⱥȦȧẠạÄäÀàÁáĀāÃãÅåąĄÃąĄ]\",b:\"[b␢β"
]
}
]
}
Loading