Decode content bytes only with supplied charset or static list of cha…

…rsets to try
openzim · Jun 17, 2024 · b1c8a35 · b1c8a35
1 parent 4c12681
commit b1c8a35
Show file tree

Hide file tree

Showing 18 changed files with 1,343 additions and 271 deletions.
diff --git a/docs/software_architecture.md b/docs/software_architecture.md
@@ -35,10 +35,6 @@ It provide two main features:
 
 Except that, scraper directly uses WarcRecord (returned by cdxj_indexer, implemented in warcio) to access metadata and such.
 
-## chardet
-
-[chardet Python library](https://pypi.org/project/chardet/) is used to detect character encoding of files when it is absent (only HTML file typically specify its encoding) or incoherent.
-
 ## zimscraperlib
 
 [zimscraperlib Python library](https://pypi.org/project/zimscraperlib) is used for ZIM operations.

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,6 @@ dependencies = [
   "requests==2.32.3",
   "zimscraperlib==3.3.2",
   "jinja2==3.1.4",
-  "chardet==5.2.0",
   # to support possible brotli content in warcs, must be added separately
   "brotlipy==0.7.0",
   "cdxj_indexer==1.4.5",

diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py
@@ -63,6 +63,7 @@ def __init__(
         existing_zim_paths: set[ZimPath],
         missing_zim_paths: set[ZimPath] | None,
         js_modules: set[ZimPath],
+        charsets_to_try: list[str],
     ):
         self.content = get_record_content(record)
 
@@ -78,24 +79,11 @@ def __init__(
 
         self.rewrite_mode = self.get_rewrite_mode(record, mimetype)
         self.js_modules = js_modules
+        self.charsets_to_try = charsets_to_try
 
     @property
     def content_str(self) -> str:
-        try:
-            result = to_string(self.content, self.encoding)
-            if self.encoding and result.encoding and result.encoding != self.encoding:
-                logger.warning(
-                    f"Encoding issue, '{result.encoding}' has been used instead of "
-                    f"'{self.encoding}' to decode content of '{self.orig_url_str}'"
-                )
-            if result.chars_ignored:
-                logger.warning(
-                    "Encoding issue, some chars had to be ignored to properly decode "
-                    f"content of '{self.orig_url_str}' with '{result.encoding}'"
-                )
-            return result.value
-        except ValueError as e:
-            raise RuntimeError(f"Impossible to decode item {self.path.value}") from e
+        return to_string(self.content, self.encoding, self.charsets_to_try)
 
     def rewrite(
         self, pre_head_template: Template, post_head_template: Template

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -192,6 +192,9 @@ def __init__(self, args):
         self.redirections: dict[ZimPath, ZimPath] = {}
         self.missing_zim_paths: set[ZimPath] | None = set() if args.verbose else None
         self.js_modules: set[ZimPath] = set()
+        self.charsets_to_try: list[str] = [
+            charset_to_try.strip() for charset_to_try in args.charsets_to_try.split(",")
+        ]
 
         # progress file handling
         self.stats_filename = (
@@ -747,6 +750,7 @@ def add_items_for_warc_record(self, record):
                 self.expected_zim_items,
                 self.missing_zim_paths,
                 self.js_modules,
+                self.charsets_to_try,
             )
 
             if len(payload_item.content) != 0:

diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
@@ -33,13 +33,19 @@ def __init__(
         existing_zim_paths: set[ZimPath],
         missing_zim_paths: set[ZimPath] | None,
         js_modules: set[ZimPath],
+        charsets_to_try: list[str],
     ):
         super().__init__()
 
         self.path = path.value
         self.mimetype = get_record_mime_type(record)
         (self.title, self.content) = Rewriter(
-            path, record, existing_zim_paths, missing_zim_paths, js_modules
+            path,
+            record,
+            existing_zim_paths,
+            missing_zim_paths,
+            js_modules,
+            charsets_to_try,
         ).rewrite(pre_head_template, post_head_template)
 
     def get_hints(self):

diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
@@ -110,6 +110,14 @@ def main(raw_args=None):
         dest="disable_metadata_checks",
     )
 
+    parser.add_argument(
+        "--charsets-to-try",
+        help="List of charsets to try decode content when charset is not defined at "
+        "document or HTTP level. Single string, values separated by a comma. Default: "
+        "UTF-8,ISO-8859-1",
+        default="UTF-8,ISO-8859-1",
+    )
+
     args = parser.parse_args(args=raw_args)
     converter = Converter(args)
     return converter.run()

diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
@@ -5,9 +5,7 @@
 
 import re
 from http import HTTPStatus
-from typing import NamedTuple
 
-import chardet
 from bs4 import BeautifulSoup
 from warcio.recordloader import ArcWarcRecord
 
@@ -19,12 +17,6 @@
 )
 
 
-class StringConversionResult(NamedTuple):
-    value: str
-    encoding: str | None
-    chars_ignored: bool
-
-
 def get_version():
     return __version__
 
@@ -132,84 +124,58 @@ def get_record_encoding(record: ArcWarcRecord) -> str | None:
         return m.group("encoding")
 
 
-def to_string(input_: str | bytes, encoding: str | None) -> StringConversionResult:
+def to_string(
+    input_: str | bytes, http_encoding: str | None, charsets_to_try: list[str]
+) -> str:
     """
-    Decode content to string, trying to be the more tolerant possible to invalid
-    declared encoding.
+    Decode content to string based on charset declared in content or fallback.
+
+    This method tries to not be smarter than necessary.
 
-    This try to decode the content using 3 methods:
-     - From http headers in the warc record (given as `encoding` argument)
-     - From encoding declaration inside the content (hopping that content can be
-       losely decode using ascii to something usable)
-     - From statistical analysis of the content (made by chardet)
+    First, it tries to find an charset declaration inside the first bytes of the content
+    (hopping that content first bytes can be losely decoded using few known encoding to
+    something usable). If found, it is used to decode and any bad character is
+    automatically replaced, assuming document editor is right.
 
-    If all these methods fails, try again with the encoding passed via http headers but
-    ignore all unrecognized characters.
+    Second, if no charset declaration has been found in content, it uses the charset
+    declared in HTTP `Content-Type` header. This is passed to this method as
+    `http_encoding` argument. If present, it is used to decode and any bad character is
+    automatically replaced, assuming web server is right.
 
-    Returns the decoded content, the encoding used (or None if the input was already
-    decoded) and a boolean indicating wether unrecognized characters had to been ignored
-    or not.
+    Finally, we fallback to use `charsets_to_try` argument, which is a list of charsets
+    to try. Each charset is tried in order, but any bad character found is raising an
+    error. If none of these charsets achieves to decode the content, an exception is
+    raised.
+
+    Returns the decoded content.
 
     """
-    http_encoding = encoding
 
-    tried_encodings: set[str] = set()
     if isinstance(input_, str):
-        return StringConversionResult(input_, None, False)
+        return input_
 
     if not input_:
         # Empty bytes are easy to decode
-        return StringConversionResult("", None, False)
-
-    if encoding:
-        try:
-            return StringConversionResult(input_.decode(encoding), encoding, False)
-        except (ValueError, LookupError):
-            tried_encodings.add(encoding)
-            pass
+        return ""
 
     # Search for encoding from content first bytes based on regexp
-    content_start = input_[:1024].decode("ascii", errors="replace")
-    if m := ENCODING_RE.search(content_start):
-        encoding = m.group("encoding")
-        if encoding and encoding not in tried_encodings:
-            try:
-                return StringConversionResult(input_.decode(encoding), encoding, False)
-            except (ValueError, LookupError):
-                tried_encodings.add(encoding)
-                pass
-
-    # Try to detect the most probable encoding with chardet (and only most probable
-    # one, since otherwise we will likely find an encoding which pass but produces only
-    # garbage with most characters badly decoded just due to a wrongly encoded character
-    # see https://github.com/openzim/warc2zim/issues/221)
-    # Nota: we use the detect_all method of chardet even if we are interesting only in
-    # the most probable encoding, because (as-of chardet 5.2.0 at least) the detect
-    # chardet method seems to be more naive, and detect_all gives better results in our
-    # tests
-    chardet_encodings = chardet.detect_all(input_)
-    if len(chardet_encodings):
-        chardet_encoding = chardet_encodings[0]["encoding"]
-        if chardet_encoding and chardet_encoding not in tried_encodings:
-            try:
-                return StringConversionResult(
-                    input_.decode(chardet_encoding), chardet_encoding, False
-                )
-            except (ValueError, LookupError):
-                tried_encodings.add(chardet_encoding)
-                pass
-
-    # Try again encoding detected by chardet (most probable one), but this time ignore
-    # all bad chars
+    for encoding in ["ascii", "utf-16", "utf-32"]:
+        content_start = input_[:1024].decode(encoding, errors="replace")
+        if m := ENCODING_RE.search(content_start):
+            head_encoding = m.group("encoding")
+            return input_.decode(head_encoding, errors="replace")
+
     if http_encoding:
+        return input_.decode(http_encoding, errors="replace")
+
+    # Try all charsets_to_try passed
+    for charset_to_try in charsets_to_try:
         try:
-            return StringConversionResult(
-                input_.decode(http_encoding, errors="ignore"), http_encoding, True
-            )
+            return input_.decode(charset_to_try)
         except (ValueError, LookupError):
             pass
 
-    raise ValueError(f"Impossible to decode content {input_[:200]}")
+    raise ValueError(f"No suitable charset found to decode content {input_[:200]}")
 
 
 def get_record_content(record: ArcWarcRecord):

diff --git a/tests/encodings/definition.json b/tests/encodings/definition.json
@@ -0,0 +1,87 @@
+{
+    "files": [
+        {
+            "filename": "file01.js",
+            "source": "https://www.marxists.org/espanol/menu.js",
+            "date": "2024-06",
+            "probable_charset": "ISO-8859-1",
+            "expected_strings": [
+                "Afanásiev, Víktor",
+                "Andrópov, Yuri",
+                "Amaguaña, Tránsito",
+                "Cunhal, Álvaro",
+                "De la Cruz, Juana Inés",
+                "Faure, Sèbastien"
+            ]
+        },
+        {
+            "filename": "file02.js",
+            "source": "https://www.cloudflare.com/vendor/onetrust/scripttemplates/202308.2.0/otBannerSdk.js",
+            "date": "2024-06",
+            "probable_charset": "UTF-8",
+            "expected_strings": [
+                "_Container:\"#ot-ven-lst\",P_Ven_Bx:\"ot-ven-box\",P_Ven_Name:\".ot-ven-name\"",
+                "ist,IabType:e.IabType,InactiveText:e.InactiveText,IsConsentLoggingEnabled:e.IsConsentLoggingEnabl",
+                "0;\\n                    transition: visibility 0s \"+e+\"ms, opacity \"+e+\"ms linear;\\n                \",!0);var",
+                "r.prototype.escapeRegExp=function(e){return e.replace(/[-/\\\\^$*+?.()|[\\]{}]/g,\"\\\\$&\")}"
+            ]
+        },
+        {
+            "filename": "file03.html",
+            "source": "https://www.solidarite-numerique.fr/tutoriels/comprendre-les-cookies/?thematique=internet",
+            "date": "2024-06",
+            "probable_charset": "UTF-8",
+            "contains_bad_chars": true,
+            "expected_strings": [
+                "Vous souhaitez changer de navigateur et utiliser Firefox ? Ce tutoriel vous détaille la procédure d'installation et la configuration pour une premi�..."
+            ]
+        },
+        {
+            "filename": "file04.js",
+            "source": "https://static.mailerlite.com/js/w/ml_jQuery.inputmask.bundle.min.js?v3.3.1",
+            "date": "2024-06",
+            "probable_charset": "ascii",
+            "expected_strings": [
+                "1,this.isOptional=b||!1,this.isQuantifier=c||!1,this.isAlterna",
+                "is;if(na=!1,g.clearMaskOnLostFocus&&document.activeElement!==b){var c=x().slice(),d=b.inputmask._v"
+            ]
+        },
+        {
+            "filename": "file05.js",
+            "source": "https://static.sketchfab.com/static/builds/web/dist/ac0f732c4fc1a30c77920d75c1a9be83-v2.js",
+            "date": "2024-06",
+            "probable_charset": "ascii",
+            "expected_strings": [
+                "isTickUsed||(this._isTickUsed=!0,this._schedule(this.drainQueues))},s.prototype._reset=function(){this._is"
+            ]
+        },
+        {
+            "filename": "file06.html",
+            "source": "https://website.test.openzim.org/chinese-encoding.html",
+            "date": "2024-06",
+            "known_charset": "gb2312",
+            "expected_strings": [
+                "simplified chinese characters: 汉字"
+            ]
+        },
+        {
+            "filename": "file07.html",
+            "source": "https://website.test.openzim.org/chinese-encoding.html without <meta> header",
+            "date": "2024-06",
+            "known_charset": "gb2312",
+            "http_charset": "gb2312",
+            "expected_strings": [
+                "simplified chinese characters: 汉字"
+            ]
+        },
+        {
+            "filename": "file08.js",
+            "source": "https://community.mozilla.org/wp-content/plugins/events-manager/includes/js/events-manager.min.js?ver=6.4.1",
+            "date": "2024-06",
+            "probable_charset": "UTF-8",
+            "expected_strings": [
+                "t Array]\"===Object.prototype.toString.call(e)},s={a:\"[aḀḁĂăÂâǍǎȺⱥȦȧẠạÄäÀàÁáĀāÃãÅåąĄÃąĄ]\",b:\"[b␢β"
+            ]
+        }
+    ]
+}