Skip to content

Commit

Permalink
use more sophisticated encoding detection when utf8 decoding fails. (#…
Browse files Browse the repository at this point in the history
…172)

some websites, especially older ones, sometimes use a different encoding scheme than utf8 or latin1. before, we simply tried different encoding schemes until one successfully decoded the bytes but this approach can fail unexpectedly as some encodings can erroneously get decoded by other encodings without errors being reported.
we now use the encoding detection crate 'chardetng' which is also [used in firefox](https://github.com/hsivonen/chardetng?tab=readme-ov-file#purpose).
  • Loading branch information
mikkeldenker authored Mar 5, 2024
1 parent c7e596a commit f8c58b3
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 25 deletions.
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 11 additions & 10 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[workspace]
members = [
"crates/core",
"crates/optics",
"crates/kuchiki",
"crates/zimba",
"crates/core",
"crates/optics",
"crates/kuchiki",
"crates/zimba",
]
resolver = "2"

Expand All @@ -24,16 +24,20 @@ axum-extra = {version = "0.9.0"}
axum-macros = "0.4.0"
base64 = "0.21.4"
bincode = "1.3.3"
bindgen = "0.69.2"
bitvec = "1.0.1"
bytemuck = {version = "1.13.1", features = ["derive"]}
byteorder = "1.4.3"
bzip2 = "0.4.4"
candle-transformers = "0.3.3"
candle-nn = "0.3.3"
candle-core = "0.3.3"
candle-nn = "0.3.3"
candle-transformers = "0.3.3"
cc = {version = "1", features = ["parallel"]}
chardetng = "0.1.17"
chitchat = "0.5.0"
chrono = {version = "0.4.23", features = ["serde"]}
clap = {version = "4.4.6", features = ["derive"]}
cmake = "0.1"
criterion = "0.5.1"
crossbeam-channel = "0.5.6"
csv = "1.1.6"
Expand All @@ -46,7 +50,7 @@ fnv = "1.0.3"
fst = {version = "0.4.7", features = ["levenshtein"]}
futures = "0.3.21"
half = {version = "2.2.1", features = ["serde"]}
hashbrown = {version = "0.14.0", features = ["serde" ]}
hashbrown = {version = "0.14.0", features = ["serde"]}
http = "1.0.0"
image = "0.24.3"
indicatif = {version = "0.17.7", features = ["rayon"]}
Expand Down Expand Up @@ -110,9 +114,6 @@ utoipa-swagger-ui = {version = "5.0.0", features = ["axum"]}
uuid = "1.1.2"
whatlang = {version = "0.16.0", features = ["serde"]}
zstd = "0.13"
bindgen = "0.69.2"
cmake = "0.1"
cc = { version = "1", features = ["parallel"] }

[profile.test.package]
flate2.opt-level = 3
3 changes: 2 additions & 1 deletion assets/licenses.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ <h1>Third Party Licenses</h1>

<h2>Overview of licenses:</h2>
<ul class="licenses-overview">
<li><a href="#Apache-2.0">Apache License 2.0</a> (388)</li>
<li><a href="#Apache-2.0">Apache License 2.0</a> (389)</li>
<li><a href="#MIT">MIT License</a> (175)</li>
<li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (9)</li>
<li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (8)</li>
Expand Down Expand Up @@ -1101,6 +1101,7 @@ <h4>Used by:</h4>
<h3 id="Apache-2.0">Apache License 2.0</h3>
<h4>Used by:</h4>
<ul class="license-used-by">
<li><a href=" https://github.com/hsivonen/chardetng ">chardetng 0.1.17</a></li>
<li><a href=" https://github.com/KyleMayes/clang-sys ">clang-sys 1.7.0</a></li>
<li><a href=" https://github.com/tormol/encode_unicode ">encode_unicode 0.3.6</a></li>
<li><a href=" https://github.com/hsivonen/encoding_rs ">encoding_rs 0.8.33</a></li>
Expand Down
7 changes: 4 additions & 3 deletions crates/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ default-run = "stract"
edition = "2021"
license = "AGPL-3.0"
name = "stract"
version = "0.1.0"
publish = false
version = "0.1.0"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

Expand Down Expand Up @@ -33,9 +33,10 @@ bitvec = {workspace = true}
bytemuck = {workspace = true}
byteorder = {workspace = true}
bzip2 = {workspace = true}
candle-transformers = {workspace = true}
candle-nn = {workspace = true}
candle-core = {workspace = true}
candle-nn = {workspace = true}
candle-transformers = {workspace = true}
chardetng = {workspace = true}
chitchat = {workspace = true}
chrono = {workspace = true}
clap = {workspace = true}
Expand Down
37 changes: 26 additions & 11 deletions crates/core/src/warc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,17 @@ fn rtrim(s: &mut String) {
s.truncate(s.trim_end().len());
}

fn decode(raw: &[u8]) -> String {
fn decode_string(raw: &[u8]) -> String {
if let Ok(res) = String::from_utf8(raw.to_owned()) {
res
} else {
let encodings = [
encoding_rs::WINDOWS_1251,
encoding_rs::GBK,
encoding_rs::SHIFT_JIS,
encoding_rs::EUC_JP,
encoding_rs::EUC_KR,
];

for enc in encodings {
let mut detector = chardetng::EncodingDetector::new();
detector.feed(raw, true);
let (enc, conf) = detector.guess_assess(None, true);

if conf {
dbg!(enc.name());

let (cow, _, had_errors) = enc.decode(raw);
if !had_errors {
return cow.to_string();
Expand Down Expand Up @@ -284,7 +282,7 @@ pub struct Response {

impl Response {
fn from_raw(record: RawWarcRecord) -> Result<Self> {
let content = decode(&record.content[..]);
let content = decode_string(&record.content[..]);

let (_header, content) = content
.split_once("\r\n\r\n")
Expand Down Expand Up @@ -852,6 +850,23 @@ mod tests {
assert_eq!(records[0].metadata.fetch_time_ms, 0);
}

#[test]
fn character_encodings() {
for (encoding, s) in [
(
encoding_rs::WINDOWS_1252,
"Groupe CROISEUR LEGER après 10 courses",
),
(encoding_rs::EUC_JP, "あいうえお"),
(encoding_rs::EUC_KR, "안녕하세요"),
] {
let encoded = encoding.encode(s).0;
let string = decode_string(&encoded);

assert_eq!(s, string, "Failed for encoding {:?}", encoding.name());
}
}

proptest! {
#[test]
fn write_read_invariant_prop(records: Vec<WarcRecord>) {
Expand Down

0 comments on commit f8c58b3

Please sign in to comment.