From 6841ca35da59ca9066e9dab9094702f2b6980f63 Mon Sep 17 00:00:00 2001 From: Gabe Berke-Williams Date: Sat, 31 Aug 2019 00:29:12 -0700 Subject: [PATCH] Re-encode if a website is not in UTF-8 It re-encodes in a particularly inefficient way: by parsing, searching for the meta charset tag, re-encoding the raw bytes using that encoding, and then re-parsing the re-encoded string. Hyperfine (https://github.com/sharkdp/hyperfine) shows that this is 36% slower than the previous version, but both versions complete nearly instantaneously, so I'm OK with it. --- Cargo.lock | 10 ++++++++++ Cargo.toml | 1 + src/main.rs | 30 ++++++++++++++++++++++++------ 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a3261e9..9321006 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -27,6 +27,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" name = "candle" version = "0.2.0" dependencies = [ + "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", "isatty 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "scraper 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -93,6 +94,14 @@ name = "ego-tree" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "encoding_rs" +version = "0.8.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "fuchsia-cprng" version = "0.1.1" @@ -623,6 +632,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum dtoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "ea57b42383d091c85abcc2706240b94ab2a8fa1fc81c10ff23c4de06e2a90b5e" "checksum dtoa-short 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2" "checksum ego-tree 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9733f6ada1734cb25b4033b2855ec78feb267971a2dd76012974906ba8780074" +"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" "checksum futf 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" "checksum fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" diff --git a/Cargo.toml b/Cargo.toml index ab173a4..1319fc8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +encoding_rs = "0.8" isatty = "0.1" regex = "1" scraper = "*" diff --git a/src/main.rs b/src/main.rs index bfa237c..f27d6e3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +use encoding_rs::Encoding; use isatty::stdin_isatty; use regex::Regex; use scraper::{Html, Selector}; @@ -7,20 +8,22 @@ use std::process; struct Inputs { selector: String, - html: String + html: String, + raw_bytes: Vec } -fn read_from_stdin() -> Option { +fn read_from_stdin() -> Option<(String, Vec)> { // It might not be valid UTF-8, so read to a vector of bytes and convert it to UTF-8, lossily let mut buffer: Vec = Vec::new(); io::stdin().read_to_end(&mut buffer).ok()?; - Some(String::from_utf8_lossy(&buffer).to_string()) + let string = String::from_utf8_lossy(&buffer).to_string(); + Some((string, buffer)) } fn read_inputs() -> Result { let selector = env::args().nth(1).ok_or("Usage: candle SELECTOR")?; - let html = read_from_stdin().ok_or("Error: couldn't read from STDIN")?; - Ok(Inputs { selector, html }) + let (html, raw_bytes) = read_from_stdin().ok_or("Error: couldn't read from STDIN")?; + Ok(Inputs { selector, html, raw_bytes }) } fn main() { @@ -66,8 +69,23 @@ fn select(document: scraper::Html, captures: regex::Captures) -> Result scraper::Html { + let meta_selector = Selector::parse("meta[charset]").unwrap(); + // If there's a `` tag, re-parse the HTML in that encoding. + // Otherwise, keep it exactly the same. + if let Some(meta_result) = document.select(&meta_selector).nth(0) { + let charset = meta_result.value().attr("charset").unwrap(); + match Encoding::for_label(charset.as_bytes()) { + Some(encoding) => Html::parse_document(&*encoding.decode(&inputs.raw_bytes).0), + None => document + } + } else { + document + } +} + fn parse(inputs: Inputs) -> Result, String> { - let document = Html::parse_document(&inputs.html); + let document = detect_encoding_and_re_parse(Html::parse_document(&inputs.html), &inputs); let re = Regex::new(r"(?P.+) (?:(?P\{text\})|(attr\{(?P[^}]+)\}))$").unwrap(); match re.captures(&inputs.selector) { Some(captures) => select(document, captures),