Skip to content

Commit

Permalink
Re-encode if a website is not in UTF-8
Browse files Browse the repository at this point in the history
It re-encodes in a particularly inefficient way: by parsing, searching
for the meta charset tag, re-encoding the raw bytes using that encoding,
and then re-parsing the re-encoded string.

Hyperfine (https://github.com/sharkdp/hyperfine) shows that this is 36%
slower than the previous version, but both versions complete nearly
instantaneously, so I'm OK with it.
  • Loading branch information
gabebw committed Aug 31, 2019
1 parent f76f44b commit 6841ca3
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 6 deletions.
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
encoding_rs = "0.8"
isatty = "0.1"
regex = "1"
scraper = "*"
30 changes: 24 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use encoding_rs::Encoding;
use isatty::stdin_isatty;
use regex::Regex;
use scraper::{Html, Selector};
Expand All @@ -7,20 +8,22 @@ use std::process;

struct Inputs {
selector: String,
html: String
html: String,
raw_bytes: Vec<u8>
}

fn read_from_stdin() -> Option<String> {
fn read_from_stdin() -> Option<(String, Vec<u8>)> {
// It might not be valid UTF-8, so read to a vector of bytes and convert it to UTF-8, lossily
let mut buffer: Vec<u8> = Vec::new();
io::stdin().read_to_end(&mut buffer).ok()?;
Some(String::from_utf8_lossy(&buffer).to_string())
let string = String::from_utf8_lossy(&buffer).to_string();
Some((string, buffer))
}

fn read_inputs() -> Result<Inputs, String> {
let selector = env::args().nth(1).ok_or("Usage: candle SELECTOR")?;
let html = read_from_stdin().ok_or("Error: couldn't read from STDIN")?;
Ok(Inputs { selector, html })
let (html, raw_bytes) = read_from_stdin().ok_or("Error: couldn't read from STDIN")?;
Ok(Inputs { selector, html, raw_bytes })
}

fn main() {
Expand Down Expand Up @@ -66,8 +69,23 @@ fn select(document: scraper::Html, captures: regex::Captures) -> Result<Vec<Stri
}
}

fn detect_encoding_and_re_parse(document: scraper::Html, inputs: &Inputs) -> scraper::Html {
let meta_selector = Selector::parse("meta[charset]").unwrap();
// If there's a `<meta charset="...">` tag, re-parse the HTML in that encoding.
// Otherwise, keep it exactly the same.
if let Some(meta_result) = document.select(&meta_selector).nth(0) {
let charset = meta_result.value().attr("charset").unwrap();
match Encoding::for_label(charset.as_bytes()) {
Some(encoding) => Html::parse_document(&*encoding.decode(&inputs.raw_bytes).0),
None => document
}
} else {
document
}
}

fn parse(inputs: Inputs) -> Result<Vec<String>, String> {
let document = Html::parse_document(&inputs.html);
let document = detect_encoding_and_re_parse(Html::parse_document(&inputs.html), &inputs);
let re = Regex::new(r"(?P<selector>.+) (?:(?P<text>\{text\})|(attr\{(?P<attr>[^}]+)\}))$").unwrap();
match re.captures(&inputs.selector) {
Some(captures) => select(document, captures),
Expand Down

0 comments on commit 6841ca3

Please sign in to comment.