Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

validate: Faster RFC4180 validation with byterecords and SIMD-accelerated utf8 validation #1440

Merged
merged 2 commits into from
Nov 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 57 additions & 51 deletions src/cmd/validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,40 +258,27 @@
}
}

// now, let's validate the rest of the records
let mut record = csv::StringRecord::new();
let mut result = rdr.read_record(&mut record);
// now, let's validate the rest of the records the fastest way possible
// We do that by using csv::ByteRecord, which does not validate utf8
// making for higher througput and lower memory usage compared to csv::StringRecord
// which validates each field SEPARATELY as a utf8 string.
// Combined with simdutf8::basic::from_utf8() to validate the entire record as utf8 in one
// go as a slice of bytes, this approach is much faster than csv::StringRecord's
// per-field validation.
let mut record = csv::ByteRecord::new();
let mut result;
let mut record_idx: u64 = 0;
let flag_json = args.flag_json;
let flag_pretty_json = args.flag_pretty_json;

'rfc4180_check: loop {
result = rdr.read_byte_record(&mut record);
if let Err(e) = result {
// read_byte_record() does not validate utf8, so we know this is not a utf8 error
if flag_json || flag_pretty_json {
// we're returning a JSON error, so we have more machine-friendly details
// using the JSON API error format
if let csv::ErrorKind::Utf8 { pos, err } = e.kind() {
// it's a UTF-8 error, so we report utf8 error metadata
let validation_error = json!({
"errors": [{
"title" : "UTF-8 validation error",
"detail" : format!("{e}"),
"meta": {
"last_valid_record": format!("{record_idx}"),
"record_position": format!("{pos:?}"),
"record_error": format!("{err}"),
}
}]
});

let json_error = if flag_pretty_json {
serde_json::to_string_pretty(&validation_error).unwrap()
} else {
validation_error.to_string()
};
return fail_encoding_clierror!("{json_error}");
}
// it's not a UTF-8 error, so we report generic validation error
let validation_error = json!({
"errors": [{
"title" : "Validation error",
Expand All @@ -313,40 +300,59 @@

// we're not returning a JSON error, so we can use
// a user-friendly error message with suggestions
match e.kind() {
csv::ErrorKind::UnequalLengths {
expected_len: _,
len: _,
pos: _,
} => {
return fail_clierror!(
"Validation error: {e}.\nUse `qsv fixlengths` to fix record length \
issues."
);
},
csv::ErrorKind::Utf8 { pos, err } => {
return fail_encoding_clierror!(
"non-utf8 sequence at record {record_idx} position \
{pos:?}.\n{err}\nUse `qsv input` to fix formatting and to handle \
non-utf8 sequences.\nYou may also want to transcode your data to \
UTF-8 first using `iconv` or `recode`."
);
},
_ => {
return fail_clierror!(
"Validation error: {e}.\nLast valid record: {record_idx}"
);
},
if let csv::ErrorKind::UnequalLengths {
expected_len: _,
len: _,
pos: _,
} = e.kind()
{
return fail_clierror!(
"Validation error: {e}.\nUse `qsv fixlengths` to fix record length issues."
);
} else {
return fail_clierror!(
"Validation error: {e}.\nLast valid record: {record_idx}"
);
}
} else if result.is_ok_and(|more_data| !more_data) {
}

// use SIMD accelerated UTF-8 validation
if simdutf8::basic::from_utf8(&record.as_slice()).is_err() {

Check warning

Code scanning / clippy

this expression creates a reference which is immediately dereferenced by the compiler Warning

this expression creates a reference which is immediately dereferenced by the compiler
// there's a UTF-8 error, so we report utf8 error metadata
if flag_json || flag_pretty_json {
let validation_error = json!({
"errors": [{
"title" : "UTF-8 validation error",
"detail" : "Cannot parse CSV record as UTF-8",
"meta": {
"last_valid_record": format!("{record_idx}"),
}
}]
});

let json_error = if flag_pretty_json {
serde_json::to_string_pretty(&validation_error).unwrap()
} else {
validation_error.to_string()
};
return fail_encoding_clierror!("{json_error}");
} else {
return fail_encoding_clierror!(
"non-utf8 sequence at record {record_idx}.\nUse `qsv input` to fix \
formatting and to handle non-utf8 sequences.\nYou may also want to \
transcode your data to UTF-8 first using `iconv` or `recode`."
);
}
}

if result.is_ok_and(|more_data| !more_data) {
// we've read the CSV to the end, so break out of loop
break 'rfc4180_check;
} else {
result = rdr.read_record(&mut record);
}
record_idx += 1;
}

// if we're here, we know the CSV is valid
let msg = if flag_json || flag_pretty_json {
let rfc4180 = RFC4180Struct {
delimiter_char: rconfig.get_delimiter() as char,
Expand Down
64 changes: 64 additions & 0 deletions tests/test_validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@ fn validate_good_csv_msg() {
assert_eq!(got, expected);
}

#[test]
fn validate_empty_csv_msg() {
let wrk = Workdir::new("validate_empty_csv_msg").flexible(true);
wrk.create(
"data.csv",
vec![svec!["title", "name", "real age (earth years)"]],
);
let mut cmd = wrk.command("validate");
cmd.arg("data.csv");

let got: String = wrk.stdout(&mut cmd);
let expected =
r#"Valid: 3 columns ("title", "name", "real age (earth years)") and 0 records detected."#;
assert_eq!(got, expected);
}

#[test]
fn validate_good_csv_pretty_json() {
let wrk = Workdir::new("validate_good_csv_pretty_json").flexible(true);
Expand Down Expand Up @@ -114,6 +130,54 @@ Use `qsv fixlengths` to fix record length issues.
wrk.assert_err(&mut cmd);
}

#[test]
fn validate_bad_csv_first_record() {
let wrk = Workdir::new("validate_bad_csv_first_record").flexible(true);
wrk.create(
"data.csv",
vec![
svec!["title", "name", "age"],
svec!["Professor", "Xaviers",],
svec!["Doctor", "Magneto", "90",],
svec!["First Class Student", "Iceman", "14"],
],
);
let mut cmd = wrk.command("validate");
cmd.arg("data.csv");

let got: String = wrk.output_stderr(&mut cmd);
let expected = r#"Validation error: CSV error: record 1 (line: 2, byte: 15): found record with 2 fields, but the previous record has 3 fields.
Use `qsv fixlengths` to fix record length issues.
"#;
assert_eq!(got, expected);

wrk.assert_err(&mut cmd);
}

#[test]
fn validate_bad_csv_last_record() {
let wrk = Workdir::new("validate_bad_csv_last_record").flexible(true);
wrk.create(
"data.csv",
vec![
svec!["title", "name", "age"],
svec!["Professor", "Xaviers", "60"],
svec!["Doctor", "Magneto", "90"],
svec!["First Class Student", "Iceman", "14", "extra field"],
],
);
let mut cmd = wrk.command("validate");
cmd.arg("data.csv");

let got: String = wrk.output_stderr(&mut cmd);
let expected = r#"Validation error: CSV error: record 3 (line: 4, byte: 54): found record with 4 fields, but the previous record has 3 fields.
Use `qsv fixlengths` to fix record length issues.
"#;
assert_eq!(got, expected);

wrk.assert_err(&mut cmd);
}

#[test]
fn validate_bad_csv_prettyjson() {
let wrk = Workdir::new("validate_bad_csv_prettyjson").flexible(true);
Expand Down
Loading