From e9dca7e80d9992e41d7e49c44734e688c5534f74 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 26 Nov 2023 08:01:59 -0500 Subject: [PATCH 1/2] `validate`: faster RFC4180 validation using ByteRecords and SIMDutf8 --- src/cmd/validate.rs | 108 +++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 51 deletions(-) diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index 9cdbdd1fe..8f1b84838 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -258,40 +258,27 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } } - // now, let's validate the rest of the records - let mut record = csv::StringRecord::new(); - let mut result = rdr.read_record(&mut record); + // now, let's validate the rest of the records the fastest way possible + // We do that by using csv::ByteRecord, which does not validate utf8 + // making for higher througput and lower memory usage compared to csv::StringRecord + // which validates each field SEPARATELY as a utf8 string. + // Combined with simdutf8::basic::from_utf8() to validate the entire record as utf8 in one + // go as a slice of bytes, this approach is much faster than csv::StringRecord's + // per-field validation. + let mut record = csv::ByteRecord::new(); + let mut result; let mut record_idx: u64 = 0; let flag_json = args.flag_json; let flag_pretty_json = args.flag_pretty_json; 'rfc4180_check: loop { + result = rdr.read_byte_record(&mut record); if let Err(e) = result { + // read_byte_record() does not validate utf8, so we know this is not a utf8 error if flag_json || flag_pretty_json { // we're returning a JSON error, so we have more machine-friendly details // using the JSON API error format - if let csv::ErrorKind::Utf8 { pos, err } = e.kind() { - // it's a UTF-8 error, so we report utf8 error metadata - let validation_error = json!({ - "errors": [{ - "title" : "UTF-8 validation error", - "detail" : format!("{e}"), - "meta": { - "last_valid_record": format!("{record_idx}"), - "record_position": format!("{pos:?}"), - "record_error": format!("{err}"), - } - }] - }); - let json_error = if flag_pretty_json { - serde_json::to_string_pretty(&validation_error).unwrap() - } else { - validation_error.to_string() - }; - return fail_encoding_clierror!("{json_error}"); - } - // it's not a UTF-8 error, so we report generic validation error let validation_error = json!({ "errors": [{ "title" : "Validation error", @@ -313,40 +300,59 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // we're not returning a JSON error, so we can use // a user-friendly error message with suggestions - match e.kind() { - csv::ErrorKind::UnequalLengths { - expected_len: _, - len: _, - pos: _, - } => { - return fail_clierror!( - "Validation error: {e}.\nUse `qsv fixlengths` to fix record length \ - issues." - ); - }, - csv::ErrorKind::Utf8 { pos, err } => { - return fail_encoding_clierror!( - "non-utf8 sequence at record {record_idx} position \ - {pos:?}.\n{err}\nUse `qsv input` to fix formatting and to handle \ - non-utf8 sequences.\nYou may also want to transcode your data to \ - UTF-8 first using `iconv` or `recode`." - ); - }, - _ => { - return fail_clierror!( - "Validation error: {e}.\nLast valid record: {record_idx}" - ); - }, + if let csv::ErrorKind::UnequalLengths { + expected_len: _, + len: _, + pos: _, + } = e.kind() + { + return fail_clierror!( + "Validation error: {e}.\nUse `qsv fixlengths` to fix record length issues." + ); + } else { + return fail_clierror!( + "Validation error: {e}.\nLast valid record: {record_idx}" + ); } - } else if result.is_ok_and(|more_data| !more_data) { + } + + // use SIMD accelerated UTF-8 validation + if simdutf8::basic::from_utf8(&record.as_slice()).is_err() { + // there's a UTF-8 error, so we report utf8 error metadata + if flag_json || flag_pretty_json { + let validation_error = json!({ + "errors": [{ + "title" : "UTF-8 validation error", + "detail" : "Cannot parse CSV record as UTF-8", + "meta": { + "last_valid_record": format!("{record_idx}"), + } + }] + }); + + let json_error = if flag_pretty_json { + serde_json::to_string_pretty(&validation_error).unwrap() + } else { + validation_error.to_string() + }; + return fail_encoding_clierror!("{json_error}"); + } else { + return fail_encoding_clierror!( + "non-utf8 sequence at record {record_idx}.\nUse `qsv input` to fix \ + formatting and to handle non-utf8 sequences.\nYou may also want to \ + transcode your data to UTF-8 first using `iconv` or `recode`." + ); + } + } + + if result.is_ok_and(|more_data| !more_data) { // we've read the CSV to the end, so break out of loop break 'rfc4180_check; - } else { - result = rdr.read_record(&mut record); } record_idx += 1; } + // if we're here, we know the CSV is valid let msg = if flag_json || flag_pretty_json { let rfc4180 = RFC4180Struct { delimiter_char: rconfig.get_delimiter() as char, From 1fbd9a04b80b865508f06fcb36343068f0d0e2b0 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 26 Nov 2023 08:06:22 -0500 Subject: [PATCH 2/2] `validate`: add additional CSV tests for bad CSVs and valid empty CSV with bad first and last records and a valid empty CSV with just a header --- tests/test_validate.rs | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/test_validate.rs b/tests/test_validate.rs index cbf378c80..c7d439ae3 100644 --- a/tests/test_validate.rs +++ b/tests/test_validate.rs @@ -39,6 +39,22 @@ fn validate_good_csv_msg() { assert_eq!(got, expected); } +#[test] +fn validate_empty_csv_msg() { + let wrk = Workdir::new("validate_empty_csv_msg").flexible(true); + wrk.create( + "data.csv", + vec![svec!["title", "name", "real age (earth years)"]], + ); + let mut cmd = wrk.command("validate"); + cmd.arg("data.csv"); + + let got: String = wrk.stdout(&mut cmd); + let expected = + r#"Valid: 3 columns ("title", "name", "real age (earth years)") and 0 records detected."#; + assert_eq!(got, expected); +} + #[test] fn validate_good_csv_pretty_json() { let wrk = Workdir::new("validate_good_csv_pretty_json").flexible(true); @@ -114,6 +130,54 @@ Use `qsv fixlengths` to fix record length issues. wrk.assert_err(&mut cmd); } +#[test] +fn validate_bad_csv_first_record() { + let wrk = Workdir::new("validate_bad_csv_first_record").flexible(true); + wrk.create( + "data.csv", + vec![ + svec!["title", "name", "age"], + svec!["Professor", "Xaviers",], + svec!["Doctor", "Magneto", "90",], + svec!["First Class Student", "Iceman", "14"], + ], + ); + let mut cmd = wrk.command("validate"); + cmd.arg("data.csv"); + + let got: String = wrk.output_stderr(&mut cmd); + let expected = r#"Validation error: CSV error: record 1 (line: 2, byte: 15): found record with 2 fields, but the previous record has 3 fields. +Use `qsv fixlengths` to fix record length issues. +"#; + assert_eq!(got, expected); + + wrk.assert_err(&mut cmd); +} + +#[test] +fn validate_bad_csv_last_record() { + let wrk = Workdir::new("validate_bad_csv_last_record").flexible(true); + wrk.create( + "data.csv", + vec![ + svec!["title", "name", "age"], + svec!["Professor", "Xaviers", "60"], + svec!["Doctor", "Magneto", "90"], + svec!["First Class Student", "Iceman", "14", "extra field"], + ], + ); + let mut cmd = wrk.command("validate"); + cmd.arg("data.csv"); + + let got: String = wrk.output_stderr(&mut cmd); + let expected = r#"Validation error: CSV error: record 3 (line: 4, byte: 54): found record with 4 fields, but the previous record has 3 fields. +Use `qsv fixlengths` to fix record length issues. +"#; + assert_eq!(got, expected); + + wrk.assert_err(&mut cmd); +} + #[test] fn validate_bad_csv_prettyjson() { let wrk = Workdir::new("validate_bad_csv_prettyjson").flexible(true);