From e9dca7e80d9992e41d7e49c44734e688c5534f74 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Sun, 26 Nov 2023 08:01:59 -0500
Subject: [PATCH 1/2] `validate`: faster RFC4180 validation using ByteRecords
 and SIMDutf8

---
 src/cmd/validate.rs | 108 +++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 51 deletions(-)

diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs
index 9cdbdd1fe..8f1b84838 100644
--- a/src/cmd/validate.rs
+++ b/src/cmd/validate.rs
@@ -258,40 +258,27 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
             }
         }
 
-        // now, let's validate the rest of the records
-        let mut record = csv::StringRecord::new();
-        let mut result = rdr.read_record(&mut record);
+        // now, let's validate the rest of the records the fastest way possible
+        // We do that by using csv::ByteRecord, which does not validate utf8
+        // making for higher througput and lower memory usage compared to csv::StringRecord
+        // which validates each field SEPARATELY as a utf8 string.
+        // Combined with simdutf8::basic::from_utf8() to validate the entire record as utf8 in one
+        // go as a slice of bytes, this approach is much faster than csv::StringRecord's
+        // per-field validation.
+        let mut record = csv::ByteRecord::new();
+        let mut result;
         let mut record_idx: u64 = 0;
         let flag_json = args.flag_json;
         let flag_pretty_json = args.flag_pretty_json;
 
         'rfc4180_check: loop {
+            result = rdr.read_byte_record(&mut record);
             if let Err(e) = result {
+                // read_byte_record() does not validate utf8, so we know this is not a utf8 error
                 if flag_json || flag_pretty_json {
                     // we're returning a JSON error, so we have more machine-friendly details
                     // using the JSON API error format
-                    if let csv::ErrorKind::Utf8 { pos, err } = e.kind() {
-                        // it's a UTF-8 error, so we report utf8 error metadata
-                        let validation_error = json!({
-                            "errors": [{
-                                "title" : "UTF-8 validation error",
-                                "detail" : format!("{e}"),
-                                "meta": {
-                                    "last_valid_record": format!("{record_idx}"),
-                                    "record_position": format!("{pos:?}"),
-                                    "record_error": format!("{err}"),
-                                }
-                            }]
-                        });
 
-                        let json_error = if flag_pretty_json {
-                            serde_json::to_string_pretty(&validation_error).unwrap()
-                        } else {
-                            validation_error.to_string()
-                        };
-                        return fail_encoding_clierror!("{json_error}");
-                    }
-                    // it's not a UTF-8 error, so we report generic validation error
                     let validation_error = json!({
                         "errors": [{
                             "title" : "Validation error",
@@ -313,40 +300,59 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
 
                 // we're not returning a JSON error, so we can use
                 // a user-friendly error message with suggestions
-                match e.kind() {
-                    csv::ErrorKind::UnequalLengths {
-                        expected_len: _,
-                        len: _,
-                        pos: _,
-                    } => {
-                        return fail_clierror!(
-                            "Validation error: {e}.\nUse `qsv fixlengths` to fix record length \
-                             issues."
-                        );
-                    },
-                    csv::ErrorKind::Utf8 { pos, err } => {
-                        return fail_encoding_clierror!(
-                            "non-utf8 sequence at record {record_idx} position \
-                             {pos:?}.\n{err}\nUse `qsv input` to fix formatting and to handle \
-                             non-utf8 sequences.\nYou may also want to transcode your data to \
-                             UTF-8 first using `iconv` or `recode`."
-                        );
-                    },
-                    _ => {
-                        return fail_clierror!(
-                            "Validation error: {e}.\nLast valid record: {record_idx}"
-                        );
-                    },
+                if let csv::ErrorKind::UnequalLengths {
+                    expected_len: _,
+                    len: _,
+                    pos: _,
+                } = e.kind()
+                {
+                    return fail_clierror!(
+                        "Validation error: {e}.\nUse `qsv fixlengths` to fix record length issues."
+                    );
+                } else {
+                    return fail_clierror!(
+                        "Validation error: {e}.\nLast valid record: {record_idx}"
+                    );
                 }
-            } else if result.is_ok_and(|more_data| !more_data) {
+            }
+
+            // use SIMD accelerated UTF-8 validation
+            if simdutf8::basic::from_utf8(&record.as_slice()).is_err() {
+                // there's a UTF-8 error, so we report utf8 error metadata
+                if flag_json || flag_pretty_json {
+                    let validation_error = json!({
+                        "errors": [{
+                            "title" : "UTF-8 validation error",
+                            "detail" : "Cannot parse CSV record as UTF-8",
+                            "meta": {
+                                "last_valid_record": format!("{record_idx}"),
+                            }
+                        }]
+                    });
+
+                    let json_error = if flag_pretty_json {
+                        serde_json::to_string_pretty(&validation_error).unwrap()
+                    } else {
+                        validation_error.to_string()
+                    };
+                    return fail_encoding_clierror!("{json_error}");
+                } else {
+                    return fail_encoding_clierror!(
+                        "non-utf8 sequence at record {record_idx}.\nUse `qsv input` to fix \
+                         formatting and to handle non-utf8 sequences.\nYou may also want to \
+                         transcode your data to UTF-8 first using `iconv` or `recode`."
+                    );
+                }
+            }
+
+            if result.is_ok_and(|more_data| !more_data) {
                 // we've read the CSV to the end, so break out of loop
                 break 'rfc4180_check;
-            } else {
-                result = rdr.read_record(&mut record);
             }
             record_idx += 1;
         }
 
+        // if we're here, we know the CSV is valid
         let msg = if flag_json || flag_pretty_json {
             let rfc4180 = RFC4180Struct {
                 delimiter_char: rconfig.get_delimiter() as char,

From 1fbd9a04b80b865508f06fcb36343068f0d0e2b0 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Sun, 26 Nov 2023 08:06:22 -0500
Subject: [PATCH 2/2] `validate`: add additional CSV tests for bad CSVs and
 valid empty CSV

with bad first and last records
and a valid empty CSV with just a header
---
 tests/test_validate.rs | 64 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/tests/test_validate.rs b/tests/test_validate.rs
index cbf378c80..c7d439ae3 100644
--- a/tests/test_validate.rs
+++ b/tests/test_validate.rs
@@ -39,6 +39,22 @@ fn validate_good_csv_msg() {
     assert_eq!(got, expected);
 }
 
+#[test]
+fn validate_empty_csv_msg() {
+    let wrk = Workdir::new("validate_empty_csv_msg").flexible(true);
+    wrk.create(
+        "data.csv",
+        vec![svec!["title", "name", "real age (earth years)"]],
+    );
+    let mut cmd = wrk.command("validate");
+    cmd.arg("data.csv");
+
+    let got: String = wrk.stdout(&mut cmd);
+    let expected =
+        r#"Valid: 3 columns ("title", "name", "real age (earth years)") and 0 records detected."#;
+    assert_eq!(got, expected);
+}
+
 #[test]
 fn validate_good_csv_pretty_json() {
     let wrk = Workdir::new("validate_good_csv_pretty_json").flexible(true);
@@ -114,6 +130,54 @@ Use `qsv fixlengths` to fix record length issues.
     wrk.assert_err(&mut cmd);
 }
 
+#[test]
+fn validate_bad_csv_first_record() {
+    let wrk = Workdir::new("validate_bad_csv_first_record").flexible(true);
+    wrk.create(
+        "data.csv",
+        vec![
+            svec!["title", "name", "age"],
+            svec!["Professor", "Xaviers",],
+            svec!["Doctor", "Magneto", "90",],
+            svec!["First Class Student", "Iceman", "14"],
+        ],
+    );
+    let mut cmd = wrk.command("validate");
+    cmd.arg("data.csv");
+
+    let got: String = wrk.output_stderr(&mut cmd);
+    let expected = r#"Validation error: CSV error: record 1 (line: 2, byte: 15): found record with 2 fields, but the previous record has 3 fields.
+Use `qsv fixlengths` to fix record length issues.
+"#;
+    assert_eq!(got, expected);
+
+    wrk.assert_err(&mut cmd);
+}
+
+#[test]
+fn validate_bad_csv_last_record() {
+    let wrk = Workdir::new("validate_bad_csv_last_record").flexible(true);
+    wrk.create(
+        "data.csv",
+        vec![
+            svec!["title", "name", "age"],
+            svec!["Professor", "Xaviers", "60"],
+            svec!["Doctor", "Magneto", "90"],
+            svec!["First Class Student", "Iceman", "14", "extra field"],
+        ],
+    );
+    let mut cmd = wrk.command("validate");
+    cmd.arg("data.csv");
+
+    let got: String = wrk.output_stderr(&mut cmd);
+    let expected = r#"Validation error: CSV error: record 3 (line: 4, byte: 54): found record with 4 fields, but the previous record has 3 fields.
+Use `qsv fixlengths` to fix record length issues.
+"#;
+    assert_eq!(got, expected);
+
+    wrk.assert_err(&mut cmd);
+}
+
 #[test]
 fn validate_bad_csv_prettyjson() {
     let wrk = Workdir::new("validate_bad_csv_prettyjson").flexible(true);