Skip to content

Commit

Permalink
validate: Add --trim option
Browse files Browse the repository at this point in the history
  • Loading branch information
jqnatividad committed Dec 5, 2023
1 parent 506145f commit d590f7d
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 10 deletions.
10 changes: 6 additions & 4 deletions src/cmd/validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Validate arguments:
The file can be a local file or a URL.
Validate options:
--trim Trim leading and trailing whitespace from fields before validating.
--fail-fast Stops on first error.
--valid <suffix> Valid record output file suffix. [default: valid]
--invalid <suffix> Invalid record output file suffix. [default: invalid]
Expand Down Expand Up @@ -139,6 +140,7 @@ static TIMEOUT_SECS: AtomicU16 = AtomicU16::new(15);
#[derive(Deserialize)]
#[allow(dead_code)]
struct Args {
flag_trim: bool,
flag_fail_fast: bool,
flag_valid: Option<String>,
flag_invalid: Option<String>,
Expand Down Expand Up @@ -466,6 +468,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let mut validation_results = Vec::with_capacity(batch_size);
let mut valid_flags: Vec<bool> = Vec::with_capacity(batch_size);
let mut validation_error_messages: Vec<String> = Vec::with_capacity(50);
let flag_trim = args.flag_trim;

// set RAYON_NUM_THREADS
util::njobs(args.flag_jobs);
Expand All @@ -481,10 +484,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
if has_data {
row_number += 1;
record.push_field(buffer.format(row_number).as_bytes());

// non-allocating trimming in place is much faster on the record level
// with our csv fork than doing per field std::str::trim which is allocating
record.trim();
if flag_trim {
record.trim();
}
batch.push(record.clone());
} else {
// nothing else to add to batch
Expand Down
99 changes: 93 additions & 6 deletions tests/test_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ use crate::workdir::Workdir;

#[test]
#[file_serial]
fn generate_schema_with_defaults_and_validate_with_no_errors() {
fn generate_schema_with_defaults_and_validate_trim_with_no_errors() {
// create workspace and invoke schema command with value constraints flag
let wrk =
Workdir::new("fn generate_schema_with_defaults_and_validate_with_no_errors").flexible(true);
let wrk = Workdir::new("fn generate_schema_with_defaults_and_validate_trim_with_no_errors")
.flexible(true);
wrk.clear_contents().unwrap();

// copy csv file to workdir
Expand Down Expand Up @@ -44,6 +44,7 @@ fn generate_schema_with_defaults_and_validate_with_no_errors() {
// invoke validate command from schema created above
let mut cmd2 = wrk.command("validate");
cmd2.arg("adur-public-toilets.csv");
cmd2.arg("--trim");

Check failure

Code scanning / devskim

A weak or broken hash algorithm was detected. Error test

Weak/Broken Hash Algorithm
cmd2.arg("adur-public-toilets.csv.schema.json");
wrk.output(&mut cmd2);

Expand All @@ -58,10 +59,10 @@ fn generate_schema_with_defaults_and_validate_with_no_errors() {

#[test]
#[file_serial]
fn generate_schema_with_optional_flags_and_validate_with_errors() {
fn generate_schema_with_optional_flags_notrim_and_validate_with_errors() {
// create workspace and invoke schema command with value constraints flag
let wrk =
Workdir::new("generate_schema_with_optional_flags_and_validate_with_errors").flexible(true);
let wrk = Workdir::new("generate_schema_with_optional_flags_notrim_and_validate_with_errors")
.flexible(true);
wrk.clear_contents().unwrap();

// copy csv file to workdir
Expand Down Expand Up @@ -101,6 +102,92 @@ fn generate_schema_with_optional_flags_and_validate_with_errors() {
cmd2.arg("adur-public-toilets.csv.schema.json");
wrk.output(&mut cmd2);

// validation report
let validation_errors_expected = r#"row_number field error
1 OpeningHours "S = 09:00 - 21:00 W = 09:00 - 17:00 " is not one of ["09.00 - 17.00","S = 08:00 - 21:00 W = 08:00 - 17:00","S = 09:00 - 15:00 W = 09:00 - 15:00","S = 09:00 - 21:00 W = 09:00 - 17:00",null]
2 ExtractDate "07/07/2014 00:00" is not a "date"
3 ExtractDate "2014-07-07 00:00" is not a "date"
4 ExtractDate "07/07/2014 00:00" is not a "date"
5 ExtractDate "07/07/2014 00:00" is not a "date"
6 ExtractDate "07/07/2014 00:00" is not a "date"
7 ExtractDate "07/07/2014 00:00" is not a "date"
8 ExtractDate "07/07/2014 00:00" is not a "date"
9 ExtractDate "07/07/2014 00:00" is not a "date"
10 ExtractDate "07/07/2014 00:00" is not a "date"
11 ExtractDate "07/07/2014 00:00" is not a "date"
12 ExtractDate "07/07/2014 00:00" is not a "date"
13 ExtractDate "07/07/2014 00:00" is not a "date"
14 ExtractDate "07/07/2014 00:00" is not a "date"
15 ExtractDate "07/07/2014 00:00" is not a "date"
"#;

// expecting invalid rows, so confirm there ARE output files generated
let validation_error_path = &wrk.path("adur-public-toilets.csv.validation-errors.tsv");
println!("expecting validation error file at: {validation_error_path:?}");

assert!(Path::new(validation_error_path).exists());
assert!(Path::new(&wrk.path("adur-public-toilets.csv.valid")).exists());
assert!(Path::new(&wrk.path("adur-public-toilets.csv.invalid")).exists());

// check validation error output
let validation_error_output: String =
wrk.from_str(&wrk.path("adur-public-toilets.csv.validation-errors.tsv"));

assert!(!validation_error_output.is_empty());

assert_eq!(
validation_errors_expected.to_string(),
validation_error_output
);
wrk.assert_err(&mut cmd2);

Check failure

Code scanning / devskim

A weak or broken hash algorithm was detected. Error test

Weak/Broken Hash Algorithm
}

#[test]
#[file_serial]
fn generate_schema_with_optional_flags_trim_and_validate_with_errors() {
// create workspace and invoke schema command with value constraints flag
let wrk = Workdir::new("generate_schema_with_optional_flags_trim_and_validate_with_errors")
.flexible(true);
wrk.clear_contents().unwrap();

// copy csv file to workdir
let csv = wrk.load_test_resource("adur-public-toilets.csv");
wrk.create_from_string("adur-public-toilets.csv", &csv);

// run schema command with value constraints option
let mut cmd = wrk.command("schema");
cmd.arg("adur-public-toilets.csv");
cmd.arg("--enum-threshold");
cmd.arg("13");
cmd.arg("--pattern-columns");
cmd.arg("ReportEmail,OpeningHours");
cmd.arg("--strict-dates");
wrk.output(&mut cmd);

// load output schema file
let output_schema_string: String =
wrk.from_str(&wrk.path("adur-public-toilets.csv.schema.json"));
let output_schema_json =
serde_json::from_str(&output_schema_string).expect("parse schema json");

// make sure it's a valid JSON Schema by compiling with jsonschema library
jsonschema::JSONSchema::options()
.compile(&output_schema_json)
.expect("valid JSON Schema");

// diff output json with expected json
let expected_schema: String =
wrk.load_test_resource("adur-public-toilets.csv.schema-strict.expected.json");
let expected_schema_json: Value = serde_json::from_str(&expected_schema).unwrap();
assert_json_eq!(expected_schema_json, output_schema_json);

// invoke validate command from schema created above
let mut cmd2 = wrk.command("validate");

Check failure

Code scanning / devskim

A weak or broken hash algorithm was detected. Error test

Weak/Broken Hash Algorithm
cmd2.arg("adur-public-toilets.csv");

Check failure

Code scanning / devskim

A weak or broken hash algorithm was detected. Error test

Weak/Broken Hash Algorithm
cmd2.arg("--trim");

Check failure

Code scanning / devskim

A weak or broken hash algorithm was detected. Error test

Weak/Broken Hash Algorithm
cmd2.arg("adur-public-toilets.csv.schema.json");

Check failure

Code scanning / devskim

A weak or broken hash algorithm was detected. Error test

Weak/Broken Hash Algorithm
wrk.output(&mut cmd2);

// validation report
let validation_errors_expected = r#"row_number field error
2 ExtractDate "07/07/2014 00:00" is not a "date"
Expand Down

0 comments on commit d590f7d

Please sign in to comment.