From facf4f92a4cba9055395ea4c32d0afeaf3fd4f82 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 11 Dec 2023 08:53:35 -0500 Subject: [PATCH 1/2] `apply` & `applydp`: setting regex_replace --replacement to removes matches --- src/cmd/apply.rs | 38 +++++++++++++++++++++++++++----------- src/cmd/applydp.rs | 34 +++++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/src/cmd/apply.rs b/src/cmd/apply.rs index 19fd6ba8d..2ab8f3c47 100644 --- a/src/cmd/apply.rs +++ b/src/cmd/apply.rs @@ -50,6 +50,7 @@ It has 36 supported operations: * replace: Replace all matches of a pattern (using --comparand) with a string (using --replacement) (Rust replace) * regex_replace: Replace all regex matches in --comparand w/ --replacement. + Specify as --replacement to remove matches. * titlecase - capitalizes English text using Daring Fireball titlecase style https://daringfireball.net/2008/05/title_case * censor: profanity filter. Add additional comma-delimited profanities with --comparand. @@ -387,7 +388,7 @@ use crate::{ CliResult, }; -#[derive(Clone, EnumString)] +#[derive(Clone, EnumString, PartialEq)] #[strum(use_phf)] #[strum(ascii_case_insensitive)] #[allow(non_camel_case_types)] @@ -477,6 +478,7 @@ static INDIANCOMMA_POLICY: SeparatorPolicy = SeparatorPolicy { }; // valid subcommands +#[derive(PartialEq)] enum ApplySubCmd { Operations, DateFmt, @@ -582,6 +584,20 @@ pub fn run(argv: &[&str]) -> CliResult<()> { wtr.write_record(&headers)?; } + // if there is a regex_replace operation and replacement is case-insensitive, + // we set it to empty string + let flag_replacement = if apply_cmd == ApplySubCmd::Operations + && ops_vec.contains(&Operations::Regex_Replace) + && args.flag_replacement.to_lowercase() == "" + { + String::new() + } else { + args.flag_replacement + }; + let flag_comparand = args.flag_comparand; + let flag_formatstr = args.flag_formatstr; + let flag_new_column = args.flag_new_column; + // prep progress bar let show_progress = (args.flag_progressbar || util::get_envvar_flag("QSV_PROGRESSBAR")) && !rconfig.is_stdin(); @@ -645,11 +661,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> { apply_operations( &ops_vec, &mut cell, - &args.flag_comparand, - &args.flag_replacement, - &args.flag_formatstr, + &flag_comparand, + &flag_replacement, + &flag_formatstr, ); - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, *col_index, &cell); @@ -661,9 +677,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { for col_index in &*sel { record[*col_index].clone_into(&mut cell); if cell.trim().is_empty() { - cell = args.flag_replacement.clone(); + cell = flag_replacement.clone(); } - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, *col_index, &cell); @@ -678,7 +694,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let parsed_date = parse_with_preference(&cell, prefer_dmy); if let Ok(format_date) = parsed_date { let formatted_date = - format_date.format(&args.flag_formatstr).to_string(); + format_date.format(&flag_formatstr).to_string(); if !args.flag_keep_zero_time && formatted_date.ends_with("T00:00:00+00:00") { @@ -688,7 +704,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } } } - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, *col_index, &cell); @@ -708,7 +724,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { cell = formatted.to_string(); } } - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, column_index, &cell); @@ -750,7 +766,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } }; - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&result); } else { record = replace_column_value(&record, column_index, &result); diff --git a/src/cmd/applydp.rs b/src/cmd/applydp.rs index 50dd4db45..9259f731b 100644 --- a/src/cmd/applydp.rs +++ b/src/cmd/applydp.rs @@ -43,6 +43,7 @@ It has 18 supported operations: * replace: Replace all matches of a pattern (using --comparand) with a string (using --replacement) (Rust replace) * regex_replace: Replace all regex matches in --comparand w/ --replacement. + Specify as --replacement to remove matches. * round: Round numeric values to the specified number of decimal places using Midpoint Nearest Even Rounding Strategy AKA "Bankers Rounding." Specify the number of decimal places with --formatstr (default: 3). @@ -264,7 +265,7 @@ use crate::{ CliResult, }; -#[derive(Clone, EnumString)] +#[derive(Clone, EnumString, PartialEq)] #[strum(use_phf)] #[strum(ascii_case_insensitive)] #[allow(non_camel_case_types)] @@ -382,6 +383,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { String::new() }; + #[derive(PartialEq)] enum ApplydpSubCmd { Operations, DateFmt, @@ -420,6 +422,20 @@ pub fn run(argv: &[&str]) -> CliResult<()> { wtr.write_record(&headers)?; } + // if there is a regex_replace operation and replacement is case-insensitive, + // we set it to empty string + let flag_replacement = if applydp_cmd == ApplydpSubCmd::Operations + && ops_vec.contains(&Operations::Regex_Replace) + && args.flag_replacement.to_lowercase() == "" + { + String::new() + } else { + args.flag_replacement + }; + let flag_comparand = args.flag_comparand; + let flag_formatstr = args.flag_formatstr; + let flag_new_column = args.flag_new_column; + let prefer_dmy = args.flag_prefer_dmy || rconfig.get_dmy_preference(); // amortize memory allocation by reusing record @@ -472,10 +488,10 @@ pub fn run(argv: &[&str]) -> CliResult<()> { applydp_operations( &ops_vec, &mut cell, - &args.flag_comparand, - &args.flag_replacement, + &flag_comparand, + &flag_replacement, ); - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, *col_index, &cell); @@ -487,9 +503,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { for col_index in sel.iter() { record[*col_index].clone_into(&mut cell); if cell.trim().is_empty() { - cell = args.flag_replacement.clone(); + cell = flag_replacement.clone(); } - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, *col_index, &cell); @@ -504,7 +520,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let parsed_date = parse_with_preference(&cell, prefer_dmy); if let Ok(format_date) = parsed_date { let formatted_date = - format_date.format(&args.flag_formatstr).to_string(); + format_date.format(&flag_formatstr).to_string(); if !args.flag_keep_zero_time && formatted_date.ends_with("T00:00:00+00:00") { @@ -514,7 +530,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } } } - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, *col_index, &cell); @@ -534,7 +550,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { cell = formatted.to_string(); } } - if args.flag_new_column.is_some() { + if flag_new_column.is_some() { record.push_field(&cell); } else { record = replace_column_value(&record, column_index, &cell); From c67c23d90887cb8062ad4035bb3a061ce209984e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 11 Dec 2023 08:55:57 -0500 Subject: [PATCH 2/2] `tests`: add tests for apply regex_replace with --replacement set to --- tests/test_apply.rs | 32 ++++++++++++++++++++++++++++++++ tests/test_applydp.rs | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/tests/test_apply.rs b/tests/test_apply.rs index 486e45079..ed8b261aa 100644 --- a/tests/test_apply.rs +++ b/tests/test_apply.rs @@ -440,6 +440,38 @@ fn apply_dynfmt_issue1458() { assert_eq!(got, expected); } +#[test] +fn apply_regex_replace_issue1469() { + let wrk = Workdir::new("apply_regex_replace_issue1469"); + wrk.create( + "data.csv", + vec![ + svec!["col1", "col2", "col3",], + svec!["(Adam)", "B", "Case(hello)Name "], + svec!["Derek(foo)", "(bar)E", "Fos(this needs to go)ter"], + svec!["Gordon", "H", "(cmon)Irvin"], + svec!["Jack(ie)", "K", "Lynch(-Chan)"], + ], + ); + let mut cmd = wrk.command("apply"); + cmd.arg("operations") + .arg("regex_replace") + .arg("col1,col2,col3") + .args(["--comparand", r"\([^)]+\)"]) + .args(["--replacement", ""]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["col1", "col2", "col3"], + svec!["", "B", "CaseName "], + svec!["Derek", "E", "Foster"], + svec!["Gordon", "H", "Irvin"], + svec!["Jack", "K", "Lynch"], + ]; + assert_eq!(got, expected); +} + #[test] fn apply_calcconv() { let wrk = Workdir::new("apply"); diff --git a/tests/test_applydp.rs b/tests/test_applydp.rs index 31c886e79..cf45ed902 100644 --- a/tests/test_applydp.rs +++ b/tests/test_applydp.rs @@ -366,6 +366,38 @@ fn applydp_ops_regex_replace() { assert_eq!(got, expected); } +#[test] +fn applydp_regex_replace_issue1469() { + let wrk = Workdir::new("applydp_regex_replace_issue1469"); + wrk.create( + "data.csv", + vec![ + svec!["col1", "col2", "col3",], + svec!["(Adam)", "B", "Case(hello)Name "], + svec!["Derek(foo)", "(bar)E", "Fos(this needs to go)ter"], + svec!["Gordon", "H", "(cmon)Irvin"], + svec!["Jack(ie)", "K", "Lynch(-Chan)"], + ], + ); + let mut cmd = wrk.command("applydp"); + cmd.arg("operations") + .arg("regex_replace") + .arg("col1,col2,col3") + .args(["--comparand", r"\([^)]+\)"]) + .args(["--replacement", ""]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["col1", "col2", "col3"], + svec!["", "B", "CaseName "], + svec!["Derek", "E", "Foster"], + svec!["Gordon", "H", "Irvin"], + svec!["Jack", "K", "Lynch"], + ]; + assert_eq!(got, expected); +} + #[test] fn applydp_ops_regex_replace_validation_error() { let wrk = Workdir::new("applydp");