Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

apply & applydp: operations regex_replace now supports empty --replacement with the "<EMPTY>" special value #1470

Merged
merged 2 commits into from
Dec 11, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions src/cmd/apply.rs
Original file line number Diff line number Diff line change
@@ -50,6 +50,7 @@ It has 36 supported operations:
* replace: Replace all matches of a pattern (using --comparand)
with a string (using --replacement) (Rust replace)
* regex_replace: Replace all regex matches in --comparand w/ --replacement.
Specify <EMPTY> as --replacement to remove matches.
* titlecase - capitalizes English text using Daring Fireball titlecase style
https://daringfireball.net/2008/05/title_case
* censor: profanity filter. Add additional comma-delimited profanities with --comparand.
@@ -387,7 +388,7 @@ use crate::{
CliResult,
};

#[derive(Clone, EnumString)]
#[derive(Clone, EnumString, PartialEq)]
#[strum(use_phf)]
#[strum(ascii_case_insensitive)]
#[allow(non_camel_case_types)]
@@ -477,6 +478,7 @@ static INDIANCOMMA_POLICY: SeparatorPolicy = SeparatorPolicy {
};

// valid subcommands
#[derive(PartialEq)]
enum ApplySubCmd {
Operations,
DateFmt,
@@ -582,6 +584,20 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
wtr.write_record(&headers)?;
}

// if there is a regex_replace operation and replacement is <empty> case-insensitive,
// we set it to empty string
let flag_replacement = if apply_cmd == ApplySubCmd::Operations
&& ops_vec.contains(&Operations::Regex_Replace)
&& args.flag_replacement.to_lowercase() == "<empty>"
{
String::new()
} else {
args.flag_replacement
};
let flag_comparand = args.flag_comparand;
let flag_formatstr = args.flag_formatstr;
let flag_new_column = args.flag_new_column;

// prep progress bar
let show_progress =
(args.flag_progressbar || util::get_envvar_flag("QSV_PROGRESSBAR")) && !rconfig.is_stdin();
@@ -645,11 +661,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
apply_operations(
&ops_vec,
&mut cell,
&args.flag_comparand,
&args.flag_replacement,
&args.flag_formatstr,
&flag_comparand,
&flag_replacement,
&flag_formatstr,
);
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, *col_index, &cell);
@@ -661,9 +677,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
for col_index in &*sel {
record[*col_index].clone_into(&mut cell);
if cell.trim().is_empty() {
cell = args.flag_replacement.clone();
cell = flag_replacement.clone();
}
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, *col_index, &cell);
@@ -678,7 +694,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let parsed_date = parse_with_preference(&cell, prefer_dmy);
if let Ok(format_date) = parsed_date {
let formatted_date =
format_date.format(&args.flag_formatstr).to_string();
format_date.format(&flag_formatstr).to_string();
if !args.flag_keep_zero_time
&& formatted_date.ends_with("T00:00:00+00:00")
{
@@ -688,7 +704,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}
}
}
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, *col_index, &cell);
@@ -708,7 +724,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
cell = formatted.to_string();
}
}
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, column_index, &cell);
@@ -750,7 +766,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}
};

if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&result);
} else {
record = replace_column_value(&record, column_index, &result);
34 changes: 25 additions & 9 deletions src/cmd/applydp.rs
Original file line number Diff line number Diff line change
@@ -43,6 +43,7 @@ It has 18 supported operations:
* replace: Replace all matches of a pattern (using --comparand)
with a string (using --replacement) (Rust replace)
* regex_replace: Replace all regex matches in --comparand w/ --replacement.
Specify <EMPTY> as --replacement to remove matches.
* round: Round numeric values to the specified number of decimal places using
Midpoint Nearest Even Rounding Strategy AKA "Bankers Rounding."
Specify the number of decimal places with --formatstr (default: 3).
@@ -264,7 +265,7 @@ use crate::{
CliResult,
};

#[derive(Clone, EnumString)]
#[derive(Clone, EnumString, PartialEq)]
#[strum(use_phf)]
#[strum(ascii_case_insensitive)]
#[allow(non_camel_case_types)]
@@ -382,6 +383,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
String::new()
};

#[derive(PartialEq)]
enum ApplydpSubCmd {
Operations,
DateFmt,
@@ -420,6 +422,20 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
wtr.write_record(&headers)?;
}

// if there is a regex_replace operation and replacement is <empty> case-insensitive,
// we set it to empty string
let flag_replacement = if applydp_cmd == ApplydpSubCmd::Operations
&& ops_vec.contains(&Operations::Regex_Replace)
&& args.flag_replacement.to_lowercase() == "<empty>"
{
String::new()
} else {
args.flag_replacement
};
let flag_comparand = args.flag_comparand;
let flag_formatstr = args.flag_formatstr;
let flag_new_column = args.flag_new_column;

let prefer_dmy = args.flag_prefer_dmy || rconfig.get_dmy_preference();

// amortize memory allocation by reusing record
@@ -472,10 +488,10 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
applydp_operations(
&ops_vec,
&mut cell,
&args.flag_comparand,
&args.flag_replacement,
&flag_comparand,
&flag_replacement,
);
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, *col_index, &cell);
@@ -487,9 +503,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
for col_index in sel.iter() {
record[*col_index].clone_into(&mut cell);
if cell.trim().is_empty() {
cell = args.flag_replacement.clone();
cell = flag_replacement.clone();
}
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, *col_index, &cell);
@@ -504,7 +520,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let parsed_date = parse_with_preference(&cell, prefer_dmy);
if let Ok(format_date) = parsed_date {
let formatted_date =
format_date.format(&args.flag_formatstr).to_string();
format_date.format(&flag_formatstr).to_string();
if !args.flag_keep_zero_time
&& formatted_date.ends_with("T00:00:00+00:00")
{
@@ -514,7 +530,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}
}
}
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, *col_index, &cell);
@@ -534,7 +550,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
cell = formatted.to_string();
}
}
if args.flag_new_column.is_some() {
if flag_new_column.is_some() {
record.push_field(&cell);
} else {
record = replace_column_value(&record, column_index, &cell);
32 changes: 32 additions & 0 deletions tests/test_apply.rs
Original file line number Diff line number Diff line change
@@ -440,6 +440,38 @@ fn apply_dynfmt_issue1458() {
assert_eq!(got, expected);
}

#[test]
fn apply_regex_replace_issue1469() {
let wrk = Workdir::new("apply_regex_replace_issue1469");
wrk.create(
"data.csv",
vec![
svec!["col1", "col2", "col3",],
svec!["(Adam)", "B", "Case(hello)Name "],
svec!["Derek(foo)", "(bar)E", "Fos(this needs to go)ter"],
svec!["Gordon", "H", "(cmon)Irvin"],
svec!["Jack(ie)", "K", "Lynch(-Chan)"],
],
);
let mut cmd = wrk.command("apply");
cmd.arg("operations")
.arg("regex_replace")
.arg("col1,col2,col3")
.args(["--comparand", r"\([^)]+\)"])
.args(["--replacement", "<EmpTY>"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["col1", "col2", "col3"],
svec!["", "B", "CaseName "],
svec!["Derek", "E", "Foster"],
svec!["Gordon", "H", "Irvin"],
svec!["Jack", "K", "Lynch"],
];
assert_eq!(got, expected);
}

#[test]
fn apply_calcconv() {
let wrk = Workdir::new("apply");
32 changes: 32 additions & 0 deletions tests/test_applydp.rs
Original file line number Diff line number Diff line change
@@ -366,6 +366,38 @@ fn applydp_ops_regex_replace() {
assert_eq!(got, expected);
}

#[test]
fn applydp_regex_replace_issue1469() {
let wrk = Workdir::new("applydp_regex_replace_issue1469");
wrk.create(
"data.csv",
vec![
svec!["col1", "col2", "col3",],
svec!["(Adam)", "B", "Case(hello)Name "],
svec!["Derek(foo)", "(bar)E", "Fos(this needs to go)ter"],
svec!["Gordon", "H", "(cmon)Irvin"],
svec!["Jack(ie)", "K", "Lynch(-Chan)"],
],
);
let mut cmd = wrk.command("applydp");
cmd.arg("operations")
.arg("regex_replace")
.arg("col1,col2,col3")
.args(["--comparand", r"\([^)]+\)"])
.args(["--replacement", "<EMPTY>"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["col1", "col2", "col3"],
svec!["", "B", "CaseName "],
svec!["Derek", "E", "Foster"],
svec!["Gordon", "H", "Irvin"],
svec!["Jack", "K", "Lynch"],
];
assert_eq!(got, expected);
}

#[test]
fn applydp_ops_regex_replace_validation_error() {
let wrk = Workdir::new("applydp");