From f1cfb24f43233297f6b1dee7112d83d42ba74ffc Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 21 Jun 2024 09:12:27 -0400 Subject: [PATCH 1/3] `deps`: add xxhash-rust dependency for `enum --hash` option --- Cargo.lock | 1 + Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 856e6045d..9606b675c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4793,6 +4793,7 @@ dependencies = [ "uuid", "vader_sentiment", "whatlang", + "xxhash-rust", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 395bd044c..092027fe3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -225,7 +225,7 @@ uuid = { version = "1", features = ["v4"] } url = "2.5" vader_sentiment = { version = "0.1", optional = true } whatlang = { version = "0.16", optional = true } - +xxhash-rust = { version = "0.8", features = ["xxh3"] } [target.'cfg(not(target_arch = "aarch64"))'.dependencies] simdutf8 = "0.1" From ef5f2b3aca48f182baaa3f5f1e77cce57ab8d90d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 21 Jun 2024 09:20:24 -0400 Subject: [PATCH 2/3] `enum`: add `--hash` option --- src/cmd/enumerate.rs | 98 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/src/cmd/enumerate.rs b/src/cmd/enumerate.rs index 8c06e2cff..485879ab1 100644 --- a/src/cmd/enumerate.rs +++ b/src/cmd/enumerate.rs @@ -17,6 +17,13 @@ The enum function has four modes of operation: 4. COPY. Copy the contents of a column to a new one: $ qsv enum --copy names + 5. HASH. Create a new column filled with the hash of a given column/s: + $ qsv enum --hash 1- // hash all columns + $ qsv enum --hash col2,col3,col4 // hash specific columns + $ qsv enum --hash col2 // hash a single column + $ qsv enum --hash /record_id|name|address/ // hash columns that match a regex + $ qsv enum --hash !/record_id/ // hash all columns except the record_id column + Finally, note that you should also be able to shuffle the lines of a CSV file by sorting on the generated uuids: $ qsv enum --uuid file.csv | qsv sort -s uuid > shuffled.csv @@ -42,6 +49,16 @@ enum options: --uuid When set, the column will be populated with uuids (v4) instead of the incremental identifier. Changes the default column name to "uuid". + --hash Create a new column filled with the hash of the + given column/s. Use "1-" to hash all columns. + Changes the default column name to "hash". + Will remove an existing "hash" column if it exists. + + The columns argument specify the columns to use + in the hash. Columns can be referenced by name or index, + starting at 1. Specify multiple columns by separating + them with a comma. Specify a range of columns with `-`. + (See 'qsv select --help' for the full syntax.) Common options: -h, --help Display this message @@ -54,6 +71,7 @@ Common options: use serde::Deserialize; use uuid::Uuid; +use xxhash_rust::xxh3::xxh3_64; use crate::{ config::{Config, Delimiter}, @@ -72,6 +90,7 @@ struct Args { flag_constant: Option, flag_copy: Option, flag_uuid: bool, + flag_hash: Option, flag_output: Option, flag_no_headers: bool, flag_delimiter: Option, @@ -82,6 +101,7 @@ enum EnumOperation { Uuid, Constant, Copy, + Hash, } pub fn run(argv: &[&str]) -> CliResult<()> { @@ -94,6 +114,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut wtr = Config::new(&args.flag_output).writer()?; let mut headers = rdr.byte_headers()?.clone(); + let mut hash_index = None; let mut copy_index = 0; let mut copy_operation = false; @@ -105,6 +126,43 @@ pub fn run(argv: &[&str]) -> CliResult<()> { copy_operation = true; } + let mut hash_sel = None; + let mut hash_operation = false; + + if let Some(hash_columns) = &args.flag_hash { + // get the index of the column named "hash", if it exists + hash_index = headers.iter().position(|col| col == b"hash"); + + // get the original selection + rconfig = rconfig.select(hash_columns.clone()); + let original_selection = rconfig + .clone() + .select(hash_columns.clone()) + .selection(&headers)?; + + // Filter out the "hash" column from the original selection, if it exists + let filtered_selection = original_selection + .iter() + .filter(|&&index| index != hash_index.unwrap_or(usize::MAX)) + .collect::>(); + + // Construct selection string without "hash" column + let selection_string = filtered_selection + .iter() + .map(|&&index| (index + 1).to_string()) + .collect::>() + .join(","); + + // Parse the new selection without "hash" column + let no_hash_column_selection = SelectColumns::parse(&selection_string)?; + + // Update the configuration with the new selection + rconfig = rconfig.select(no_hash_column_selection); + hash_sel = Some(rconfig.selection(&headers)?); + + hash_operation = true; + } + if !rconfig.no_headers { if let Some(column_name) = &args.flag_new_column { headers.push_field(column_name.as_bytes()); @@ -118,6 +176,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> { Err(e) => return fail_clierror!("Could not parse cell as utf-8!: {e}"), }; headers.push_field(format!("{current_header}_copy").as_bytes()); + } else if hash_operation { + // Remove an existing "hash" column from the header, if it exists + headers = if let Some(hash_index) = hash_index { + headers + .into_iter() + .enumerate() + .filter(|(i, _)| *i != hash_index) + .map(|(_, field)| field) + .collect() + } else { + headers + }; + headers.push_field(b"hash"); } else { headers.push_field(b"index"); }; @@ -137,6 +208,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { EnumOperation::Uuid } else if copy_operation { EnumOperation::Copy + } else if args.flag_hash.is_some() { + EnumOperation::Hash } else { EnumOperation::Increment }; @@ -148,6 +221,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { #[allow(unused_assignments)] let mut colcopy: Vec = Vec::with_capacity(20); let increment = args.flag_increment.unwrap_or(1); + let mut hash_string = String::new(); + let mut hash; while rdr.read_byte_record(&mut record)? { match enum_operation { @@ -170,6 +245,29 @@ pub fn run(argv: &[&str]) -> CliResult<()> { colcopy = record[copy_index].to_vec(); record.push_field(&colcopy); }, + EnumOperation::Hash => { + hash_string.clear(); + + // build the hash string from the filtered selection + if let Some(ref sel) = hash_sel { + sel.iter() + .for_each(|i| hash_string.push_str(&String::from_utf8_lossy(&record[*i]))); + } + hash = xxh3_64(hash_string.as_bytes()); + + // Optionally remove the "hash" column if it already exists from the output + record = if let Some(hash_index) = hash_index { + record + .into_iter() + .enumerate() + .filter(|(i, _)| *i != hash_index) + .map(|(_, field)| field) + .collect() + } else { + record + }; + record.push_field(hash.to_string().as_bytes()); + }, } wtr.write_byte_record(&record)?; From 4597b1a1401652d19fab73c11cb1c326e29bd98a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 21 Jun 2024 09:20:47 -0400 Subject: [PATCH 3/3] `tests`: add `enum --hash` tests --- tests/test_enumerate.rs | 146 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/tests/test_enumerate.rs b/tests/test_enumerate.rs index f375d6f9d..fdb994034 100644 --- a/tests/test_enumerate.rs +++ b/tests/test_enumerate.rs @@ -83,6 +83,152 @@ fn enumerate_counter_inc() { assert_eq!(got, expected); } +#[test] +fn enumerate_hash() { + let wrk = Workdir::new("enumerate_hash"); + wrk.create( + "data.csv", + vec![ + svec!["letter", "number", "random_text"], + svec!["a", "13", "this is a test"], + svec!["b", "24", "the quick brown fox"], + svec!["c", "72", "jumps over the lazy dog"], + svec!["d", "7", "I think, therefore I am"], + svec!["d", "7", "I think, therefore I am"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(&["--hash", "1-"]).arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["letter", "number", "random_text", "hash"], + svec!["a", "13", "this is a test", "4649922201779202190"], + svec!["b", "24", "the quick brown fox", "10788366602312130446"], + svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn enumerate_hash_replace_old_hash() { + let wrk = Workdir::new("enumerate_replace_old_hash"); + wrk.create( + "data.csv", + vec![ + svec!["letter", "number", "random_text", "hash"], + svec!["a", "13", "this is a test", "1"], + svec!["b", "24", "the quick brown fox", "2"], + svec!["c", "72", "jumps over the lazy dog", "3"], + svec!["d", "7", "I think, therefore I am", "4"], + svec!["d", "7", "I think, therefore I am", "5"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(&["--hash", "!/hash/"]).arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["letter", "number", "random_text", "hash"], + svec!["a", "13", "this is a test", "4649922201779202190"], + svec!["b", "24", "the quick brown fox", "10788366602312130446"], + svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn enumerate_hash_replace_old_hash2() { + let wrk = Workdir::new("enumerate_replace_old_hash2"); + wrk.create( + "data.csv", + vec![ + svec!["hash", "letter", "number", "random_text"], + svec!["1", "a", "13", "this is a test"], + svec!["2", "b", "24", "the quick brown fox"], + svec!["3", "c", "72", "jumps over the lazy dog"], + svec!["4", "d", "7", "I think, therefore I am"], + svec!["5", "d", "7", "I think, therefore I am"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(&["--hash", "1-"]).arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["letter", "number", "random_text", "hash"], + svec!["a", "13", "this is a test", "4649922201779202190"], + svec!["b", "24", "the quick brown fox", "10788366602312130446"], + svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn enumerate_hash_regex() { + let wrk = Workdir::new("enumerate_replace_regex"); + wrk.create( + "data.csv", + vec![ + svec!["hash", "letter", "number", "random_text"], + svec!["1", "a", "13", "this is a test"], + svec!["2", "b", "24", "the quick brown fox"], + svec!["3", "c", "72", "jumps over the lazy dog"], + svec!["4", "d", "7", "I think, therefore I am"], + svec!["5", "d", "7", "I think, therefore I am"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(&["--hash", "/letter|number|random_text/"]) + .arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["letter", "number", "random_text", "hash"], + svec!["a", "13", "this is a test", "4649922201779202190"], + svec!["b", "24", "the quick brown fox", "10788366602312130446"], + svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn enumerate_hash_regex_not() { + let wrk = Workdir::new("enumerate_replace_regex_not"); + wrk.create( + "data.csv", + vec![ + svec!["hash", "letter", "number", "random_text"], + svec!["1", "a", "13", "this is a test"], + svec!["2", "b", "24", "the quick brown fox"], + svec!["3", "c", "72", "jumps over the lazy dog"], + svec!["4", "d", "7", "I think, therefore I am"], + svec!["5", "d", "7", "I think, therefore I am"], + ], + ); + let mut cmd = wrk.command("enum"); + cmd.args(&["--hash", "!/hash/"]).arg("data.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["letter", "number", "random_text", "hash"], + svec!["a", "13", "this is a test", "4649922201779202190"], + svec!["b", "24", "the quick brown fox", "10788366602312130446"], + svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + svec!["d", "7", "I think, therefore I am", "14437068658547852882"], + ]; + assert_eq!(got, expected); +} + #[test] fn enumerate_column_name() { let wrk = Workdir::new("enum");