Merge pull request #1902 from jqnatividad/1888-enum-hash-option

`enum`: add `--hash` option to create a platform-independent deterministic id
dathere · Jun 21, 2024 · 0dc8973 · 0dc8973
2 parents 332544e + 4597b1a
commit 0dc8973
Show file tree

Hide file tree

Showing 4 changed files with 246 additions and 1 deletion.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -225,7 +225,7 @@ uuid = { version = "1", features = ["v4"] }
 url = "2.5"
 vader_sentiment = { version = "0.1", optional = true }
 whatlang = { version = "0.16", optional = true }
-
+xxhash-rust = { version = "0.8", features = ["xxh3"] }
 
 [target.'cfg(not(target_arch = "aarch64"))'.dependencies]
 simdutf8    = "0.1"

diff --git a/src/cmd/enumerate.rs b/src/cmd/enumerate.rs
@@ -17,6 +17,13 @@ The enum function has four modes of operation:
   4. COPY. Copy the contents of a column to a new one:
     $ qsv enum --copy names
 
+  5. HASH. Create a new column filled with the hash of a given column/s:
+    $ qsv enum --hash 1- // hash all columns
+    $ qsv enum --hash col2,col3,col4 // hash specific columns
+    $ qsv enum --hash col2 // hash a single column
+    $ qsv enum --hash /record_id|name|address/ // hash columns that match a regex
+    $ qsv enum --hash !/record_id/ // hash all columns except the record_id column
+
   Finally, note that you should also be able to shuffle the lines of a CSV file
   by sorting on the generated uuids:
     $ qsv enum --uuid file.csv | qsv sort -s uuid > shuffled.csv
@@ -42,6 +49,16 @@ enum options:
     --uuid                   When set, the column will be populated with
                              uuids (v4) instead of the incremental identifier.
                              Changes the default column name to "uuid".
+    --hash <columns>         Create a new column filled with the hash of the
+                             given column/s. Use "1-" to hash all columns.
+                             Changes the default column name to "hash".
+                             Will remove an existing "hash" column if it exists.
+
+                             The columns argument specify the columns to use
+                             in the hash. Columns can be referenced by name or index,
+                             starting at 1. Specify multiple columns by separating
+                             them with a comma. Specify a range of columns with `-`.
+                             (See 'qsv select --help' for the full syntax.)
 
 Common options:
     -h, --help               Display this message
@@ -54,6 +71,7 @@ Common options:
 
 use serde::Deserialize;
 use uuid::Uuid;
+use xxhash_rust::xxh3::xxh3_64;
 
 use crate::{
     config::{Config, Delimiter},
@@ -72,6 +90,7 @@ struct Args {
     flag_constant:   Option<String>,
     flag_copy:       Option<SelectColumns>,
     flag_uuid:       bool,
+    flag_hash:       Option<SelectColumns>,
     flag_output:     Option<String>,
     flag_no_headers: bool,
     flag_delimiter:  Option<Delimiter>,
@@ -82,6 +101,7 @@ enum EnumOperation {
     Uuid,
     Constant,
     Copy,
+    Hash,
 }
 
 pub fn run(argv: &[&str]) -> CliResult<()> {
@@ -94,6 +114,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
     let mut wtr = Config::new(&args.flag_output).writer()?;
 
     let mut headers = rdr.byte_headers()?.clone();
+    let mut hash_index = None;
 
     let mut copy_index = 0;
     let mut copy_operation = false;
@@ -105,6 +126,43 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         copy_operation = true;
     }
 
+    let mut hash_sel = None;
+    let mut hash_operation = false;
+
+    if let Some(hash_columns) = &args.flag_hash {
+        // get the index of the column named "hash", if it exists
+        hash_index = headers.iter().position(|col| col == b"hash");
+
+        // get the original selection
+        rconfig = rconfig.select(hash_columns.clone());
+        let original_selection = rconfig
+            .clone()
+            .select(hash_columns.clone())
+            .selection(&headers)?;
+
+        // Filter out the "hash" column from the original selection, if it exists
+        let filtered_selection = original_selection
+            .iter()
+            .filter(|&&index| index != hash_index.unwrap_or(usize::MAX))
+            .collect::<Vec<_>>();
+
+        // Construct selection string without "hash" column
+        let selection_string = filtered_selection
+            .iter()
+            .map(|&&index| (index + 1).to_string())
+            .collect::<Vec<String>>()
+            .join(",");
+
+        // Parse the new selection without "hash" column
+        let no_hash_column_selection = SelectColumns::parse(&selection_string)?;
+
+        // Update the configuration with the new selection
+        rconfig = rconfig.select(no_hash_column_selection);
+        hash_sel = Some(rconfig.selection(&headers)?);
+
+        hash_operation = true;
+    }
+
     if !rconfig.no_headers {
         if let Some(column_name) = &args.flag_new_column {
             headers.push_field(column_name.as_bytes());
@@ -118,6 +176,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
                 Err(e) => return fail_clierror!("Could not parse cell as utf-8!: {e}"),
             };
             headers.push_field(format!("{current_header}_copy").as_bytes());
+        } else if hash_operation {
+            // Remove an existing "hash" column from the header, if it exists
+            headers = if let Some(hash_index) = hash_index {
+                headers
+                    .into_iter()
+                    .enumerate()
+                    .filter(|(i, _)| *i != hash_index)
+                    .map(|(_, field)| field)
+                    .collect()
+            } else {
+                headers
+            };
+            headers.push_field(b"hash");
         } else {
             headers.push_field(b"index");
         };
@@ -137,6 +208,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         EnumOperation::Uuid
     } else if copy_operation {
         EnumOperation::Copy
+    } else if args.flag_hash.is_some() {
+        EnumOperation::Hash
     } else {
         EnumOperation::Increment
     };
@@ -148,6 +221,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
     #[allow(unused_assignments)]
     let mut colcopy: Vec<u8> = Vec::with_capacity(20);
     let increment = args.flag_increment.unwrap_or(1);
+    let mut hash_string = String::new();
+    let mut hash;
 
     while rdr.read_byte_record(&mut record)? {
         match enum_operation {
@@ -170,6 +245,29 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
                 colcopy = record[copy_index].to_vec();
                 record.push_field(&colcopy);
             },
+            EnumOperation::Hash => {
+                hash_string.clear();
+
+                // build the hash string from the filtered selection
+                if let Some(ref sel) = hash_sel {
+                    sel.iter()
+                        .for_each(|i| hash_string.push_str(&String::from_utf8_lossy(&record[*i])));
+                }
+                hash = xxh3_64(hash_string.as_bytes());
+
+                // Optionally remove the "hash" column if it already exists from the output
+                record = if let Some(hash_index) = hash_index {
+                    record
+                        .into_iter()
+                        .enumerate()
+                        .filter(|(i, _)| *i != hash_index)
+                        .map(|(_, field)| field)
+                        .collect()
+                } else {
+                    record
+                };
+                record.push_field(hash.to_string().as_bytes());
+            },
         }
 
         wtr.write_byte_record(&record)?;

diff --git a/tests/test_enumerate.rs b/tests/test_enumerate.rs
@@ -83,6 +83,152 @@ fn enumerate_counter_inc() {
     assert_eq!(got, expected);
 }
 
+#[test]
+fn enumerate_hash() {
+    let wrk = Workdir::new("enumerate_hash");
+    wrk.create(
+        "data.csv",
+        vec![
+            svec!["letter", "number", "random_text"],
+            svec!["a", "13", "this is a test"],
+            svec!["b", "24", "the quick brown fox"],
+            svec!["c", "72", "jumps over the lazy dog"],
+            svec!["d", "7", "I think, therefore I am"],
+            svec!["d", "7", "I think, therefore I am"],
+        ],
+    );
+    let mut cmd = wrk.command("enum");
+    cmd.args(&["--hash", "1-"]).arg("data.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["letter", "number", "random_text", "hash"],
+        svec!["a", "13", "this is a test", "4649922201779202190"],
+        svec!["b", "24", "the quick brown fox", "10788366602312130446"],
+        svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+    ];
+    assert_eq!(got, expected);
+}
+
+#[test]
+fn enumerate_hash_replace_old_hash() {
+    let wrk = Workdir::new("enumerate_replace_old_hash");
+    wrk.create(
+        "data.csv",
+        vec![
+            svec!["letter", "number", "random_text", "hash"],
+            svec!["a", "13", "this is a test", "1"],
+            svec!["b", "24", "the quick brown fox", "2"],
+            svec!["c", "72", "jumps over the lazy dog", "3"],
+            svec!["d", "7", "I think, therefore I am", "4"],
+            svec!["d", "7", "I think, therefore I am", "5"],
+        ],
+    );
+    let mut cmd = wrk.command("enum");
+    cmd.args(&["--hash", "!/hash/"]).arg("data.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["letter", "number", "random_text", "hash"],
+        svec!["a", "13", "this is a test", "4649922201779202190"],
+        svec!["b", "24", "the quick brown fox", "10788366602312130446"],
+        svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+    ];
+    assert_eq!(got, expected);
+}
+
+#[test]
+fn enumerate_hash_replace_old_hash2() {
+    let wrk = Workdir::new("enumerate_replace_old_hash2");
+    wrk.create(
+        "data.csv",
+        vec![
+            svec!["hash", "letter", "number", "random_text"],
+            svec!["1", "a", "13", "this is a test"],
+            svec!["2", "b", "24", "the quick brown fox"],
+            svec!["3", "c", "72", "jumps over the lazy dog"],
+            svec!["4", "d", "7", "I think, therefore I am"],
+            svec!["5", "d", "7", "I think, therefore I am"],
+        ],
+    );
+    let mut cmd = wrk.command("enum");
+    cmd.args(&["--hash", "1-"]).arg("data.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["letter", "number", "random_text", "hash"],
+        svec!["a", "13", "this is a test", "4649922201779202190"],
+        svec!["b", "24", "the quick brown fox", "10788366602312130446"],
+        svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+    ];
+    assert_eq!(got, expected);
+}
+
+#[test]
+fn enumerate_hash_regex() {
+    let wrk = Workdir::new("enumerate_replace_regex");
+    wrk.create(
+        "data.csv",
+        vec![
+            svec!["hash", "letter", "number", "random_text"],
+            svec!["1", "a", "13", "this is a test"],
+            svec!["2", "b", "24", "the quick brown fox"],
+            svec!["3", "c", "72", "jumps over the lazy dog"],
+            svec!["4", "d", "7", "I think, therefore I am"],
+            svec!["5", "d", "7", "I think, therefore I am"],
+        ],
+    );
+    let mut cmd = wrk.command("enum");
+    cmd.args(&["--hash", "/letter|number|random_text/"])
+        .arg("data.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["letter", "number", "random_text", "hash"],
+        svec!["a", "13", "this is a test", "4649922201779202190"],
+        svec!["b", "24", "the quick brown fox", "10788366602312130446"],
+        svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+    ];
+    assert_eq!(got, expected);
+}
+
+#[test]
+fn enumerate_hash_regex_not() {
+    let wrk = Workdir::new("enumerate_replace_regex_not");
+    wrk.create(
+        "data.csv",
+        vec![
+            svec!["hash", "letter", "number", "random_text"],
+            svec!["1", "a", "13", "this is a test"],
+            svec!["2", "b", "24", "the quick brown fox"],
+            svec!["3", "c", "72", "jumps over the lazy dog"],
+            svec!["4", "d", "7", "I think, therefore I am"],
+            svec!["5", "d", "7", "I think, therefore I am"],
+        ],
+    );
+    let mut cmd = wrk.command("enum");
+    cmd.args(&["--hash", "!/hash/"]).arg("data.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["letter", "number", "random_text", "hash"],
+        svec!["a", "13", "this is a test", "4649922201779202190"],
+        svec!["b", "24", "the quick brown fox", "10788366602312130446"],
+        svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+        svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
+    ];
+    assert_eq!(got, expected);
+}
+
 #[test]
 fn enumerate_column_name() {
     let wrk = Workdir::new("enum");