Skip to content

Commit

Permalink
Merge pull request #1902 from jqnatividad/1888-enum-hash-option
Browse files Browse the repository at this point in the history
`enum`: add  `--hash` option to create a platform-independent deterministic id
  • Loading branch information
jqnatividad authored Jun 21, 2024
2 parents 332544e + 4597b1a commit 0dc8973
Show file tree
Hide file tree
Showing 4 changed files with 246 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ uuid = { version = "1", features = ["v4"] }
url = "2.5"
vader_sentiment = { version = "0.1", optional = true }
whatlang = { version = "0.16", optional = true }

xxhash-rust = { version = "0.8", features = ["xxh3"] }

[target.'cfg(not(target_arch = "aarch64"))'.dependencies]
simdutf8 = "0.1"
Expand Down
98 changes: 98 additions & 0 deletions src/cmd/enumerate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ The enum function has four modes of operation:
4. COPY. Copy the contents of a column to a new one:
$ qsv enum --copy names
5. HASH. Create a new column filled with the hash of a given column/s:
$ qsv enum --hash 1- // hash all columns
$ qsv enum --hash col2,col3,col4 // hash specific columns
$ qsv enum --hash col2 // hash a single column
$ qsv enum --hash /record_id|name|address/ // hash columns that match a regex
$ qsv enum --hash !/record_id/ // hash all columns except the record_id column
Finally, note that you should also be able to shuffle the lines of a CSV file
by sorting on the generated uuids:
$ qsv enum --uuid file.csv | qsv sort -s uuid > shuffled.csv
Expand All @@ -42,6 +49,16 @@ enum options:
--uuid When set, the column will be populated with
uuids (v4) instead of the incremental identifier.
Changes the default column name to "uuid".
--hash <columns> Create a new column filled with the hash of the
given column/s. Use "1-" to hash all columns.
Changes the default column name to "hash".
Will remove an existing "hash" column if it exists.
The columns argument specify the columns to use
in the hash. Columns can be referenced by name or index,
starting at 1. Specify multiple columns by separating
them with a comma. Specify a range of columns with `-`.
(See 'qsv select --help' for the full syntax.)
Common options:
-h, --help Display this message
Expand All @@ -54,6 +71,7 @@ Common options:

use serde::Deserialize;
use uuid::Uuid;
use xxhash_rust::xxh3::xxh3_64;

use crate::{
config::{Config, Delimiter},
Expand All @@ -72,6 +90,7 @@ struct Args {
flag_constant: Option<String>,
flag_copy: Option<SelectColumns>,
flag_uuid: bool,
flag_hash: Option<SelectColumns>,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
Expand All @@ -82,6 +101,7 @@ enum EnumOperation {
Uuid,
Constant,
Copy,
Hash,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
Expand All @@ -94,6 +114,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let mut wtr = Config::new(&args.flag_output).writer()?;

let mut headers = rdr.byte_headers()?.clone();
let mut hash_index = None;

let mut copy_index = 0;
let mut copy_operation = false;
Expand All @@ -105,6 +126,43 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
copy_operation = true;
}

let mut hash_sel = None;
let mut hash_operation = false;

if let Some(hash_columns) = &args.flag_hash {
// get the index of the column named "hash", if it exists
hash_index = headers.iter().position(|col| col == b"hash");

// get the original selection
rconfig = rconfig.select(hash_columns.clone());
let original_selection = rconfig
.clone()
.select(hash_columns.clone())
.selection(&headers)?;

// Filter out the "hash" column from the original selection, if it exists
let filtered_selection = original_selection
.iter()
.filter(|&&index| index != hash_index.unwrap_or(usize::MAX))
.collect::<Vec<_>>();

// Construct selection string without "hash" column
let selection_string = filtered_selection
.iter()
.map(|&&index| (index + 1).to_string())
.collect::<Vec<String>>()
.join(",");

// Parse the new selection without "hash" column
let no_hash_column_selection = SelectColumns::parse(&selection_string)?;

// Update the configuration with the new selection
rconfig = rconfig.select(no_hash_column_selection);
hash_sel = Some(rconfig.selection(&headers)?);

hash_operation = true;
}

if !rconfig.no_headers {
if let Some(column_name) = &args.flag_new_column {
headers.push_field(column_name.as_bytes());
Expand All @@ -118,6 +176,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
Err(e) => return fail_clierror!("Could not parse cell as utf-8!: {e}"),
};
headers.push_field(format!("{current_header}_copy").as_bytes());
} else if hash_operation {
// Remove an existing "hash" column from the header, if it exists
headers = if let Some(hash_index) = hash_index {
headers
.into_iter()
.enumerate()
.filter(|(i, _)| *i != hash_index)
.map(|(_, field)| field)
.collect()
} else {
headers
};
headers.push_field(b"hash");
} else {
headers.push_field(b"index");
};
Expand All @@ -137,6 +208,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
EnumOperation::Uuid
} else if copy_operation {
EnumOperation::Copy
} else if args.flag_hash.is_some() {
EnumOperation::Hash
} else {
EnumOperation::Increment
};
Expand All @@ -148,6 +221,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
#[allow(unused_assignments)]
let mut colcopy: Vec<u8> = Vec::with_capacity(20);
let increment = args.flag_increment.unwrap_or(1);
let mut hash_string = String::new();
let mut hash;

while rdr.read_byte_record(&mut record)? {
match enum_operation {
Expand All @@ -170,6 +245,29 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
colcopy = record[copy_index].to_vec();
record.push_field(&colcopy);
},
EnumOperation::Hash => {
hash_string.clear();

// build the hash string from the filtered selection
if let Some(ref sel) = hash_sel {
sel.iter()
.for_each(|i| hash_string.push_str(&String::from_utf8_lossy(&record[*i])));
}
hash = xxh3_64(hash_string.as_bytes());

// Optionally remove the "hash" column if it already exists from the output
record = if let Some(hash_index) = hash_index {
record
.into_iter()
.enumerate()
.filter(|(i, _)| *i != hash_index)
.map(|(_, field)| field)
.collect()
} else {
record
};
record.push_field(hash.to_string().as_bytes());
},
}

wtr.write_byte_record(&record)?;
Expand Down
146 changes: 146 additions & 0 deletions tests/test_enumerate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,152 @@ fn enumerate_counter_inc() {
assert_eq!(got, expected);
}

#[test]
fn enumerate_hash() {
let wrk = Workdir::new("enumerate_hash");
wrk.create(
"data.csv",
vec![
svec!["letter", "number", "random_text"],
svec!["a", "13", "this is a test"],
svec!["b", "24", "the quick brown fox"],
svec!["c", "72", "jumps over the lazy dog"],
svec!["d", "7", "I think, therefore I am"],
svec!["d", "7", "I think, therefore I am"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(&["--hash", "1-"]).arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["letter", "number", "random_text", "hash"],
svec!["a", "13", "this is a test", "4649922201779202190"],
svec!["b", "24", "the quick brown fox", "10788366602312130446"],
svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_hash_replace_old_hash() {
let wrk = Workdir::new("enumerate_replace_old_hash");
wrk.create(
"data.csv",
vec![
svec!["letter", "number", "random_text", "hash"],
svec!["a", "13", "this is a test", "1"],
svec!["b", "24", "the quick brown fox", "2"],
svec!["c", "72", "jumps over the lazy dog", "3"],
svec!["d", "7", "I think, therefore I am", "4"],
svec!["d", "7", "I think, therefore I am", "5"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(&["--hash", "!/hash/"]).arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["letter", "number", "random_text", "hash"],
svec!["a", "13", "this is a test", "4649922201779202190"],
svec!["b", "24", "the quick brown fox", "10788366602312130446"],
svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_hash_replace_old_hash2() {
let wrk = Workdir::new("enumerate_replace_old_hash2");
wrk.create(
"data.csv",
vec![
svec!["hash", "letter", "number", "random_text"],
svec!["1", "a", "13", "this is a test"],
svec!["2", "b", "24", "the quick brown fox"],
svec!["3", "c", "72", "jumps over the lazy dog"],
svec!["4", "d", "7", "I think, therefore I am"],
svec!["5", "d", "7", "I think, therefore I am"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(&["--hash", "1-"]).arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["letter", "number", "random_text", "hash"],
svec!["a", "13", "this is a test", "4649922201779202190"],
svec!["b", "24", "the quick brown fox", "10788366602312130446"],
svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_hash_regex() {
let wrk = Workdir::new("enumerate_replace_regex");
wrk.create(
"data.csv",
vec![
svec!["hash", "letter", "number", "random_text"],
svec!["1", "a", "13", "this is a test"],
svec!["2", "b", "24", "the quick brown fox"],
svec!["3", "c", "72", "jumps over the lazy dog"],
svec!["4", "d", "7", "I think, therefore I am"],
svec!["5", "d", "7", "I think, therefore I am"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(&["--hash", "/letter|number|random_text/"])
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["letter", "number", "random_text", "hash"],
svec!["a", "13", "this is a test", "4649922201779202190"],
svec!["b", "24", "the quick brown fox", "10788366602312130446"],
svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_hash_regex_not() {
let wrk = Workdir::new("enumerate_replace_regex_not");
wrk.create(
"data.csv",
vec![
svec!["hash", "letter", "number", "random_text"],
svec!["1", "a", "13", "this is a test"],
svec!["2", "b", "24", "the quick brown fox"],
svec!["3", "c", "72", "jumps over the lazy dog"],
svec!["4", "d", "7", "I think, therefore I am"],
svec!["5", "d", "7", "I think, therefore I am"],
],
);
let mut cmd = wrk.command("enum");
cmd.args(&["--hash", "!/hash/"]).arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["letter", "number", "random_text", "hash"],
svec!["a", "13", "this is a test", "4649922201779202190"],
svec!["b", "24", "the quick brown fox", "10788366602312130446"],
svec!["c", "72", "jumps over the lazy dog", "6378567261782451553"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
svec!["d", "7", "I think, therefore I am", "14437068658547852882"],
];
assert_eq!(got, expected);
}

#[test]
fn enumerate_column_name() {
let wrk = Workdir::new("enum");
Expand Down

0 comments on commit 0dc8973

Please sign in to comment.