Skip to content

Commit

Permalink
joinp: handle snappy compressed right input
Browse files Browse the repository at this point in the history
  • Loading branch information
jqnatividad committed Jul 23, 2024
1 parent 28e3dcf commit 015e5a5
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 14 deletions.
43 changes: 31 additions & 12 deletions src/cmd/joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,18 @@ impl Args {
if input2_path.extension().and_then(std::ffi::OsStr::to_str) == Some("sz") {
let decompressed_path =
util::decompress_snappy_file(&input2_path.to_path_buf(), tmpdir)?;
self.arg_input2 = decompressed_path;
let decomp_path = if decompressed_path.ends_with("__qsv_temp_decompressed") {
// use a regular expression to extract the original file name
// the original file name is between "qsv__" and "__qsv_temp_decompressed"
let re =
regex::Regex::new(r"qsv__(?P<filename>.*)__qsv_temp_decompressed").unwrap();
let caps = re.captures(&decompressed_path).unwrap();
let filename = caps.name("filename").unwrap().as_str();
filename.to_string()
} else {
decompressed_path.clone()
};
self.arg_input2 = decomp_path;
}

LazyCsvReader::new(&self.arg_input2)
Expand Down Expand Up @@ -635,22 +646,30 @@ impl Args {
}
}

/// if the file has a TSV or TAB extension, we automatically use tab as the delimiter
/// if the file has a TSV/TAB or SSV extension, we automatically use
/// tab or semicolon as the delimiter
/// otherwise, we use the delimiter specified by the user
pub fn tsvssv_delim<P: AsRef<Path>>(file: P, orig_delim: u8) -> u8 {
let inputfile_extension = file
.as_ref()
.extension()
.and_then(std::ffi::OsStr::to_str)
.unwrap_or_default();

if inputfile_extension.eq_ignore_ascii_case("tsv")
|| inputfile_extension.eq_ignore_ascii_case("tab")
{
b'\t'
} else if inputfile_extension.eq_ignore_ascii_case("ssv") {
b';'
} else {
orig_delim
.unwrap_or_default()
.to_ascii_lowercase();

match inputfile_extension.as_str() {
"tsv" | "tab" => b'\t',
"ssv" => b';',
_ => orig_delim,
}

// if inputfile_extension.eq_ignore_ascii_case("tsv")
// || inputfile_extension.eq_ignore_ascii_case("tab")
// {
// b'\t'
// } else if inputfile_extension.eq_ignore_ascii_case("ssv") {
// b';'
// } else {
// orig_delim
// }
}
11 changes: 9 additions & 2 deletions tests/test_joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ macro_rules! joinp_test_comments {
let wrk = setup(stringify!($name2));
let mut cmd = wrk.command("joinp");
cmd.env("QSV_COMMENT_CHAR", "#");
cmd.args(&["city", "cities_comments.csv", "city", "places.csv"]);
cmd.args(&["city", "cities_comments.csv", "city", "places.ssv"]);
$fun(wrk, cmd);
}
}
Expand All @@ -74,7 +74,7 @@ macro_rules! joinp_test_compressed {
fn headers() {
let wrk = setup(stringify!($name3));
let mut cmd = wrk.command("joinp");
cmd.args(&["city", "cities.csv.sz", "city", "places.csv.sz"]);
cmd.args(&["city", "cities.csv.sz", "city", "places.ssv.sz"]);
$fun(wrk, cmd);
}
}
Expand Down Expand Up @@ -131,6 +131,13 @@ fn setup(name: &str) -> Workdir {
.args(["--output", &out_file2]);
wrk.assert_success(&mut cmd2);

let out_file3 = wrk.path("places.ssv.sz").to_string_lossy().to_string();
let mut cmd3 = wrk.command("snappy");
cmd3.arg("compress")
.arg("places.ssv")
.args(["--output", &out_file3]);
wrk.assert_success(&mut cmd3);

wrk
}

Expand Down

0 comments on commit 015e5a5

Please sign in to comment.