Skip to content

Commit

Permalink
Merge pull request #1412 from jqnatividad/polars-csv-comments
Browse files Browse the repository at this point in the history
`sqlp` & `joinp`: both commands now recognize QSV_COMMENT_CHAR env var
  • Loading branch information
jqnatividad authored Nov 11, 2023
2 parents b1e2b05 + f9a1ea4 commit 7ced121
Show file tree
Hide file tree
Showing 4 changed files with 258 additions and 1 deletion.
9 changes: 9 additions & 0 deletions src/cmd/joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ Common options:
"#;

use std::{
env,
fs::File,
io::{self, Write},
path::Path,
Expand Down Expand Up @@ -473,6 +474,12 @@ impl Args {
b','
};

let comment_char: Option<u8> = if let Ok(comment_char) = env::var("QSV_COMMENT_CHAR") {
Some(comment_char.as_bytes().first().unwrap().to_owned())
} else {
None
};

let num_rows = if infer_len == 0 {
None
} else {
Expand All @@ -482,6 +489,7 @@ impl Args {
let mut left_lf = LazyCsvReader::new(&self.arg_input1)
.has_header(true)
.with_missing_is_null(self.flag_nulls)
.with_comment_char(comment_char)
.with_separator(delim)
.with_infer_schema_length(num_rows)
.with_try_parse_dates(try_parsedates)
Expand All @@ -497,6 +505,7 @@ impl Args {
let mut right_lf = LazyCsvReader::new(&self.arg_input2)
.has_header(true)
.with_missing_is_null(self.flag_nulls)
.with_comment_char(comment_char)
.with_separator(delim)
.with_infer_schema_length(num_rows)
.with_try_parse_dates(try_parsedates)
Expand Down
7 changes: 7 additions & 0 deletions src/cmd/sqlp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
b','
};

let comment_char: Option<u8> = if let Ok(comment_char) = env::var("QSV_COMMENT_CHAR") {
Some(comment_char.as_bytes().first().unwrap().to_owned())
} else {
None
};

let optimization_state = if args.flag_no_optimizations {
// use default optimization state
polars::lazy::frame::OptState {
Expand Down Expand Up @@ -467,6 +473,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let lf = LazyCsvReader::new(table)
.has_header(true)
.with_missing_is_null(true)
.with_comment_char(comment_char)
.with_null_values(Some(NullValues::AllColumns(rnull_values.clone())))
.with_separator(delim)
.with_infer_schema_length(args.flag_infer_len)
Expand Down
207 changes: 206 additions & 1 deletion tests/test_joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,26 @@ macro_rules! joinp_test {
};
}

macro_rules! joinp_test_comments {
($name2:ident, $fun:expr) => {
mod $name2 {
use std::process;

#[allow(unused_imports)]
use super::{make_rows, setup};
use crate::workdir::Workdir;

#[test]
fn headers() {
let wrk = setup(stringify!($name2));
let mut cmd = wrk.command("joinp");
cmd.args(&["city", "cities_comments.csv", "city", "places.csv"]);
$fun(wrk, cmd);
}
}
};
}

fn setup(name: &str) -> Workdir {
let cities = vec![
svec!["city", "state"],
Expand All @@ -28,6 +48,15 @@ fn setup(name: &str) -> Workdir {
svec!["San Francisco", "CA"],
svec!["Buffalo", "NY"],
];
let cities_comments = vec![
svec!["#this is a comment", ""],
svec!["city", "state"],
svec!["Boston", "MA"],
svec!["New York", "NY"],
svec!["#Washington", "DC"],
svec!["San Francisco", "CA"],
svec!["Buffalo", "NY"],
];
let places = vec![
svec!["city", "place"],
svec!["Boston", "Logan Airport"],
Expand All @@ -38,6 +67,7 @@ fn setup(name: &str) -> Workdir {

let wrk = Workdir::new(name);
wrk.create("cities.csv", cities);
wrk.create("cities_comments.csv", cities_comments);
wrk.create("places.csv", places);
wrk
}
Expand Down Expand Up @@ -66,6 +96,22 @@ joinp_test!(joinp_inner, |wrk: Workdir, mut cmd: process::Command| {
assert_eq!(got, expected);
});

joinp_test_comments!(
joinp_inner_comments,
|wrk: Workdir, mut cmd: process::Command| {
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
false,
vec![
svec!["Boston", "MA", "Logan Airport"],
svec!["Boston", "MA", "Boston Garden"],
svec!["Buffalo", "NY", "Ralph Wilson Stadium"],
],
);
assert_eq!(got, expected);
}
);

joinp_test!(
joinp_outer_left,
|wrk: Workdir, mut cmd: process::Command| {
Expand All @@ -85,6 +131,25 @@ joinp_test!(
}
);

joinp_test_comments!(
joinp_outer_left_comments,
|wrk: Workdir, mut cmd: process::Command| {
cmd.arg("--left");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
false,
vec![
svec!["Boston", "MA", "Logan Airport"],
svec!["Boston", "MA", "Boston Garden"],
svec!["New York", "NY", ""],
svec!["San Francisco", "CA", ""],
svec!["Buffalo", "NY", "Ralph Wilson Stadium"],
],
);
assert_eq!(got, expected);
}
);

joinp_test!(
joinp_outer_left_filter_left,
|wrk: Workdir, mut cmd: process::Command| {
Expand All @@ -101,6 +166,22 @@ joinp_test!(
}
);

joinp_test_comments!(
joinp_outer_left_filter_left_comments,
|wrk: Workdir, mut cmd: process::Command| {
cmd.arg("--left").args(["--filter-left", "city = 'Boston'"]);
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
false,
vec![
svec!["Boston", "MA", "Logan Airport"],
svec!["Boston", "MA", "Boston Garden"],
],
);
assert_eq!(got, expected);
}
);

joinp_test!(
joinp_inner_filter_right,
|wrk: Workdir, mut cmd: process::Command| {
Expand All @@ -111,6 +192,16 @@ joinp_test!(
}
);

joinp_test_comments!(
joinp_inner_filter_right_comments,
|wrk: Workdir, mut cmd: process::Command| {
cmd.args(["--filter-right", "place ~* 'w'"]);
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(false, vec![svec!["Buffalo", "NY", "Ralph Wilson Stadium"]]);
assert_eq!(got, expected);
}
);

joinp_test!(
joinp_outer_left_validate_none,
|wrk: Workdir, mut cmd: process::Command| {
Expand All @@ -130,6 +221,25 @@ joinp_test!(
}
);

joinp_test_comments!(
joinp_outer_left_validate_none_comments,
|wrk: Workdir, mut cmd: process::Command| {
cmd.arg("--left").args(["--validate", "none"]);
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
false,
vec![
svec!["Boston", "MA", "Logan Airport"],
svec!["Boston", "MA", "Boston Garden"],
svec!["New York", "NY", ""],
svec!["San Francisco", "CA", ""],
svec!["Buffalo", "NY", "Ralph Wilson Stadium"],
],
);
assert_eq!(got, expected);
}
);

// joinp_test!(
// joinp_outer_left_validate_manytoone,
// |wrk: Workdir, mut cmd: process::Command| {
Expand All @@ -142,7 +252,7 @@ joinp_test!(
// );
// wrk.assert_err(&mut cmd);
// }
// );x
// );

joinp_test!(joinp_full, |wrk: Workdir, mut cmd: process::Command| {
cmd.arg("--full");
Expand Down Expand Up @@ -172,6 +282,37 @@ joinp_test!(joinp_full, |wrk: Workdir, mut cmd: process::Command| {
assert!(got == expected1 || got == expected2);
});

joinp_test_comments!(
joinp_full_comments,
|wrk: Workdir, mut cmd: process::Command| {
cmd.arg("--full");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected1 = make_rows(
false,
vec![
svec!["Boston", "MA", "Logan Airport"],
svec!["Boston", "MA", "Boston Garden"],
svec!["Buffalo", "NY", "Ralph Wilson Stadium"],
svec!["Orlando", "", "Disney World"],
svec!["San Francisco", "CA", ""],
svec!["New York", "NY", ""],
],
);
let expected2 = make_rows(
false,
vec![
svec!["Boston", "MA", "Logan Airport"],
svec!["Boston", "MA", "Boston Garden"],
svec!["Buffalo", "NY", "Ralph Wilson Stadium"],
svec!["Orlando", "", "Disney World"],
svec!["New York", "NY", ""],
svec!["San Francisco", "CA", ""],
],
);
assert!(got == expected1 || got == expected2);
}
);

joinp_test!(
joinp_left_semi,
|wrk: Workdir, mut cmd: process::Command| {
Expand All @@ -182,6 +323,16 @@ joinp_test!(
}
);

joinp_test_comments!(
joinp_left_semi_comments,
|wrk: Workdir, mut cmd: process::Command| {
cmd.arg("--left-semi");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(true, vec![svec!["Boston", "MA"], svec!["Buffalo", "NY"]]);
assert_eq!(got, expected);
}
);

joinp_test!(
joinp_left_anti,
|wrk: Workdir, mut cmd: process::Command| {
Expand All @@ -195,6 +346,19 @@ joinp_test!(
}
);

joinp_test_comments!(
joinp_left_anti_comments,
|wrk: Workdir, mut cmd: process::Command| {
cmd.arg("--left-anti");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
true,
vec![svec!["New York", "NY"], svec!["San Francisco", "CA"]],
);
assert_eq!(got, expected);
}
);

#[test]
fn joinp_cross() {
let wrk = Workdir::new("join_cross");
Expand Down Expand Up @@ -260,6 +424,47 @@ fn joinp_asof_date() {
assert_eq!(got, expected);
}

#[test]
fn joinp_asof_dat_comments() {
let wrk = Workdir::new("join_asof_date_comments");
wrk.create(
"gdp.csv",
vec![
svec!["#comment", "here"],
svec!["date", "gdp"],
svec!["2016-01-01", "4164"],
svec!["2017-01-01", "4411"],
svec!["2018-01-01", "4566"],
svec!["2019-01-01", "4696"],
],
);
wrk.create(
"population.csv",
vec![
svec!["date", "population"],
svec!["2016-05-12", "82.19"],
svec!["2017-05-12", "82.66"],
svec!["#comment", "in the middle"],
svec!["2018-05-12", "83.12"],
svec!["2019-05-12", "83.52"],
],
);

let mut cmd = wrk.command("joinp");
cmd.arg("--asof")
.args(["date", "population.csv", "date", "gdp.csv"]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["date", "population", "gdp"],
svec!["2016-05-12", "82.19", "4164"],
svec!["2017-05-12", "82.66", "4411"],
svec!["2018-05-12", "83.12", "4566"],
svec!["2019-05-12", "83.52", "4696"],
];
assert_eq!(got, expected);
}

#[test]
fn joinp_asofby_1() {
let wrk = Workdir::new("join_asofby_timeseries");
Expand Down
36 changes: 36 additions & 0 deletions tests/test_sqlp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,42 @@ fn sqlp_boston311_try_parsedates_format() {
assert_eq!(got, expected);
}

#[test]
fn sqlp_comments() {
let wrk = Workdir::new("sqlp_comments");
// let test_file = wrk.load_test_file("inputcommenttest.csv");
wrk.create(
"comments.csv",
vec![
svec!["# test file to see how comments work", ""],
svec!["# this is another comment before the header", ""],
svec!["# DATA DICTIONARY", ""],
svec!["# column1 - alphabetic; id of the column", ""],
svec!["# column2 - numeric; just a number", ""],
svec!["column1", "column2"],
svec!["a", "1"],
svec!["#b", "2"],
svec!["c", "3"],
svec!["#d - this row is corrupted skip", "extra col2"],
svec!["e", "5"],
],
);

let mut cmd = wrk.command("sqlp");
cmd.env("QSV_COMMENT_CHAR", "#");
cmd.arg("comments.csv")
.arg("select column1, column2 from comments order by column2 desc");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["e", "5"],
svec!["c", "3"],
svec!["a", "1"],
];
assert_eq!(got, expected);
}

#[test]
fn sqlp_boston311_explain() {
let wrk = Workdir::new("sqlp_boston311_explain");
Expand Down

0 comments on commit 7ced121

Please sign in to comment.