From 8317fcb01524bed09dc27e7005e8d5967bb79036 Mon Sep 17 00:00:00 2001
From: rzmk <30333942+rzmk@users.noreply.github.com>
Date: Fri, 14 Jun 2024 18:48:05 -0400
Subject: [PATCH 1/2] `jsonp`: add `jsonp` command allowing non-nested JSON to
CSV conversion with Polars
---
README.md | 1 +
src/cmd/jsonp.rs | 123 ++++++++++++++++++++++++++++++++++++++++++++
src/cmd/mod.rs | 2 +
src/main.rs | 8 +++
tests/test_jsonp.rs | 86 +++++++++++++++++++++++++++++++
tests/tests.rs | 2 +
6 files changed, 222 insertions(+)
create mode 100644 src/cmd/jsonp.rs
create mode 100644 tests/test_jsonp.rs
diff --git a/README.md b/README.md
index 6c81bc9e8..768bc6e6f 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@
| [join](/src/cmd/join.rs#L2) | Inner, outer, right, cross, anti & semi joins. Automatically creates a simple, in-memory hash index to make it fast. |
| [joinp](/src/cmd/joinp.rs#L2)
✨🚀🐻❄️ | Inner, outer, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output doesn't have duplicate columns. However, `joinp` doesn't have an --ignore-case option & it doesn't support right outer joins. |
| [jsonl](/src/cmd/jsonl.rs#L2)
🚀🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL.
+| [jsonp](/src/cmd/jsonp.rs#L2)
| Convert non-nested JSON to CSV. Only available with the polars feature enabled.
|
[luau](/src/cmd/luau.rs#L2) 👑
✨📇🌐🔣 ![CKAN](docs/images/ckan.png) | Create multiple new computed columns, filter rows, compute aggregations and build complex data pipelines by executing a [Luau](https://luau-lang.org) [0.625](https://github.com/Roblox/luau/releases/tag/0.625) expression/script for every row of a CSV file ([sequential mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L254-L298)), or using [random access](https://www.webopedia.com/definitions/random-access/) with an index ([random access mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L367-L415)).
Can process a single Luau expression or [full-fledged data-wrangling scripts using lookup tables](https://github.com/dathere/qsv-lookup-tables#example) with discrete BEGIN, MAIN and END sections.
It is not just another qsv command, it is qsv's [Domain-specific Language](https://en.wikipedia.org/wiki/Domain-specific_language) (DSL) with [numerous qsv-specific helper functions](https://github.com/jqnatividad/qsv/blob/113eee17b97882dc368b2e65fec52b86df09f78b/src/cmd/luau.rs#L1356-L2290) to build production data pipelines. |
| [partition](/src/cmd/partition.rs#L2) | Partition a CSV based on a column value. |
| [prompt](/src/cmd/prompt.rs#L2) | Open a file dialog to pick a file. |
diff --git a/src/cmd/jsonp.rs b/src/cmd/jsonp.rs
new file mode 100644
index 000000000..b3d9df38d
--- /dev/null
+++ b/src/cmd/jsonp.rs
@@ -0,0 +1,123 @@
+static USAGE: &str = r#"
+Convert non-nested JSON to CSV (polars feature only).
+
+You may provide JSON data either from stdin or a file path.
+This command may not work with nested JSON data.
+
+As a basic example, say we have a file fruits.json with contents:
+
+[
+ {
+ "fruit": "apple",
+ "price": 2.5
+ },
+ {
+ "fruit": "banana",
+ "price": 3.0
+ }
+]
+
+To convert it to CSV format, run:
+
+qsv jsonp fruits.json
+
+And the following is printed to the terminal:
+
+fruit,price
+apple,2.5
+banana,3.0
+
+If fruits.json was provided using stdin then either use - or do not provide a file path. For example:
+
+cat fruits.json | qsv jsonp -
+
+For more examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_jsonp.rs.
+
+Usage:
+ qsv jsonp [options] []
+ qsv jsonp --help
+
+jsonp options:
+ --datetime-format The datetime format to use writing datetimes.
+ See https://docs.rs/chrono/latest/chrono/format/strftime/index.html
+ for the list of valid format specifiers.
+ --date-format The date format to use writing dates.
+ --time-format The time format to use writing times.
+ --float-precision The number of digits of precision to use when writing floats.
+ --wnull-value The string to use when WRITING null values.
+
+Common options:
+ -h, --help Display this message
+ -o, --output Write output to instead of stdout.
+"#;
+
+use std::io::{Cursor, Read, Seek, SeekFrom, Write};
+
+use polars::prelude::*;
+use serde::Deserialize;
+
+use crate::{util, CliResult};
+
+#[derive(Deserialize)]
+struct Args {
+ arg_input: Option,
+ flag_datetime_format: Option,
+ flag_date_format: Option,
+ flag_time_format: Option,
+ flag_float_precision: Option,
+ flag_wnull_value: Option,
+ flag_output: Option,
+}
+
+pub fn run(argv: &[&str]) -> CliResult<()> {
+ let args: Args = util::get_args(USAGE, argv)?;
+
+ fn df_from_stdin() -> PolarsResult {
+ // Create a buffer in memory for stdin
+ let mut buffer: Vec = Vec::new();
+ let stdin = std::io::stdin();
+ stdin.lock().read_to_end(&mut buffer)?;
+ Ok(JsonReader::new(Box::new(std::io::Cursor::new(buffer))).finish()?)
+ }
+
+ fn df_from_path(path: String) -> PolarsResult {
+ Ok(JsonReader::new(std::fs::File::open(path)?).finish()?)
+ }
+
+ let df = match args.arg_input.clone() {
+ Some(path) => {
+ if path == "-" {
+ df_from_stdin()?
+ } else {
+ df_from_path(path)?
+ }
+ },
+ None => df_from_stdin()?,
+ };
+
+ fn df_to_csv(mut writer: W, mut df: DataFrame, args: &Args) -> PolarsResult<()> {
+ CsvWriter::new(&mut writer)
+ .with_datetime_format(args.flag_datetime_format.clone())
+ .with_date_format(args.flag_date_format.clone())
+ .with_time_format(args.flag_time_format.clone())
+ .with_float_precision(args.flag_float_precision.clone())
+ .with_null_value(args.flag_wnull_value.clone().unwrap_or("".to_string()))
+ .include_bom(util::get_envvar_flag("QSV_OUTPUT_BOM"))
+ .finish(&mut df)?;
+ Ok(())
+ }
+
+ if let Some(output_path) = args.flag_output.clone() {
+ let mut output = std::fs::File::create(output_path)?;
+ df_to_csv(&mut output, df, &args)?;
+ } else {
+ let mut res = Cursor::new(Vec::new());
+ df_to_csv(&mut res, df, &args)?;
+ res.seek(SeekFrom::Start(0))?;
+ let mut out = String::new();
+ res.read_to_string(&mut out)?;
+ println!("{out}");
+ }
+
+ Ok(())
+}
diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs
index 844d26fe6..db3a7ba57 100644
--- a/src/cmd/mod.rs
+++ b/src/cmd/mod.rs
@@ -46,6 +46,8 @@ pub mod join;
pub mod joinp;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
pub mod jsonl;
+#[cfg(feature = "polars")]
+pub mod jsonp;
#[cfg(feature = "luau")]
pub mod luau;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
diff --git a/src/main.rs b/src/main.rs
index 70385103e..d3816f0e3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -144,6 +144,10 @@ fn main() -> QsvExitCode {
enabled_commands.push_str(" jsonl Convert newline-delimited JSON files to CSV\n");
+ #[cfg(all(feature = "polars", feature = "feature_capable"))]
+ enabled_commands
+ .push_str(" jsonp Convert non-nested JSON to CSV (polars feature only)\n");
+
#[cfg(all(feature = "luau", feature = "feature_capable"))]
enabled_commands.push_str(" luau Execute Luau script on CSV data\n");
@@ -356,6 +360,8 @@ enum Command {
#[cfg(all(feature = "polars", feature = "feature_capable"))]
JoinP,
Jsonl,
+ #[cfg(all(feature = "polars", feature = "feature_capable"))]
+ JsonP,
#[cfg(all(feature = "luau", feature = "feature_capable"))]
Luau,
Partition,
@@ -445,6 +451,8 @@ impl Command {
#[cfg(all(feature = "polars", feature = "feature_capable"))]
Command::JoinP => cmd::joinp::run(argv),
Command::Jsonl => cmd::jsonl::run(argv),
+ #[cfg(all(feature = "polars", feature = "feature_capable"))]
+ Command::JsonP => cmd::jsonp::run(argv),
#[cfg(all(feature = "luau", feature = "feature_capable"))]
Command::Luau => cmd::luau::run(argv),
Command::Partition => cmd::partition::run(argv),
diff --git a/tests/test_jsonp.rs b/tests/test_jsonp.rs
new file mode 100644
index 000000000..a6c66b383
--- /dev/null
+++ b/tests/test_jsonp.rs
@@ -0,0 +1,86 @@
+use crate::workdir::Workdir;
+
+#[test]
+fn jsonp_simple() {
+ let wrk = Workdir::new("jsonp_simple");
+ wrk.create_from_string(
+ "data.json",
+ r#"[{"id":1,"father":"Mark","mother":"Charlotte","oldest_child":"Tom","boy":true},
+{"id":2,"father":"John","mother":"Ann","oldest_child":"Jessika","boy":false},
+{"id":3,"father":"Bob","mother":"Monika","oldest_child":"Jerry","boy":true}]"#,
+ );
+ let mut cmd = wrk.command("jsonp");
+ cmd.arg("data.json");
+
+ let got: Vec> = wrk.read_stdout(&mut cmd);
+ let expected = vec![
+ svec!["id", "father", "mother", "oldest_child", "boy"],
+ svec!["1", "Mark", "Charlotte", "Tom", "true"],
+ svec!["2", "John", "Ann", "Jessika", "false"],
+ svec!["3", "Bob", "Monika", "Jerry", "true"],
+ ];
+ assert_eq!(got, expected);
+}
+
+#[test]
+fn jsonp_fruits_stats() {
+ let wrk = Workdir::new("jsonp_fruits_stats");
+ wrk.create_from_string(
+ "data.json",
+ r#"[{"field":"fruit","type":"String","is_ascii":true,"sum":null,"min":"apple","max":"strawberry","range":null,"min_length":5,"max_length":10,"mean":null,"stddev":null,"variance":null,"nullcount":0,"max_precision":null,"sparsity":0},{"field":"price","type":"Float","is_ascii":null,"sum":7,"min":"1.5","max":"3.0","range":1.5,"min_length":4,"max_length":4,"mean":2.3333,"stddev":0.6236,"variance":0.3889,"nullcount":0,"max_precision":1,"sparsity":0}]"#,
+ );
+ let mut cmd = wrk.command("jsonp");
+ cmd.arg("data.json");
+
+ let got: String = wrk.stdout(&mut cmd);
+ let expected = r#"field,type,is_ascii,sum,min,max,range,min_length,max_length,mean,stddev,variance,nullcount,max_precision,sparsity
+fruit,String,true,,apple,strawberry,,5,10,,,,0,,0
+price,Float,,7,1.5,3.0,1.5,4,4,2.3333,0.6236,0.3889,0,1,0"#.to_string();
+ assert_eq!(got, expected);
+}
+
+#[test]
+fn jsonp_fruits_stats_fp_2() {
+ let wrk = Workdir::new("jsonp_fruits_stats_fp_2");
+ wrk.create_from_string(
+ "data.json",
+ r#"[{"field":"fruit","type":"String","is_ascii":true,"sum":null,"min":"apple","max":"strawberry","range":null,"min_length":5,"max_length":10,"mean":null,"stddev":null,"variance":null,"nullcount":0,"max_precision":null,"sparsity":0},{"field":"price","type":"Float","is_ascii":null,"sum":7,"min":"1.5","max":"3.0","range":1.5,"min_length":4,"max_length":4,"mean":2.3333,"stddev":0.6236,"variance":0.3889,"nullcount":0,"max_precision":1,"sparsity":0}]"#,
+ );
+ let mut cmd = wrk.command("jsonp");
+ cmd.arg("data.json");
+ cmd.args(&["--float-precision", "2"]);
+
+ let got: String = wrk.stdout(&mut cmd);
+ let expected = r#"field,type,is_ascii,sum,min,max,range,min_length,max_length,mean,stddev,variance,nullcount,max_precision,sparsity
+fruit,String,true,,apple,strawberry,,5,10,,,,0,,0
+price,Float,,7,1.5,3.0,1.50,4,4,2.33,0.62,0.39,0,1,0"#.to_string();
+ assert_eq!(got, expected);
+}
+
+#[test]
+// Verify that qsv stats fruits.csv has the same content as
+// qsv stats fruits.csv | qsv slice --json | qsv jsonp
+fn jsonp_fruits_stats_slice_jsonp() {
+ let wrk = Workdir::new("jsonp_fruits_stats_slice_jsonp");
+ let test_file = wrk.load_test_file("fruits.csv");
+
+ // qsv stats fruits.csv
+ let mut stats_cmd = wrk.command("stats");
+ stats_cmd.arg(test_file);
+ let stats_output: String = wrk.stdout(&mut stats_cmd);
+ wrk.create_from_string("stats.csv", stats_output.as_str());
+
+ // qsv slice --json
+ let mut slice_cmd = wrk.command("slice");
+ slice_cmd.arg("stats.csv");
+ slice_cmd.arg("--json");
+ let slice_output: String = wrk.stdout(&mut slice_cmd);
+ wrk.create_from_string("slice.json", slice_output.as_str());
+
+ // qsv jsonp
+ let mut jsonp_cmd = wrk.command("jsonp");
+ jsonp_cmd.arg("slice.json");
+ let jsonp_output: String = wrk.stdout(&mut jsonp_cmd);
+
+ assert_eq!(stats_output, jsonp_output);
+}
diff --git a/tests/tests.rs b/tests/tests.rs
index af644b1d1..62577f69d 100644
--- a/tests/tests.rs
+++ b/tests/tests.rs
@@ -79,6 +79,8 @@ mod test_join;
mod test_joinp;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
mod test_jsonl;
+#[cfg(feature = "polars")]
+mod test_jsonp;
#[cfg(feature = "luau")]
mod test_luau;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
From b01ab5edaeeaf1936f36ae99e11d94582ebb8d5b Mon Sep 17 00:00:00 2001
From: rzmk <30333942+rzmk@users.noreply.github.com>
Date: Fri, 14 Jun 2024 20:12:23 -0400
Subject: [PATCH 2/2] `jsonp`: fix clippy lints and disable for DP+
---
src/cmd/jsonp.rs | 6 +++---
tests/tests.rs | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/cmd/jsonp.rs b/src/cmd/jsonp.rs
index b3d9df38d..3c3c83f7b 100644
--- a/src/cmd/jsonp.rs
+++ b/src/cmd/jsonp.rs
@@ -77,11 +77,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let mut buffer: Vec = Vec::new();
let stdin = std::io::stdin();
stdin.lock().read_to_end(&mut buffer)?;
- Ok(JsonReader::new(Box::new(std::io::Cursor::new(buffer))).finish()?)
+ JsonReader::new(Box::new(std::io::Cursor::new(buffer))).finish()
}
fn df_from_path(path: String) -> PolarsResult {
- Ok(JsonReader::new(std::fs::File::open(path)?).finish()?)
+ JsonReader::new(std::fs::File::open(path)?).finish()
}
let df = match args.arg_input.clone() {
@@ -100,7 +100,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
.with_datetime_format(args.flag_datetime_format.clone())
.with_date_format(args.flag_date_format.clone())
.with_time_format(args.flag_time_format.clone())
- .with_float_precision(args.flag_float_precision.clone())
+ .with_float_precision(args.flag_float_precision)
.with_null_value(args.flag_wnull_value.clone().unwrap_or("".to_string()))
.include_bom(util::get_envvar_flag("QSV_OUTPUT_BOM"))
.finish(&mut df)?;
diff --git a/tests/tests.rs b/tests/tests.rs
index 62577f69d..9093aeca8 100644
--- a/tests/tests.rs
+++ b/tests/tests.rs
@@ -79,7 +79,7 @@ mod test_join;
mod test_joinp;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
mod test_jsonl;
-#[cfg(feature = "polars")]
+#[cfg(all(feature = "polars", not(feature = "datapusher_plus")))]
mod test_jsonp;
#[cfg(feature = "luau")]
mod test_luau;