From 80e3ba86b81503b39178d28a095dd7d1fc6cb4f9 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 05:21:47 -0400 Subject: [PATCH 1/4] `deps`: bump polars to 0.33 from 0.32 https://github.com/pola-rs/polars/releases/tag/rs-0.33.0 --- Cargo.lock | 107 ++++++++++++++++++++++++++--------------------------- Cargo.toml | 2 +- 2 files changed, 54 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e9644e4e..f7f3caeb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -474,9 +474,9 @@ dependencies = [ [[package]] name = "arrow2" -version = "0.17.4" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c468daea140b747d781a1da9f7db5f0a8e6636d4af20cc539e43d05b0604fa" +checksum = "963fef509b757bcbbf9e5ffa23bcb345614d99f4f6f531f97417b27b8604d389" dependencies = [ "ahash 0.8.3", "arrow-format", @@ -491,13 +491,14 @@ dependencies = [ "futures", "getrandom", "hash_hasher", + "hashbrown 0.14.0", "lexical-core", "lz4", "multiversion", "num-traits", "parquet2", "regex", - "regex-syntax 0.6.29", + "regex-syntax 0.7.5", "rustc_version", "simdutf8", "streaming-iterator", @@ -557,7 +558,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -568,7 +569,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -647,7 +648,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.35", + "syn 2.0.37", "which", ] @@ -826,7 +827,7 @@ checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -1578,7 +1579,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -1883,7 +1884,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -2697,13 +2698,12 @@ checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128" [[package]] name = "local-channel" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f303ec0e94c6c54447f84f3b0ef7af769858a9c4ef56ef2a986d3dcd4c3fc9c" +checksum = "e0a493488de5f18c8ffcba89eebb8532ffc562dc400490eb65b84893fae0b178" dependencies = [ "futures-core", "futures-sink", - "futures-util", "local-waker", ] @@ -3308,7 +3308,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -3340,9 +3340,9 @@ dependencies = [ [[package]] name = "polars" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1362d4a136c0ebacb40d88a37ba361738b222fd8a2ee9340a3d8642f698c52b" +checksum = "3030de163b9ff2c9dac9a12dcb9be25cc0f2bc7c8e7cd2e4b2592ebed458ce6a" dependencies = [ "getrandom", "polars-core", @@ -3356,9 +3356,9 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f967c901fa5da4ca7f64e813d1268488ba97e9b3004cefc579ff851c197a1138" +checksum = "35cd38a64fb389fd990e4efd433a36331c995c981d353bfef83b5de4d87f1828" dependencies = [ "arrow2", "hashbrown 0.14.0", @@ -3371,9 +3371,9 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b24f92fc5b167f668ff85ab9607dfa72e2c09664cacef59297ee8601dee60126" +checksum = "08367c014c07fa8f141680e024f926cab3a1fe839605a8fcf2223647eb45ca71" dependencies = [ "ahash 0.8.3", "arrow2", @@ -3402,9 +3402,9 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40d09c3a7337e53b38c37b57999038440fa39c6801b9ba48afaecd8e16f7ac0a" +checksum = "9b20a09651a299979354945819dc2ce017964b80b916954e9d2ce39002a5f949" dependencies = [ "arrow2", "regex", @@ -3413,17 +3413,15 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92cab0df9f2a35702fa5aec99edfaabf9ae8e9cdd0acf69e143ad2d132f34f9c" +checksum = "88cf4a89c18a90ac20dfbcdfd19ab50ad4ac5a76fc7bb775d3c28bb738cf1f34" dependencies = [ "ahash 0.8.3", "arrow2", - "async-trait", "bytes", "chrono", "fast-float", - "futures", "home", "lexical", "lexical-core", @@ -3442,14 +3440,13 @@ dependencies = [ "serde_json", "simd-json", "simdutf8", - "tokio", ] [[package]] name = "polars-json" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e58094557cf6446808c7348dcb797885db61815857f6ea02924b35505566e94" +checksum = "d6d5666176d681706aef5a06a57597c83391948b3d958f9fbe9b4cf016527eb8" dependencies = [ "ahash 0.8.3", "arrow2", @@ -3465,9 +3462,9 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c33762ec2a55e01c9f8776b34db86257c70a0a3b3929bd4eb91a52aacf61456" +checksum = "5110eab438848c981cc5f541fbc5b21bb263fd707000b4715233074fb2630fcf" dependencies = [ "ahash 0.8.3", "bitflags 2.4.0", @@ -3489,9 +3486,9 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e825575c96302d2daedfc205a0062180033c92c55bcd6aafc4e109d4d8849ed0" +checksum = "7740d7bc4c2ca08044f9ef599638e116fdd7d687e80d1974b698e390c6ce4252" dependencies = [ "argminmax", "arrow2", @@ -3501,15 +3498,16 @@ dependencies = [ "polars-arrow", "polars-core", "polars-utils", + "regex", "smartstring", "version_check", ] [[package]] name = "polars-pipe" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f2bc9a12da9ed043fb0cb51dbcb87b365e4845b7ab6399d7a81e838460c6974" +checksum = "1f30c5e77c5594ddc958a46fe2e021da2feba9c94e767e1d798bd82ac5a33c3b" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -3530,9 +3528,9 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb67b014f0295e8e9dbb84404a91d666d477b3bc248a2ed51bc442833b16da35" +checksum = "678cbeb730e29e50f0f8d844102d15454fc6113a74c667eab046c0e4a4322a9e" dependencies = [ "ahash 0.8.3", "arrow2", @@ -3552,9 +3550,9 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27f54c1956027bf6301948fb4f2837cf6d6b638d8dd1edf3aaeaa19906a986be" +checksum = "c52ef8885b9d13f848839594fbab21ad79fc63f7e11c19cdc2cfe9bb03c313ac" dependencies = [ "arrow2", "polars-error", @@ -3563,9 +3561,9 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbfcb15cf8eebd25ea1724109d0153817cd484c6326290585f0736b4e7fcf2f4" +checksum = "4d716855267e3516f722287f68cf10e650e33f7197df83a79e680602471456fc" dependencies = [ "polars-arrow", "polars-core", @@ -3578,9 +3576,9 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f42d2632f5971c9575041d33cbcfb1f996900c40bbf58bc6eb0a0c5efbecea" +checksum = "a2eb75a24f11b55a400b52dc19a2a3e949aaaa46a911f99496de4485b1127063" dependencies = [ "arrow2", "atoi", @@ -3597,11 +3595,12 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.32.1" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c326708a370d71dc6e11a8f4bbc10a8479e1c314dc048ba73543b815cd0bf339" +checksum = "2a4a5e743509096322cad39104d56e329fe2748483a3354a0f0c354724f3cef6" dependencies = [ "ahash 0.8.3", + "bytemuck", "hashbrown 0.14.0", "num-traits", "once_cell", @@ -3674,7 +3673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" dependencies = [ "proc-macro2", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -4535,7 +4534,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -4607,7 +4606,7 @@ checksum = "91d129178576168c589c9ec973feedf7d3126c01ac2bf08795109aa35b69fb8f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -4920,7 +4919,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -4942,9 +4941,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.35" +version = "2.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59bf04c28bee9043ed9ea1e41afc0552288d3aba9c6efdd78903b802926f4879" +checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" dependencies = [ "proc-macro2", "quote", @@ -5044,7 +5043,7 @@ checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -5152,7 +5151,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", ] [[package]] @@ -5474,7 +5473,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", "wasm-bindgen-shared", ] @@ -5508,7 +5507,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.35", + "syn 2.0.37", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/Cargo.toml b/Cargo.toml index 8fae1799a..2431a4465 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -129,7 +129,7 @@ mlua = { version = "0.9", features = [ ], optional = true } num_cpus = "1" odht = "0.3" -polars = { version = "0.32", features = [ +polars = { version = "0.33", features = [ "lazy", "streaming", "object", From 3930294de908b7e1b882612aba099abf12aa21fd Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 05:23:09 -0400 Subject: [PATCH 2/4] `sqlp` & `joinp`: OptState struct has two new members `fast_projection: true` and `eager: false` --- src/cmd/joinp.rs | 2 ++ src/cmd/sqlp.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/cmd/joinp.rs b/src/cmd/joinp.rs index 8ab2807cb..380308d4d 100644 --- a/src/cmd/joinp.rs +++ b/src/cmd/joinp.rs @@ -389,6 +389,8 @@ impl JoinStruct { comm_subplan_elim: true, comm_subexpr_elim: true, streaming: self.streaming, + fast_projection: true, + eager: false, } }; log::debug!("Optimization state: {optimization_state:?}"); diff --git a/src/cmd/sqlp.rs b/src/cmd/sqlp.rs index 460b4dc65..f7e73063a 100644 --- a/src/cmd/sqlp.rs +++ b/src/cmd/sqlp.rs @@ -393,6 +393,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { comm_subplan_elim: !args.flag_low_memory, comm_subexpr_elim: true, streaming: args.flag_low_memory, + fast_projection: true, + eager: false, } }; // gated by log::log_enabled!(log::Level::Debug) to avoid the From 94ca07bd009d6a7dcebb6269675b756901f939f9 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 05:24:15 -0400 Subject: [PATCH 3/4] `sqlp`: --try-parsedates is stricter now, needed to add `--ignore-errors` to pass tests see https://github.com/pola-rs/polars/pull/10877 --- tests/test_sqlp.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/test_sqlp.rs b/tests/test_sqlp.rs index 8fc13c3cd..c08c1cf83 100644 --- a/tests/test_sqlp.rs +++ b/tests/test_sqlp.rs @@ -582,10 +582,13 @@ fn sqlp_boston311_try_parsedates() { let test_file = wrk.load_test_file("boston311-100.csv"); let mut cmd = wrk.command("sqlp"); - cmd.arg(&test_file).arg("--try-parsedates").arg( - "select ward, cast(avg(closed_dt - open_dt) as float) as avg_tat from _t_1 where \ - case_status = 'Closed' group by ward order by avg_tat desc, ward asc", - ); + cmd.arg(&test_file) + .arg("--try-parsedates") + .arg( + "select ward, cast(avg(closed_dt - open_dt) as float) as avg_tat from _t_1 where \ + case_status = 'Closed' group by ward order by avg_tat desc, ward asc", + ) + .arg("--ignore-errors"); let got: Vec> = wrk.read_stdout(&mut cmd); let expected = vec![ @@ -644,7 +647,8 @@ fn sqlp_boston311_try_parsedates_precision() { .arg( "select ward, cast(avg(closed_dt - open_dt) as float) as avg_tat from _t_1 where \ case_status = 'Closed' group by ward order by avg_tat desc, ward asc limit 5", - ); + ) + .arg("--ignore-errors"); let got: Vec> = wrk.read_stdout(&mut cmd); let expected = vec![ @@ -667,7 +671,8 @@ fn sqlp_boston311_try_parsedates_format() { cmd.arg(&test_file) .arg("--try-parsedates") .args(["--datetime-format", "%a %Y-%m-%d %H:%M:%S"]) - .arg("select closed_dt, open_dt from _t_1 where case_status = 'Closed' limit 5"); + .arg("select closed_dt, open_dt from _t_1 where case_status = 'Closed' limit 5") + .arg("--ignore-errors"); let got: Vec> = wrk.read_stdout(&mut cmd); let expected = vec![ From d7e643f4f61c2c6927756a3c34ee08965d19e558 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 18 Sep 2023 05:31:08 -0400 Subject: [PATCH 4/4] `sqlp`: add note about `--try-parsedates` and `--ignore-errors` --- src/cmd/sqlp.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cmd/sqlp.rs b/src/cmd/sqlp.rs index f7e73063a..19b563a0e 100644 --- a/src/cmd/sqlp.rs +++ b/src/cmd/sqlp.rs @@ -111,6 +111,9 @@ sqlp options: POLARS CSV PARSING OPTIONS: --try-parsedates Automatically try to parse dates/datetimes and time. If parsing fails, columns remain as strings. + Note that if dates are not well-formatted in your CSV, + that you may want to try to set `--ignore-errors` to relax + the CSV parsing of dates. --infer-len The number of rows to scan when inferring the schema of the CSV. Set to 0 to do a full table scan (warning: very slow). (default: 250)