Merge remote-tracking branch 'apache/main' into private_make_scalar_f…

…unction
viirya · Jan 22, 2024 · 8e559a8 · 8e559a8
2 parents 64a7642 + edec418
commit 8e559a8
Show file tree

Hide file tree

Showing 143 changed files with 6,500 additions and 2,415 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -53,7 +53,7 @@ jobs:
           rust-version: stable
 
       - name: Cache Cargo
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: |
             ~/.cargo/bin/
@@ -383,7 +383,7 @@ jobs:
   #         rustup default stable
   #         rustup component add rustfmt clippy
   #     - name: Cache Cargo
-  #       uses: actions/cache@v3
+  #       uses: actions/cache@v4
   #       with:
   #         path: /home/runner/.cargo
   #         # this key is not equal because the user is different than on a container (runner vs github)

diff --git a/Cargo.toml b/Cargo.toml
@@ -29,7 +29,7 @@ license = "Apache-2.0"
 readme = "README.md"
 repository = "https://github.com/apache/arrow-datafusion"
 rust-version = "1.70"
-version = "34.0.0"
+version = "35.0.0"
 
 [workspace.dependencies]
 arrow = { version = "50.0.0", features = ["prettyprint"] }
@@ -45,17 +45,17 @@ bytes = "1.4"
 chrono = { version = "0.4.31", default-features = false }
 ctor = "0.2.0"
 dashmap = "5.4.0"
-datafusion = { path = "datafusion/core", version = "34.0.0" }
-datafusion-common = { path = "datafusion/common", version = "34.0.0" }
-datafusion-execution = { path = "datafusion/execution", version = "34.0.0" }
-datafusion-expr = { path = "datafusion/expr", version = "34.0.0" }
-datafusion-optimizer = { path = "datafusion/optimizer", version = "34.0.0" }
-datafusion-physical-expr = { path = "datafusion/physical-expr", version = "34.0.0" }
-datafusion-physical-plan = { path = "datafusion/physical-plan", version = "34.0.0" }
-datafusion-proto = { path = "datafusion/proto", version = "34.0.0" }
-datafusion-sql = { path = "datafusion/sql", version = "34.0.0" }
-datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "34.0.0" }
-datafusion-substrait = { path = "datafusion/substrait", version = "34.0.0" }
+datafusion = { path = "datafusion/core", version = "35.0.0" }
+datafusion-common = { path = "datafusion/common", version = "35.0.0" }
+datafusion-execution = { path = "datafusion/execution", version = "35.0.0" }
+datafusion-expr = { path = "datafusion/expr", version = "35.0.0" }
+datafusion-optimizer = { path = "datafusion/optimizer", version = "35.0.0" }
+datafusion-physical-expr = { path = "datafusion/physical-expr", version = "35.0.0" }
+datafusion-physical-plan = { path = "datafusion/physical-plan", version = "35.0.0" }
+datafusion-proto = { path = "datafusion/proto", version = "35.0.0" }
+datafusion-sql = { path = "datafusion/sql", version = "35.0.0" }
+datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "35.0.0" }
+datafusion-substrait = { path = "datafusion/substrait", version = "35.0.0" }
 doc-comment = "0.3"
 env_logger = "0.10"
 futures = "0.3"

diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
@@ -18,7 +18,7 @@
 [package]
 name = "datafusion-benchmarks"
 description = "DataFusion Benchmarks"
-version = "34.0.0"
+version = "35.0.0"
 edition = { workspace = true }
 authors = ["Apache Arrow <[email protected]>"]
 homepage = "https://github.com/apache/arrow-datafusion"
@@ -33,8 +33,8 @@ snmalloc = ["snmalloc-rs"]
 
 [dependencies]
 arrow = { workspace = true }
-datafusion = { path = "../datafusion/core", version = "34.0.0" }
-datafusion-common = { path = "../datafusion/common", version = "34.0.0" }
+datafusion = { path = "../datafusion/core", version = "35.0.0" }
+datafusion-common = { path = "../datafusion/common", version = "35.0.0" }
 env_logger = { workspace = true }
 futures = { workspace = true }
 log = { workspace = true }
@@ -49,4 +49,4 @@ test-utils = { path = "../test-utils/", version = "0.1.0" }
 tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread", "parking_lot"] }
 
 [dev-dependencies]
-datafusion-proto = { path = "../datafusion/proto", version = "34.0.0" }
+datafusion-proto = { path = "../datafusion/proto", version = "35.0.0" }
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -74,6 +74,7 @@ parquet:                Benchmark of parquet reader's filtering speed
 sort:                   Benchmark of sorting speed
 clickbench_1:           ClickBench queries against a single parquet file
 clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
+clickbench_extended:    ClickBench "inspired" queries against a single parquet (DataFusion specific)
 
 **********
 * Supported Configuration (Environment Variables)
@@ -155,6 +156,9 @@ main() {
                 clickbench_partitioned)
                     data_clickbench_partitioned
                     ;;
+                clickbench_extended)
+                    data_clickbench_1
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
                     usage
@@ -193,6 +197,7 @@ main() {
                     run_sort
                     run_clickbench_1
                     run_clickbench_partitioned
+                    run_clickbench_extended
                     ;;
                 tpch)
                     run_tpch "1"
@@ -218,6 +223,9 @@ main() {
                 clickbench_partitioned)
                     run_clickbench_partitioned
                     ;;
+                clickbench_extended)
+                    run_clickbench_extended
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -401,6 +409,15 @@ run_clickbench_partitioned() {
     $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
 }
 
+# Runs the clickbench "extended" benchmark with a single large parquet file
+run_clickbench_extended() {
+    RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running clickbench (1 file) extended benchmark..."
+    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
+}
+
+
 compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
     BRANCH1="${ARG2}"

diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md
@@ -0,0 +1,33 @@
+# ClickBench queries
+
+This directory contains queries for the ClickBench benchmark https://benchmark.clickhouse.com/
+
+ClickBench is focused on aggregation and filtering performance (though it has no Joins)
+
+## Files:
+* `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
+* `extended.sql` - "Extended" DataFusion specific queries. 
+
+[ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql
+
+## "Extended" Queries 
+The "extended" queries are not part of the official ClickBench benchmark. 
+Instead they are used to test other DataFusion features that are not 
+covered by the standard benchmark
+
+Each description below is for the corresponding line in `extended.sql` (line 1
+is `Q0`, line 2 is `Q1`, etc.)  
+
+### Q0
+Models initial Data exploration, to understand some statistics of data. 
+Import Query Properties: multiple `COUNT DISTINCT` on strings
+
+```sql
+SELECT 
+    COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") 
+FROM hits;
+```
+
+
+
+
diff --git a/benchmarks/queries/clickbench/README.txt b/benchmarks/queries/clickbench/README.txt
diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql
@@ -0,0 +1 @@
+SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -15,13 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::path::Path;
 use std::{path::PathBuf, time::Instant};
 
 use datafusion::{
-    common::exec_err,
     error::{DataFusionError, Result},
     prelude::SessionContext,
 };
+use datafusion_common::exec_datafusion_err;
 use structopt::StructOpt;
 
 use crate::{BenchmarkRun, CommonOpt};
@@ -69,15 +70,49 @@ pub struct RunOpt {
     output_path: Option<PathBuf>,
 }
 
-const CLICKBENCH_QUERY_START_ID: usize = 0;
-const CLICKBENCH_QUERY_END_ID: usize = 42;
+struct AllQueries {
+    queries: Vec<String>,
+}
+
+impl AllQueries {
+    fn try_new(path: &Path) -> Result<Self> {
+        // ClickBench has all queries in a single file identified by line number
+        let all_queries = std::fs::read_to_string(path)
+            .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
+        Ok(Self {
+            queries: all_queries.lines().map(|s| s.to_string()).collect(),
+        })
+    }
+
+    /// Returns the text of query `query_id`
+    fn get_query(&self, query_id: usize) -> Result<&str> {
+        self.queries
+            .get(query_id)
+            .ok_or_else(|| {
+                let min_id = self.min_query_id();
+                let max_id = self.max_query_id();
+                exec_datafusion_err!(
+                    "Invalid query id {query_id}. Must be between {min_id} and {max_id}"
+                )
+            })
+            .map(|s| s.as_str())
+    }
+
+    fn min_query_id(&self) -> usize {
+        0
+    }
 
+    fn max_query_id(&self) -> usize {
+        self.queries.len() - 1
+    }
+}
 impl RunOpt {
     pub async fn run(self) -> Result<()> {
         println!("Running benchmarks with the following options: {self:?}");
+        let queries = AllQueries::try_new(self.queries_path.as_path())?;
         let query_range = match self.query {
             Some(query_id) => query_id..=query_id,
-            None => CLICKBENCH_QUERY_START_ID..=CLICKBENCH_QUERY_END_ID,
+            None => queries.min_query_id()..=queries.max_query_id(),
         };
 
         let config = self.common.config();
@@ -88,12 +123,12 @@ impl RunOpt {
         let mut benchmark_run = BenchmarkRun::new();
         for query_id in query_range {
             benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let sql = self.get_query(query_id)?;
+            let sql = queries.get_query(query_id)?;
             println!("Q{query_id}: {sql}");
 
             for i in 0..iterations {
                 let start = Instant::now();
-                let results = ctx.sql(&sql).await?.collect().await?;
+                let results = ctx.sql(sql).await?.collect().await?;
                 let elapsed = start.elapsed();
                 let ms = elapsed.as_secs_f64() * 1000.0;
                 let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
@@ -120,23 +155,4 @@ impl RunOpt {
                 )
             })
     }
-
-    /// Returns the text of query `query_id`
-    fn get_query(&self, query_id: usize) -> Result<String> {
-        if query_id > CLICKBENCH_QUERY_END_ID {
-            return exec_err!(
-                "Invalid query id {query_id}. Must be between {CLICKBENCH_QUERY_START_ID} and {CLICKBENCH_QUERY_END_ID}"
-            );
-        }
-
-        let path = self.queries_path.as_path();
-
-        // ClickBench has all queries in a single file identified by line number
-        let all_queries = std::fs::read_to_string(path).map_err(|e| {
-            DataFusionError::Execution(format!("Could not open {path:?}: {e}"))
-        })?;
-        let all_queries: Vec<_> = all_queries.lines().collect();
-
-        Ok(all_queries.get(query_id).map(|s| s.to_string()).unwrap())
-    }
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;