Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/main' into private_make_scalar_f…
Browse files Browse the repository at this point in the history
…unction
  • Loading branch information
alamb committed Jan 22, 2024
2 parents 64a7642 + edec418 commit 8e559a8
Show file tree
Hide file tree
Showing 143 changed files with 6,500 additions and 2,415 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
rust-version: stable

- name: Cache Cargo
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: |
~/.cargo/bin/
Expand Down Expand Up @@ -383,7 +383,7 @@ jobs:
# rustup default stable
# rustup component add rustfmt clippy
# - name: Cache Cargo
# uses: actions/cache@v3
# uses: actions/cache@v4
# with:
# path: /home/runner/.cargo
# # this key is not equal because the user is different than on a container (runner vs github)
Expand Down
24 changes: 12 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/arrow-datafusion"
rust-version = "1.70"
version = "34.0.0"
version = "35.0.0"

[workspace.dependencies]
arrow = { version = "50.0.0", features = ["prettyprint"] }
Expand All @@ -45,17 +45,17 @@ bytes = "1.4"
chrono = { version = "0.4.31", default-features = false }
ctor = "0.2.0"
dashmap = "5.4.0"
datafusion = { path = "datafusion/core", version = "34.0.0" }
datafusion-common = { path = "datafusion/common", version = "34.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "34.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "34.0.0" }
datafusion-optimizer = { path = "datafusion/optimizer", version = "34.0.0" }
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "34.0.0" }
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "34.0.0" }
datafusion-proto = { path = "datafusion/proto", version = "34.0.0" }
datafusion-sql = { path = "datafusion/sql", version = "34.0.0" }
datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "34.0.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "34.0.0" }
datafusion = { path = "datafusion/core", version = "35.0.0" }
datafusion-common = { path = "datafusion/common", version = "35.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "35.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "35.0.0" }
datafusion-optimizer = { path = "datafusion/optimizer", version = "35.0.0" }
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "35.0.0" }
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "35.0.0" }
datafusion-proto = { path = "datafusion/proto", version = "35.0.0" }
datafusion-sql = { path = "datafusion/sql", version = "35.0.0" }
datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "35.0.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "35.0.0" }
doc-comment = "0.3"
env_logger = "0.10"
futures = "0.3"
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
[package]
name = "datafusion-benchmarks"
description = "DataFusion Benchmarks"
version = "34.0.0"
version = "35.0.0"
edition = { workspace = true }
authors = ["Apache Arrow <[email protected]>"]
homepage = "https://github.com/apache/arrow-datafusion"
Expand All @@ -33,8 +33,8 @@ snmalloc = ["snmalloc-rs"]

[dependencies]
arrow = { workspace = true }
datafusion = { path = "../datafusion/core", version = "34.0.0" }
datafusion-common = { path = "../datafusion/common", version = "34.0.0" }
datafusion = { path = "../datafusion/core", version = "35.0.0" }
datafusion-common = { path = "../datafusion/common", version = "35.0.0" }
env_logger = { workspace = true }
futures = { workspace = true }
log = { workspace = true }
Expand All @@ -49,4 +49,4 @@ test-utils = { path = "../test-utils/", version = "0.1.0" }
tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread", "parking_lot"] }

[dev-dependencies]
datafusion-proto = { path = "../datafusion/proto", version = "34.0.0" }
datafusion-proto = { path = "../datafusion/proto", version = "35.0.0" }
17 changes: 17 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
clickbench_1: ClickBench queries against a single parquet file
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
clickbench_extended: ClickBench "inspired" queries against a single parquet (DataFusion specific)
**********
* Supported Configuration (Environment Variables)
Expand Down Expand Up @@ -155,6 +156,9 @@ main() {
clickbench_partitioned)
data_clickbench_partitioned
;;
clickbench_extended)
data_clickbench_1
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
usage
Expand Down Expand Up @@ -193,6 +197,7 @@ main() {
run_sort
run_clickbench_1
run_clickbench_partitioned
run_clickbench_extended
;;
tpch)
run_tpch "1"
Expand All @@ -218,6 +223,9 @@ main() {
clickbench_partitioned)
run_clickbench_partitioned
;;
clickbench_extended)
run_clickbench_extended
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for run"
usage
Expand Down Expand Up @@ -401,6 +409,15 @@ run_clickbench_partitioned() {
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
}

# Runs the clickbench "extended" benchmark with a single large parquet file
run_clickbench_extended() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) extended benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
}


compare_benchmarks() {
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
BRANCH1="${ARG2}"
Expand Down
33 changes: 33 additions & 0 deletions benchmarks/queries/clickbench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# ClickBench queries

This directory contains queries for the ClickBench benchmark https://benchmark.clickhouse.com/

ClickBench is focused on aggregation and filtering performance (though it has no Joins)

## Files:
* `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
* `extended.sql` - "Extended" DataFusion specific queries.

[ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql

## "Extended" Queries
The "extended" queries are not part of the official ClickBench benchmark.
Instead they are used to test other DataFusion features that are not
covered by the standard benchmark

Each description below is for the corresponding line in `extended.sql` (line 1
is `Q0`, line 2 is `Q1`, etc.)

### Q0
Models initial Data exploration, to understand some statistics of data.
Import Query Properties: multiple `COUNT DISTINCT` on strings

```sql
SELECT
COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel")
FROM hits;
```




1 change: 0 additions & 1 deletion benchmarks/queries/clickbench/README.txt

This file was deleted.

1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
66 changes: 41 additions & 25 deletions benchmarks/src/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use std::path::Path;
use std::{path::PathBuf, time::Instant};

use datafusion::{
common::exec_err,
error::{DataFusionError, Result},
prelude::SessionContext,
};
use datafusion_common::exec_datafusion_err;
use structopt::StructOpt;

use crate::{BenchmarkRun, CommonOpt};
Expand Down Expand Up @@ -69,15 +70,49 @@ pub struct RunOpt {
output_path: Option<PathBuf>,
}

const CLICKBENCH_QUERY_START_ID: usize = 0;
const CLICKBENCH_QUERY_END_ID: usize = 42;
struct AllQueries {
queries: Vec<String>,
}

impl AllQueries {
fn try_new(path: &Path) -> Result<Self> {
// ClickBench has all queries in a single file identified by line number
let all_queries = std::fs::read_to_string(path)
.map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
Ok(Self {
queries: all_queries.lines().map(|s| s.to_string()).collect(),
})
}

/// Returns the text of query `query_id`
fn get_query(&self, query_id: usize) -> Result<&str> {
self.queries
.get(query_id)
.ok_or_else(|| {
let min_id = self.min_query_id();
let max_id = self.max_query_id();
exec_datafusion_err!(
"Invalid query id {query_id}. Must be between {min_id} and {max_id}"
)
})
.map(|s| s.as_str())
}

fn min_query_id(&self) -> usize {
0
}

fn max_query_id(&self) -> usize {
self.queries.len() - 1
}
}
impl RunOpt {
pub async fn run(self) -> Result<()> {
println!("Running benchmarks with the following options: {self:?}");
let queries = AllQueries::try_new(self.queries_path.as_path())?;
let query_range = match self.query {
Some(query_id) => query_id..=query_id,
None => CLICKBENCH_QUERY_START_ID..=CLICKBENCH_QUERY_END_ID,
None => queries.min_query_id()..=queries.max_query_id(),
};

let config = self.common.config();
Expand All @@ -88,12 +123,12 @@ impl RunOpt {
let mut benchmark_run = BenchmarkRun::new();
for query_id in query_range {
benchmark_run.start_new_case(&format!("Query {query_id}"));
let sql = self.get_query(query_id)?;
let sql = queries.get_query(query_id)?;
println!("Q{query_id}: {sql}");

for i in 0..iterations {
let start = Instant::now();
let results = ctx.sql(&sql).await?.collect().await?;
let results = ctx.sql(sql).await?.collect().await?;
let elapsed = start.elapsed();
let ms = elapsed.as_secs_f64() * 1000.0;
let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
Expand All @@ -120,23 +155,4 @@ impl RunOpt {
)
})
}

/// Returns the text of query `query_id`
fn get_query(&self, query_id: usize) -> Result<String> {
if query_id > CLICKBENCH_QUERY_END_ID {
return exec_err!(
"Invalid query id {query_id}. Must be between {CLICKBENCH_QUERY_START_ID} and {CLICKBENCH_QUERY_END_ID}"
);
}

let path = self.queries_path.as_path();

// ClickBench has all queries in a single file identified by line number
let all_queries = std::fs::read_to_string(path).map_err(|e| {
DataFusionError::Execution(format!("Could not open {path:?}: {e}"))
})?;
let all_queries: Vec<_> = all_queries.lines().collect();

Ok(all_queries.get(query_id).map(|s| s.to_string()).unwrap())
}
}
Loading

0 comments on commit 8e559a8

Please sign in to comment.