Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into hash_join_build_right
Browse files Browse the repository at this point in the history
  • Loading branch information
viirya committed Jun 6, 2024
2 parents c4cb10e + c1cdf46 commit 1bd7549
Show file tree
Hide file tree
Showing 345 changed files with 64,027 additions and 4,915 deletions.
2 changes: 1 addition & 1 deletion .github/actions/setup-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ inputs:
rust-version:
description: 'version of rust to install (e.g. nightly)'
required: true
default: 'nightly'
default: 'stable'
jdk-version:
description: 'jdk version to install (e.g., 17)'
required: true
Expand Down
2 changes: 1 addition & 1 deletion .github/actions/setup-macos-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ inputs:
rust-version:
description: 'version of rust to install (e.g. nightly)'
required: true
default: 'nightly'
default: 'stable'
jdk-version:
description: 'jdk version to install (e.g., 17)'
required: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/benchmark-tpch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
prepare:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
prepare:
Expand Down
47 changes: 7 additions & 40 deletions .github/workflows/pr_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
linux-test:
Expand Down Expand Up @@ -97,22 +97,11 @@ jobs:
with:
rust-version: ${{env.RUST_VERSION}}
jdk-version: ${{ matrix.java_version }}
- name: Clone Spark
uses: actions/checkout@v4
with:
repository: "apache/spark"
path: "apache-spark"
- name: Install Spark
shell: bash
working-directory: ./apache-spark
run: build/mvn install -Phive -Phadoop-cloud -DskipTests
- name: Java test steps
uses: ./.github/actions/java-test
with:
# TODO: remove -DskipTests after fixing tests
maven_opts: "-Pspark-${{ matrix.spark-version }} -DskipTests"
# TODO: upload test reports after enabling tests
upload-test-reports: false
maven_opts: -Pspark-${{ matrix.spark-version }}
upload-test-reports: true

linux-test-with-old-spark:
strategy:
Expand Down Expand Up @@ -225,22 +214,11 @@ jobs:
with:
rust-version: ${{env.RUST_VERSION}}
jdk-version: ${{ matrix.java_version }}
- name: Clone Spark
uses: actions/checkout@v4
with:
repository: "apache/spark"
path: "apache-spark"
- name: Install Spark
shell: bash
working-directory: ./apache-spark
run: build/mvn install -Phive -Phadoop-cloud -DskipTests
- name: Java test steps
uses: ./.github/actions/java-test
with:
# TODO: remove -DskipTests after fixing tests
maven_opts: "-Pspark-${{ matrix.spark-version }} -DskipTests"
# TODO: upload test reports after enabling tests
upload-test-reports: false
maven_opts: -Pspark-${{ matrix.spark-version }}
upload-test-reports: true

macos-aarch64-test-with-spark4_0:
strategy:
Expand All @@ -265,22 +243,11 @@ jobs:
jdk-version: ${{ matrix.java_version }}
jdk-architecture: aarch64
protoc-architecture: aarch_64
- name: Clone Spark
uses: actions/checkout@v4
with:
repository: "apache/spark"
path: "apache-spark"
- name: Install Spark
shell: bash
working-directory: ./apache-spark
run: build/mvn install -Phive -Phadoop-cloud -DskipTests
- name: Java test steps
uses: ./.github/actions/java-test
with:
# TODO: remove -DskipTests after fixing tests
maven_opts: "-Pspark-${{ matrix.spark-version }} -DskipTests"
# TODO: upload test reports after enabling tests
upload-test-reports: false
maven_opts: -Pspark-${{ matrix.spark-version }}
upload-test-reports: true

macos-aarch64-test-with-old-spark:
strategy:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/spark_sql_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
spark-sql-catalyst:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/spark_sql_test_ansi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
spark-sql-catalyst:
Expand Down
19 changes: 11 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ format:

core-amd64:
rustup target add x86_64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
mkdir -p common/target/classes/org/apache/comet/darwin/x86_64
cp core/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release
mkdir -p common/target/classes/org/apache/comet/linux/amd64
cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
jar -cf common/target/comet-native-x86_64.jar \
Expand All @@ -57,10 +57,10 @@ core-amd64:

core-arm64:
rustup target add aarch64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
mkdir -p common/target/classes/org/apache/comet/darwin/aarch64
cp core/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
mkdir -p common/target/classes/org/apache/comet/linux/aarch64
cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
jar -cf common/target/comet-native-aarch64.jar \
Expand All @@ -70,13 +70,16 @@ core-arm64:

release-linux: clean
rustup target add aarch64-apple-darwin x86_64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
./mvnw install -Prelease -DskipTests $(PROFILES)
release:
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
./mvnw install -Prelease -DskipTests $(PROFILES)
release-nogit:
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
./mvnw install -Prelease -DskipTests $(PROFILES) -Dmaven.gitcommitid.skip=true
benchmark-%: clean release
cd spark && COMET_CONF_DIR=$(shell pwd)/conf MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="$*" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="$(filter-out $@,$(MAKECMDGOALS))" $(PROFILES)
.DEFAULT:
Expand Down
98 changes: 63 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,58 +19,86 @@ under the License.

# Apache DataFusion Comet

Apache DataFusion Comet is an Apache Spark plugin that uses [Apache DataFusion](https://datafusion.apache.org/)
as native runtime to achieve improvement in terms of query efficiency and query runtime.
Apache DataFusion Comet is a high-performance accelerator for Apache Spark, built on top of the powerful
[Apache DataFusion](https://datafusion.apache.org) query engine. Comet is designed to significantly enhance the
performance of Apache Spark workloads while leveraging commodity hardware and seamlessly integrating with the
Spark ecosystem without requiring any code changes.

Comet runs Spark SQL queries using the native DataFusion runtime, which is
typically faster and more resource efficient than JVM based runtimes.
# Benefits of Using Comet

<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-system-diagram.png" align="center" width="500" ></a>
## Run Spark Queries at DataFusion Speeds

Comet aims to support:
Comet delivers a performance speedup for many queries, enabling faster data processing and shorter time-to-insights.

- a native Parquet implementation, including both reader and writer
- full implementation of Spark operators, including
Filter/Project/Aggregation/Join/Exchange etc.
- full implementation of Spark built-in expressions
- a UDF framework for users to migrate their existing UDF to native
The following chart shows the time it takes to run the 22 TPC-H queries against 100 GB of data in Parquet format
using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
for details of the environment used for these benchmarks.

## Architecture
When using Comet, the overall run time is reduced from 649 seconds to 440 seconds, a 1.5x speedup.

The following diagram illustrates the architecture of Comet:
Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.9x
speedup compared to Spark.

<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-overview.png" align="center" height="600" width="750" ></a>
Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup
for many use cases.

## Current Status
![](docs/source/_static/images/tpch_allqueries.png)

The project is currently integrated into Apache Spark 3.2, 3.3, and 3.4.
Here is a breakdown showing relative performance of Spark, Comet, and DataFusion for each TPC-H query.

## Feature Parity with Apache Spark
![](docs/source/_static/images/tpch_queries_compare.png)

The project strives to keep feature parity with Apache Spark, that is,
users should expect the same behavior (w.r.t features, configurations,
query results, etc) with Comet turned on or turned off in their Spark
jobs. In addition, Comet extension should automatically detect unsupported
features and fallback to Spark engine.
The following chart shows how much Comet currently accelerates each query from the benchmark. Performance optimization
is an ongoing task, and we welcome contributions from the community to help achieve even greater speedups in the future.

To achieve this, besides unit tests within Comet itself, we also re-use
Spark SQL tests and make sure they all pass with Comet extension
enabled.
![](docs/source/_static/images/tpch_queries_speedup.png)

## Supported Platforms
These benchmarks can be reproduced in any environment using the documentation in the
[Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html). We encourage
you to run your own benchmarks.

Linux, Apple OSX (Intel and M1)
## Use Commodity Hardware

## Requirements
Comet leverages commodity hardware, eliminating the need for costly hardware upgrades or
specialized hardware accelerators, such as GPUs or FGPA. By maximizing the utilization of commodity hardware, Comet
ensures cost-effectiveness and scalability for your Spark deployments.

- Apache Spark 3.2, 3.3, or 3.4
- JDK 8, 11 and 17 (JDK 11 recommended because Spark 3.2 doesn't support 17)
- GLIBC 2.17 (Centos 7) and up
## Spark Compatibility

## Getting started
Comet aims for 100% compatibility with all supported versions of Apache Spark, allowing you to integrate Comet into
your existing Spark deployments and workflows seamlessly. With no code changes required, you can immediately harness
the benefits of Comet's acceleration capabilities without disrupting your Spark applications.

See the [DataFusion Comet User Guide](https://datafusion.apache.org/comet/user-guide/installation.html) for installation instructions.
## Tight Integration with Apache DataFusion

Comet tightly integrates with the core Apache DataFusion project, leveraging its powerful execution engine. With
seamless interoperability between Comet and DataFusion, you can achieve optimal performance and efficiency in your
Spark workloads.

## Active Community

Comet boasts a vibrant and active community of developers, contributors, and users dedicated to advancing the
capabilities of Apache DataFusion and accelerating the performance of Apache Spark.

## Getting Started

To get started with Apache DataFusion Comet, follow the
[installation instructions](https://datafusion.apache.org/comet/user-guide/installation.html). Join the
[DataFusion Slack and Discord channels](https://datafusion.apache.org/contributor-guide/communication.html) to connect
with other users, ask questions, and share your experiences with Comet.

## Contributing
See the [DataFusion Comet Contribution Guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html)
for information on how to get started contributing to the project.

We welcome contributions from the community to help improve and enhance Apache DataFusion Comet. Whether it's fixing
bugs, adding new features, writing documentation, or optimizing performance, your contributions are invaluable in
shaping the future of Comet. Check out our
[contributor guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html) to get started.

## License

Apache DataFusion Comet is licensed under the Apache License 2.0. See the [LICENSE.txt](LICENSE.txt) file for details.

## Acknowledgments

We would like to express our gratitude to the Apache DataFusion community for their support and contributions to
Comet. Together, we're building a faster, more efficient future for big data processing with Apache Spark.
9 changes: 6 additions & 3 deletions common/src/main/scala/org/apache/comet/CometConf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import org.apache.spark.network.util.JavaUtils
import org.apache.spark.sql.comet.util.Utils
import org.apache.spark.sql.internal.SQLConf

import org.apache.comet.shims.ShimCometConf

/**
* Configurations for a Comet application. Mostly inspired by [[SQLConf]] in Spark.
*
Expand All @@ -41,7 +43,7 @@ import org.apache.spark.sql.internal.SQLConf
* which retrieves the config value from the thread-local [[SQLConf]] object. Alternatively, you
* can also explicitly pass a [[SQLConf]] object to the `get` method.
*/
object CometConf {
object CometConf extends ShimCometConf {

/** List of all configs that is used for generating documentation */
val allConfs = new ListBuffer[ConfigEntry[_]]
Expand Down Expand Up @@ -361,7 +363,7 @@ object CometConf {
"column to a long column, a float column to a double column, etc. This is automatically" +
"enabled when reading from Iceberg tables.")
.booleanConf
.createWithDefault(false)
.createWithDefault(COMET_SCHEMA_EVOLUTION_ENABLED_DEFAULT)

val COMET_ROW_TO_COLUMNAR_ENABLED: ConfigEntry[Boolean] =
conf("spark.comet.rowToColumnar.enabled")
Expand All @@ -382,12 +384,13 @@ object CometConf {
.createWithDefault(Seq("Range,InMemoryTableScan"))

val COMET_ANSI_MODE_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.ansi.enabled")
.internal()
.doc(
"Comet does not respect ANSI mode in most cases and by default will not accelerate " +
"queries when ansi mode is enabled. Enable this setting to test Comet's experimental " +
"support for ANSI mode. This should not be used in production.")
.booleanConf
.createWithDefault(false)
.createWithDefault(COMET_ANSI_MODE_ENABLED_DEFAULT)

val COMET_CAST_ALLOW_INCOMPATIBLE: ConfigEntry[Boolean] =
conf("spark.comet.cast.allowIncompatible")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.comet.shims

trait ShimCometConf {
protected val COMET_SCHEMA_EVOLUTION_ENABLED_DEFAULT = false
protected val COMET_ANSI_MODE_ENABLED_DEFAULT = false
}
Loading

0 comments on commit 1bd7549

Please sign in to comment.