Merge remote-tracking branch 'upstream/main' into hash_join_build_right

apache · Jun 6, 2024 · 1bd7549 · 1bd7549
2 parents c4cb10e + c1cdf46
commit 1bd7549
Show file tree

Hide file tree

Showing 345 changed files with 64,027 additions and 4,915 deletions.
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
@@ -21,7 +21,7 @@ inputs:
   rust-version:
     description: 'version of rust to install (e.g. nightly)'
     required: true
-    default: 'nightly'
+    default: 'stable'
   jdk-version:
     description: 'jdk version to install (e.g., 17)'
     required: true

diff --git a/.github/actions/setup-macos-builder/action.yaml b/.github/actions/setup-macos-builder/action.yaml
@@ -21,7 +21,7 @@ inputs:
   rust-version:
     description: 'version of rust to install (e.g. nightly)'
     required: true
-    default: 'nightly'
+    default: 'stable'
   jdk-version:
     description: 'jdk version to install (e.g., 17)'
     required: true

diff --git a/.github/workflows/benchmark-tpch.yml b/.github/workflows/benchmark-tpch.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   prepare:

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   prepare:

diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   linux-test:
@@ -97,22 +97,11 @@ jobs:
         with:
           rust-version: ${{env.RUST_VERSION}}
           jdk-version: ${{ matrix.java_version }}
-      - name: Clone Spark
-        uses: actions/checkout@v4
-        with:
-          repository: "apache/spark"
-          path: "apache-spark"
-      - name: Install Spark
-        shell: bash
-        working-directory: ./apache-spark
-        run: build/mvn install -Phive -Phadoop-cloud -DskipTests
       - name: Java test steps
         uses: ./.github/actions/java-test
         with:
-          # TODO: remove -DskipTests after fixing tests
-          maven_opts: "-Pspark-${{ matrix.spark-version }} -DskipTests"
-          # TODO: upload test reports after enabling tests
-          upload-test-reports: false
+          maven_opts: -Pspark-${{ matrix.spark-version }}
+          upload-test-reports: true
 
   linux-test-with-old-spark:
     strategy:
@@ -225,22 +214,11 @@ jobs:
         with:
           rust-version: ${{env.RUST_VERSION}}
           jdk-version: ${{ matrix.java_version }}
-      - name: Clone Spark
-        uses: actions/checkout@v4
-        with:
-          repository: "apache/spark"
-          path: "apache-spark"
-      - name: Install Spark
-        shell: bash
-        working-directory: ./apache-spark
-        run: build/mvn install -Phive -Phadoop-cloud -DskipTests
       - name: Java test steps
         uses: ./.github/actions/java-test
         with:
-          # TODO: remove -DskipTests after fixing tests
-          maven_opts: "-Pspark-${{ matrix.spark-version }} -DskipTests"
-          # TODO: upload test reports after enabling tests
-          upload-test-reports: false
+          maven_opts: -Pspark-${{ matrix.spark-version }}
+          upload-test-reports: true
 
   macos-aarch64-test-with-spark4_0:
     strategy:
@@ -265,22 +243,11 @@ jobs:
           jdk-version: ${{ matrix.java_version }}
           jdk-architecture: aarch64
           protoc-architecture: aarch_64
-      - name: Clone Spark
-        uses: actions/checkout@v4
-        with:
-          repository: "apache/spark"
-          path: "apache-spark"
-      - name: Install Spark
-        shell: bash
-        working-directory: ./apache-spark
-        run: build/mvn install -Phive -Phadoop-cloud -DskipTests
       - name: Java test steps
         uses: ./.github/actions/java-test
         with:
-          # TODO: remove -DskipTests after fixing tests
-          maven_opts: "-Pspark-${{ matrix.spark-version }} -DskipTests"
-          # TODO: upload test reports after enabling tests
-          upload-test-reports: false
+          maven_opts: -Pspark-${{ matrix.spark-version }}
+          upload-test-reports: true
 
   macos-aarch64-test-with-old-spark:
     strategy:

diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   spark-sql-catalyst:

diff --git a/.github/workflows/spark_sql_test_ansi.yml b/.github/workflows/spark_sql_test_ansi.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   spark-sql-catalyst:

diff --git a/Makefile b/Makefile
@@ -44,10 +44,10 @@ format:
 
 core-amd64:
 	rustup target add x86_64-apple-darwin
-	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
 	mkdir -p common/target/classes/org/apache/comet/darwin/x86_64
 	cp core/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
-	cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release
 	mkdir -p common/target/classes/org/apache/comet/linux/amd64
 	cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
 	jar -cf common/target/comet-native-x86_64.jar \
@@ -57,10 +57,10 @@ core-amd64:
 
 core-arm64:
 	rustup target add aarch64-apple-darwin
-	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
 	mkdir -p common/target/classes/org/apache/comet/darwin/aarch64
 	cp core/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
-	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
 	mkdir -p common/target/classes/org/apache/comet/linux/aarch64
 	cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
 	jar -cf common/target/comet-native-aarch64.jar \
@@ -70,13 +70,16 @@ core-arm64:
 
 release-linux: clean
 	rustup target add aarch64-apple-darwin x86_64-apple-darwin
-	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
-	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
-	cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
 	./mvnw install -Prelease -DskipTests $(PROFILES)
 release:
-	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
 	./mvnw install -Prelease -DskipTests $(PROFILES)
+release-nogit:
+	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
+	./mvnw install -Prelease -DskipTests $(PROFILES) -Dmaven.gitcommitid.skip=true
 benchmark-%: clean release
 	cd spark && COMET_CONF_DIR=$(shell pwd)/conf MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="$*" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="$(filter-out $@,$(MAKECMDGOALS))" $(PROFILES)
 .DEFAULT:

diff --git a/README.md b/README.md
@@ -19,58 +19,86 @@ under the License.
 
 # Apache DataFusion Comet
 
-Apache DataFusion Comet is an Apache Spark plugin that uses [Apache DataFusion](https://datafusion.apache.org/)
-as native runtime to achieve improvement in terms of query efficiency and query runtime.
+Apache DataFusion Comet is a high-performance accelerator for Apache Spark, built on top of the powerful
+[Apache DataFusion](https://datafusion.apache.org) query engine. Comet is designed to significantly enhance the
+performance of Apache Spark workloads while leveraging commodity hardware and seamlessly integrating with the
+Spark ecosystem without requiring any code changes.
 
-Comet runs Spark SQL queries using the native DataFusion runtime, which is
-typically faster and more resource efficient than JVM based runtimes.
+# Benefits of Using Comet
 
-<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-system-diagram.png" align="center" width="500" ></a>
+## Run Spark Queries at DataFusion Speeds
 
-Comet aims to support:
+Comet delivers a performance speedup for many queries, enabling faster data processing and shorter time-to-insights.
 
-- a native Parquet implementation, including both reader and writer
-- full implementation of Spark operators, including
-  Filter/Project/Aggregation/Join/Exchange etc.
-- full implementation of Spark built-in expressions
-- a UDF framework for users to migrate their existing UDF to native
+The following chart shows the time it takes to run the 22 TPC-H queries against 100 GB of data in Parquet format 
+using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
+for details of the environment used for these benchmarks.
 
-## Architecture
+When using Comet, the overall run time is reduced from 649 seconds to 440 seconds, a 1.5x speedup.
 
-The following diagram illustrates the architecture of Comet:
+Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.9x 
+speedup compared to Spark.
 
-<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-overview.png" align="center" height="600" width="750" ></a>
+Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup 
+for many use cases.
 
-## Current Status
+![](docs/source/_static/images/tpch_allqueries.png)
 
-The project is currently integrated into Apache Spark 3.2, 3.3, and 3.4.
+Here is a breakdown showing relative performance of Spark, Comet, and DataFusion for each TPC-H query.
 
-## Feature Parity with Apache Spark
+![](docs/source/_static/images/tpch_queries_compare.png)
 
-The project strives to keep feature parity with Apache Spark, that is,
-users should expect the same behavior (w.r.t features, configurations,
-query results, etc) with Comet turned on or turned off in their Spark
-jobs. In addition, Comet extension should automatically detect unsupported
-features and fallback to Spark engine.
+The following chart shows how much Comet currently accelerates each query from the benchmark. Performance optimization
+is an ongoing task, and we welcome contributions from the community to help achieve even greater speedups in the future.
 
-To achieve this, besides unit tests within Comet itself, we also re-use
-Spark SQL tests and make sure they all pass with Comet extension
-enabled.
+![](docs/source/_static/images/tpch_queries_speedup.png)
 
-## Supported Platforms
+These benchmarks can be reproduced in any environment using the documentation in the 
+[Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html). We encourage 
+you to run your own benchmarks.
 
-Linux, Apple OSX (Intel and M1)
+## Use Commodity Hardware
 
-## Requirements
+Comet leverages commodity hardware, eliminating the need for costly hardware upgrades or
+specialized hardware accelerators, such as GPUs or FGPA. By maximizing the utilization of commodity hardware, Comet 
+ensures cost-effectiveness and scalability for your Spark deployments.
 
-- Apache Spark 3.2, 3.3, or 3.4
-- JDK 8, 11 and 17 (JDK 11 recommended because Spark 3.2 doesn't support 17)
-- GLIBC 2.17 (Centos 7) and up
+## Spark Compatibility
 
-## Getting started
+Comet aims for 100% compatibility with all supported versions of Apache Spark, allowing you to integrate Comet into
+your existing Spark deployments and workflows seamlessly. With no code changes required, you can immediately harness
+the benefits of Comet's acceleration capabilities without disrupting your Spark applications.
 
-See the [DataFusion Comet User Guide](https://datafusion.apache.org/comet/user-guide/installation.html) for installation instructions.
+## Tight Integration with Apache DataFusion
+
+Comet tightly integrates with the core Apache DataFusion project, leveraging its powerful execution engine. With
+seamless interoperability between Comet and DataFusion, you can achieve optimal performance and efficiency in your
+Spark workloads.
+
+## Active Community
+
+Comet boasts a vibrant and active community of developers, contributors, and users dedicated to advancing the
+capabilities of Apache DataFusion and accelerating the performance of Apache Spark.
+
+## Getting Started
+
+To get started with Apache DataFusion Comet, follow the
+[installation instructions](https://datafusion.apache.org/comet/user-guide/installation.html). Join the
+[DataFusion Slack and Discord channels](https://datafusion.apache.org/contributor-guide/communication.html) to connect
+with other users, ask questions, and share your experiences with Comet.
 
 ## Contributing
-See the [DataFusion Comet Contribution Guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html)
-for information on how to get started contributing to the project.
+
+We welcome contributions from the community to help improve and enhance Apache DataFusion Comet. Whether it's fixing
+bugs, adding new features, writing documentation, or optimizing performance, your contributions are invaluable in
+shaping the future of Comet. Check out our
+[contributor guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html) to get started.
+
+## License
+
+Apache DataFusion Comet is licensed under the Apache License 2.0. See the [LICENSE.txt](LICENSE.txt) file for details.
+
+## Acknowledgments
+
+We would like to express our gratitude to the Apache DataFusion community for their support and contributions to
+Comet. Together, we're building a faster, more efficient future for big data processing with Apache Spark.
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -29,6 +29,8 @@ import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.sql.comet.util.Utils
 import org.apache.spark.sql.internal.SQLConf
 
+import org.apache.comet.shims.ShimCometConf
+
 /**
  * Configurations for a Comet application. Mostly inspired by [[SQLConf]] in Spark.
  *
@@ -41,7 +43,7 @@ import org.apache.spark.sql.internal.SQLConf
  * which retrieves the config value from the thread-local [[SQLConf]] object. Alternatively, you
  * can also explicitly pass a [[SQLConf]] object to the `get` method.
  */
-object CometConf {
+object CometConf extends ShimCometConf {
 
   /** List of all configs that is used for generating documentation */
   val allConfs = new ListBuffer[ConfigEntry[_]]
@@ -361,7 +363,7 @@ object CometConf {
         "column to a long column, a float column to a double column, etc. This is automatically" +
         "enabled when reading from Iceberg tables.")
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(COMET_SCHEMA_EVOLUTION_ENABLED_DEFAULT)
 
   val COMET_ROW_TO_COLUMNAR_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.rowToColumnar.enabled")
@@ -382,12 +384,13 @@ object CometConf {
       .createWithDefault(Seq("Range,InMemoryTableScan"))
 
   val COMET_ANSI_MODE_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.ansi.enabled")
+    .internal()
     .doc(
       "Comet does not respect ANSI mode in most cases and by default will not accelerate " +
         "queries when ansi mode is enabled. Enable this setting to test Comet's experimental " +
         "support for ANSI mode. This should not be used in production.")
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(COMET_ANSI_MODE_ENABLED_DEFAULT)
 
   val COMET_CAST_ALLOW_INCOMPATIBLE: ConfigEntry[Boolean] =
     conf("spark.comet.cast.allowIncompatible")

diff --git a/common/src/main/spark-3.x/org/apache/comet/shims/ShimCometConf.scala b/common/src/main/spark-3.x/org/apache/comet/shims/ShimCometConf.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.shims
+
+trait ShimCometConf {
+  protected val COMET_SCHEMA_EVOLUTION_ENABLED_DEFAULT = false
+  protected val COMET_ANSI_MODE_ENABLED_DEFAULT = false
+}