Skip to content

Commit

Permalink
Merge branch 'main' into issue-521
Browse files Browse the repository at this point in the history
  • Loading branch information
vaibhawvipul committed Jul 8, 2024
2 parents 6e843da + a8433b5 commit 8a95de9
Show file tree
Hide file tree
Showing 475 changed files with 84,563 additions and 2,223 deletions.
2 changes: 1 addition & 1 deletion .github/actions/java-test/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ runs:
- name: Run Cargo build
shell: bash
run: |
cd core
cd native
cargo build
- name: Cache Maven dependencies
Expand Down
8 changes: 4 additions & 4 deletions .github/actions/rust-test/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@ runs:
- name: Check Cargo fmt
shell: bash
run: |
cd core
cd native
cargo fmt --all -- --check --color=never
- name: Check Cargo clippy
shell: bash
run: |
cd core
cd native
cargo clippy --color=never -- -D warnings
- name: Check compilation
shell: bash
run: |
cd core
cd native
cargo check --benches
- name: Cache Maven dependencies
Expand All @@ -56,5 +56,5 @@ runs:
- name: Run Cargo test
shell: bash
run: |
cd core
cd native
RUST_BACKTRACE=1 cargo test
6 changes: 3 additions & 3 deletions .github/workflows/benchmark-tpch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ jobs:
with:
name: libcomet-${{ github.run_id }}
path: |
core/target/release/libcomet.so
core/target/release/libcomet.dylib
native/target/release/libcomet.so
native/target/release/libcomet.dylib
retention-days: 1 # remove the artifact after 1 day, only valid for this workflow
overwrite: true
- name: Generate TPC-H (SF=1) table data
Expand Down Expand Up @@ -119,7 +119,7 @@ jobs:
uses: actions/download-artifact@v4
with:
name: libcomet-${{ github.run_id }}
path: core/target/release
path: native/target/release
- name: Run TPC-H queries
run: |
SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test
6 changes: 3 additions & 3 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ jobs:
with:
name: libcomet-${{ github.run_id }}
path: |
core/target/release/libcomet.so
core/target/release/libcomet.dylib
native/target/release/libcomet.so
native/target/release/libcomet.dylib
retention-days: 1 # remove the artifact after 1 day, only valid for this workflow
overwrite: true
- name: Build tpcds-kit
Expand Down Expand Up @@ -134,7 +134,7 @@ jobs:
uses: actions/download-artifact@v4
with:
name: libcomet-${{ github.run_id }}
path: core/target/release
path: native/target/release
- name: Run TPC-DS queries (Sort merge join)
if: matrix.join == 'sort_merge'
run: |
Expand Down
21 changes: 5 additions & 16 deletions .github/workflows/pr_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
os: [ubuntu-latest]
java_version: [8, 11, 17]
test-target: [rust, java]
spark-version: ['3.4']
spark-version: ['3.5']
scala-version: ['2.12', '2.13']
is_push_event:
- ${{ github.event_name == 'push' }}
Expand Down Expand Up @@ -109,15 +109,8 @@ jobs:
os: [ubuntu-latest]
java_version: [8, 11, 17]
test-target: [java]
spark-version: ['3.2', '3.3']
spark-version: ['3.3', '3.4']
scala-version: ['2.12', '2.13']
exclude:
- java_version: 17
spark-version: '3.2'
- java_version: 11
spark-version: '3.2'
- spark-version: '3.2'
scala-version: '2.13'
fail-fast: false
name: ${{ matrix.os }}/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}-scala-${{matrix.scala-version}}/${{ matrix.test-target }}
runs-on: ${{ matrix.os }}
Expand All @@ -141,7 +134,7 @@ jobs:
os: [macos-13]
java_version: [8, 11, 17]
test-target: [rust, java]
spark-version: ['3.4']
spark-version: ['3.4', '3.5']
scala-version: ['2.12', '2.13']
fail-fast: false
if: github.event_name == 'push'
Expand All @@ -168,7 +161,7 @@ jobs:
matrix:
java_version: [8, 11, 17]
test-target: [rust, java]
spark-version: ['3.4']
spark-version: ['3.4', '3.5']
scala-version: ['2.12', '2.13']
is_push_event:
- ${{ github.event_name == 'push' }}
Expand Down Expand Up @@ -254,15 +247,11 @@ jobs:
matrix:
java_version: [8, 17]
test-target: [java]
spark-version: ['3.2', '3.3']
spark-version: ['3.3', '3.4']
scala-version: ['2.12', '2.13']
exclude:
- java_version: 17
spark-version: '3.2'
- java_version: 8
spark-version: '3.3'
- spark-version: '3.2'
scala-version: '2.13'
fail-fast: false
name: macos-14(Silicon)/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}-scala-${{matrix.scala-version}}/${{ matrix.test-target }}
runs-on: macos-14
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/spark_sql_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
matrix:
os: [ubuntu-latest]
java-version: [11]
spark-version: [{short: '3.4', full: '3.4.3'}]
spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.1'}]
module:
- {name: "catalyst", args1: "catalyst/test", args2: ""}
- {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}
Expand Down Expand Up @@ -75,6 +75,7 @@ jobs:
- name: Run Spark tests
run: |
cd apache-spark
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
ENABLE_COMET=true build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
env:
LC_ALL: "C.UTF-8"
Expand Down
27 changes: 13 additions & 14 deletions .github/workflows/spark_sql_test_ansi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,15 @@ concurrency:
cancel-in-progress: true

on:
# enable the following once Ansi support is completed
# push:
# paths-ignore:
# - "doc/**"
# - "**.md"
# pull_request:
# paths-ignore:
# - "doc/**"
# - "**.md"

# manual trigger ONLY
push:
paths-ignore:
- "docs/**"
- "**.md"
pull_request:
paths-ignore:
- "docs/**"
- "**.md"
# manual trigger
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
workflow_dispatch:

Expand All @@ -44,8 +42,8 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
java-version: [11]
spark-version: [{short: '3.4', full: '3.4.2'}]
java-version: [17]
spark-version: [{short: '4.0', full: '4.0.0-preview1'}]
module:
- {name: "catalyst", args1: "catalyst/test", args2: ""}
- {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}
Expand Down Expand Up @@ -75,7 +73,8 @@ jobs:
- name: Run Spark tests
run: |
cd apache-spark
ENABLE_COMET=true ENABLE_COMET_ANSI_MODE=true build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
RUST_BACKTRACE=1 ENABLE_COMET=true ENABLE_COMET_ANSI_MODE=true build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
env:
LC_ALL: "C.UTF-8"

2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ derby.log
metastore_db/
spark-warehouse/
dependency-reduced-pom.xml
core/src/execution/generated
native/core/src/execution/generated
prebuild
.flattened-pom.xml
rat.txt
Expand Down
28 changes: 28 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@
See the License for the specific language governing permissions and
limitations under the License.

--------------------------------------------------------------------------------

This project includes code from Apache Aurora.

Expand All @@ -210,3 +211,30 @@ This project includes code from Apache Aurora.
Copyright: 2016 The Apache Software Foundation.
Home page: https://aurora.apache.org/
License: http://www.apache.org/licenses/LICENSE-2.0

--------------------------------------------------------------------------------

This project includes software from the twox-hash project
https://github.com/shepmaster/twox-hash

The MIT License (MIT)

Copyright (c) 2015 Jake Goulding

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
36 changes: 18 additions & 18 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,65 +20,65 @@
all: core jvm

core:
cd core && cargo build
cd native && cargo build
test-rust:
# We need to compile CometException so that the cargo test can pass
./mvnw compile -pl common -DskipTests $(PROFILES)
cd core && cargo build && \
cd native && cargo build && \
RUST_BACKTRACE=1 cargo test
jvm:
./mvnw clean package -DskipTests $(PROFILES)
test-jvm: core
SPARK_HOME=`pwd` COMET_CONF_DIR=$(shell pwd)/conf RUST_BACKTRACE=1 ./mvnw verify $(PROFILES)
test: test-rust test-jvm
clean:
cd core && cargo clean
cd native && cargo clean
./mvnw clean $(PROFILES)
rm -rf .dist
bench:
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo bench $(filter-out $@,$(MAKECMDGOALS))
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo bench $(filter-out $@,$(MAKECMDGOALS))
format:
cd core && cargo fmt
cd native && cargo fmt
./mvnw compile test-compile scalafix:scalafix -Psemanticdb $(PROFILES)
./mvnw spotless:apply $(PROFILES)

core-amd64:
rustup target add x86_64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
cd native && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
mkdir -p common/target/classes/org/apache/comet/darwin/x86_64
cp core/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release
cp native/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
cd native && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release
mkdir -p common/target/classes/org/apache/comet/linux/amd64
cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
cp native/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
jar -cf common/target/comet-native-x86_64.jar \
-C common/target/classes/org/apache/comet darwin \
-C common/target/classes/org/apache/comet linux
./dev/deploy-file common/target/comet-native-x86_64.jar comet-native-x86_64${COMET_CLASSIFIER} jar

core-arm64:
rustup target add aarch64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
cd native && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
mkdir -p common/target/classes/org/apache/comet/darwin/aarch64
cp core/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
cp native/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
mkdir -p common/target/classes/org/apache/comet/linux/aarch64
cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
cp native/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
jar -cf common/target/comet-native-aarch64.jar \
-C common/target/classes/org/apache/comet darwin \
-C common/target/classes/org/apache/comet linux
./dev/deploy-file common/target/comet-native-aarch64.jar comet-native-aarch64${COMET_CLASSIFIER} jar

release-linux: clean
rustup target add aarch64-apple-darwin x86_64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
cd native && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
cd native && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
cd native && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
./mvnw install -Prelease -DskipTests $(PROFILES)
release:
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
./mvnw install -Prelease -DskipTests $(PROFILES)
release-nogit:
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
./mvnw install -Prelease -DskipTests $(PROFILES) -Dmaven.gitcommitid.skip=true
benchmark-%: clean release
cd spark && COMET_CONF_DIR=$(shell pwd)/conf MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="$*" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="$(filter-out $@,$(MAKECMDGOALS))" $(PROFILES)
Expand Down
8 changes: 8 additions & 0 deletions NOTICE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Apache DataFusion Comet
Copyright 2024 The Apache Software Foundation

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

This product includes software from the twox-hash project (MIT License)
https://github.com/shepmaster/twox-hash
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,14 @@ The following chart shows the time it takes to run the 22 TPC-H queries against
using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
for details of the environment used for these benchmarks.

When using Comet, the overall run time is reduced from 649 seconds to 440 seconds, a 1.5x speedup.
When using Comet, the overall run time is reduced from 649 seconds to 433 seconds, a 1.5x speedup, with some queries
showing a 2x-3x speedup.

Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.9x
speedup compared to Spark.

Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup
for many use cases.
for a broader set of queries.

![](docs/source/_static/images/tpch_allqueries.png)

Expand Down
5 changes: 3 additions & 2 deletions common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ under the License.
<sources>
<source>src/main/${shims.majorVerSrc}</source>
<source>src/main/${shims.minorVerSrc}</source>
<source>src/main/${shims.pre35Src}</source>
</sources>
</configuration>
</execution>
Expand All @@ -192,14 +193,14 @@ under the License.
<directory>${project.basedir}/src/main/resources</directory>
</resource>
<resource>
<directory>${project.basedir}/../core/target/x86_64-apple-darwin/release</directory>
<directory>${project.basedir}/../native/target/x86_64-apple-darwin/release</directory>
<includes>
<include>libcomet.dylib</include>
</includes>
<targetPath>org/apache/comet/darwin/x86_64</targetPath>
</resource>
<resource>
<directory>${project.basedir}/../core/target/aarch64-apple-darwin/release</directory>
<directory>${project.basedir}/../native/target/aarch64-apple-darwin/release</directory>
<includes>
<include>libcomet.dylib</include>
</includes>
Expand Down
3 changes: 2 additions & 1 deletion common/src/main/java/org/apache/arrow/c/ArrowImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ public FieldVector importVector(
ArrowArray array, ArrowSchema schema, CDataDictionaryProvider provider) {
Field field = importField(schema, provider);
FieldVector vector = field.createVector(allocator);
Data.importIntoVector(allocator, array, vector, provider);
CometArrayImporter importer = new CometArrayImporter(allocator, vector, provider);
importer.importArray(array);
return vector;
}
}
Loading

0 comments on commit 8a95de9

Please sign in to comment.