diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..332acff44 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +.git +.github +.idea +bin +conf +docs/build +docs/temp +docs/venv +metastore_db +target +common/target +spark-integration/target +fuzz-testing/target +spark/target +native/target +core/target +spark-warehouse +venv diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 000000000..daa6db324 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Publish Docker images + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +on: + push: + tags: + - '*.*.*' + - '*.*.*-rc*' + - 'test-docker-publish-*' + +docker: + name: Docker + runs-on: ubuntu-22.04 + permissions: + contents: read + packages: write + steps: + - name: Set up Java + uses: actions/setup-java@v3 + with: + java-version: '17' + - name: Extract Comet version + id: extract_version + run: | + COMET_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) + echo "COMET_VERSION=$COMET_VERSION" >> $GITHUB_ENV + - name: Echo Comet version + run: echo "The current Comet version is ${{ env.COMET_VERSION }}" + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push + uses: docker/build-push-action@v6 + with: + platforms: linux/amd64,linux/arm64 + push: true + tags: apache/datafusion-comet:spark-3.4-scala-2.12-${{ env.COMET_VERSION }} + file: kube/Dockerfile diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md index c17bf77bb..a87e41e7a 100644 --- a/docs/source/user-guide/installation.md +++ b/docs/source/user-guide/installation.md @@ -32,7 +32,11 @@ Make sure the following requirements are met and software installed on your mach - JDK 8 and up - GLIBC 2.17 (Centos 7) and up -## Using a Published Binary Release +## Using a Published Docker Image + +Docker images are available at https://github.com/orgs/apache/packages?repo_name=datafusion-comet + +## Using a Published JAR File There are no published binary releases yet. @@ -151,3 +155,6 @@ To enable columnar shuffle which supports all partitioning and basic complex typ ``` --conf spark.comet.exec.shuffle.mode=jvm ``` + +### Memory tuning +In addition to Apache Spark memory configuration parameters the Comet introduces own parameters to configure memory allocation for native execution. More [Comet Memory Tuning](./tuning.md) \ No newline at end of file diff --git a/docs/source/user-guide/tuning.md b/docs/source/user-guide/tuning.md index 876f498d5..9704bade8 100644 --- a/docs/source/user-guide/tuning.md +++ b/docs/source/user-guide/tuning.md @@ -37,6 +37,17 @@ Comet will allocate at least `spark.comet.memory.overhead.min` memory. If both `spark.comet.memoryOverhead` and `spark.comet.memory.overhead.factor` are set, the former will be used. +## Memory Tuning using CometPlugin +Configuring memory for Spark and Comet might be a tedious task as it requires to tune Spark executor overhead memory and Comet memory overhead configs. Comet provides a Spark plugin `CometPlugin` which can be set up to your Spark application to help memory settings. + +For users running the Comet in clusters like Kubernetes or YARN, `CometPlugin` can also make the resource manager respect correctly Comet memory parameters `spark.comet.memory*`. +it is needed to pass to the starting command line additional Spark configuration parameter `--conf spark.plugins=org.apache.spark.CometPlugin` + +The resource managers respects Apache Spark memory configuration before starting the containers. + +The `CometPlugin` plugin overrides `spark.executor.memoryOverhead` adding up the Comet memory configuration. + + ## Shuffle Comet provides Comet shuffle features that can be used to improve the performance of your queries. diff --git a/kube/Dockerfile b/kube/Dockerfile index d06651770..4fbdd89d1 100644 --- a/kube/Dockerfile +++ b/kube/Dockerfile @@ -21,7 +21,6 @@ USER root # Installing JDK11 as the image comes with JRE RUN apt update \ - && apt install -y git \ && apt install -y curl \ && apt install -y openjdk-11-jdk \ && apt clean @@ -32,14 +31,37 @@ ENV RUSTFLAGS="-C debuginfo=line-tables-only -C incremental=false" ENV SPARK_VERSION=3.4 ENV SCALA_VERSION=2.12 +# copy source files to Docker image +RUN mkdir /comet +WORKDIR /comet + +# build native code first so that this layer can be re-used +# if only Scala code gets modified +COPY rust-toolchain.toml /comet/rust-toolchain.toml +COPY native /comet/native +RUN cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release + +# copy the rest of the project +COPY .mvn /comet/.mvn +COPY mvnw /comet/mvnw +COPY common /comet/common +COPY dev /comet/dev +COPY docs /comet/docs +COPY fuzz-testing /comet/fuzz-testing +COPY spark /comet/spark +COPY spark-integration /comet/spark-integration +COPY scalafmt.conf /comet/scalafmt.conf +COPY .scalafix.conf /comet/.scalafix.conf +COPY Makefile /comet/Makefile +COPY pom.xml /comet/pom.xml + # Pick the JDK instead of JRE to compile Comet -RUN cd /opt \ - && git clone https://github.com/apache/datafusion-comet.git \ - && cd datafusion-comet \ - && JAVA_HOME=$(readlink -f $(which javac) | sed "s/\/bin\/javac//") make release PROFILES="-Pspark-$SPARK_VERSION -Pscala-$SCALA_VERSION" +RUN cd /comet \ + && JAVA_HOME=$(readlink -f $(which javac) | sed "s/\/bin\/javac//") make release-nogit PROFILES="-Pspark-$SPARK_VERSION -Pscala-$SCALA_VERSION" FROM apache/spark:3.4.2 ENV SPARK_VERSION=3.4 ENV SCALA_VERSION=2.12 USER root -COPY --from=builder /opt/datafusion-comet/spark/target/comet-spark-spark${SPARK_VERSION}_$SCALA_VERSION-0.2.0-SNAPSHOT.jar $SPARK_HOME/jars \ No newline at end of file +# note the use of a wildcard in the file name so that this works with both snapshot and final release versions +COPY --from=builder /comet/spark/target/comet-spark-spark${SPARK_VERSION}_$SCALA_VERSION-0.2.0*.jar $SPARK_HOME/jars diff --git a/native/core/src/execution/datafusion/expressions/checkoverflow.rs b/native/core/src/execution/datafusion/expressions/checkoverflow.rs index 044b366e3..e4f54a1b8 100644 --- a/native/core/src/execution/datafusion/expressions/checkoverflow.rs +++ b/native/core/src/execution/datafusion/expressions/checkoverflow.rs @@ -23,7 +23,7 @@ use std::{ }; use arrow::{ - array::{as_primitive_array, Array, ArrayRef, Decimal128Array, PrimitiveArray}, + array::{as_primitive_array, Array, ArrayRef, Decimal128Array}, datatypes::{Decimal128Type, DecimalType}, record_batch::RecordBatch, }; @@ -111,30 +111,14 @@ impl PhysicalExpr for CheckOverflow { let casted_array = if self.fail_on_error { // Returning error if overflow - let iter = decimal_array - .iter() - .map(|v| { - v.map(|v| { - Decimal128Type::validate_decimal_precision(v, *precision).map(|_| v) - }) - .map_or(Ok(None), |r| r.map(Some)) - }) - .collect::, _>>()? - .into_iter(); - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } + decimal_array.validate_decimal_precision(*precision)?; + decimal_array } else { // Overflowing gets null value - let iter = decimal_array.iter().map(|v| { - v.and_then(|v| { - Decimal128Type::validate_decimal_precision(v, *precision) - .map(|_| v) - .ok() - }) - }); - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } + &decimal_array.null_if_overflow_precision(*precision) }; - let new_array = Decimal128Array::from(casted_array.to_data()) + let new_array = Decimal128Array::from(casted_array.into_data()) .with_precision_and_scale(*precision, *scale) .map(|a| Arc::new(a) as ArrayRef)?; diff --git a/pom.xml b/pom.xml index 311437cc9..d41a57dbc 100644 --- a/pom.xml +++ b/pom.xml @@ -588,6 +588,10 @@ under the License. + + scala-2.12 + + scala-2.13 @@ -938,6 +942,7 @@ under the License. **/build/** **/target/** **/apache-spark/** + .dockerignore .git/** .github/** .gitignore @@ -963,7 +968,7 @@ under the License. docs/source/_static/images/** dev/release/rat_exclude_files.txt dev/release/requirements.txt - native/core/src/execution/generated/** + native/proto/src/generated/**