upmerge

apache · Aug 20, 2024 · 20597c8 · 20597c8
2 parents a868355 + 829656c
commit 20597c8
Show file tree

Hide file tree

Showing 7 changed files with 139 additions and 29 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,18 @@
+.git
+.github
+.idea
+bin
+conf
+docs/build
+docs/temp
+docs/venv
+metastore_db
+target
+common/target
+spark-integration/target
+fuzz-testing/target
+spark/target
+native/target
+core/target
+spark-warehouse
+venv
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Publish Docker images
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  push:
+    tags:
+      - '*.*.*'
+      - '*.*.*-rc*'
+      - 'test-docker-publish-*'
+
+docker:
+  name: Docker
+  runs-on: ubuntu-22.04
+  permissions:
+    contents: read
+    packages: write
+  steps:
+    - name: Set up Java
+      uses: actions/setup-java@v3
+      with:
+        java-version: '17'
+    - name: Extract Comet version
+      id: extract_version
+      run: |
+        COMET_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
+        echo "COMET_VERSION=$COMET_VERSION" >> $GITHUB_ENV
+    - name: Echo Comet version
+      run: echo "The current Comet version is ${{ env.COMET_VERSION }}"
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Build and push
+      uses: docker/build-push-action@v6
+      with:
+        platforms: linux/amd64,linux/arm64
+        push: true
+        tags: apache/datafusion-comet:spark-3.4-scala-2.12-${{ env.COMET_VERSION }}
+        file: kube/Dockerfile
diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md
@@ -32,7 +32,11 @@ Make sure the following requirements are met and software installed on your mach
 - JDK 8 and up
 - GLIBC 2.17 (Centos 7) and up
 
-## Using a Published Binary Release
+## Using a Published Docker Image
+
+Docker images are available at https://github.com/orgs/apache/packages?repo_name=datafusion-comet
+
+## Using a Published JAR File
 
 There are no published binary releases yet.
 
@@ -151,3 +155,6 @@ To enable columnar shuffle which supports all partitioning and basic complex typ
 ```
 --conf spark.comet.exec.shuffle.mode=jvm
 ```
+
+### Memory tuning
+In addition to Apache Spark memory configuration parameters the Comet introduces own parameters to configure memory allocation for native execution. More [Comet Memory Tuning](./tuning.md)
diff --git a/docs/source/user-guide/tuning.md b/docs/source/user-guide/tuning.md
@@ -37,6 +37,17 @@ Comet will allocate at least `spark.comet.memory.overhead.min` memory.
 
 If both `spark.comet.memoryOverhead` and `spark.comet.memory.overhead.factor` are set, the former will be used.
 
+## Memory Tuning using CometPlugin
+Configuring memory for Spark and Comet might be a tedious task as it requires to tune Spark executor overhead memory and Comet memory overhead configs. Comet provides a Spark plugin `CometPlugin` which can be set up to your Spark application to help memory settings.
+
+For users running the Comet in clusters like Kubernetes or YARN, `CometPlugin` can also make the resource manager respect correctly Comet memory parameters `spark.comet.memory*`. 
+it is needed to pass to the starting command line additional Spark configuration parameter `--conf spark.plugins=org.apache.spark.CometPlugin`
+
+The resource managers respects Apache Spark memory configuration before starting the containers.
+
+The `CometPlugin` plugin overrides `spark.executor.memoryOverhead` adding up the Comet memory configuration.
+
+
 ## Shuffle
 
 Comet provides Comet shuffle features that can be used to improve the performance of your queries.

diff --git a/kube/Dockerfile b/kube/Dockerfile
@@ -21,7 +21,6 @@ USER root
 
 # Installing JDK11 as the image comes with JRE
 RUN apt update \
-    && apt install -y git \
     && apt install -y curl \
     && apt install -y openjdk-11-jdk \
     && apt clean
@@ -32,14 +31,37 @@ ENV RUSTFLAGS="-C debuginfo=line-tables-only -C incremental=false"
 ENV SPARK_VERSION=3.4
 ENV SCALA_VERSION=2.12
 
+# copy source files to Docker image
+RUN mkdir /comet
+WORKDIR /comet
+
+# build native code first so that this layer can be re-used
+# if only Scala code gets modified
+COPY rust-toolchain.toml /comet/rust-toolchain.toml
+COPY native /comet/native
+RUN cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
+
+# copy the rest of the project
+COPY .mvn /comet/.mvn
+COPY mvnw /comet/mvnw
+COPY common /comet/common
+COPY dev /comet/dev
+COPY docs /comet/docs
+COPY fuzz-testing /comet/fuzz-testing
+COPY spark /comet/spark
+COPY spark-integration /comet/spark-integration
+COPY scalafmt.conf /comet/scalafmt.conf
+COPY .scalafix.conf /comet/.scalafix.conf
+COPY Makefile /comet/Makefile
+COPY pom.xml /comet/pom.xml
+
 # Pick the JDK instead of JRE to compile Comet
-RUN cd /opt \
-    && git clone https://github.com/apache/datafusion-comet.git \
-    && cd datafusion-comet \
-    && JAVA_HOME=$(readlink -f $(which javac) | sed "s/\/bin\/javac//") make release PROFILES="-Pspark-$SPARK_VERSION -Pscala-$SCALA_VERSION"
+RUN cd /comet \
+    && JAVA_HOME=$(readlink -f $(which javac) | sed "s/\/bin\/javac//") make release-nogit PROFILES="-Pspark-$SPARK_VERSION -Pscala-$SCALA_VERSION"
 
 FROM apache/spark:3.4.2
 ENV SPARK_VERSION=3.4
 ENV SCALA_VERSION=2.12
 USER root
-COPY --from=builder  /opt/datafusion-comet/spark/target/comet-spark-spark${SPARK_VERSION}_$SCALA_VERSION-0.2.0-SNAPSHOT.jar $SPARK_HOME/jars
+# note the use of a wildcard in the file name so that this works with both snapshot and final release versions
+COPY --from=builder  /comet/spark/target/comet-spark-spark${SPARK_VERSION}_$SCALA_VERSION-0.2.0*.jar $SPARK_HOME/jars
diff --git a/native/core/src/execution/datafusion/expressions/checkoverflow.rs b/native/core/src/execution/datafusion/expressions/checkoverflow.rs
@@ -23,7 +23,7 @@ use std::{
 };
 
 use arrow::{
-    array::{as_primitive_array, Array, ArrayRef, Decimal128Array, PrimitiveArray},
+    array::{as_primitive_array, Array, ArrayRef, Decimal128Array},
     datatypes::{Decimal128Type, DecimalType},
     record_batch::RecordBatch,
 };
@@ -111,30 +111,14 @@ impl PhysicalExpr for CheckOverflow {
 
                 let casted_array = if self.fail_on_error {
                     // Returning error if overflow
-                    let iter = decimal_array
-                        .iter()
-                        .map(|v| {
-                            v.map(|v| {
-                                Decimal128Type::validate_decimal_precision(v, *precision).map(|_| v)
-                            })
-                            .map_or(Ok(None), |r| r.map(Some))
-                        })
-                        .collect::<Result<Vec<_>, _>>()?
-                        .into_iter();
-                    unsafe { PrimitiveArray::<Decimal128Type>::from_trusted_len_iter(iter) }
+                    decimal_array.validate_decimal_precision(*precision)?;
+                    decimal_array
                 } else {
                     // Overflowing gets null value
-                    let iter = decimal_array.iter().map(|v| {
-                        v.and_then(|v| {
-                            Decimal128Type::validate_decimal_precision(v, *precision)
-                                .map(|_| v)
-                                .ok()
-                        })
-                    });
-                    unsafe { PrimitiveArray::<Decimal128Type>::from_trusted_len_iter(iter) }
+                    &decimal_array.null_if_overflow_precision(*precision)
                 };
 
-                let new_array = Decimal128Array::from(casted_array.to_data())
+                let new_array = Decimal128Array::from(casted_array.into_data())
                     .with_precision_and_scale(*precision, *scale)
                     .map(|a| Arc::new(a) as ArrayRef)?;
 

diff --git a/pom.xml b/pom.xml
@@ -588,6 +588,10 @@ under the License.
       </properties>
     </profile>
 
+    <profile>
+      <id>scala-2.12</id>
+    </profile>
+
     <profile>
       <id>scala-2.13</id>
       <properties>
@@ -938,6 +942,7 @@ under the License.
             <exclude>**/build/**</exclude>
             <exclude>**/target/**</exclude>
             <exclude>**/apache-spark/**</exclude>
+            <exclude>.dockerignore</exclude>
             <exclude>.git/**</exclude>
             <exclude>.github/**</exclude>
             <exclude>.gitignore</exclude>
@@ -963,7 +968,7 @@ under the License.
             <exclude>docs/source/_static/images/**</exclude>
             <exclude>dev/release/rat_exclude_files.txt</exclude>
             <exclude>dev/release/requirements.txt</exclude>
-            <exclude>native/core/src/execution/generated/**</exclude>
+            <exclude>native/proto/src/generated/**</exclude>
           </excludes>
         </configuration>
       </plugin>