Merge branch 'master' into python-new-timer

apache · Nov 27, 2024 · f10348c · f10348c
2 parents 521722e + 6edcf43
commit f10348c
Show file tree

Hide file tree

Showing 543 changed files with 27,906 additions and 15,213 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -62,6 +62,8 @@ jobs:
       image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }}
       image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}
       image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
+      image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}
+      image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v4
@@ -154,6 +156,14 @@ jobs:
         IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}"
         IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
         echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT
+    - name: Generate infra image URL (SparkR)
+      id: infra-image-sparkr-outputs
+      run: |
+        # Convert to lowercase to meet Docker repo name requirement
+        REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
+        IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}"
+        IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
+        echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT
     - name: Link the docker images
       id: infra-image-link
       run: |
@@ -162,9 +172,11 @@ jobs:
         if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
           echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
           echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
+          echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
         else
           echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
           echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
+          echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT
         fi
 
   # Build: build Spark and run the tests for specified modules.
@@ -405,6 +417,17 @@ jobs:
             ${{ needs.precondition.outputs.image_lint_url }}
           # Use the infra image cache to speed up
           cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }}
+      - name: Build and push (SparkR)
+        if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+        id: docker_build_sparkr
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/sparkr/
+          push: true
+          tags: |
+            ${{ needs.precondition.outputs.image_sparkr_url }}
+          # Use the infra image cache to speed up
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }}
 
 
   pyspark:
@@ -564,7 +587,7 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 180
     container:
-      image: ${{ needs.precondition.outputs.image_url }}
+      image: ${{ needs.precondition.outputs.image_sparkr_url_link }}
     env:
       HADOOP_PROFILE: ${{ inputs.hadoop }}
       HIVE_PROFILE: hive2.3

diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml
@@ -29,6 +29,7 @@ on:
     - 'dev/infra/Dockerfile'
     - 'dev/spark-test-image/docs/Dockerfile'
     - 'dev/spark-test-image/lint/Dockerfile'
+    - 'dev/spark-test-image/sparkr/Dockerfile'
     - '.github/workflows/build_infra_images_cache.yml'
   # Create infra image when cutting down branches/tags
   create:
@@ -88,3 +89,16 @@ jobs:
       - name: Image digest (Linter)
         if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
         run: echo ${{ steps.docker_build_lint.outputs.digest }}
+      - name: Build and push (SparkR)
+        if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+        id: docker_build_sparkr
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/sparkr/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (SparkR)
+        if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
diff --git a/.github/workflows/build_python_3.11_macos.yml b/.github/workflows/build_python_3.11_macos.yml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build / Python-only (master, Python 3.11, MacOS)"
+
+on:
+  schedule:
+    - cron: '0 21 * * *'
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/python_macos_test.yml
+    if: github.repository == 'apache/spark'
diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml
@@ -93,7 +93,7 @@ jobs:
           # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
           ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
           # None of tests are dependent on each other in Pandas API on Spark so run them in parallel
-          ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
+          ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
 
           # Stop Spark Connect server.
           ./sbin/stop-connect-server.sh

diff --git a/.github/workflows/build_python_connect35.yml b/.github/workflows/build_python_connect35.yml
@@ -98,7 +98,7 @@ jobs:
           # Run branch-3.5 tests
           ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect
           # None of tests are dependent on each other in Pandas API on Spark so run them in parallel
-          ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
+          ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
       - name: Upload test results to report
         if: always()
         uses: actions/upload-artifact@v4

diff --git a/.github/workflows/python_macos_test.yml b/.github/workflows/python_macos_test.yml
@@ -0,0 +1,162 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Build and test PySpark on macOS
+
+on:
+  workflow_call:
+    inputs:
+      java:
+        required: false
+        type: string
+        default: 17
+      python:
+        required: false
+        type: string
+        default: 3.11
+      branch:
+        description: Branch to run the build against
+        required: false
+        type: string
+        default: master
+      hadoop:
+        description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
+        required: false
+        type: string
+        default: hadoop3
+      envs:
+        description: Additional environment variables to set when running the tests. Should be in JSON format.
+        required: false
+        type: string
+        default: '{}'
+jobs:
+  build:
+    name: "PySpark test on macos: ${{ matrix.modules }}"
+    runs-on: macos-15
+    strategy:
+      fail-fast: false
+      matrix:
+        java:
+          - ${{ inputs.java }}
+        python:
+          - ${{inputs.python}}
+        modules:
+          - >-
+            pyspark-sql, pyspark-resource, pyspark-testing
+          - >-
+            pyspark-core, pyspark-errors, pyspark-streaming
+          - >-
+            pyspark-mllib, pyspark-ml, pyspark-ml-connect
+          - >-
+            pyspark-connect
+          - >-
+            pyspark-pandas
+          - >-
+            pyspark-pandas-slow
+          - >-
+            pyspark-pandas-connect-part0
+          - >-
+            pyspark-pandas-connect-part1
+          - >-
+            pyspark-pandas-connect-part2
+          - >-
+            pyspark-pandas-connect-part3
+    env:
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      PYTHON_TO_TEST: python${{inputs.python}}
+      HADOOP_PROFILE: ${{ inputs.hadoop }}
+      HIVE_PROFILE: hive2.3
+      # GitHub Actions' default miniconda to use in pip packaging test.
+      CONDA_PREFIX: /usr/share/miniconda
+      GITHUB_PREV_SHA: ${{ github.event.before }}
+      SPARK_LOCAL_IP: localhost
+      SKIP_UNIDOC: true
+      SKIP_MIMA: true
+      SKIP_PACKAGING: true
+      METASPACE_SIZE: 1g
+      BRANCH: ${{ inputs.branch }}
+    steps:
+      - name: Checkout Spark repository
+        uses: actions/checkout@v4
+        # In order to fetch changed files
+        with:
+          fetch-depth: 0
+          repository: apache/spark
+          ref: ${{ inputs.branch }}
+      - name: Sync the current branch with the latest in Apache Spark
+        if: github.repository != 'apache/spark'
+        run: |
+          echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
+          git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+          git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
+          git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
+      # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
+      - name: Cache SBT and Maven
+        uses: actions/cache@v4
+        with:
+          path: |
+            build/apache-maven-*
+            build/*.jar
+            ~/.sbt
+          key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+          restore-keys: |
+            build-
+      - name: Cache Coursier local repository
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/coursier
+          key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+          restore-keys: |
+            pyspark-coursier-
+      - name: Install Java ${{ matrix.java }}
+        uses: actions/setup-java@v4
+        with:
+          distribution: zulu
+          java-version: ${{ matrix.java }}
+      - name: Install Python packages (Python ${{matrix.python}})
+        run: |
+          python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
+          python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
+          python${{matrix.python}} -m pip install numpy 'pyarrow>=15.0.0' 'six==1.16.0' 'pandas==2.2.3' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
+          python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \
+          python${{matrix.python}} -m pip cache purge && \
+          python${{matrix.python}} -m pip list
+      # Run the tests.
+      - name: Run tests
+        env: ${{ fromJSON(inputs.envs) }}
+        run: |
+          if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
+            export SKIP_PACKAGING=false
+            echo "Python Packaging Tests Enabled!"
+          fi
+          ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
+      - name: Upload test results to report
+        env: ${{ fromJSON(inputs.envs) }}
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
+          path: "**/target/test-reports/*.xml"
+      - name: Upload unit tests log files
+        env: ${{ fromJSON(inputs.envs) }}
+        if: ${{ !success() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
+          path: "**/target/unit-tests.log"
diff --git a/assembly/README b/assembly/README
@@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command
 
 If you need to build an assembly for a different version of Hadoop the
 hadoop-version system property needs to be set as in this example:
-  -Dhadoop.version=3.4.0
+  -Dhadoop.version=3.4.1