Skip to content

Commit

Permalink
Merge branch 'master' into python-new-timer
Browse files Browse the repository at this point in the history
  • Loading branch information
jingz-db authored Nov 27, 2024
2 parents 521722e + 6edcf43 commit f10348c
Show file tree
Hide file tree
Showing 543 changed files with 27,906 additions and 15,213 deletions.
25 changes: 24 additions & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ jobs:
image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }}
image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}
image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}
image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -154,6 +156,14 @@ jobs:
IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Generate infra image URL (SparkR)
id: infra-image-sparkr-outputs
run: |
# Convert to lowercase to meet Docker repo name requirement
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Link the docker images
id: infra-image-link
run: |
Expand All @@ -162,9 +172,11 @@ jobs:
if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
else
echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT
fi
# Build: build Spark and run the tests for specified modules.
Expand Down Expand Up @@ -405,6 +417,17 @@ jobs:
${{ needs.precondition.outputs.image_lint_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }}
- name: Build and push (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
id: docker_build_sparkr
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/sparkr/
push: true
tags: |
${{ needs.precondition.outputs.image_sparkr_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }}


pyspark:
Expand Down Expand Up @@ -564,7 +587,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 180
container:
image: ${{ needs.precondition.outputs.image_url }}
image: ${{ needs.precondition.outputs.image_sparkr_url_link }}
env:
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/build_infra_images_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ on:
- 'dev/infra/Dockerfile'
- 'dev/spark-test-image/docs/Dockerfile'
- 'dev/spark-test-image/lint/Dockerfile'
- 'dev/spark-test-image/sparkr/Dockerfile'
- '.github/workflows/build_infra_images_cache.yml'
# Create infra image when cutting down branches/tags
create:
Expand Down Expand Up @@ -88,3 +89,16 @@ jobs:
- name: Image digest (Linter)
if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
run: echo ${{ steps.docker_build_lint.outputs.digest }}
- name: Build and push (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
id: docker_build_sparkr
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/sparkr/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max
- name: Image digest (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
32 changes: 32 additions & 0 deletions .github/workflows/build_python_3.11_macos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: "Build / Python-only (master, Python 3.11, MacOS)"

on:
schedule:
- cron: '0 21 * * *'

jobs:
run-build:
permissions:
packages: write
name: Run
uses: ./.github/workflows/python_macos_test.yml
if: github.repository == 'apache/spark'
2 changes: 1 addition & 1 deletion .github/workflows/build_python_connect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
# Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
# Stop Spark Connect server.
./sbin/stop-connect-server.sh
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_python_connect35.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
# Run branch-3.5 tests
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v4
Expand Down
162 changes: 162 additions & 0 deletions .github/workflows/python_macos_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: Build and test PySpark on macOS

on:
workflow_call:
inputs:
java:
required: false
type: string
default: 17
python:
required: false
type: string
default: 3.11
branch:
description: Branch to run the build against
required: false
type: string
default: master
hadoop:
description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
required: false
type: string
default: hadoop3
envs:
description: Additional environment variables to set when running the tests. Should be in JSON format.
required: false
type: string
default: '{}'
jobs:
build:
name: "PySpark test on macos: ${{ matrix.modules }}"
runs-on: macos-15
strategy:
fail-fast: false
matrix:
java:
- ${{ inputs.java }}
python:
- ${{inputs.python}}
modules:
- >-
pyspark-sql, pyspark-resource, pyspark-testing
- >-
pyspark-core, pyspark-errors, pyspark-streaming
- >-
pyspark-mllib, pyspark-ml, pyspark-ml-connect
- >-
pyspark-connect
- >-
pyspark-pandas
- >-
pyspark-pandas-slow
- >-
pyspark-pandas-connect-part0
- >-
pyspark-pandas-connect-part1
- >-
pyspark-pandas-connect-part2
- >-
pyspark-pandas-connect-part3
env:
MODULES_TO_TEST: ${{ matrix.modules }}
PYTHON_TO_TEST: python${{inputs.python}}
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
METASPACE_SIZE: 1g
BRANCH: ${{ inputs.branch }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
pyspark-coursier-
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ matrix.java }}
- name: Install Python packages (Python ${{matrix.python}})
run: |
python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
python${{matrix.python}} -m pip install numpy 'pyarrow>=15.0.0' 'six==1.16.0' 'pandas==2.2.3' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \
python${{matrix.python}} -m pip cache purge && \
python${{matrix.python}} -m pip list
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
run: |
if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
export SKIP_PACKAGING=false
echo "Python Packaging Tests Enabled!"
fi
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
- name: Upload test results to report
env: ${{ fromJSON(inputs.envs) }}
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
env: ${{ fromJSON(inputs.envs) }}
if: ${{ !success() }}
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: "**/target/unit-tests.log"
2 changes: 1 addition & 1 deletion assembly/README
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command

If you need to build an assembly for a different version of Hadoop the
hadoop-version system property needs to be set as in this example:
-Dhadoop.version=3.4.0
-Dhadoop.version=3.4.1
Loading

0 comments on commit f10348c

Please sign in to comment.