Skip to content

Commit

Permalink
[KYUUBI#5467] Integrate Intel Gluten with Spark engine
Browse files Browse the repository at this point in the history
  • Loading branch information
beryllw authored and wangjunbo committed Dec 1, 2023
1 parent 8f529aa commit 16d0ffb
Show file tree
Hide file tree
Showing 19 changed files with 711 additions and 137 deletions.
122 changes: 122 additions & 0 deletions .github/workflows/gluten.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

name: Gluten CI

on:
schedule:
- cron: 0 4 * * *

env:
MVN_OPT: -Dmaven.javadoc.skip=true -Drat.skip=true -Dscalastyle.skip=true -Dspotless.check.skip -Dorg.slf4j.simpleLogger.defaultLogLevel=warn -Pjdbc-shaded,gen-policy -Dmaven.plugin.download.cache.path=/tmp/engine-archives

jobs:
gluten-build:
name: Build Gluten
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Tune Runner VM
uses: ./.github/actions/tune-runner-vm
- name: Update and Upgrade
run: sudo apt-get update && sudo apt-get upgrade -y
- name: Install dependencies
run: |
sudo apt-get install -y software-properties-common
sudo apt-get install -y libunwind-dev build-essential cmake libssl-dev libre2-dev libcurl4-openssl-dev clang lldb lld libz-dev git ninja-build uuid-dev
- name: Setup JDK 8
uses: actions/setup-java@v3
with:
distribution: temurin
java-version: 8
cache: 'maven'
check-latest: false
- name: Setup Maven
uses: ./.github/actions/setup-maven
- name: Get gluten cache date
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Check gluten cache
id: gluten-cache
uses: actions/cache@v3
with:
path: gluten/package/target/
key: gluten_package_${{ steps.date.outputs.date }}
- name: Build gluten project
run: |
if [[ "${{ steps.gluten-cache.outputs.cache-hit }}" != 'true' ]]; then
git clone https://github.com/oap-project/gluten.git
cd gluten
./dev/buildbundle-veloxbe.sh
fi
- uses: actions/cache@v3
if: steps.gluten-cache.outputs.cache-hit != 'true'
with:
path: gluten/package/target/
key: gluten_package_${{ steps.date.outputs.date }}

gluten-test:
name: Gluten TPC-H/DS Test
needs: gluten-build
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
module: [ "extensions/spark/kyuubi-spark-connector-tpcds", "extensions/spark/kyuubi-spark-connector-tpch" ]
steps:
- uses: actions/checkout@v4
- name: Tune Runner VM
uses: ./.github/actions/tune-runner-vm
- name: Update and Upgrade
run: sudo apt-get update && sudo apt-get upgrade -y
- name: Install dependencies
run: |
sudo apt-get install -y software-properties-common
sudo apt-get install -y libunwind-dev build-essential cmake libssl-dev libre2-dev libcurl4-openssl-dev clang lldb lld libz-dev git ninja-build uuid-dev
sudo apt-get install -y libsnappy-dev libthrift-dev libboost-all-dev libgflags-dev libgoogle-glog-dev
- name: Cache Engine Archives
uses: ./.github/actions/cache-engine-archives
- name: Get gluten cache date
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Check gluten cache
id: gluten-cache
uses: actions/cache@v3
with:
path: gluten/package/target/
key: gluten_package_${{ steps.date.outputs.date }}
- name: Cache Gluten Package
uses: actions/cache@v3
with:
path: gluten/package/target/
key: gluten_package
- name: Setup JDK 8
uses: actions/setup-java@v3
with:
distribution: temurin
java-version: 8
cache: 'maven'
check-latest: false
- name: Setup Maven
uses: ./.github/actions/setup-maven
- name: Run TPC-H/DS Test
run: |
TEST_MODULES=${{ matrix.module }}
./build/mvn ${MVN_OPT} -pl ${TEST_MODULES} -am clean install -DskipTests -Pgluten -Pspark-3.4
./build/mvn ${MVN_OPT} -pl ${TEST_MODULES} -am -Pgluten -Pspark-3.4 test \
-Dmaven.plugin.scalatest.exclude.tags='' \
-Dtest=none -Dmaven.plugin.scalatest.include.tags='org.apache.kyuubi.tags.GlutenTest'
8 changes: 4 additions & 4 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,22 @@ jobs:
- java: 8
spark: '3.4'
spark-archive: '-Dspark.archive.mirror=https://archive.apache.org/dist/spark/spark-3.1.3 -Dspark.archive.name=spark-3.1.3-bin-hadoop3.2.tgz -Pzookeeper-3.6'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.HudiTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.GlutenTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.HudiTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
comment: 'verify-on-spark-3.1-binary'
- java: 8
spark: '3.4'
spark-archive: '-Dspark.archive.mirror=https://archive.apache.org/dist/spark/spark-3.2.4 -Dspark.archive.name=spark-3.2.4-bin-hadoop3.2.tgz -Pzookeeper-3.6'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.HudiTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.GlutenTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.HudiTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
comment: 'verify-on-spark-3.2-binary'
- java: 8
spark: '3.4'
spark-archive: '-Dspark.archive.mirror=https://archive.apache.org/dist/spark/spark-3.3.3 -Dspark.archive.name=spark-3.3.3-bin-hadoop3.tgz -Pzookeeper-3.6'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.HudiTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.GlutenTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.HudiTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
comment: 'verify-on-spark-3.3-binary'
- java: 8
spark: '3.4'
spark-archive: '-Dspark.archive.mirror=https://archive.apache.org/dist/spark/spark-3.5.0 -Dspark.archive.name=spark-3.5.0-bin-hadoop3.tgz -Pzookeeper-3.6'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.PaimonTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
exclude-tags: '-Dmaven.plugin.scalatest.exclude.tags=org.scalatest.tags.Slow,org.apache.kyuubi.tags.GlutenTest,org.apache.kyuubi.tags.DeltaTest,org.apache.kyuubi.tags.IcebergTest,org.apache.kyuubi.tags.PaimonTest,org.apache.kyuubi.tags.SparkLocalClusterTest'
comment: 'verify-on-spark-3.5-binary'
exclude:
# SPARK-33772: Spark supports JDK 17 since 3.3.0
Expand Down
52 changes: 52 additions & 0 deletions docs/deployment/spark/gluten.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-->
<!-- DO NOT MODIFY THIS FILE DIRECTLY, IT IS AUTO-GENERATED BY [org.apache.kyuubi.engine.spark.udf.KyuubiDefinedFunctionSuite] -->

# Gluten

Gluten is a Spark plugin developed by Intel, designed to accelerate Apache Spark with native libraries. Currently, only CentOS 7/8 and Ubuntu 20.04/22.04, along with Spark 3.2/3.3/3.4, are supported. Users can employ the following methods to utilize the Gluten with Velox native libraries.

## Building(with velox Backend)

### Build gluten velox backend package

Git clone gluten project, use gluten build script `buildbundle-veloxbe.sh`, and target package is in `/path/to/gluten/package/target/`
```bash
git clone https://github.com/oap-project/gluten.git
cd /path/to/gluten

## The script builds two jars for spark 3.2.x, 3.3.x, and 3.4.x.
./dev/buildbundle-veloxbe.sh
```

## Usage

You can use Gluten to accelerate Spark by following steps.

### Installing

add gluten jar: `copy /path/to/gluten/package/target/gluten-velox-bundle-spark3.x_2.12-*.jar $SPARK_HOME/jars/` or specified to `spark.jars` configuration

### Configure

add config into `spark-defaults.conf`:
```properties
spark.plugins=io.glutenproject.GlutenPlugin
spark.memory.offHeap.size=20g
spark.memory.offHeap.enabled=true
spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager
```
25 changes: 25 additions & 0 deletions extensions/spark/kyuubi-spark-connector-tpcds/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,29 @@
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
</build>

<profiles>
<profile>
<id>gluten</id>
<properties>
<maven.plugin.scalatest.include.tags>org.apache.kyuubi.tags.GlutenTest</maven.plugin.scalatest.include.tags>
<spark.version>3.4.1</spark.version>
<spark.binary.version>3.4</spark.binary.version>
</properties>
<dependencies>
<dependency>
<groupId>io.glutenproject</groupId>
<artifactId>gluten-velox-bundle-spark3.4_2.12-ubuntu_22.04</artifactId>
<version>1.1.0-SNAPSHOT</version>
<scope>system</scope>
<systemPath>${project.basedir}/../../../gluten/package/target/gluten-velox-bundle-spark3.4_2.12-ubuntu_22.04-1.1.0-SNAPSHOT.jar</systemPath>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
-- Licensed to the Apache Software Foundation (ASF) under one or more
-- contributor license agreements. See the NOTICE file distributed with
-- this work for additional information regarding copyright ownership.
-- The ASF licenses this file to You under the Apache License, Version 2.0
-- (the "License"); you may not use this file except in compliance with
-- the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
--

CREATE DATABASE IF NOT EXISTS spark_catalog.tpcds_tiny;

USE spark_catalog.tpcds_tiny;

--
-- Name: catalog_sales; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS catalog_sales USING parquet PARTITIONED BY (cs_sold_date_sk)
AS SELECT * FROM tpcds.tiny.catalog_sales;

--
-- Name: catalog_returns; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS catalog_returns USING parquet PARTITIONED BY (cr_returned_date_sk)
AS SELECT * FROM tpcds.tiny.catalog_returns;

--
-- Name: inventory; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS inventory USING parquet PARTITIONED BY (inv_date_sk)
AS SELECT * FROM tpcds.tiny.inventory;

--
-- Name: store_sales; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS store_sales USING parquet PARTITIONED BY (ss_sold_date_sk)
AS SELECT * FROM tpcds.tiny.store_sales;

--
-- Name: store_returns; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS store_returns USING parquet PARTITIONED BY (sr_returned_date_sk)
AS SELECT * FROM tpcds.tiny.store_returns;

--
-- Name: web_sales; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS web_sales USING parquet PARTITIONED BY (ws_sold_date_sk)
AS SELECT * FROM tpcds.tiny.web_sales;

--
-- Name: web_returns; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS web_returns USING parquet PARTITIONED BY (wr_returned_date_sk)
AS SELECT * FROM tpcds.tiny.web_returns;

--
-- Name: call_center; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS call_center USING parquet AS SELECT * FROM tpcds.tiny.call_center;

--
-- Name: catalog_page; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS catalog_page USING parquet AS SELECT * FROM tpcds.tiny.catalog_page;

--
-- Name: customer; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS customer USING parquet AS SELECT * FROM tpcds.tiny.customer;

--
-- Name: customer_address; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS customer_address USING parquet AS SELECT * FROM tpcds.tiny.customer_address;

--
-- Name: customer_demographics; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS customer_demographics USING parquet AS SELECT * FROM tpcds.tiny.customer_demographics;

--
-- Name: date_dim; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS date_dim USING parquet AS SELECT * FROM tpcds.tiny.date_dim;

--
-- Name: household_demographics; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS household_demographics USING parquet AS SELECT * FROM tpcds.tiny.household_demographics;

--
-- Name: income_band; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS income_band USING parquet AS SELECT * FROM tpcds.tiny.income_band;

--
-- Name: item; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS item USING parquet AS SELECT * FROM tpcds.tiny.item;

--
-- Name: promotion; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS promotion USING parquet AS SELECT * FROM tpcds.tiny.promotion;

--
-- Name: reason; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS reason USING parquet AS SELECT * FROM tpcds.tiny.reason;

--
-- Name: ship_mode; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS ship_mode USING parquet AS SELECT * FROM tpcds.tiny.ship_mode;

--
-- Name: store; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS store USING parquet AS SELECT * FROM tpcds.tiny.store;

--
-- Name: time_dim; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS time_dim USING parquet AS SELECT * FROM tpcds.tiny.time_dim;

--
-- Name: warehouse; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS warehouse USING parquet AS SELECT * FROM tpcds.tiny.warehouse;

--
-- Name: web_page; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS web_page USING parquet AS SELECT * FROM tpcds.tiny.web_page;

--
-- Name: web_site; Type: TABLE; Tablespace:
--
CREATE TABLE IF NOT EXISTS web_site USING parquet AS SELECT * FROM tpcds.tiny.web_site;
Loading

0 comments on commit 16d0ffb

Please sign in to comment.