Skip to content

Commit

Permalink
build: Add CI for TPCDS queries (#99)
Browse files Browse the repository at this point in the history
  • Loading branch information
viirya authored Feb 24, 2024
1 parent 51ec49b commit 38b0bfd
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 1 deletion.
61 changes: 61 additions & 0 deletions .github/workflows/pr_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,64 @@ jobs:
- if: matrix.test-target == 'java'
name: Java test steps
uses: ./.github/actions/java-test

tpcds-1g:
name: Run TPC-DS queries with SF=1
runs-on: ubuntu-latest
container:
image: amd64/rust
env:
JAVA_VERSION: 11
steps:
- uses: actions/checkout@v4
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{env.RUST_VERSION}}
jdk-version: 11

- name: Cache TPC-DS generated data
id: cache-tpcds-sf-1
uses: actions/cache@v4
with:
path: ./tpcds-sf-1
key: tpcds-${{ hashFiles('.github/workflows/pr_build.yml') }}
- name: Checkout tpcds-kit repository
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
uses: actions/checkout@v4
with:
repository: databricks/tpcds-kit
path: ./tpcds-kit
- name: Build Comet
run: make release
- name: Build tpcds-kit
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: |
apt-get install -y yacc bison flex
cd tpcds-kit/tools && make OS=LINUX
- name: Generate TPC-DS (SF=1) table data
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: |
cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1"
cd ..
- name: Run TPC-DS queries (Sort merge join)
run: |
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.preferSortMergeJoin=true
- name: Run TPC-DS queries (Broadcast hash join)
run: |
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=10485760
- name: Run TPC-DS queries (Shuffled hash join)
run: |
SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.forceApplyShuffledHashJoin=true
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,7 @@ under the License.
<exclude>**/test/resources/**</exclude>
<exclude>**/benchmarks/*.txt</exclude>
<exclude>**/inspections/*.txt</exclude>
<exclude>tpcds-kit/**</exclude>
</excludes>
</configuration>
</plugin>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ import org.apache.comet.CometConf
class CometTPCDSQuerySuite
extends {
// This is private in `TPCDSBase`.
val excludedTpcdsQueries: Seq[String] = Seq("q34", "q64")
val excludedTpcdsQueries: Seq[String] =
Seq("q34", "q66", "q64", "q71", "q88", "q90", "q96")

// This is private in `TPCDSBase` and `excludedTpcdsQueries` is private too.
// So we cannot override `excludedTpcdsQueries` to exclude the queries.
Expand Down

0 comments on commit 38b0bfd

Please sign in to comment.