From be6f4d834baa5c84477a3eb278040c7f5198332e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 1 Jul 2024 20:18:04 -0600 Subject: [PATCH] feat: Enable Spark SQL tests for Spark 3.5.1 (#603) * Enable Spark SQL tests for Spark 3.5.1 * fix conflicts * finish creating diff * improve docs, update slf4j version * improve docs * fix diff * fix diff * address feedback * add IgnoreComet to patch * update test * remove unused imports * ignore 2 failing tests * scalastyle * ignore failing parquet metadata tests * add links to tracking issue --- .github/workflows/spark_sql_test.yml | 3 +- dev/diffs/3.5.1.diff | 2815 +++++++++++++++++ .../contributor-guide/spark-sql-tests.md | 128 + docs/source/index.rst | 1 + pom.xml | 2 +- 5 files changed, 2947 insertions(+), 2 deletions(-) create mode 100644 dev/diffs/3.5.1.diff create mode 100644 docs/source/contributor-guide/spark-sql-tests.md diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index 352e0ecbe3..1cc6a1ff4c 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -45,7 +45,7 @@ jobs: matrix: os: [ubuntu-latest] java-version: [11] - spark-version: [{short: '3.4', full: '3.4.3'}] + spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.1'}] module: - {name: "catalyst", args1: "catalyst/test", args2: ""} - {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest} @@ -75,6 +75,7 @@ jobs: - name: Run Spark tests run: | cd apache-spark + rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups ENABLE_COMET=true build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}" env: LC_ALL: "C.UTF-8" diff --git a/dev/diffs/3.5.1.diff b/dev/diffs/3.5.1.diff new file mode 100644 index 0000000000..ebc4cdb8ca --- /dev/null +++ b/dev/diffs/3.5.1.diff @@ -0,0 +1,2815 @@ +diff --git a/pom.xml b/pom.xml +index 0f504dbee85..71fd49a3744 100644 +--- a/pom.xml ++++ b/pom.xml +@@ -152,6 +152,8 @@ + --> + 2.5.1 + 2.0.8 ++ 3.5 ++ 0.1.0-SNAPSHOT + + +# Running Spark SQL Tests + +Running Apache Spark's SQL tests with Comet enabled is a good way to ensure that Comet produces the same +results as that version of Spark. To enable this, we apply some changes to the Apache Spark source code so that +Comet is enabled when we run the tests. + +Here is an overview of the changes that we need to make to Spark: + +- Update the pom.xml to add a dependency on Comet +- Modify SparkSession to load the Comet extension +- Modify TestHive to load Comet +- Modify SQLTestUtilsBase to load Comet when `ENABLE_COMET` environment variable exists + +Here are the steps involved in running the Spark SQL tests with Comet, using Spark 3.4.3 for this example. + +## 1. Install Comet + +Run `make release` in Comet to install the Comet JAR into the local Maven repository, specifying the Spark version. + +```shell +PROFILES="-Pspark-3.4" make release +``` + +## 2. Clone Spark and Apply Diff + +Clone Apache Spark locally and apply the diff file from Comet. + +```shell +git clone git@github.com:apache/spark.git apache-spark +cd apache-spark +git checkout v3.4.3 +git apply ../datafusion-comet/dev/diffs/3.4.3.diff +``` + +## 3. Run Spark SQL Tests + +Use the following commands to run the SQL tests locally. + +```shell +ENABLE_COMET=true build/sbt catalyst/test +ENABLE_COMET=true build/sbt "sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest" +ENABLE_COMET=true build/sbt "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest" +ENABLE_COMET=true build/sbt "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest" +ENABLE_COMET=true build/sbt "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest" +ENABLE_COMET=true build/sbt "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest" +ENABLE_COMET=true build/sbt "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest" +``` + +## Creating a diff file for a new Spark version + +Once Comet has support for a new Spark version, we need to create a diff file that can be applied to that version +of Apache Spark to enable Comet when running tests. This is a highly manual process and the process can +vary depending on the changes in the new version of Spark, but here is a general guide to the process. + +We typically start by applying a patch from a previous version of Spark. For example, when enabling the tests +for Spark version 3.5.1 we may start by applying the existing diff for 3.4.3 first. + +```shell +cd git/apache/spark +git checkout v3.5.1 +git apply --reject --whitespace=fix ../datafusion-comet/dev/diffs/3.4.3.diff +``` + +Any changes that cannot be cleanly applied will instead be written out to reject files. For example, the above +command generated the following files. + +```shell +find . -name "*.rej" +./pom.xml.rej +./sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRebaseDatetimeSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala.rej +./sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala.rej +./sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala.rej +``` + +The changes in these reject files need to be applied manually. + +One method is to use the [wiggle](https://github.com/neilbrown/wiggle) command (`brew install wiggle` on Mac). + +For example: + +```shell +wiggle --replace ./sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala ./sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala.rej +``` + +## Generating The Diff File + +```shell +git diff v3.5.1 > ../datafusion-comet/dev/diffs/3.5.1.diff +``` + +## Running Tests in CI + +The easiest way to run the tests is to create a PR against Comet and let CI run the tests. When working with a +new Spark version, the `spark_sql_test.yaml` and `spark_sql_test_ansi.yaml` files will need updating with the +new version. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 1e9eba21f6..100145a97c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -61,6 +61,7 @@ as a native runtime to achieve improvement in terms of query efficiency and quer Benchmarking Guide Adding a New Expression Profiling Native Code + Spark SQL Tests Github and Issue Tracker .. _toc.asf-links: diff --git a/pom.xml b/pom.xml index 981c762367..d8d1bab6f1 100644 --- a/pom.xml +++ b/pom.xml @@ -555,13 +555,13 @@ under the License. - spark-3.5 2.12.18 3.5.1 3.5 1.13.1 + 2.0.7 spark-3.5 not-needed not-needed