From abe82529f90b8457256101b0ad110efced190cdb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 21 Feb 2024 20:27:15 -0800 Subject: [PATCH] feat: Add struct/map as unsupported map key/value for columnar shuffle --- core/src/execution/shuffle/row.rs | 6 +- .../apache/comet/serde/QueryPlanSerde.scala | 5 ++ .../apache/comet/exec/CometShuffleSuite.scala | 85 +++++++++++++++++++ 3 files changed, 93 insertions(+), 3 deletions(-) diff --git a/core/src/execution/shuffle/row.rs b/core/src/execution/shuffle/row.rs index 36a7b2424..d3efc16dc 100644 --- a/core/src/execution/shuffle/row.rs +++ b/core/src/execution/shuffle/row.rs @@ -1758,7 +1758,7 @@ pub(crate) fn append_columns( } _ => { return Err(CometError::Internal(format!( - "Unsupported type: {:?}", + "Unsupported map type: {:?}", field.data_type() ))) } @@ -3182,7 +3182,7 @@ fn make_builders( _ => { return Err(CometError::Internal(format!( - "Unsupported type: {:?}", + "Unsupported map type: {:?}", field.data_type() ))) } @@ -3255,7 +3255,7 @@ fn make_builders( } _ => { return Err(CometError::Internal(format!( - "Unsupported type: {:?}", + "Unsupported list type: {:?}", field.data_type() ))) } diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 0a251d448..a365e7543 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -1830,10 +1830,15 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde { case StructType(fields) => fields.forall(f => supportedDataType(f.dataType)) case ArrayType(ArrayType(_, _), _) => false // TODO: nested array is not supported + case ArrayType(MapType(_, _, _), _) => false // TODO: map array element is not supported case ArrayType(elementType, _) => supportedDataType(elementType) case MapType(MapType(_, _, _), _, _) => false // TODO: nested map is not supported case MapType(_, MapType(_, _, _), _) => false + case MapType(StructType(_), _, _) => false // TODO: struct map key/value is not supported + case MapType(_, StructType(_), _) => false + case MapType(ArrayType(_, _), _, _) => false // TODO: array map key/value is not supported + case MapType(_, ArrayType(_, _), _) => false case MapType(keyType, valueType, _) => supportedDataType(keyType) && supportedDataType(valueType) case _ => diff --git a/spark/src/test/scala/org/apache/comet/exec/CometShuffleSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometShuffleSuite.scala index ac5664911..beb6dc860 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometShuffleSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometShuffleSuite.scala @@ -112,6 +112,91 @@ abstract class CometShuffleSuiteBase extends CometTestBase with AdaptiveSparkPla } } + test("columnar shuffle on array/struct map key/value") { + Seq("false", "true").foreach { execEnabled => + Seq(10, 201).foreach { numPartitions => + Seq("1.0", "10.0").foreach { ratio => + withSQLConf( + CometConf.COMET_EXEC_ENABLED.key -> execEnabled, + CometConf.COMET_COLUMNAR_SHUFFLE_ENABLED.key -> "true", + CometConf.COMET_EXEC_SHUFFLE_ENABLED.key -> "true", + CometConf.COMET_SHUFFLE_PREFER_DICTIONARY_RATIO.key -> ratio) { + withParquetTable((0 until 50).map(i => (Map(Seq(i, i + 1) -> i), i + 1)), "tbl") { + val df = sql("SELECT * FROM tbl") + .filter($"_2" > 10) + .repartition(numPartitions, $"_1", $"_2") + .sortWithinPartitions($"_2") + + checkSparkAnswer(df) + // Array map key array element fallback to Spark shuffle for now + checkCometExchange(df, 0, false) + } + + withParquetTable((0 until 50).map(i => (Map(i -> Seq(i, i + 1)), i + 1)), "tbl") { + val df = sql("SELECT * FROM tbl") + .filter($"_2" > 10) + .repartition(numPartitions, $"_1", $"_2") + .sortWithinPartitions($"_2") + + checkSparkAnswer(df) + // Array map value array element fallback to Spark shuffle for now + checkCometExchange(df, 0, false) + } + + withParquetTable((0 until 50).map(i => (Map((i, i.toString) -> i), i + 1)), "tbl") { + val df = sql("SELECT * FROM tbl") + .filter($"_2" > 10) + .repartition(numPartitions, $"_1", $"_2") + .sortWithinPartitions($"_2") + + checkSparkAnswer(df) + // Struct map key array element fallback to Spark shuffle for now + checkCometExchange(df, 0, false) + } + + withParquetTable((0 until 50).map(i => (Map(i -> (i, i.toString)), i + 1)), "tbl") { + val df = sql("SELECT * FROM tbl") + .filter($"_2" > 10) + .repartition(numPartitions, $"_1", $"_2") + .sortWithinPartitions($"_2") + + checkSparkAnswer(df) + // Struct map value array element fallback to Spark shuffle for now + checkCometExchange(df, 0, false) + } + } + } + } + } + } + + test("columnar shuffle on map array element") { + Seq("false", "true").foreach { execEnabled => + Seq(10, 201).foreach { numPartitions => + Seq("1.0", "10.0").foreach { ratio => + withSQLConf( + CometConf.COMET_EXEC_ENABLED.key -> execEnabled, + CometConf.COMET_COLUMNAR_SHUFFLE_ENABLED.key -> "true", + CometConf.COMET_EXEC_SHUFFLE_ENABLED.key -> "true", + CometConf.COMET_SHUFFLE_PREFER_DICTIONARY_RATIO.key -> ratio) { + withParquetTable( + (0 until 50).map(i => ((Seq(Map(1 -> i)), Map(2 -> i), Map(3 -> i)), i + 1)), + "tbl") { + val df = sql("SELECT * FROM tbl") + .filter($"_2" > 10) + .repartition(numPartitions, $"_1", $"_2") + .sortWithinPartitions($"_2") + + checkSparkAnswer(df) + // Map array element fallback to Spark shuffle for now + checkCometExchange(df, 0, false) + } + } + } + } + } + } + test("RoundRobinPartitioning is supported by columnar shuffle") { withSQLConf( // AQE has `ShuffleStage` which is a leaf node which blocks