From 81cc9de0dfb01600b7dcac7af2296493d7d85eec Mon Sep 17 00:00:00 2001 From: LantaoJin Date: Thu, 10 Oct 2024 14:59:13 +0800 Subject: [PATCH] [SPARK-49919][SQL] Add special limits support for return content as JSON dataset --- .../spark/sql/execution/SparkStrategies.scala | 52 +++++++++++++------ .../spark/sql/execution/PlannerSuite.scala | 13 +++++ 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 53c335c1eced6..103ece09f6425 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -82,27 +82,47 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { */ object SpecialLimits extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + // Match the serialization pattern for case of Dataset.toJSON // Call `planTakeOrdered` first which matches a larger plan. - case ReturnAnswer(rootPlan) => planTakeOrdered(rootPlan).getOrElse(rootPlan match { - // We should match the combination of limit and offset first, to get the optimal physical - // plan, instead of planning limit and offset separately. - case LimitAndOffset(limit, offset, child) => - CollectLimitExec(limit = limit, child = planLater(child), offset = offset) - case OffsetAndLimit(offset, limit, child) => - // 'Offset a' then 'Limit b' is the same as 'Limit a + b' then 'Offset a'. - CollectLimitExec(limit = offset + limit, child = planLater(child), offset = offset) - case Limit(IntegerLiteral(limit), child) => - CollectLimitExec(limit = limit, child = planLater(child)) - case logical.Offset(IntegerLiteral(offset), child) => - CollectLimitExec(child = planLater(child), offset = offset) - case Tail(IntegerLiteral(limit), child) => - CollectTailExec(limit, planLater(child)) - case other => planLater(other) - }) :: Nil + case ReturnAnswer( + SerializeFromObject( + serializer, + MapPartitions( + f, + objAttr1, + DeserializeToObject(deserializer, objAttr2, child)))) => + SerializeFromObjectExec( + serializer, + MapPartitionsExec( + f, + objAttr1, + DeserializeToObjectExec( + deserializer, + objAttr2, + planTakeOrdered(child).getOrElse(planCollectLimit(child))))) :: Nil + case ReturnAnswer(rootPlan) => + planTakeOrdered(rootPlan).getOrElse(planCollectLimit(rootPlan)) :: Nil case other => planTakeOrdered(other).toSeq } + private def planCollectLimit(plan: LogicalPlan): SparkPlan = plan match { + // We should match the combination of limit and offset first, to get the optimal physical + // plan, instead of planning limit and offset separately. + case LimitAndOffset(limit, offset, child) => + CollectLimitExec(limit = limit, child = planLater(child), offset = offset) + case OffsetAndLimit(offset, limit, child) => + // 'Offset a' then 'Limit b' is the same as 'Limit a + b' then 'Offset a'. + CollectLimitExec(limit = offset + limit, child = planLater(child), offset = offset) + case Limit(IntegerLiteral(limit), child) => + CollectLimitExec(limit = limit, child = planLater(child)) + case logical.Offset(IntegerLiteral(offset), child) => + CollectLimitExec(child = planLater(child), offset = offset) + case Tail(IntegerLiteral(limit), child) => + CollectTailExec(limit, planLater(child)) + case other => planLater(other) + } + private def planTakeOrdered(plan: LogicalPlan): Option[SparkPlan] = plan match { // We should match the combination of limit and offset first, to get the optimal physical // plan, instead of planning limit and offset separately. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 1400ee25f4319..6584ed21be7d8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -262,6 +262,19 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } } + test("SPARK-49919: CollectLimit can be applied with toJSON") { + val query = testData.select($"value").limit(2).toJSON + val planned = query.queryExecution.sparkPlan + assert(planned.exists(_.isInstanceOf[CollectLimitExec])) + val t = testData.select($"value").toJSON.logicalPlan.output + } + + test("SPARK-49919: TakeOrderedAndProject can be applied with toJSON") { + val query = testData.select($"key", $"value").sort($"key").limit(2).toJSON + val planned = query.queryExecution.executedPlan + assert(planned.isInstanceOf[execution.TakeOrderedAndProjectExec]) + } + test("PartitioningCollection") { withTempView("normal", "small", "tiny") { testData.createOrReplaceTempView("normal")