Backport the native implementation of last join for Spark 3.2.1 #28

tobegit3hub · 2022-10-11T09:39:00Z

There is some patches need to be back-ported for Spark 3.2.1. Here is the git diff file.

@@ -254,7 +257,7 @@ case class BroadcastHashJoinExec(
   /**
    * Generates the code for left or right outer join.
    */
-  private def codegenOuter(ctx: CodegenContext, input: Seq[ExprCode]): String = {
+  private def codegenOuter(ctx: CodegenContext, input: Seq[ExprCode], isLastJoin: Boolean = false): String = {
     val (broadcastRelation, relationTerm) = prepareBroadcast(ctx)
     val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input)
     val matched = ctx.freshName("matched")
@@ -306,24 +309,48 @@ case class BroadcastHashJoinExec(
       val matches = ctx.freshName("matches")
       val iteratorCls = classOf[Iterator[UnsafeRow]].getName
       val found = ctx.freshName("found")
-      s"""
-         |// generate join key for stream side
-         |${keyEv.code}
-         |// find matches from HashRelation
-         |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value});
-         |boolean $found = false;
-         |// the last iteration of this loop is to emit an empty row if there is no matched rows.
-         |while ($matches != null && $matches.hasNext() || !$found) {
-         |  UnsafeRow $matched = $matches != null && $matches.hasNext() ?
-         |    (UnsafeRow) $matches.next() : null;
-         |  ${checkCondition.trim}
-         |  if ($conditionPassed) {
-         |    $found = true;
-         |    $numOutput.add(1);
-         |    ${consume(ctx, resultVars)}
-         |  }
-         |}
+
+      // Add by 4Paradigm
+      if (isLastJoin) {
+        s"""
+           |// generate join key for stream side
+           |${keyEv.code}
+           |// find matches from HashRelation
+           |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value});
+           |boolean $found = false;
+           |// the last iteration of this loop is to emit an empty row if there is no matched rows.
+           |if ($matches != null && $matches.hasNext() || !$found) {
+           |  UnsafeRow $matched = $matches != null && $matches.hasNext() ?
+           |    (UnsafeRow) $matches.next() : null;
+           |  ${checkCondition.trim}
+           |  if ($conditionPassed) {
+           |    $found = true;
+           |    $numOutput.add(1);
+           |    ${consume(ctx, resultVars)}
+           |  }
+           |}
        """.stripMargin
+      } else {
+        s"""
+           |// generate join key for stream side
+           |${keyEv.code}
+           |// find matches from HashRelation
+           |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value});
+           |boolean $found = false;
+           |// the last iteration of this loop is to emit an empty row if there is no matched rows.
+           |while ($matches != null && $matches.hasNext() || !$found) {
+           |  UnsafeRow $matched = $matches != null && $matches.hasNext() ?
+           |    (UnsafeRow) $matches.next() : null;
+           |  ${checkCondition.trim}
+           |  if ($conditionPassed) {
+           |    $found = true;
+           |    $numOutput.add(1);
+           |    ${consume(ctx, resultVars)}
+           |  }
+           |}
+       """.stripMargin
+      }
+
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 305741e298..d4ea4e58c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -61,6 +61,9 @@ trait HashJoin {
         left.output ++ right.output
       case LeftOuter =>
         left.output ++ right.output.map(_.withNullability(true))
+      // Add by 4Paradigm
+      case LastJoinType =>
+        left.output ++ right.output.map(_.withNullability(true))
       case RightOuter =>
         left.output.map(_.withNullability(true)) ++ right.output
       case j: ExistenceJoin =>
@@ -132,7 +135,8 @@ trait HashJoin {
 
   private def outerJoin(
       streamedIter: Iterator[InternalRow],
-    hashedRelation: HashedRelation): Iterator[InternalRow] = {
+    hashedRelation: HashedRelation,
+    isLastJoin: Boolean = false): Iterator[InternalRow] = {
     val joinedRow = new JoinedRow()
     val keyGenerator = streamSideKeyGenerator()
     val nullRow = new GenericInternalRow(buildPlan.output.length)
@@ -144,13 +148,31 @@ trait HashJoin {
       new RowIterator {
         private var found = false
         override def advanceNext(): Boolean = {
-          while (buildIter != null && buildIter.hasNext) {
-            val nextBuildRow = buildIter.next()
-            if (boundCondition(joinedRow.withRight(nextBuildRow))) {
-              found = true
-              return true
+
+          // Add by 4Paradigm to support last join
+          if (isLastJoin && found) {
+            return false
+          }
+
+          // Add by 4Paradigm to support last join
+          if (isLastJoin) {
+            if (buildIter != null && buildIter.hasNext) {
+              val nextBuildRow = buildIter.next()
+              if (boundCondition(joinedRow.withRight(nextBuildRow))) {
+                found = true
+                return true
+              }
+            }
+          } else {
+            while (buildIter != null && buildIter.hasNext) {
+              val nextBuildRow = buildIter.next()
+              if (boundCondition(joinedRow.withRight(nextBuildRow))) {
+                found = true
+                return true
+              }
             }
           }
+
           if (!found) {
             joinedRow.withRight(nullRow)
             found = true
@@ -218,6 +240,9 @@ trait HashJoin {
         innerJoin(streamedIter, hashed)
       case LeftOuter | RightOuter =>
         outerJoin(streamedIter, hashed)
+      case LastJoinType =>
+        // Add by 4Paradigm
+        outerJoin(streamedIter, hashed, true)
       case LeftSemi =>
         semiJoin(streamedIter, hashed)
       case LeftAnti =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 5b5904f157..b3cc91cab1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -75,6 +75,9 @@ case class SortMergeJoinExec(
         left.output ++ right.output
       case LeftOuter =>
         left.output ++ right.output.map(_.withNullability(true))
+      // Add by 4Paradigm
+      case LastJoinType =>
+        left.output ++ right.output.map(_.withNullability(true))
       case RightOuter =>
         left.output.map(_.withNullability(true)) ++ right.output
       case FullOuter =>
@@ -94,6 +97,8 @@ case class SortMergeJoinExec(
       PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning))
     // For left and right outer joins, the output is partitioned by the streamed input's join keys.
     case LeftOuter => left.outputPartitioning
+    // Add by 4Paradigm
+    case LastJoinType => left.outputPartitioning
     case RightOuter => right.outputPartitioning
     case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
     case LeftExistence(_) => left.outputPartitioning
@@ -124,6 +129,8 @@ case class SortMergeJoinExec(
       }
     // For left and right outer joins, the output is ordered by the streamed input's join keys.
     case LeftOuter => getKeyOrdering(leftKeys, left.outputOrdering)
+    // Add by 4Paradigm
+    case LastJoinType => getKeyOrdering(leftKeys, left.outputOrdering)
     case RightOuter => getKeyOrdering(rightKeys, right.outputOrdering)
     // There are null rows in both streams, so there is no order.
     case FullOuter => Nil
@@ -257,6 +264,23 @@ case class SortMergeJoinExec(
           new LeftOuterIterator(
             smjScanner, rightNullRow, boundCondition, resultProj, numOutputRows).toScala
 
+        // Add by 4Paradigm
+        case LastJoinType =>
+          val smjScanner = new SortMergeJoinScanner(
+            streamedKeyGenerator = createLeftKeyGenerator(),
+            bufferedKeyGenerator = createRightKeyGenerator(),
+            keyOrdering,
+            streamedIter = RowIterator.fromScala(leftIter),
+            bufferedIter = RowIterator.fromScala(rightIter),
+            inMemoryThreshold,
+            spillThreshold,
+            cleanupResources,
+            true
+          )
+          val rightNullRow = new GenericInternalRow(right.output.length)
+          new LeftOuterIterator(
+            smjScanner, rightNullRow, boundCondition, resultProj, numOutputRows).toScala
+
         case RightOuter =>
           val smjScanner = new SortMergeJoinScanner(
             streamedKeyGenerator = createRightKeyGenerator(),
@@ -712,7 +736,8 @@ private[joins] class SortMergeJoinScanner(
     bufferedIter: RowIterator,
     inMemoryThreshold: Int,
     spillThreshold: Int,
-    eagerCleanupResources: () => Unit) {
+    eagerCleanupResources: () => Unit,
+    isLastJoin: Boolean = false) {
   private[this] var streamedRow: InternalRow = _
   private[this] var streamedRowKey: InternalRow = _
   private[this] var bufferedRow: InternalRow = _
@@ -883,10 +908,18 @@ private[joins] class SortMergeJoinScanner(
     // This join key may have been produced by a mutable projection, so we need to make a copy:
     matchJoinKey = streamedRowKey.copy()
     bufferedMatches.clear()
-    do {
+
+    // Add by 4Paradigm
+    if (isLastJoin) {
       bufferedMatches.add(bufferedRow.asInstanceOf[UnsafeRow])
       advancedBufferedToRowWithNullFreeJoinKey()
-    } while (bufferedRow != null && keyOrdering.compare(streamedRowKey, bufferedRowKey) == 0)
+    } else {
+      do {
+        bufferedMatches.add(bufferedRow.asInstanceOf[UnsafeRow])
+        advancedBufferedToRowWithNullFreeJoinKey()
+      } while (bufferedRow != null && keyOrdering.compare(streamedRowKey, bufferedRowKey) == 0)
+    }
+
   }
 }

The text was updated successfully, but these errors were encountered:

tobegit3hub self-assigned this Oct 11, 2022

This was referenced Oct 11, 2022

feat: merge openmldb patch to v321 #27

Merged

Disable native last join for Spark 3.2.1 4paradigm/OpenMLDB#2640

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Backport the native implementation of last join for Spark 3.2.1 #28

Backport the native implementation of last join for Spark 3.2.1 #28

tobegit3hub commented Oct 11, 2022

Backport the native implementation of last join for Spark 3.2.1 #28

Backport the native implementation of last join for Spark 3.2.1 #28

Comments

tobegit3hub commented Oct 11, 2022