Skip to content

Commit

Permalink
[SPARK-48263] Collate function support for non UTF8_BINARY strings
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
collate("xx", "<non default>") does not work when there is a config for default collation set which configures non UTF8_BINARY collation as default.

### Why are the changes needed?
Fixing the compatibility issue with default collation config and collate function.

### Does this PR introduce _any_ user-facing change?
Customers will be able to execute collation(<string>, <collation>) function even when default collation config is configured to some other collation than UTF8_BINARY. We are expanding the surface area for cx.

### How was this patch tested?
Added tests.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes apache#46574 from nebojsa-db/SPARK-48263.

Authored-by: Nebojsa Savic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
nebojsa-db authored and cloud-fan committed May 14, 2024
1 parent 97bf1ee commit 91da2ca
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ object CollateExpressionBuilder extends ExpressionBuilder {
expressions match {
case Seq(e: Expression, collationExpr: Expression) =>
(collationExpr.dataType, collationExpr.foldable) match {
case (StringType, true) =>
case (_: StringType, true) =>
val evalCollation = collationExpr.eval()
if (evalCollation == null) {
throw QueryCompilationErrors.unexpectedNullError("collation", collationExpr)
} else {
Collate(e, evalCollation.toString)
}
case (StringType, false) => throw QueryCompilationErrors.nonFoldableArgumentError(
case (_: StringType, false) => throw QueryCompilationErrors.nonFoldableArgumentError(
funcName, "collationName", StringType)
case (_, _) => throw QueryCompilationErrors.unexpectedInputDataTypeError(
funcName, 1, StringType, collationExpr)
Expand Down
14 changes: 12 additions & 2 deletions sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,18 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
}

test("collate function syntax") {
assert(sql(s"select collate('aaa', 'utf8_binary')").schema(0).dataType == StringType(0))
assert(sql(s"select collate('aaa', 'utf8_binary_lcase')").schema(0).dataType == StringType(1))
assert(sql(s"select collate('aaa', 'utf8_binary')").schema(0).dataType ==
StringType("UTF8_BINARY"))
assert(sql(s"select collate('aaa', 'utf8_binary_lcase')").schema(0).dataType ==
StringType("UTF8_BINARY_LCASE"))
}

test("collate function syntax with default collation set") {
withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_BINARY_LCASE") {
assert(sql(s"select collate('aaa', 'utf8_binary_lcase')").schema(0).dataType ==
StringType("UTF8_BINARY_LCASE"))
assert(sql(s"select collate('aaa', 'UNICODE')").schema(0).dataType == StringType("UNICODE"))
}
}

test("collate function syntax invalid arg count") {
Expand Down

0 comments on commit 91da2ca

Please sign in to comment.