Skip to content

Commit

Permalink
fix: Optimize rpad
Browse files Browse the repository at this point in the history
  • Loading branch information
kazuyukitanimura committed Aug 3, 2024
1 parent ed1a846 commit efc6286
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 15 deletions.
23 changes: 8 additions & 15 deletions native/spark-expr/src/scalar_funcs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ use num::{
};
use std::fmt::Write;
use std::{cmp::min, sync::Arc};
use unicode_segmentation::UnicodeSegmentation;

mod unhex;
pub use unhex::spark_unhex;
Expand Down Expand Up @@ -412,7 +411,6 @@ fn spark_rpad_internal<T: OffsetSizeTrait>(
) -> Result<ColumnarValue, DataFusionError> {
let string_array = as_generic_string_array::<T>(array)?;
let length = 0.max(length) as usize;
let empty_str = "";
let space_string = " ".repeat(length);

let mut builder =
Expand All @@ -421,21 +419,16 @@ fn spark_rpad_internal<T: OffsetSizeTrait>(
for string in string_array.iter() {
match string {
Some(string) => {
if length == 0 {
builder.append_value(empty_str);
} else if length == 1 && string.len() > 0 {
// Special case: when length == 1, no need to calculate expensive graphemes
// It looks Spark's UTF8String is closer to chars rather than graphemes
// https://stackoverflow.com/a/46290728
let char_len = string.chars().count();
if length <= char_len {
builder.append_value(string);
} else {
let graphemes_len = string.graphemes(true).count();
if length <= graphemes_len {
builder.append_value(string);
} else {
// write_str updates only the value buffer, not null nor offset buffer
// This is convenient for concatenating str(s)
builder.write_str(string)?;
builder.append_value(&space_string[graphemes_len..]);
}
// write_str updates only the value buffer, not null nor offset buffer
// This is convenient for concatenating str(s)
builder.write_str(string)?;
builder.append_value(&space_string[char_len..]);
}
}
_ => builder.append_null(),
Expand Down
7 changes: 7 additions & 0 deletions spark/src/test/resources/tpcds-micro-benchmarks/char_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
SELECT
cd_gender
FROM customer_demographics
WHERE
cd_gender = 'M' AND
cd_marital_status = 'S' AND
cd_education_status = 'College'
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ object CometTPCDSMicroBenchmark extends CometTPCQueryBenchmarkBase {
"agg_sum_integers_no_grouping",
"case_when_column_or_null",
"case_when_scalar",
"char_type",
"filter_highly_selective",
"filter_less_selective",
"if_column_or_null",
Expand Down

0 comments on commit efc6286

Please sign in to comment.