Skip to content

Commit

Permalink
feat(query): add string function: split and split_part (#13303)
Browse files Browse the repository at this point in the history
* feat(query): add string function: split and split_part

* add some test
  • Loading branch information
TCeason authored Oct 18, 2023
1 parent 8908d27 commit ce6e979
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 0 deletions.
68 changes: 68 additions & 0 deletions src/query/functions/src/scalars/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use common_expression::types::number::SimpleDomain;
use common_expression::types::number::UInt64Type;
use common_expression::types::string::StringColumn;
use common_expression::types::string::StringColumnBuilder;
use common_expression::types::ArrayType;
use common_expression::types::NumberType;
use common_expression::types::StringType;
use common_expression::vectorize_with_builder_1_arg;
Expand Down Expand Up @@ -757,6 +758,73 @@ pub fn register(registry: &mut FunctionRegistry) {
}
}),
);

registry
.register_passthrough_nullable_2_arg::<StringType, StringType, ArrayType<StringType>, _, _>(
"split",
|_, _, _| FunctionDomain::Full,
vectorize_with_builder_2_arg::<StringType, StringType, ArrayType<StringType>>(
|str, sep, output, ctx| match String::from_utf8(str.to_vec()) {
Ok(s) => match String::from_utf8(sep.to_vec()) {
Ok(sep) => {
let res: Vec<&str> = s.split(&sep).collect();
let len = res.len();
let mut builder = StringColumnBuilder::with_capacity(len, len);
for i in res {
builder.put_slice(i.as_bytes());
builder.commit_row();
}
let column = builder.build();
output.builder.append_column(&column);
output.commit_row()
}
Err(e) => {
ctx.set_error(output.len(), e.to_string());
output.commit_row();
}
},
Err(e) => {
ctx.set_error(output.len(), e.to_string());
output.commit_row();
}
},
),
);

registry
.register_passthrough_nullable_3_arg::<StringType, StringType, NumberType<i64>, StringType, _, _>(
"split_part",
|_, _, _, _| FunctionDomain::Full,
vectorize_with_builder_3_arg::<StringType, StringType, NumberType<i64>, StringType>(
|str, sep, part, output, ctx| match String::from_utf8(str.to_vec()) {
Ok(s) => match String::from_utf8(sep.to_vec()) {
Ok(sep) => {
let split: Vec<&str> = s.split(&sep).collect();
let len = split.len();
if part <= len as i64 && part >= -(len as i64) {
let idx = match part.cmp(&(0i64)) {
Ordering::Greater => (part-1) as usize,
Ordering::Less => (len as i64 + part) as usize,
Ordering::Equal => 0
};
let res = split[idx];
output.put_slice(res.as_bytes());

}
output.commit_row();
}
Err(e) => {
ctx.set_error(output.len(), e.to_string());
output.commit_row();
}
},
Err(e) => {
ctx.set_error(output.len(), e.to_string());
output.commit_row();
}
},
),
)
}

pub(crate) mod soundex {
Expand Down
22 changes: 22 additions & 0 deletions src/query/functions/tests/it/scalars/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ fn test_string() {
test_left(file);
test_right(file);
test_substr(file);
test_split(file)
}

fn test_upper(file: &mut impl Write) {
Expand Down Expand Up @@ -679,3 +680,24 @@ fn test_substr(file: &mut impl Write) {
),
]);
}

fn test_split(file: &mut impl Write) {
run_ast(file, "split('Sakila', 'il')", &[]);
run_ast(file, "split('sakila', 'a')", &[]);
run_ast(file, "split('abc','b')", &[]);
run_ast(file, "split(str, sep)", &[
(
"str",
StringType::from_data_with_validity(
&["127.0.0.1", "aaa--bbb-BBB--ccc", "cc", "aeeceedeef"],
vec![false, true, true, true],
),
),
(
"sep",
StringType::from_data_with_validity(&[".", "--", "cc", "ee"], vec![
false, true, true, true,
]),
),
]);
}
Original file line number Diff line number Diff line change
Expand Up @@ -3080,6 +3080,10 @@ Functions overloads:
1 soundex(String NULL) :: String NULL
0 space(UInt64) :: String
1 space(UInt64 NULL) :: String NULL
0 split(String, String) :: Array(String)
1 split(String NULL, String NULL) :: Array(String) NULL
0 split_part(String, String, Int64) :: String
1 split_part(String NULL, String NULL, Int64 NULL) :: String NULL
0 sqrt(UInt8) :: Float64
1 sqrt(UInt8 NULL) :: Float64 NULL
2 sqrt(UInt16) :: Float64
Expand Down
51 changes: 51 additions & 0 deletions src/query/functions/tests/it/scalars/testdata/string.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3282,3 +3282,54 @@ evaluation (internal):
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


ast : split('Sakila', 'il')
raw expr : split('Sakila', 'il')
checked expr : split<String, String>("Sakila", "il")
optimized expr : ['Sak', 'a']
output type : Array(String)
output domain : [{"Sak"..="a"}]
output : ['Sak', 'a']


ast : split('sakila', 'a')
raw expr : split('sakila', 'a')
checked expr : split<String, String>("sakila", "a")
optimized expr : ['s', 'kil', '']
output type : Array(String)
output domain : [{""..="s"}]
output : ['s', 'kil', '']


ast : split('abc','b')
raw expr : split('abc', 'b')
checked expr : split<String, String>("abc", "b")
optimized expr : ['a', 'c']
output type : Array(String)
output domain : [{"a"..="c"}]
output : ['a', 'c']


ast : split(str, sep)
raw expr : split(str::String NULL, sep::String NULL)
checked expr : split<String NULL, String NULL>(str, sep)
evaluation:
+--------+-------------------------------+------------------------+---------------------------+
| | str | sep | Output |
+--------+-------------------------------+------------------------+---------------------------+
| Type | String NULL | String NULL | Array(String) NULL |
| Domain | {"127.0.0.1"..="cc"} ∪ {NULL} | {"--"..="ee"} ∪ {NULL} | [{""..}] ∪ {NULL} |
| Row 0 | NULL | NULL | NULL |
| Row 1 | 'aaa--bbb-BBB--ccc' | '--' | ['aaa', 'bbb-BBB', 'ccc'] |
| Row 2 | 'cc' | 'cc' | ['', ''] |
| Row 3 | 'aeeceedeef' | 'ee' | ['a', 'c', 'd', 'f'] |
+--------+-------------------------------+------------------------+---------------------------+
evaluation (internal):
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Column | Data |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| str | NullableColumn { column: StringColumn { data: 0x3132372e302e302e316161612d2d6262622d4242422d2d636363636361656563656564656566, offsets: [0, 9, 26, 28, 38] }, validity: [0b____1110] } |
| sep | NullableColumn { column: StringColumn { data: 0x2e2d2d63636565, offsets: [0, 1, 3, 5, 7] }, validity: [0b____1110] } |
| Output | NullableColumn { column: ArrayColumn { values: StringColumn { data: 0x3132373030316161616262622d42424263636361636466, offsets: [0, 3, 4, 5, 6, 9, 16, 19, 19, 19, 20, 21, 22, 23] }, offsets: [0, 4, 7, 9, 13] }, validity: [0b____1110] } |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
query T
select split('127.0.0.1', '.');
----
['127','0','0','1']

query T
select split('|', '|');
----
['','']

query T
select split('ab', '');
----
['','a','b','']

query T
select split_part('ab', '', 1);
----
(empty)

query T
select split_part('ab', '', 2);
----
a

query T
select split_part('|', '|', 1);
----
(empty)

query T
select split_part(null, null, 1);
----
NULL

query T
select split(null, null);
----
NULL


query TT
select * from
(select 0, split_part('11.22.33', '.', 0) UNION
select 1, split_part('11.22.33', '.', 1) UNION
select 2, split_part('11.22.33', '.', 2) UNION
select 3, split_part('11.22.33', '.', 3) UNION
select 4, split_part('11.22.33', '.', 4) UNION
select -1, split_part('11.22.33', '.', -1) UNION
select -2, split_part('11.22.33', '.', -2) UNION
select -3, split_part('11.22.33', '.', -3) UNION
select -4, split_part('11.22.33', '.', -4)) order by `0`;
----
-4 (empty)
-3 11
-2 22
-1 33
0 11
1 11
2 22
3 33
4 (empty)

statement ok
drop table if exists t;

statement ok
create table t(c1 string);

statement ok
insert into t values('127.0.0.1'), ('127.0.0.2'), ('192.168.1.3.2222')

query T
select split(c1, '.') from t order by c1
----
['127','0','0','1']
['127','0','0','2']
['192','168','1','3','2222']

query T
select split_part(c1, '.', -5), split_part(c1, '.', -4), split_part(c1, '.', 0), split_part(c1, '.', 1), split_part(c1, '.', 4), split_part(c1, '.', 5) from t order by c1
----
(empty) 127 127 127 1 (empty)
(empty) 127 127 127 2 (empty)
192 168 192 192 3 2222

statement ok
drop table t;

1 comment on commit ce6e979

@vercel
Copy link

@vercel vercel bot commented on ce6e979 Oct 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.