From 538d484edf060913346f915dcfbfc0d1ac0c012f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 11 Dec 2024 17:38:23 -0700 Subject: [PATCH] Move string kernels and expressions to spark-expr crate --- .../src/execution/datafusion/expressions/mod.rs | 1 - native/core/src/execution/datafusion/planner.rs | 15 ++++++--------- native/core/src/execution/kernels/mod.rs | 2 -- native/spark-expr/src/kernels/mod.rs | 1 + .../src}/kernels/strings.rs | 7 +++---- native/spark-expr/src/lib.rs | 2 ++ .../expressions => spark-expr/src}/strings.rs | 2 +- 7 files changed, 13 insertions(+), 17 deletions(-) rename native/{core/src/execution => spark-expr/src}/kernels/strings.rs (96%) rename native/{core/src/execution/datafusion/expressions => spark-expr/src}/strings.rs (99%) diff --git a/native/core/src/execution/datafusion/expressions/mod.rs b/native/core/src/execution/datafusion/expressions/mod.rs index 2bb14df36..5f9f322b2 100644 --- a/native/core/src/execution/datafusion/expressions/mod.rs +++ b/native/core/src/execution/datafusion/expressions/mod.rs @@ -23,7 +23,6 @@ use crate::errors::CometError; pub mod bloom_filter_agg; pub mod bloom_filter_might_contain; pub mod negative; -pub mod strings; pub mod subquery; pub mod unbound; diff --git a/native/core/src/execution/datafusion/planner.rs b/native/core/src/execution/datafusion/planner.rs index 5e77b3f65..0e64ed6af 100644 --- a/native/core/src/execution/datafusion/planner.rs +++ b/native/core/src/execution/datafusion/planner.rs @@ -25,12 +25,8 @@ use crate::{ datafusion::{ expressions::{ bloom_filter_agg::BloomFilterAgg, - bloom_filter_might_contain::BloomFilterMightContain, - checkoverflow::CheckOverflow, - negative, - strings::{Contains, EndsWith, Like, StartsWith, StringSpaceExpr, SubstringExpr}, - subquery::Subquery, - unbound::UnboundColumn, + bloom_filter_might_contain::BloomFilterMightContain, checkoverflow::CheckOverflow, + negative, subquery::Subquery, unbound::UnboundColumn, }, operators::expand::CometExpandExec, shuffle_writer::ShuffleWriterExec, @@ -90,9 +86,10 @@ use datafusion_comet_proto::{ spark_partitioning::{partitioning::PartitioningStruct, Partitioning as SparkPartitioning}, }; use datafusion_comet_spark_expr::{ - ArrayInsert, Avg, AvgDecimal, BitwiseNotExpr, Cast, Correlation, Covariance, CreateNamedStruct, - DateTruncExpr, GetArrayStructFields, GetStructField, HourExpr, IfExpr, ListExtract, MinuteExpr, - NormalizeNaNAndZero, RLike, SecondExpr, SparkCastOptions, Stddev, SumDecimal, + ArrayInsert, Avg, AvgDecimal, BitwiseNotExpr, Cast, Contains, Correlation, Covariance, + CreateNamedStruct, DateTruncExpr, EndsWith, GetArrayStructFields, GetStructField, HourExpr, + IfExpr, Like, ListExtract, MinuteExpr, NormalizeNaNAndZero, RLike, SecondExpr, + SparkCastOptions, StartsWith, Stddev, StringSpaceExpr, SubstringExpr, SumDecimal, TimestampTruncExpr, ToJson, Variance, }; use datafusion_common::scalar::ScalarStructBuilder; diff --git a/native/core/src/execution/kernels/mod.rs b/native/core/src/execution/kernels/mod.rs index 675dcd489..d72dd4060 100644 --- a/native/core/src/execution/kernels/mod.rs +++ b/native/core/src/execution/kernels/mod.rs @@ -19,5 +19,3 @@ mod hash; pub use hash::hash; - -pub(crate) mod strings; diff --git a/native/spark-expr/src/kernels/mod.rs b/native/spark-expr/src/kernels/mod.rs index 88aa34b1a..3669ff13a 100644 --- a/native/spark-expr/src/kernels/mod.rs +++ b/native/spark-expr/src/kernels/mod.rs @@ -17,4 +17,5 @@ //! Kernels +pub mod strings; pub(crate) mod temporal; diff --git a/native/core/src/execution/kernels/strings.rs b/native/spark-expr/src/kernels/strings.rs similarity index 96% rename from native/core/src/execution/kernels/strings.rs rename to native/spark-expr/src/kernels/strings.rs index d63b2c477..bb275fbb9 100644 --- a/native/core/src/execution/kernels/strings.rs +++ b/native/spark-expr/src/kernels/strings.rs @@ -25,15 +25,14 @@ use arrow::{ compute::kernels::substring::{substring as arrow_substring, substring_by_char}, datatypes::{DataType, Int32Type}, }; - -use crate::errors::ExpressionError; +use datafusion_common::DataFusionError; /// Returns an ArrayRef with a string consisting of `length` spaces. /// /// # Preconditions /// /// - elements in `length` must not be negative -pub fn string_space(length: &dyn Array) -> Result { +pub fn string_space(length: &dyn Array) -> Result { match length.data_type() { DataType::Int32 => { let array = length.as_any().downcast_ref::().unwrap(); @@ -52,7 +51,7 @@ pub fn string_space(length: &dyn Array) -> Result { } } -pub fn substring(array: &dyn Array, start: i64, length: u64) -> Result { +pub fn substring(array: &dyn Array, start: i64, length: u64) -> Result { match array.data_type() { DataType::LargeUtf8 => substring_by_char( array diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs index 15f446ef3..5dff6e0b8 100644 --- a/native/spark-expr/src/lib.rs +++ b/native/spark-expr/src/lib.rs @@ -33,6 +33,8 @@ mod correlation; pub use correlation::Correlation; mod covariance; pub use covariance::Covariance; +mod strings; +pub use strings::{Contains, EndsWith, Like, StartsWith, StringSpaceExpr, SubstringExpr}; mod kernels; mod list; mod regexp; diff --git a/native/core/src/execution/datafusion/expressions/strings.rs b/native/spark-expr/src/strings.rs similarity index 99% rename from native/core/src/execution/datafusion/expressions/strings.rs rename to native/spark-expr/src/strings.rs index 200b4ec5a..a8aab6aee 100644 --- a/native/core/src/execution/datafusion/expressions/strings.rs +++ b/native/spark-expr/src/strings.rs @@ -17,7 +17,7 @@ #![allow(deprecated)] -use crate::execution::kernels::strings::{string_space, substring}; +use crate::kernels::strings::{string_space, substring}; use arrow::{ compute::{ contains_dyn, contains_utf8_scalar_dyn, ends_with_dyn, ends_with_utf8_scalar_dyn, like_dyn,