Skip to content

Commit

Permalink
Improve regexp_match performance by avoiding cloning Regex
Browse files Browse the repository at this point in the history
  • Loading branch information
viirya committed Dec 22, 2023
1 parent bc013fc commit a47e771
Showing 1 changed file with 80 additions and 3 deletions.
83 changes: 80 additions & 3 deletions datafusion/physical-expr/src/regex_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ use arrow::array::{
new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray,
OffsetSizeTrait,
};
use arrow::compute;
use arrow_array::builder::{GenericStringBuilder, ListBuilder};
use arrow_schema::ArrowError;
use datafusion_common::plan_err;
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
Expand Down Expand Up @@ -58,7 +59,7 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
2 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
compute::regexp_match(values, regex, None).map_err(DataFusionError::ArrowError)
_regexp_match(values, regex, None).map_err(DataFusionError::ArrowError)
}
3 => {
let values = as_generic_string_array::<T>(&args[0])?;
Expand All @@ -69,7 +70,7 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
Some(f) if f.iter().any(|s| s == Some("g")) => {
plan_err!("regexp_match() does not support the \"global\" option")
},
_ => compute::regexp_match(values, regex, flags).map_err(DataFusionError::ArrowError),
_ => _regexp_match(values, regex, flags).map_err(DataFusionError::ArrowError),
}
}
other => internal_err!(
Expand All @@ -78,6 +79,82 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}

/// TODO: Remove this once it is included in arrow-rs new release.
fn _regexp_match<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex_array: &GenericStringArray<OffsetSize>,
flags_array: Option<&GenericStringArray<OffsetSize>>,
) -> std::result::Result<ArrayRef, ArrowError> {
let mut patterns: std::collections::HashMap<String, Regex> =
std::collections::HashMap::new();
let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
let mut list_builder = ListBuilder::new(builder);

let complete_pattern = match flags_array {
Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
|(pattern, flags)| {
pattern.map(|pattern| match flags {
Some(value) => format!("(?{value}){pattern}"),
None => pattern.to_string(),
})
},
)) as Box<dyn Iterator<Item = Option<String>>>,
None => Box::new(
regex_array
.iter()
.map(|pattern| pattern.map(|pattern| pattern.to_string())),
),
};

array
.iter()
.zip(complete_pattern)
.map(|(value, pattern)| {
match (value, pattern) {
// Required for Postgres compatibility:
// SELECT regexp_match('foobarbequebaz', ''); = {""}
(Some(_), Some(pattern)) if pattern == *"" => {
list_builder.values().append_value("");
list_builder.append(true);
}
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
Some(re) => re,
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {e:?}"
))
})?;
patterns.insert(pattern.clone(), re);
patterns.get(&pattern).unwrap()
}
};
match re.captures(value) {
Some(caps) => {
let mut iter = caps.iter();
if caps.len() > 1 {
iter.next();
}
for m in iter.flatten() {
list_builder.values().append_value(m.as_str());
}

list_builder.append(true);
}
None => list_builder.append(false),
}
}
_ => list_builder.append(false),
}
Ok(())
})
.collect::<std::result::Result<Vec<()>, ArrowError>>()?;
Ok(Arc::new(list_builder.finish()))
}

/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
/// used by regexp_replace
fn regex_replace_posix_groups(replacement: &str) -> String {
Expand Down

0 comments on commit a47e771

Please sign in to comment.