From dab104b04189d1e9054ff78a7144af4446a13f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=AD=90=E6=81=92?= Date: Tue, 28 May 2024 17:33:51 +0800 Subject: [PATCH] feat: support locate(substr, str[, pos]) function(#820) --- hybridse/src/codegen/udf_ir_builder_test.cc | 24 ++++++++++++ hybridse/src/udf/default_udf_library.cc | 41 +++++++++++++++++++++ hybridse/src/udf/udf.cc | 36 ++++++++++++++++++ hybridse/src/udf/udf.h | 2 + 4 files changed, 103 insertions(+) diff --git a/hybridse/src/codegen/udf_ir_builder_test.cc b/hybridse/src/codegen/udf_ir_builder_test.cc index 6cd82be7859..91a56b83c49 100644 --- a/hybridse/src/codegen/udf_ir_builder_test.cc +++ b/hybridse/src/codegen/udf_ir_builder_test.cc @@ -766,6 +766,30 @@ TEST_F(UdfIRBuilderTest, SubstringPosUdfTest) { StringRef("1234567890"), -12); } +TEST_F(UdfIRBuilderTest, LocateUdfTest) { + CheckUdf("locate", 1, StringRef("ab"), StringRef("abcab")); + CheckUdf("locate", 3, StringRef("ab"), StringRef("bcab")); + CheckUdf("locate", 0, StringRef("ab"), StringRef("bcAb")); + CheckUdf("locate", 1, StringRef(""), StringRef("")); +} + +TEST_F(UdfIRBuilderTest, LocatePosUdfTest) { + CheckUdf("locate", 0, StringRef("ab"), StringRef("ab"), -1); + CheckUdf("locate", 0, StringRef("ab"), StringRef("Ab"), 1); + + CheckUdf("locate", 4, StringRef("ab"), StringRef("abcab"), 2); + CheckUdf("locate", 0, StringRef("ab"), StringRef("abcAb"), 2); + CheckUdf("locate", 4, StringRef("ab"), StringRef("abcab"), 2); + CheckUdf("locate", 0, StringRef("ab"), StringRef("abcab"), 6); + + CheckUdf("locate", 5, StringRef(""), StringRef("abcab"), 5); + CheckUdf("locate", 6, StringRef(""), StringRef("abcab"), 6); + CheckUdf("locate", 0, StringRef(""), StringRef("abcab"), 7); + + CheckUdf("locate", 1, StringRef(""), StringRef(""), 1); + CheckUdf("locate", 0, StringRef(""), StringRef(""), 2); +} + TEST_F(UdfIRBuilderTest, UpperUcase) { CheckUdf, Nullable>("upper", StringRef("SQL"), StringRef("Sql")); CheckUdf, Nullable>("ucase", StringRef("SQL"), StringRef("Sql")); diff --git a/hybridse/src/udf/default_udf_library.cc b/hybridse/src/udf/default_udf_library.cc index d6fed696ab3..a6be6745917 100644 --- a/hybridse/src/udf/default_udf_library.cc +++ b/hybridse/src/udf/default_udf_library.cc @@ -911,6 +911,47 @@ void DefaultUdfLibrary::InitStringUdf() { RegisterAlias("substr", "substring"); + RegisterExternal("locate") + .args( + static_cast(udf::v1::locate)) + .doc(R"( + @brief Returns the position of the first occurrence of substr in str. The given pos and return value are 1-based. + This is a version of the `locate` function where `pos` has a default value of 1. + + Example: + + @code{.sql} + + select locate("wo", "hello world"); + --output 7 + + @endcode)"); + + RegisterExternal("locate") + .args( + static_cast(udf::v1::locate)) + .doc(R"( + @brief Returns the position of the first occurrence of substr in str after position pos. The given pos and return value are 1-based. + + Example: + + @code{.sql} + + select locate("wo", "hello world", 2); + --output 7 + + select locate("Wo", "hello world", 2); + --output 0 + + @endcode + + @param substr + @param str + @param pos: define the begining search position of the str. + - Negetive value is illegal and will return 0 directly; + - If substr is "" and pos less equal len(str) + 1, return pos, other case return 0; + )"); + RegisterExternal("strcmp") .args( static_cast( diff --git a/hybridse/src/udf/udf.cc b/hybridse/src/udf/udf.cc index b32d75d4ac8..35dc82645cc 100644 --- a/hybridse/src/udf/udf.cc +++ b/hybridse/src/udf/udf.cc @@ -1083,6 +1083,42 @@ void sub_string(StringRef *str, int32_t from, int32_t len, output->size_ = static_cast(len); return; } + +int32_t locate(StringRef *substr, StringRef *str) { + return locate(substr, str, 1); +} + +int32_t locate(StringRef *substr, StringRef *str, int32_t pos) { + if (nullptr == substr || nullptr == str) { + return 0; + } + // negetive pos return 0 directly + if (pos <= 0) { + return 0; + } + uint32_t sub_size = substr->size_; + uint32_t size = str->size_; + // if substr is "" and pos <= len(str) + 1, return pos, other case return 0 + if (pos + sub_size - 1 > size) { + return 0; + } + if (sub_size == 0) { + return pos; + } + for (uint32_t i = pos - 1; i <= size - sub_size; i++) { + uint32_t j = 0, k = i; + for (; j < sub_size; j++, k++) { + if (str->data_[k] != substr->data_[j]) { + break; + } + } + if (j == sub_size) { + return i + 1; + } + } + return 0; +} + int32_t strcmp(StringRef *s1, StringRef *s2) { if (s1 == s2) { return 0; diff --git a/hybridse/src/udf/udf.h b/hybridse/src/udf/udf.h index b7f222433a7..9f41e3aac1d 100644 --- a/hybridse/src/udf/udf.h +++ b/hybridse/src/udf/udf.h @@ -390,6 +390,8 @@ void sub_string(StringRef *str, int32_t pos, StringRef *output); void sub_string(StringRef *str, int32_t pos, int32_t len, StringRef *output); +int32_t locate(StringRef *substr, StringRef* str); +int32_t locate(StringRef *substr, StringRef* str, int32_t pos); int32_t strcmp(StringRef *s1, StringRef *s2); void bool_to_string(bool v, StringRef *output);