From 8269fde63bef8a5bbb782ef4199acbac44bdc35b Mon Sep 17 00:00:00 2001 From: aoiasd Date: Tue, 22 Oct 2024 14:25:21 +0800 Subject: [PATCH 1/4] Tokenizer support build with params and clone for concurrency Signed-off-by: aoiasd --- internal/core/src/common/FieldMeta.cpp | 1 + internal/core/src/segcore/tokenizer_c.cpp | 12 ++ internal/core/src/segcore/tokenizer_c.h | 3 + .../tantivy/tantivy-binding/Cargo.lock | 6 +- .../tantivy/tantivy-binding/Cargo.toml | 1 + .../tantivy-binding/include/tantivy-binding.h | 2 + .../tantivy/tantivy-binding/src/lib.rs | 1 + .../tantivy/tantivy-binding/src/tokenizer.rs | 133 +++++++++++++-- .../tantivy-binding/src/tokenizer_c.rs | 7 + .../tantivy-binding/src/tokenizer_filter.rs | 159 ++++++++++++++++++ internal/core/thirdparty/tantivy/tokenizer.h | 9 + internal/core/unittest/test_text_match.cpp | 4 +- internal/util/ctokenizer/c_tokenizer.go | 9 + .../ctokenizer/text_schema_validator_test.go | 4 +- internal/util/function/bm25_function.go | 37 +++- internal/util/tokenizerapi/tokenizer.go | 1 + .../testcases/test_full_text_search.py | 5 +- 17 files changed, 368 insertions(+), 26 deletions(-) create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs diff --git a/internal/core/src/common/FieldMeta.cpp b/internal/core/src/common/FieldMeta.cpp index ca55d45e67ac2..a90f768781fe2 100644 --- a/internal/core/src/common/FieldMeta.cpp +++ b/internal/core/src/common/FieldMeta.cpp @@ -16,6 +16,7 @@ #include #include "Consts.h" +#include "log/Log.h" namespace milvus { TokenizerParams diff --git a/internal/core/src/segcore/tokenizer_c.cpp b/internal/core/src/segcore/tokenizer_c.cpp index 85a3cc39d4f55..a33a6bd9bfd85 100644 --- a/internal/core/src/segcore/tokenizer_c.cpp +++ b/internal/core/src/segcore/tokenizer_c.cpp @@ -10,6 +10,7 @@ // or implied. See the License for the specific language governing permissions and limitations under the License #include "segcore/tokenizer_c.h" +#include #include "common/FieldMeta.h" #include "common/protobuf_utils.h" #include "pb/schema.pb.h" @@ -30,6 +31,17 @@ create_tokenizer(CMap m, CTokenizer* tokenizer) { } } +CStatus +clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst) { + try { + auto impl = reinterpret_cast(*tokenizer); + *rst = impl->Clone().release(); + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(&e); + } +} + void free_tokenizer(CTokenizer tokenizer) { auto impl = reinterpret_cast(tokenizer); diff --git a/internal/core/src/segcore/tokenizer_c.h b/internal/core/src/segcore/tokenizer_c.h index 901689c5337ef..3f84da729efaa 100644 --- a/internal/core/src/segcore/tokenizer_c.h +++ b/internal/core/src/segcore/tokenizer_c.h @@ -26,6 +26,9 @@ typedef void* CTokenizer; CStatus create_tokenizer(CMap m, CTokenizer* tokenizer); +CStatus +clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst); + void free_tokenizer(CTokenizer tokenizer); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index 47872ac8120b8..a72e056522e8d 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -1021,11 +1021,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -1166,6 +1167,7 @@ dependencies = [ "libc", "log", "scopeguard", + "serde_json", "tantivy", "tantivy-jieba", "zstd-sys", diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 3bf9759d470f8..6b26b3ab67e7e 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -15,6 +15,7 @@ env_logger = "0.11.3" log = "0.4.21" tantivy-jieba = "0.10.0" lazy_static = "1.4.0" +serde_json = "1.0.128" [build-dependencies] cbindgen = "0.26.0" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index c443ec7fc7a0e..391cece60bccd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -159,6 +159,8 @@ const char *tantivy_token_stream_get_token(void *token_stream); void *tantivy_create_tokenizer(void *tokenizer_params); +void *tantivy_clone_tokenizer(void *ptr); + void tantivy_free_tokenizer(void *tokenizer); bool tantivy_index_exist(const char *path); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index fd73108fd4954..f5df4dc10ff15 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -15,6 +15,7 @@ mod log; mod string_c; mod token_stream_c; mod tokenizer; +mod tokenizer_filter; mod tokenizer_c; mod util; mod util_c; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs index 2e0d283947377..128ebe266f2cb 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -1,7 +1,10 @@ use lazy_static::lazy_static; -use log::{info, warn}; +use log::warn; use std::collections::HashMap; -use tantivy::tokenizer::{TextAnalyzer, TokenizerManager}; +use tantivy::tokenizer::*; +use serde_json as json; + +use crate::tokenizer_filter::*; use crate::log::init_log; lazy_static! { @@ -12,32 +15,128 @@ pub(crate) fn default_tokenizer() -> TextAnalyzer { DEFAULT_TOKENIZER_MANAGER.get("default").unwrap() } -fn jieba_tokenizer() -> TextAnalyzer { - tantivy_jieba::JiebaTokenizer {}.into() +struct TantivyBuilder<'a>{ + // builder: TextAnalyzerBuilder + filters:HashMap, + params:&'a json::Map } -pub(crate) fn create_tokenizer(params: &HashMap) -> Option { - init_log(); +impl TantivyBuilder<'_>{ + fn new(params: &json::Map) -> TantivyBuilder{ + TantivyBuilder{ + filters: HashMap::new(), + params:params, + } + } + + fn add_costom_filter(&mut self, name: &String, params: &json::Map){ + match SystemFilter::try_from(params){ + Ok(filter) => {self.filters.insert(name.to_string(), filter);}, + Err(_e) => {}, + }; + } - match params.get("tokenizer") { - Some(tokenizer_name) => match tokenizer_name.as_str() { - "default" => { - Some(default_tokenizer()) + fn add_costom_filters(&mut self, params:&json::Map){ + for (name, value) in params{ + if !value.is_object(){ + continue; } + + self.add_costom_filter(name, value.as_object().unwrap()); + } + } + + fn build(mut self) -> Option{ + let tokenizer=self.params.get("tokenizer"); + if !tokenizer.is_none() && !tokenizer.unwrap().is_string(){ + return None; + } + + let tokenizer_name = { + if !tokenizer.is_none(){ + tokenizer.unwrap().as_str().unwrap() + }else{ + "standard" + } + }; + + match tokenizer_name { + "standard" => { + let mut builder = TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(); + let filters= self.params.get("filter"); + if !filters.is_none() && filters.unwrap().is_array(){ + for filter in filters.unwrap().as_array().unwrap(){ + if filter.is_string(){ + let filter_name = filter.as_str().unwrap(); + let costum = self.filters.remove(filter_name); + if !costum.is_none(){ + builder = costum.unwrap().transform(builder); + continue; + } + // check if filter was system filter + let system = SystemFilter::from(filter_name); + match system { + SystemFilter::Invalid => { + log::warn!("build analyzer failed, filter not found :{}", filter_name); + return None + } + other => { + builder = other.transform(builder); + }, + } + } + } + } + Some(builder.build()) + } "jieba" => { - Some(jieba_tokenizer()) + Some(tantivy_jieba::JiebaTokenizer {}.into()) } s => { warn!("unsupported tokenizer: {}", s); None } - }, - None => { - Some(default_tokenizer()) } } } +pub(crate) fn create_tokenizer(params: &HashMap) -> Option { + init_log(); + + let analyzer_json_value = match params.get("analyzer"){ + Some(value) => { + let json_analyzer = json::from_str::(value); + if json_analyzer.is_err() { + return None; + } + let json_value = json_analyzer.unwrap(); + if !json_value.is_object(){ + return None + } + json_value + } + None => json::Value::Object(json::Map::::new()), + }; + + let analyzer_params= analyzer_json_value.as_object().unwrap(); + let mut builder = TantivyBuilder::new(analyzer_params); + let str_filter=params.get("filter"); + if !str_filter.is_none(){ + let json_filter = json::from_str::(str_filter.unwrap()); + if json_filter.is_err(){ + return None + } + + let filter_params = json_filter.unwrap(); + if !filter_params.is_object(){ + return None + } + + builder.add_costom_filters(filter_params.as_object().unwrap()); + } + builder.build() +} + #[cfg(test)] mod tests { use std::collections::HashMap; @@ -46,8 +145,12 @@ mod tests { #[test] fn test_create_tokenizer() { let mut params : HashMap = HashMap::new(); - params.insert("tokenizer".parse().unwrap(), "jieba".parse().unwrap()); + let analyzer_params = r#" + { + "tokenizer": "jieba" + }"#; + params.insert("analyzer".to_string(), analyzer_params.to_string()); let tokenizer = create_tokenizer(¶ms); assert!(tokenizer.is_some()); } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs index c2caf097fc34c..ef572fcc4f2a6 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -20,6 +20,13 @@ pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mu } } +#[no_mangle] +pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void { + let analyzer=ptr as *mut TextAnalyzer; + let clone = unsafe {(*analyzer).clone()}; + create_binding(clone) +} + #[no_mangle] pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) { free_binding::(tokenizer); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs new file mode 100644 index 0000000000000..9d4c27aa15ae7 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs @@ -0,0 +1,159 @@ +use tantivy::tokenizer::*; +use serde_json as json; + +pub(crate) enum SystemFilter{ + Invalid, + LowerCase(LowerCaser), + AsciiFolding(AsciiFoldingFilter), + AlphaNumOnly(AlphaNumOnlyFilter), + Length(RemoveLongFilter), + Stop(StopWordFilter), + Decompounder(SplitCompoundWords), + Stemmer(Stemmer) +} + +impl SystemFilter{ + pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder{ + match self{ + Self::LowerCase(filter) => builder.filter(filter).dynamic(), + Self::AsciiFolding(filter) => builder.filter(filter).dynamic(), + Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(), + Self::Length(filter) => builder.filter(filter).dynamic(), + Self::Stop(filter) => builder.filter(filter).dynamic(), + Self::Decompounder(filter) => builder.filter(filter).dynamic(), + Self::Stemmer(filter) => builder.filter(filter).dynamic(), + Self::Invalid => builder, + } + } +} + +// create length filter from params +// { +// "type": "length", +// "max": 10, // length +// } +// TODO support min length +fn get_length_filter(params: &json::Map) -> Result{ + let limit_str = params.get("max"); + if limit_str.is_none() || !limit_str.unwrap().is_u64(){ + return Err(()) + } + let limit = limit_str.unwrap().as_u64().unwrap() as usize; + Ok(SystemFilter::Length(RemoveLongFilter::limit(limit))) +} + +fn get_stop_filter(params: &json::Map)-> Result{ + let value = params.get("stop_words"); + if value.is_none() || !value.unwrap().is_array(){ + return Err(()) + } + + let stop_words= value.unwrap().as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words{ + match element.as_str(){ + Some(word) => str_list.push(word.to_string()), + None => return Err(()) + } + }; + Ok(SystemFilter::Stop(StopWordFilter::remove(str_list))) +} + +fn get_decompounder_filter(params: &json::Map)-> Result{ + let value = params.get("word_list"); + if value.is_none() || !value.unwrap().is_array(){ + return Err(()) + } + + let stop_words= value.unwrap().as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words{ + match element.as_str(){ + Some(word) => str_list.push(word.to_string()), + None => return Err(()) + } + }; + + match SplitCompoundWords::from_dictionary(str_list){ + Ok(f) => Ok(SystemFilter::Decompounder(f)), + Err(_e) => Err(()) + } +} + +fn get_stemmer_filter(params: &json::Map)-> Result{ + let value = params.get("language"); + if value.is_none() || !value.unwrap().is_string(){ + return Err(()) + } + + match value.unwrap().as_str().unwrap().into_language(){ + Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))), + Err(_e) => Err(()), + } +} + +trait LanguageParser { + type Error; + fn into_language(self) -> Result; +} + +impl LanguageParser for &str { + type Error = (); + fn into_language(self) -> Result { + match self { + "arabig" => Ok(Language::Arabic), + "danish" => Ok(Language::Danish), + "dutch" => Ok(Language::Dutch), + "english" => Ok(Language::English), + "finnish" => Ok(Language::Finnish), + "french" => Ok(Language::French), + "german" => Ok(Language::German), + "greek" => Ok(Language::Greek), + "hungarian" => Ok(Language::Hungarian), + "italian" => Ok(Language::Italian), + "norwegian" => Ok(Language::Norwegian), + "portuguese" => Ok(Language::Portuguese), + "romanian" => Ok(Language::Romanian), + "russian" => Ok(Language::Russian), + "spanish" => Ok(Language::Spanish), + "swedish" => Ok(Language::Swedish), + "tamil" => Ok(Language::Tamil), + "turkish" => Ok(Language::Turkish), + _ => Err(()), + } + } +} + +impl From<&str> for SystemFilter{ + fn from(value: &str) -> Self { + match value{ + "lowercase" => Self::LowerCase(LowerCaser), + "asciifolding" => Self::AsciiFolding(AsciiFoldingFilter), + "alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter), + _ => Self::Invalid, + } + } +} + +impl TryFrom<&json::Map> for SystemFilter { + type Error = (); + + fn try_from(params: &json::Map) -> Result { + match params.get(&"type".to_string()){ + Some(value) =>{ + if !value.is_string(){ + return Err(()); + }; + + match value.as_str().unwrap(){ + "length" => get_length_filter(params), + "stop" => get_stop_filter(params), + "decompounder" => get_decompounder_filter(params), + "stemmer" => get_stemmer_filter(params), + _other=>Err(()), + } + } + None => Err(()), + } + } +} diff --git a/internal/core/thirdparty/tantivy/tokenizer.h b/internal/core/thirdparty/tantivy/tokenizer.h index dd753205aa196..6f42eecbfcbe2 100644 --- a/internal/core/thirdparty/tantivy/tokenizer.h +++ b/internal/core/thirdparty/tantivy/tokenizer.h @@ -20,6 +20,9 @@ struct Tokenizer { } } + explicit Tokenizer(void* _ptr) : ptr_(_ptr) { + } + ~Tokenizer() { if (ptr_ != nullptr) { tantivy_free_tokenizer(ptr_); @@ -34,6 +37,12 @@ struct Tokenizer { return std::make_unique(token_stream, shared_text); } + std::unique_ptr + Clone() { + auto newptr = tantivy_clone_tokenizer(ptr_); + return std::make_unique(newptr); + } + // CreateTokenStreamCopyText will copy the text and then create token stream based on the text. std::unique_ptr CreateTokenStreamCopyText(const std::string& text) { diff --git a/internal/core/unittest/test_text_match.cpp b/internal/core/unittest/test_text_match.cpp index 55b85cad1d118..247a23c9bcbdc 100644 --- a/internal/core/unittest/test_text_match.cpp +++ b/internal/core/unittest/test_text_match.cpp @@ -253,7 +253,7 @@ TEST(TextMatch, GrowingJieBa) { auto schema = GenTestSchema({ {"enable_match", "true"}, {"enable_tokenizer", "true"}, - {"tokenizer_params", R"({"tokenizer": "jieba"})"}, + {"tokenizer_params", R"({"analyzer":{"tokenizer": "jieba"}})"}, }); auto seg = CreateGrowingSegment(schema, empty_index_meta); std::vector raw_str = {"青铜时代", "黄金时代"}; @@ -330,7 +330,7 @@ TEST(TextMatch, SealedJieBa) { auto schema = GenTestSchema({ {"enable_match", "true"}, {"enable_tokenizer", "true"}, - {"tokenizer_params", R"({"tokenizer": "jieba"})"}, + {"tokenizer_params", R"({"analyzer":{"tokenizer": "jieba"}})"}, }); auto seg = CreateSealedSegment(schema, empty_index_meta); std::vector raw_str = {"青铜时代", "黄金时代"}; diff --git a/internal/util/ctokenizer/c_tokenizer.go b/internal/util/ctokenizer/c_tokenizer.go index 915aa4cfa1938..e9f44aeb23a79 100644 --- a/internal/util/ctokenizer/c_tokenizer.go +++ b/internal/util/ctokenizer/c_tokenizer.go @@ -33,6 +33,15 @@ func (impl *CTokenizer) NewTokenStream(text string) tokenizerapi.TokenStream { return NewCTokenStream(ptr) } +func (impl *CTokenizer) Clone() (tokenizerapi.Tokenizer, error) { + var newptr C.CTokenizer + status := C.clone_tokenizer(&impl.ptr, &newptr) + if err := HandleCStatus(&status, "failed to clone tokenizer"); err != nil { + return nil, err + } + return NewCTokenizer(newptr), nil +} + func (impl *CTokenizer) Destroy() { C.free_tokenizer(impl.ptr) } diff --git a/internal/util/ctokenizer/text_schema_validator_test.go b/internal/util/ctokenizer/text_schema_validator_test.go index 56e3ba668c5cb..dd5bc78ce9800 100644 --- a/internal/util/ctokenizer/text_schema_validator_test.go +++ b/internal/util/ctokenizer/text_schema_validator_test.go @@ -33,7 +33,7 @@ func TestValidateTextSchema(t *testing.T) { DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{ {Key: "enable_match", Value: "true"}, - {Key: "tokenizer_params", Value: `{"tokenizer": "default"}`}, + {Key: "tokenizer_params", Value: `{"analyzer":{"tokenizer": "standard"}}`}, }, }, { @@ -41,7 +41,7 @@ func TestValidateTextSchema(t *testing.T) { DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{ {Key: "enable_match", Value: "true"}, - {Key: "tokenizer_params", Value: `{"tokenizer": "jieba"}`}, + {Key: "tokenizer_params", Value: `{"analyzer":{"tokenizer": "standard"}}`}, }, }, } diff --git a/internal/util/function/bm25_function.go b/internal/util/function/bm25_function.go index 275be8e412f29..225a3fa30893f 100644 --- a/internal/util/function/bm25_function.go +++ b/internal/util/function/bm25_function.go @@ -19,6 +19,7 @@ package function import ( + "encoding/json" "fmt" "sync" @@ -40,6 +41,28 @@ type BM25FunctionRunner struct { concurrency int } +// TODO Use json string instead map[string]string as tokenizer params +func getTokenizerParams(field *schemapb.FieldSchema) (map[string]string, error) { + result := map[string]string{} + for _, param := range field.GetTypeParams() { + if param.Key == "tokenizer_params" { + params := map[string]interface{}{} + err := json.Unmarshal([]byte(param.GetValue()), ¶ms) + if err != nil { + return nil, err + } + for key, param := range params { + bytes, err := json.Marshal(param) + if err != nil { + return nil, err + } + result[key] = string(bytes) + } + } + } + return result, nil +} + func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.FunctionSchema) (*BM25FunctionRunner, error) { if len(schema.GetOutputFieldIds()) != 1 { return nil, fmt.Errorf("bm25 function should only have one output field, but now %d", len(schema.GetOutputFieldIds())) @@ -49,17 +72,26 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun schema: schema, concurrency: 8, } + var params map[string]string for _, field := range coll.GetFields() { if field.GetFieldID() == schema.GetOutputFieldIds()[0] { runner.outputField = field break } + + if field.GetFieldID() == schema.GetInputFieldIds()[0] { + var err error + params, err = getTokenizerParams(field) + if err != nil { + return nil, err + } + } } if runner.outputField == nil { return nil, fmt.Errorf("no output field") } - tokenizer, err := ctokenizer.NewTokenizer(map[string]string{}) + tokenizer, err := ctokenizer.NewTokenizer(params) if err != nil { return nil, err } @@ -69,8 +101,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun } func (v *BM25FunctionRunner) run(data []string, dst []map[uint32]float32) error { - // TODO AOIASD Support single Tokenizer concurrency - tokenizer, err := ctokenizer.NewTokenizer(map[string]string{}) + tokenizer, err := v.tokenizer.Clone() if err != nil { return err } diff --git a/internal/util/tokenizerapi/tokenizer.go b/internal/util/tokenizerapi/tokenizer.go index 2b6debbec71f6..6dab31257122c 100644 --- a/internal/util/tokenizerapi/tokenizer.go +++ b/internal/util/tokenizerapi/tokenizer.go @@ -3,5 +3,6 @@ package tokenizerapi //go:generate mockery --name=Tokenizer --with-expecter type Tokenizer interface { NewTokenStream(text string) TokenStream + Clone() (Tokenizer, error) Destroy() } diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index c54933c7e7373..76df79b8edd68 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -2317,7 +2317,6 @@ def test_full_text_search_default( @pytest.mark.parametrize("expr", ["text_match"]) @pytest.mark.parametrize("offset", [10]) @pytest.mark.parametrize("tokenizer", ["jieba"]) - @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751") def test_full_text_search_with_jieba_tokenizer( self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq ): @@ -2329,7 +2328,9 @@ def test_full_text_search_with_jieba_tokenizer( expected: full text search successfully and result is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{ + "tokenizer": tokenizer, + } } dim = 128 fields = [ From f14cfe5c14fc1b288fc069ea74f581bd9e24a39c Mon Sep 17 00:00:00 2001 From: aoiasd Date: Wed, 23 Oct 2024 15:44:19 +0800 Subject: [PATCH 2/4] create tokenizer by string Signed-off-by: aoiasd --- internal/core/src/common/FieldMeta.cpp | 14 +--- internal/core/src/common/FieldMeta.h | 2 +- internal/core/src/index/TextMatchIndex.cpp | 8 +- internal/core/src/index/TextMatchIndex.h | 8 +- internal/core/src/segcore/tokenizer_c.cpp | 5 +- internal/core/src/segcore/tokenizer_c.h | 2 +- .../tantivy-binding/include/tantivy-binding.h | 8 +- .../tantivy-binding/src/index_reader_text.rs | 4 +- .../src/index_reader_text_c.rs | 15 ++-- .../src/index_writer_text_c.rs | 7 +- .../tantivy/tantivy-binding/src/tokenizer.rs | 76 +++++++++---------- .../tantivy-binding/src/tokenizer_c.rs | 11 ++- .../core/thirdparty/tantivy/tantivy-wrapper.h | 14 ++-- internal/core/thirdparty/tantivy/tokenizer.h | 7 +- internal/core/unittest/test_text_match.cpp | 9 +-- .../util/ctokenizer/c_tokenizer_factory.go | 11 +-- internal/util/function/bm25_function.go | 28 ++----- 17 files changed, 98 insertions(+), 131 deletions(-) diff --git a/internal/core/src/common/FieldMeta.cpp b/internal/core/src/common/FieldMeta.cpp index a90f768781fe2..2e9cd8c5052b2 100644 --- a/internal/core/src/common/FieldMeta.cpp +++ b/internal/core/src/common/FieldMeta.cpp @@ -16,25 +16,15 @@ #include #include "Consts.h" -#include "log/Log.h" namespace milvus { TokenizerParams ParseTokenizerParams(const TypeParams& params) { auto iter = params.find("tokenizer_params"); if (iter == params.end()) { - return {}; + return "{}"; } - nlohmann::json j = nlohmann::json::parse(iter->second); - std::map ret; - for (const auto& [k, v] : j.items()) { - try { - ret[k] = v.get(); - } catch (std::exception& e) { - ret[k] = v.dump(); - } - } - return ret; + return iter ->second.c_str(); } bool diff --git a/internal/core/src/common/FieldMeta.h b/internal/core/src/common/FieldMeta.h index ed040902a54d6..0581cf9167551 100644 --- a/internal/core/src/common/FieldMeta.h +++ b/internal/core/src/common/FieldMeta.h @@ -25,7 +25,7 @@ namespace milvus { using TypeParams = std::map; -using TokenizerParams = std::map; +using TokenizerParams = const char*; TokenizerParams ParseTokenizerParams(const TypeParams& params); diff --git a/internal/core/src/index/TextMatchIndex.cpp b/internal/core/src/index/TextMatchIndex.cpp index f21e5b319e006..216b93007f4c9 100644 --- a/internal/core/src/index/TextMatchIndex.cpp +++ b/internal/core/src/index/TextMatchIndex.cpp @@ -22,7 +22,7 @@ constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/"; TextMatchIndex::TextMatchIndex( int64_t commit_interval_in_ms, const char* tokenizer_name, - const std::map& tokenizer_params) + const char* tokenizer_params) : commit_interval_in_ms_(commit_interval_in_ms), last_commit_time_(stdclock::now()) { d_type_ = TantivyDataType::Text; @@ -34,7 +34,7 @@ TextMatchIndex::TextMatchIndex( TextMatchIndex::TextMatchIndex( const std::string& path, const char* tokenizer_name, - const std::map& tokenizer_params) + const char* tokenizer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { path_ = path; @@ -50,7 +50,7 @@ TextMatchIndex::TextMatchIndex( TextMatchIndex::TextMatchIndex( const storage::FileManagerContext& ctx, const char* tokenizer_name, - const std::map& tokenizer_params) + const char* tokenizer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { schema_ = ctx.fieldDataMeta.field_schema; @@ -176,7 +176,7 @@ TextMatchIndex::CreateReader() { void TextMatchIndex::RegisterTokenizer( const char* tokenizer_name, - const std::map& tokenizer_params) { + const char* tokenizer_params) { wrapper_->register_tokenizer(tokenizer_name, tokenizer_params); } diff --git a/internal/core/src/index/TextMatchIndex.h b/internal/core/src/index/TextMatchIndex.h index 570668a0304e0..2a52d2cf0e9d5 100644 --- a/internal/core/src/index/TextMatchIndex.h +++ b/internal/core/src/index/TextMatchIndex.h @@ -25,17 +25,17 @@ class TextMatchIndex : public InvertedIndexTantivy { explicit TextMatchIndex( int64_t commit_interval_in_ms, const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); // for sealed segment. explicit TextMatchIndex( const std::string& path, const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); // for building index. explicit TextMatchIndex( const storage::FileManagerContext& ctx, const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); // for loading index explicit TextMatchIndex(const storage::FileManagerContext& ctx); @@ -69,7 +69,7 @@ class TextMatchIndex : public InvertedIndexTantivy { void RegisterTokenizer( const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); TargetBitmap MatchQuery(const std::string& query); diff --git a/internal/core/src/segcore/tokenizer_c.cpp b/internal/core/src/segcore/tokenizer_c.cpp index a33a6bd9bfd85..781192f269c63 100644 --- a/internal/core/src/segcore/tokenizer_c.cpp +++ b/internal/core/src/segcore/tokenizer_c.cpp @@ -20,10 +20,9 @@ using Map = std::map; CStatus -create_tokenizer(CMap m, CTokenizer* tokenizer) { +create_tokenizer(const char* params, CTokenizer* tokenizer) { try { - auto mm = reinterpret_cast(m); - auto impl = std::make_unique(*mm); + auto impl = std::make_unique(params); *tokenizer = impl.release(); return milvus::SuccessCStatus(); } catch (std::exception& e) { diff --git a/internal/core/src/segcore/tokenizer_c.h b/internal/core/src/segcore/tokenizer_c.h index 3f84da729efaa..422449c0cab50 100644 --- a/internal/core/src/segcore/tokenizer_c.h +++ b/internal/core/src/segcore/tokenizer_c.h @@ -24,7 +24,7 @@ extern "C" { typedef void* CTokenizer; CStatus -create_tokenizer(CMap m, CTokenizer* tokenizer); +create_tokenizer(const char* params, CTokenizer* tokenizer); CStatus clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 391cece60bccd..5c494f215ece9 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -88,7 +88,9 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern); RustArray tantivy_match_query(void *ptr, const char *query); -void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params); +void tantivy_register_tokenizer(void *ptr, + const char *tokenizer_name, + const char *tokenizer_params); void *tantivy_create_index(const char *field_name, TantivyDataType data_type, @@ -142,7 +144,7 @@ void tantivy_index_add_multi_keywords(void *ptr, void *tantivy_create_text_writer(const char *field_name, const char *path, const char *tokenizer_name, - void *tokenizer_params, + const char *tokenizer_params, uintptr_t num_threads, uintptr_t overall_memory_budget_in_bytes, bool in_ram); @@ -157,7 +159,7 @@ bool tantivy_token_stream_advance(void *token_stream); const char *tantivy_token_stream_get_token(void *token_stream); -void *tantivy_create_tokenizer(void *tokenizer_params); +void *tantivy_create_tokenizer(const char *tokenizer_params); void *tantivy_clone_tokenizer(void *ptr); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index 654346fc868c4..b80039c00c9dd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -4,7 +4,7 @@ use tantivy::{ Term, }; -use crate::{index_reader::IndexReaderWrapper, tokenizer::default_tokenizer}; +use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_tokenizer}; impl IndexReaderWrapper { // split the query string into multiple tokens using index's default tokenizer, @@ -14,7 +14,7 @@ impl IndexReaderWrapper { let mut tokenizer = self .index .tokenizer_for_field(self.field) - .unwrap_or(default_tokenizer()) + .unwrap_or(standard_tokenizer()) .clone(); let mut token_stream = tokenizer.token_stream(q); let mut terms: Vec = Vec::new(); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index eb0653c90357b..cc461c87deabd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -1,8 +1,13 @@ -use std::{collections::HashMap, ffi::CStr}; +use std::{ffi::CStr}; use libc::{c_char, c_void}; -use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer}; +use crate::{ + array::RustArray, + string_c::c_str_to_str, + index_reader::IndexReaderWrapper, + tokenizer::create_tokenizer, +}; #[no_mangle] pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray { @@ -18,13 +23,13 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> pub extern "C" fn tantivy_register_tokenizer( ptr: *mut c_void, tokenizer_name: *const c_char, - tokenizer_params: *mut c_void, + tokenizer_params: *const c_char, ) { let real = ptr as *mut IndexReaderWrapper; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) }; let analyzer = unsafe { - let m = tokenizer_params as *const HashMap; - create_tokenizer(&(*m)) + let params = c_str_to_str(tokenizer_params).to_string(); + create_tokenizer(¶ms) }; match analyzer { Some(text_analyzer) => unsafe { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index 1ca70ac232c9b..83c8dc34ad954 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -6,13 +6,14 @@ use std::ffi::CStr; use crate::index_writer::IndexWriterWrapper; use crate::tokenizer::create_tokenizer; use crate::util::create_binding; +use crate::string_c::c_str_to_str; #[no_mangle] pub extern "C" fn tantivy_create_text_writer( field_name: *const c_char, path: *const c_char, tokenizer_name: *const c_char, - tokenizer_params: *mut c_void, + tokenizer_params: *const c_char, num_threads: usize, overall_memory_budget_in_bytes: usize, in_ram: bool, @@ -21,8 +22,8 @@ pub extern "C" fn tantivy_create_text_writer( let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() }; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() }; let analyzer = unsafe { - let m = tokenizer_params as *const HashMap; - create_tokenizer(&(*m)) + let params = c_str_to_str(tokenizer_params).to_string(); + create_tokenizer(¶ms) }; match analyzer { Some(text_analyzer) => { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs index 128ebe266f2cb..eae72e613ff4f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -7,12 +7,8 @@ use serde_json as json; use crate::tokenizer_filter::*; use crate::log::init_log; -lazy_static! { - static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default(); -} - -pub(crate) fn default_tokenizer() -> TextAnalyzer { - DEFAULT_TOKENIZER_MANAGER.get("default").unwrap() +pub(crate) fn standard_tokenizer() -> TextAnalyzer { + TextAnalyzer::builder(SimpleTokenizer::default()).build() } struct TantivyBuilder<'a>{ @@ -89,6 +85,7 @@ impl TantivyBuilder<'_>{ } Some(builder.build()) } + // TODO support jieba filter and use same builder with standard. "jieba" => { Some(tantivy_jieba::JiebaTokenizer {}.into()) } @@ -100,58 +97,57 @@ impl TantivyBuilder<'_>{ } } -pub(crate) fn create_tokenizer(params: &HashMap) -> Option { +pub(crate) fn create_tokenizer(params: &String) -> Option { init_log(); - let analyzer_json_value = match params.get("analyzer"){ - Some(value) => { - let json_analyzer = json::from_str::(value); - if json_analyzer.is_err() { + match json::from_str::(¶ms){ + Ok(value) =>{ + if value.is_null(){ + return Some(standard_tokenizer()); + } + if !value.is_object(){ return None; } - let json_value = json_analyzer.unwrap(); - if !json_value.is_object(){ - return None + let json_params = value.as_object().unwrap(); + // create builder + let analyzer_params=json_params.get("analyzer"); + if analyzer_params.is_none(){ + return Some(standard_tokenizer()); } - json_value - } - None => json::Value::Object(json::Map::::new()), - }; - - let analyzer_params= analyzer_json_value.as_object().unwrap(); - let mut builder = TantivyBuilder::new(analyzer_params); - let str_filter=params.get("filter"); - if !str_filter.is_none(){ - let json_filter = json::from_str::(str_filter.unwrap()); - if json_filter.is_err(){ - return None - } + if !analyzer_params.unwrap().is_object(){ + return None; + } + let mut builder = TantivyBuilder::new(analyzer_params.unwrap().as_object().unwrap()); - let filter_params = json_filter.unwrap(); - if !filter_params.is_object(){ - return None - } + // build custom filter + let filter_params=json_params.get("filter"); + if !filter_params.is_none() && filter_params.unwrap().is_object(){ + builder.add_costom_filters(filter_params.unwrap().as_object().unwrap()); + } - builder.add_costom_filters(filter_params.as_object().unwrap()); + // build analyzer + builder.build() + }, + Err(_e) => None, } - builder.build() } #[cfg(test)] mod tests { - use std::collections::HashMap; use crate::tokenizer::create_tokenizer; #[test] fn test_create_tokenizer() { - let mut params : HashMap = HashMap::new(); - let analyzer_params = r#" + let params = r#" { - "tokenizer": "jieba" + "analyzer": + { + "tokenizer": "standard", + "filter": [""], + }, }"#; - params.insert("analyzer".to_string(), analyzer_params.to_string()); - let tokenizer = create_tokenizer(¶ms); + let tokenizer = create_tokenizer(¶ms.to_string()); assert!(tokenizer.is_some()); } -} +} \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs index ef572fcc4f2a6..dedca3064e138 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -1,18 +1,17 @@ -use std::collections::HashMap; - -use libc::c_void; +use libc::{c_void,c_char}; use tantivy::tokenizer::TextAnalyzer; use crate::{ + string_c::c_str_to_str, tokenizer::create_tokenizer, util::{create_binding, free_binding}, }; #[no_mangle] -pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void { +pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void { let analyzer = unsafe { - let m = tokenizer_params as *const HashMap; - create_tokenizer(&(*m)) + let params = c_str_to_str(tokenizer_params).to_string(); + create_tokenizer(¶ms) }; match analyzer { Some(text_analyzer) => create_binding(text_analyzer), diff --git a/internal/core/thirdparty/tantivy/tantivy-wrapper.h b/internal/core/thirdparty/tantivy/tantivy-wrapper.h index 17822d1bbdfb3..2e576f5fe9162 100644 --- a/internal/core/thirdparty/tantivy/tantivy-wrapper.h +++ b/internal/core/thirdparty/tantivy/tantivy-wrapper.h @@ -14,7 +14,7 @@ namespace milvus::tantivy { using Map = std::map; static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer"; -static Map DEFAULT_TOKENIZER_PARAMS = {}; +static const char* DEFAULT_TOKENIZER_PARAMS = "{}"; static constexpr uintptr_t DEFAULT_NUM_THREADS = 4; static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES = DEFAULT_NUM_THREADS * 15 * 1024 * 1024; @@ -101,17 +101,15 @@ struct TantivyIndexWrapper { bool in_ram, const char* path, const char* tokenizer_name = DEFAULT_TOKENIZER_NAME, - const std::map& + const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS, uintptr_t num_threads = DEFAULT_NUM_THREADS, uintptr_t overall_memory_budget_in_bytes = DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) { - RustHashMap m; - m.from(tokenizer_params); writer_ = tantivy_create_text_writer(field_name, path, tokenizer_name, - m.get_pointer(), + tokenizer_params, num_threads, overall_memory_budget_in_bytes, in_ram); @@ -136,12 +134,10 @@ struct TantivyIndexWrapper { void register_tokenizer( const char* tokenizer_name, - const std::map& tokenizer_params) { - RustHashMap m; - m.from(tokenizer_params); + const char* tokenizer_params) { if (reader_ != nullptr) { tantivy_register_tokenizer( - reader_, tokenizer_name, m.get_pointer()); + reader_, tokenizer_name, tokenizer_params); } } diff --git a/internal/core/thirdparty/tantivy/tokenizer.h b/internal/core/thirdparty/tantivy/tokenizer.h index 6f42eecbfcbe2..eeeec4db6de3e 100644 --- a/internal/core/thirdparty/tantivy/tokenizer.h +++ b/internal/core/thirdparty/tantivy/tokenizer.h @@ -11,10 +11,9 @@ struct Tokenizer { public: NO_COPY_OR_ASSIGN(Tokenizer); - explicit Tokenizer(const std::map& params) { - RustHashMap m; - m.from(params); - ptr_ = tantivy_create_tokenizer(m.get_pointer()); + explicit Tokenizer(std::string&& params) { + auto shared_params = std::make_shared(std::move(params)); + ptr_ = tantivy_create_tokenizer(shared_params->c_str()); if (ptr_ == nullptr) { throw std::invalid_argument("invalid tokenizer parameters"); } diff --git a/internal/core/unittest/test_text_match.cpp b/internal/core/unittest/test_text_match.cpp index 247a23c9bcbdc..359c35ce367de 100644 --- a/internal/core/unittest/test_text_match.cpp +++ b/internal/core/unittest/test_text_match.cpp @@ -80,23 +80,20 @@ TEST(ParseJson, Naive) { TEST(ParseTokenizerParams, NoTokenizerParams) { TypeParams params{{"k", "v"}}; auto p = ParseTokenizerParams(params); - ASSERT_EQ(0, p.size()); + ASSERT_EQ("{}", p); } TEST(ParseTokenizerParams, Default) { TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}}; auto p = ParseTokenizerParams(params); - ASSERT_EQ(1, p.size()); - auto iter = p.find("tokenizer"); - ASSERT_NE(p.end(), iter); - ASSERT_EQ("default", iter->second); + ASSERT_EQ(params.at("tokenizer_params"), p); } TEST(TextMatch, Index) { using Index = index::TextMatchIndex; auto index = std::make_unique(std::numeric_limits::max(), "milvus_tokenizer", - std::map{}); + "{}"); index->CreateReader(); index->AddText("football, basketball, pingpang", 0); index->AddText("swimming, football", 1); diff --git a/internal/util/ctokenizer/c_tokenizer_factory.go b/internal/util/ctokenizer/c_tokenizer_factory.go index c5690d8861600..1bd3177917741 100644 --- a/internal/util/ctokenizer/c_tokenizer_factory.go +++ b/internal/util/ctokenizer/c_tokenizer_factory.go @@ -9,16 +9,17 @@ package ctokenizer import "C" import ( + "unsafe" + "github.com/milvus-io/milvus/internal/util/tokenizerapi" ) -func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) { - mm := NewCMap() - defer mm.Destroy() - mm.From(m) +func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) { + paramPtr := C.CString(param) + defer C.free(unsafe.Pointer(paramPtr)) var ptr C.CTokenizer - status := C.create_tokenizer(mm.GetPointer(), &ptr) + status := C.create_tokenizer(paramPtr, &ptr) if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil { return nil, err } diff --git a/internal/util/function/bm25_function.go b/internal/util/function/bm25_function.go index 225a3fa30893f..ff8e046b1f504 100644 --- a/internal/util/function/bm25_function.go +++ b/internal/util/function/bm25_function.go @@ -19,7 +19,6 @@ package function import ( - "encoding/json" "fmt" "sync" @@ -41,26 +40,13 @@ type BM25FunctionRunner struct { concurrency int } -// TODO Use json string instead map[string]string as tokenizer params -func getTokenizerParams(field *schemapb.FieldSchema) (map[string]string, error) { - result := map[string]string{} +func getTokenizerParams(field *schemapb.FieldSchema) string { for _, param := range field.GetTypeParams() { if param.Key == "tokenizer_params" { - params := map[string]interface{}{} - err := json.Unmarshal([]byte(param.GetValue()), ¶ms) - if err != nil { - return nil, err - } - for key, param := range params { - bytes, err := json.Marshal(param) - if err != nil { - return nil, err - } - result[key] = string(bytes) - } + return param.Value } } - return result, nil + return "{}" } func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.FunctionSchema) (*BM25FunctionRunner, error) { @@ -72,7 +58,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun schema: schema, concurrency: 8, } - var params map[string]string + var params string for _, field := range coll.GetFields() { if field.GetFieldID() == schema.GetOutputFieldIds()[0] { runner.outputField = field @@ -80,11 +66,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun } if field.GetFieldID() == schema.GetInputFieldIds()[0] { - var err error - params, err = getTokenizerParams(field) - if err != nil { - return nil, err - } + params = getTokenizerParams(field) } } From 76ceac891bf2324c8a23a2c46102a566ba300c94 Mon Sep 17 00:00:00 2001 From: aoiasd Date: Wed, 23 Oct 2024 19:39:47 +0800 Subject: [PATCH 3/4] add error struct for tantivy binding Signed-off-by: aoiasd --- internal/core/src/common/FieldMeta.cpp | 2 +- internal/core/src/common/FieldMeta.h | 2 +- internal/core/src/index/TextMatchIndex.cpp | 26 +-- internal/core/src/index/TextMatchIndex.h | 25 +- internal/core/src/indexbuilder/index_c.cpp | 2 +- .../src/segcore/ChunkedSegmentSealedImpl.cpp | 8 +- .../core/src/segcore/SegmentGrowingImpl.cpp | 4 +- .../core/src/segcore/SegmentSealedImpl.cpp | 8 +- .../tantivy/tantivy-binding/src/error.rs | 40 ++++ .../tantivy-binding/src/index_reader_text.rs | 4 +- .../src/index_reader_text_c.rs | 10 +- .../src/index_writer_text_c.rs | 10 +- .../tantivy/tantivy-binding/src/lib.rs | 1 + .../tantivy/tantivy-binding/src/tokenizer.rs | 213 +++++++++++------- .../tantivy-binding/src/tokenizer_c.rs | 9 +- .../tantivy-binding/src/tokenizer_filter.rs | 40 ++-- .../core/thirdparty/tantivy/tantivy-wrapper.h | 8 +- internal/core/unittest/test_c_tokenizer.cpp | 7 +- internal/core/unittest/test_text_match.cpp | 12 +- internal/proxy/task_query.go | 1 - internal/proxy/task_search.go | 1 - internal/util/ctokenizer/c_tokenizer_test.go | 5 +- tests/python_client/common/common_func.py | 2 +- .../testcases/test_full_text_search.py | 92 ++++---- tests/python_client/testcases/test_query.py | 47 ++-- tests/python_client/testcases/test_search.py | 4 +- .../testcases/test_vector_operations.py | 8 +- 27 files changed, 340 insertions(+), 251 deletions(-) create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs diff --git a/internal/core/src/common/FieldMeta.cpp b/internal/core/src/common/FieldMeta.cpp index 2e9cd8c5052b2..1506ddd507a92 100644 --- a/internal/core/src/common/FieldMeta.cpp +++ b/internal/core/src/common/FieldMeta.cpp @@ -24,7 +24,7 @@ ParseTokenizerParams(const TypeParams& params) { if (iter == params.end()) { return "{}"; } - return iter ->second.c_str(); + return iter->second; } bool diff --git a/internal/core/src/common/FieldMeta.h b/internal/core/src/common/FieldMeta.h index 0581cf9167551..0d920452bf10d 100644 --- a/internal/core/src/common/FieldMeta.h +++ b/internal/core/src/common/FieldMeta.h @@ -25,7 +25,7 @@ namespace milvus { using TypeParams = std::map; -using TokenizerParams = const char*; +using TokenizerParams = std::string; TokenizerParams ParseTokenizerParams(const TypeParams& params); diff --git a/internal/core/src/index/TextMatchIndex.cpp b/internal/core/src/index/TextMatchIndex.cpp index 216b93007f4c9..8b12e02280b82 100644 --- a/internal/core/src/index/TextMatchIndex.cpp +++ b/internal/core/src/index/TextMatchIndex.cpp @@ -19,10 +19,9 @@ namespace milvus::index { constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/"; -TextMatchIndex::TextMatchIndex( - int64_t commit_interval_in_ms, - const char* tokenizer_name, - const char* tokenizer_params) +TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms, + const char* tokenizer_name, + const char* tokenizer_params) : commit_interval_in_ms_(commit_interval_in_ms), last_commit_time_(stdclock::now()) { d_type_ = TantivyDataType::Text; @@ -31,10 +30,9 @@ TextMatchIndex::TextMatchIndex( field_name.c_str(), true, "", tokenizer_name, tokenizer_params); } -TextMatchIndex::TextMatchIndex( - const std::string& path, - const char* tokenizer_name, - const char* tokenizer_params) +TextMatchIndex::TextMatchIndex(const std::string& path, + const char* tokenizer_name, + const char* tokenizer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { path_ = path; @@ -47,10 +45,9 @@ TextMatchIndex::TextMatchIndex( tokenizer_params); } -TextMatchIndex::TextMatchIndex( - const storage::FileManagerContext& ctx, - const char* tokenizer_name, - const char* tokenizer_params) +TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx, + const char* tokenizer_name, + const char* tokenizer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { schema_ = ctx.fieldDataMeta.field_schema; @@ -174,9 +171,8 @@ TextMatchIndex::CreateReader() { } void -TextMatchIndex::RegisterTokenizer( - const char* tokenizer_name, - const char* tokenizer_params) { +TextMatchIndex::RegisterTokenizer(const char* tokenizer_name, + const char* tokenizer_params) { wrapper_->register_tokenizer(tokenizer_name, tokenizer_params); } diff --git a/internal/core/src/index/TextMatchIndex.h b/internal/core/src/index/TextMatchIndex.h index 2a52d2cf0e9d5..248bb63932712 100644 --- a/internal/core/src/index/TextMatchIndex.h +++ b/internal/core/src/index/TextMatchIndex.h @@ -22,20 +22,17 @@ using stdclock = std::chrono::high_resolution_clock; class TextMatchIndex : public InvertedIndexTantivy { public: // for growing segment. - explicit TextMatchIndex( - int64_t commit_interval_in_ms, - const char* tokenizer_name, - const char* tokenizer_params); + explicit TextMatchIndex(int64_t commit_interval_in_ms, + const char* tokenizer_name, + const char* tokenizer_params); // for sealed segment. - explicit TextMatchIndex( - const std::string& path, - const char* tokenizer_name, - const char* tokenizer_params); + explicit TextMatchIndex(const std::string& path, + const char* tokenizer_name, + const char* tokenizer_params); // for building index. - explicit TextMatchIndex( - const storage::FileManagerContext& ctx, - const char* tokenizer_name, - const char* tokenizer_params); + explicit TextMatchIndex(const storage::FileManagerContext& ctx, + const char* tokenizer_name, + const char* tokenizer_params); // for loading index explicit TextMatchIndex(const storage::FileManagerContext& ctx); @@ -67,9 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy { CreateReader(); void - RegisterTokenizer( - const char* tokenizer_name, - const char* tokenizer_params); + RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params); TargetBitmap MatchQuery(const std::string& query); diff --git a/internal/core/src/indexbuilder/index_c.cpp b/internal/core/src/indexbuilder/index_c.cpp index f4f4613c72259..d9029955bcaf3 100644 --- a/internal/core/src/indexbuilder/index_c.cpp +++ b/internal/core/src/indexbuilder/index_c.cpp @@ -284,7 +284,7 @@ BuildTextIndex(CBinarySet* c_binary_set, auto index = std::make_unique( fileManagerContext, "milvus_tokenizer", - field_schema.get_tokenizer_params()); + field_schema.get_tokenizer_params().c_str()); index->Build(config); auto binary = std::make_unique(index->Upload(config)); diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp index a77579c1dbcc7..aefea1b28aa50 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp @@ -1613,13 +1613,13 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) { index = std::make_unique( std::numeric_limits::max(), "milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); } else { // build text index using mmap. index = std::make_unique( cfg.GetMmapPath(), "milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); } { @@ -1669,7 +1669,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) { index->Reload(); index->RegisterTokenizer("milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); text_indexes_[field_id] = std::move(index); } @@ -1680,7 +1680,7 @@ ChunkedSegmentSealedImpl::LoadTextIndex( std::unique_lock lck(mutex_); const auto& field_meta = schema_->operator[](field_id); index->RegisterTokenizer("milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); text_indexes_[field_id] = std::move(index); } diff --git a/internal/core/src/segcore/SegmentGrowingImpl.cpp b/internal/core/src/segcore/SegmentGrowingImpl.cpp index b90953c858066..bacfdab588774 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.cpp +++ b/internal/core/src/segcore/SegmentGrowingImpl.cpp @@ -859,11 +859,11 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) { "cannot create text index on non-string type"); // todo: make this(200) configurable. auto index = std::make_unique( - 200, "milvus_tokenizer", field_meta.get_tokenizer_params()); + 200, "milvus_tokenizer", field_meta.get_tokenizer_params().c_str()); index->Commit(); index->CreateReader(); index->RegisterTokenizer("milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); text_indexes_[field_id] = std::move(index); } diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 4371735fa57b5..b1dbfe46edf03 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -2043,13 +2043,13 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) { index = std::make_unique( std::numeric_limits::max(), "milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); } else { // build text index using mmap. index = std::make_unique( cfg.GetMmapPath(), "milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); } { @@ -2098,7 +2098,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) { index->Reload(); index->RegisterTokenizer("milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); text_indexes_[field_id] = std::move(index); } @@ -2109,7 +2109,7 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id, std::unique_lock lck(mutex_); const auto& field_meta = schema_->operator[](field_id); index->RegisterTokenizer("milvus_tokenizer", - field_meta.get_tokenizer_params()); + field_meta.get_tokenizer_params().c_str()); text_indexes_[field_id] = std::move(index); } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs new file mode 100644 index 0000000000000..d3ddb125cc8fe --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs @@ -0,0 +1,40 @@ +use serde_json as json; + +#[derive(Debug)] +pub struct TantivyError{ + reason: String, +} + +impl TantivyError{ + fn new(reason:String) -> Self{ + TantivyError{reason:reason} + } + + pub fn reason(&self) -> String{ + return self.reason.clone() + } +} + +impl From<&str> for TantivyError{ + fn from(value: &str) -> Self { + Self::new(value.to_string()) + } +} + +impl From for TantivyError{ + fn from(value: String) -> Self { + Self::new(value) + } +} + +impl From for TantivyError{ + fn from(value: json::Error) -> Self { + Self::new(value.to_string()) + } +} + +impl ToString for TantivyError{ + fn to_string(&self) -> String { + return self.reason() + } +} \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index b80039c00c9dd..960902410482c 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -4,7 +4,7 @@ use tantivy::{ Term, }; -use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_tokenizer}; +use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer}; impl IndexReaderWrapper { // split the query string into multiple tokens using index's default tokenizer, @@ -14,7 +14,7 @@ impl IndexReaderWrapper { let mut tokenizer = self .index .tokenizer_for_field(self.field) - .unwrap_or(standard_tokenizer()) + .unwrap_or(standard_analyzer()) .clone(); let mut token_stream = tokenizer.token_stream(q); let mut terms: Vec = Vec::new(); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index cc461c87deabd..e96939f236472 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -7,6 +7,7 @@ use crate::{ string_c::c_str_to_str, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer, + log::init_log, }; #[no_mangle] @@ -25,6 +26,7 @@ pub extern "C" fn tantivy_register_tokenizer( tokenizer_name: *const c_char, tokenizer_params: *const c_char, ) { + init_log(); let real = ptr as *mut IndexReaderWrapper; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) }; let analyzer = unsafe { @@ -32,14 +34,14 @@ pub extern "C" fn tantivy_register_tokenizer( create_tokenizer(¶ms) }; match analyzer { - Some(text_analyzer) => unsafe { + Ok(text_analyzer) => unsafe { (*real).register_tokenizer( String::from(tokenizer_name_str.to_str().unwrap()), text_analyzer, ); }, - None => { - panic!("unsupported tokenizer"); - } + Err(err) => { + panic!("create tokenizer failed with error: {}", err.to_string()); + }, } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index 83c8dc34ad954..d73bb8d733082 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::ffi::c_char; use std::ffi::c_void; use std::ffi::CStr; @@ -7,6 +6,7 @@ use crate::index_writer::IndexWriterWrapper; use crate::tokenizer::create_tokenizer; use crate::util::create_binding; use crate::string_c::c_str_to_str; +use crate::log::init_log; #[no_mangle] pub extern "C" fn tantivy_create_text_writer( @@ -18,6 +18,7 @@ pub extern "C" fn tantivy_create_text_writer( overall_memory_budget_in_bytes: usize, in_ram: bool, ) -> *mut c_void { + init_log(); let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() }; let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() }; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() }; @@ -26,7 +27,7 @@ pub extern "C" fn tantivy_create_text_writer( create_tokenizer(¶ms) }; match analyzer { - Some(text_analyzer) => { + Ok(text_analyzer) => { let wrapper = IndexWriterWrapper::create_text_writer( String::from(field_name_str), String::from(path_str), @@ -38,8 +39,9 @@ pub extern "C" fn tantivy_create_text_writer( ); create_binding(wrapper) } - None => { + Err(err) => { + log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), unsafe{c_str_to_str(tokenizer_params).to_string()}); std::ptr::null_mut() - } + }, } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index f5df4dc10ff15..90bfa80fd11c7 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -18,6 +18,7 @@ mod tokenizer; mod tokenizer_filter; mod tokenizer_c; mod util; +mod error; mod util_c; mod vec_collector; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs index eae72e613ff4f..e4b8314366c6d 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -1,134 +1,182 @@ -use lazy_static::lazy_static; use log::warn; use std::collections::HashMap; use tantivy::tokenizer::*; use serde_json as json; use crate::tokenizer_filter::*; -use crate::log::init_log; +use crate::error::TantivyError; -pub(crate) fn standard_tokenizer() -> TextAnalyzer { - TextAnalyzer::builder(SimpleTokenizer::default()).build() +pub(crate) fn standard_analyzer() -> TextAnalyzer { + standard_builder().build() } -struct TantivyBuilder<'a>{ +fn standard_builder() -> TextAnalyzerBuilder{ + TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() +} + +fn whitespace_builder()-> TextAnalyzerBuilder{ + TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic() +} + +fn get_builder_by_name(name:&String) -> Result{ + match name.as_str() { + "standard" => Ok(standard_builder()), + "whitespace" => Ok(whitespace_builder()), + other => { + warn!("unsupported tokenizer: {}", other); + Err(format!("unsupported tokenizer: {}", other).into()) + } + } +} + +struct AnalyzerBuilder<'a>{ // builder: TextAnalyzerBuilder filters:HashMap, params:&'a json::Map } -impl TantivyBuilder<'_>{ - fn new(params: &json::Map) -> TantivyBuilder{ - TantivyBuilder{ +impl AnalyzerBuilder<'_>{ + fn new(params: &json::Map) -> AnalyzerBuilder{ + AnalyzerBuilder{ filters: HashMap::new(), params:params, } } - fn add_costom_filter(&mut self, name: &String, params: &json::Map){ + fn get_tokenizer_name(&self) -> Result{ + let tokenizer=self.params.get("tokenizer"); + if tokenizer.is_none(){ + return Ok("standard".to_string()); + } + if !tokenizer.unwrap().is_string(){ + return Err(format!("tokenizer name should be string").into()); + } + + Ok(tokenizer.unwrap().as_str().unwrap().to_string()) + } + + fn add_custom_filter(&mut self, name: &String, params: &json::Map) -> Result<(),TantivyError>{ match SystemFilter::try_from(params){ - Ok(filter) => {self.filters.insert(name.to_string(), filter);}, - Err(_e) => {}, - }; + Ok(filter) => { + self.filters.insert(name.to_string(), filter); + Ok(()) + }, + Err(e) => {Err(e)}, + } } - fn add_costom_filters(&mut self, params:&json::Map){ + fn add_custom_filters(&mut self, params:&json::Map) -> Result<(),TantivyError>{ for (name, value) in params{ if !value.is_object(){ continue; } - - self.add_costom_filter(name, value.as_object().unwrap()); + self.add_custom_filter(name, value.as_object().unwrap())?; } + Ok(()) } - fn build(mut self) -> Option{ - let tokenizer=self.params.get("tokenizer"); - if !tokenizer.is_none() && !tokenizer.unwrap().is_string(){ - return None; + fn build_filter(&mut self,mut builder: TextAnalyzerBuilder, params: &json::Value) -> Result{ + if !params.is_array(){ + return Err("filter params should be array".into()); } - - let tokenizer_name = { - if !tokenizer.is_none(){ - tokenizer.unwrap().as_str().unwrap() - }else{ - "standard" + + let filters = params.as_array().unwrap(); + for filter in filters{ + if filter.is_string(){ + let filter_name = filter.as_str().unwrap(); + let costum = self.filters.remove(filter_name); + if !costum.is_none(){ + builder = costum.unwrap().transform(builder); + continue; + } + + // check if filter was system filter + let system = SystemFilter::from(filter_name); + match system { + SystemFilter::Invalid => { + return Err(format!("build analyzer failed, filter not found :{}", filter_name).into()) + } + other => { + builder = other.transform(builder); + }, + } + }else if filter.is_object(){ + let filter=SystemFilter::try_from(filter.as_object().unwrap())?; + builder = filter.transform(builder); } }; + Ok(builder) + } - match tokenizer_name { - "standard" => { - let mut builder = TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(); - let filters= self.params.get("filter"); - if !filters.is_none() && filters.unwrap().is_array(){ - for filter in filters.unwrap().as_array().unwrap(){ - if filter.is_string(){ - let filter_name = filter.as_str().unwrap(); - let costum = self.filters.remove(filter_name); - if !costum.is_none(){ - builder = costum.unwrap().transform(builder); - continue; - } - // check if filter was system filter - let system = SystemFilter::from(filter_name); - match system { - SystemFilter::Invalid => { - log::warn!("build analyzer failed, filter not found :{}", filter_name); - return None - } - other => { - builder = other.transform(builder); - }, - } - } - } + fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result{ + for (key, value) in self.params{ + match key.as_str(){ + "tokenizer" => {}, + "filter" => { + // build with filter if filter param exist + builder=self.build_filter(builder, value)?; + }, + "max_token_length" => { + if !value.is_u64(){ + return Err("max token length should be int type".into()); } - Some(builder.build()) + builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize)); } - // TODO support jieba filter and use same builder with standard. - "jieba" => { - Some(tantivy_jieba::JiebaTokenizer {}.into()) - } - s => { - warn!("unsupported tokenizer: {}", s); - None + other => return Err(format!("unknown key of tokenizer option: {}", other).into()), } } + Ok(builder) + } + + fn build(mut self) -> Result{ + let tokenizer_name = self.get_tokenizer_name()?; + if tokenizer_name == "jieba"{ + return Ok(tantivy_jieba::JiebaTokenizer{}.into()); + } + + let mut builder=get_builder_by_name(&tokenizer_name)?; + + // build with option + builder = self.build_option(builder)?; + Ok(builder.build()) } } -pub(crate) fn create_tokenizer(params: &String) -> Option { - init_log(); +pub(crate) fn create_tokenizer(params: &String) -> Result { + if params.len()==0{ + return Ok(standard_analyzer()); + } match json::from_str::(¶ms){ Ok(value) =>{ if value.is_null(){ - return Some(standard_tokenizer()); + return Ok(standard_analyzer()); } if !value.is_object(){ - return None; + return Err("tokenizer params should be a json map".into()); } let json_params = value.as_object().unwrap(); + // create builder let analyzer_params=json_params.get("analyzer"); if analyzer_params.is_none(){ - return Some(standard_tokenizer()); + return Ok(standard_analyzer()); } if !analyzer_params.unwrap().is_object(){ - return None; + return Err("analyzer params should be a json map".into()); } - let mut builder = TantivyBuilder::new(analyzer_params.unwrap().as_object().unwrap()); - + let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap()); + // build custom filter let filter_params=json_params.get("filter"); if !filter_params.is_none() && filter_params.unwrap().is_object(){ - builder.add_costom_filters(filter_params.unwrap().as_object().unwrap()); + builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?; } // build analyzer builder.build() }, - Err(_e) => None, + Err(err) => Err(err.into()), } } @@ -138,16 +186,25 @@ mod tests { #[test] fn test_create_tokenizer() { - let params = r#" - { - "analyzer": - { - "tokenizer": "standard", - "filter": [""], - }, - }"#; + let params = r#"{"analyzer": {"tokenizer": "standard"}}"#; let tokenizer = create_tokenizer(¶ms.to_string()); - assert!(tokenizer.is_some()); + assert!(tokenizer.is_ok()); + } + + #[test] + fn test_jieba_tokenizer() { + let params = r#"{"analyzer": {"tokenizer": "jieba"}}"#; + + let tokenizer = create_tokenizer(¶ms.to_string()); + assert!(tokenizer.is_ok()); + let mut bining = tokenizer.unwrap(); + + let mut stream = bining.token_stream("系统安全"); + while stream.advance(){ + let token = stream.token(); + let text = token.text.clone(); + print!("test token :{}\n", text.as_str()) + } } } \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs index dedca3064e138..c7a7a79e2b6c3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -5,17 +5,22 @@ use crate::{ string_c::c_str_to_str, tokenizer::create_tokenizer, util::{create_binding, free_binding}, + log::init_log, }; #[no_mangle] pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void { + init_log(); let analyzer = unsafe { let params = c_str_to_str(tokenizer_params).to_string(); create_tokenizer(¶ms) }; match analyzer { - Some(text_analyzer) => create_binding(text_analyzer), - None => std::ptr::null_mut(), + Ok(text_analyzer) => create_binding(text_analyzer), + Err(err) => { + log::warn!("create tokenizer failed with error: {}", err.to_string()); + std::ptr::null_mut() + }, } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs index 9d4c27aa15ae7..dcfe9e7541c51 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs @@ -1,6 +1,8 @@ use tantivy::tokenizer::*; use serde_json as json; +use crate::error::TantivyError; + pub(crate) enum SystemFilter{ Invalid, LowerCase(LowerCaser), @@ -33,19 +35,19 @@ impl SystemFilter{ // "max": 10, // length // } // TODO support min length -fn get_length_filter(params: &json::Map) -> Result{ +fn get_length_filter(params: &json::Map) -> Result{ let limit_str = params.get("max"); if limit_str.is_none() || !limit_str.unwrap().is_u64(){ - return Err(()) + return Err("lenth max param was none or not uint".into()) } let limit = limit_str.unwrap().as_u64().unwrap() as usize; Ok(SystemFilter::Length(RemoveLongFilter::limit(limit))) } -fn get_stop_filter(params: &json::Map)-> Result{ +fn get_stop_filter(params: &json::Map)-> Result{ let value = params.get("stop_words"); if value.is_none() || !value.unwrap().is_array(){ - return Err(()) + return Err("stop_words should be array".into()) } let stop_words= value.unwrap().as_array().unwrap(); @@ -53,16 +55,16 @@ fn get_stop_filter(params: &json::Map)-> Result str_list.push(word.to_string()), - None => return Err(()) + None => return Err("stop words item should be string".into()) } }; Ok(SystemFilter::Stop(StopWordFilter::remove(str_list))) } -fn get_decompounder_filter(params: &json::Map)-> Result{ +fn get_decompounder_filter(params: &json::Map)-> Result{ let value = params.get("word_list"); if value.is_none() || !value.unwrap().is_array(){ - return Err(()) + return Err("decompounder word list should be array".into()) } let stop_words= value.unwrap().as_array().unwrap(); @@ -70,25 +72,25 @@ fn get_decompounder_filter(params: &json::Map)-> Result str_list.push(word.to_string()), - None => return Err(()) + None => return Err("decompounder word list item should be string".into()) } }; match SplitCompoundWords::from_dictionary(str_list){ Ok(f) => Ok(SystemFilter::Decompounder(f)), - Err(_e) => Err(()) + Err(e) => Err(format!("create decompounder failed: {}", e.to_string()).into()) } } -fn get_stemmer_filter(params: &json::Map)-> Result{ +fn get_stemmer_filter(params: &json::Map)-> Result{ let value = params.get("language"); if value.is_none() || !value.unwrap().is_string(){ - return Err(()) + return Err("stemmer language field should be string".into()) } match value.unwrap().as_str().unwrap().into_language(){ Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))), - Err(_e) => Err(()), + Err(e) => Err(format!("create stemmer failed : {}", e.to_string()).into()), } } @@ -98,9 +100,9 @@ trait LanguageParser { } impl LanguageParser for &str { - type Error = (); + type Error = TantivyError; fn into_language(self) -> Result { - match self { + match self.to_lowercase().as_str() { "arabig" => Ok(Language::Arabic), "danish" => Ok(Language::Danish), "dutch" => Ok(Language::Dutch), @@ -119,7 +121,7 @@ impl LanguageParser for &str { "swedish" => Ok(Language::Swedish), "tamil" => Ok(Language::Tamil), "turkish" => Ok(Language::Turkish), - _ => Err(()), + other => Err(format!("unsupport language: {}", other).into()), } } } @@ -136,13 +138,13 @@ impl From<&str> for SystemFilter{ } impl TryFrom<&json::Map> for SystemFilter { - type Error = (); + type Error = TantivyError; fn try_from(params: &json::Map) -> Result { match params.get(&"type".to_string()){ Some(value) =>{ if !value.is_string(){ - return Err(()); + return Err("filter type should be string".into()); }; match value.as_str().unwrap(){ @@ -150,10 +152,10 @@ impl TryFrom<&json::Map> for SystemFilter { "stop" => get_stop_filter(params), "decompounder" => get_decompounder_filter(params), "stemmer" => get_stemmer_filter(params), - _other=>Err(()), + other=> Err(format!("unsupport filter type: {}", other).into()), } } - None => Err(()), + None => Err("no type field in filter params".into()), } } } diff --git a/internal/core/thirdparty/tantivy/tantivy-wrapper.h b/internal/core/thirdparty/tantivy/tantivy-wrapper.h index 2e576f5fe9162..ee45e9d1958ff 100644 --- a/internal/core/thirdparty/tantivy/tantivy-wrapper.h +++ b/internal/core/thirdparty/tantivy/tantivy-wrapper.h @@ -101,8 +101,7 @@ struct TantivyIndexWrapper { bool in_ram, const char* path, const char* tokenizer_name = DEFAULT_TOKENIZER_NAME, - const char* - tokenizer_params = DEFAULT_TOKENIZER_PARAMS, + const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS, uintptr_t num_threads = DEFAULT_NUM_THREADS, uintptr_t overall_memory_budget_in_bytes = DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) { @@ -132,9 +131,8 @@ struct TantivyIndexWrapper { } void - register_tokenizer( - const char* tokenizer_name, - const char* tokenizer_params) { + register_tokenizer(const char* tokenizer_name, + const char* tokenizer_params) { if (reader_ != nullptr) { tantivy_register_tokenizer( reader_, tokenizer_name, tokenizer_params); diff --git a/internal/core/unittest/test_c_tokenizer.cpp b/internal/core/unittest/test_c_tokenizer.cpp index 7e5c9e2a40df6..fd836983a662e 100644 --- a/internal/core/unittest/test_c_tokenizer.cpp +++ b/internal/core/unittest/test_c_tokenizer.cpp @@ -47,12 +47,10 @@ set_cmap(CMap m, const std::string& key, const std::string& value) { } TEST(CTokenizer, Default) { - auto m = create_cmap(); - set_cmap(m, "tokenizer", "default"); - + auto tokenizer_params = R"({"analyzer":{"tokenizer": "standard"}})"; CTokenizer tokenizer; { - auto status = create_tokenizer(m, &tokenizer); + auto status = create_tokenizer(tokenizer_params, &tokenizer); ASSERT_EQ(milvus::ErrorCode::Success, status.error_code); } @@ -71,5 +69,4 @@ TEST(CTokenizer, Default) { free_token_stream(token_stream); free_tokenizer(tokenizer); - free_cmap(m); } diff --git a/internal/core/unittest/test_text_match.cpp b/internal/core/unittest/test_text_match.cpp index 359c35ce367de..dfdc9eaa9d175 100644 --- a/internal/core/unittest/test_text_match.cpp +++ b/internal/core/unittest/test_text_match.cpp @@ -10,9 +10,9 @@ // or implied. See the License for the specific language governing permissions and limitations under the License #include +#include #include "common/Schema.h" -#include "segcore/segment_c.h" #include "segcore/SegmentGrowing.h" #include "segcore/SegmentGrowingImpl.h" #include "test_utils/DataGen.h" @@ -80,20 +80,20 @@ TEST(ParseJson, Naive) { TEST(ParseTokenizerParams, NoTokenizerParams) { TypeParams params{{"k", "v"}}; auto p = ParseTokenizerParams(params); - ASSERT_EQ("{}", p); + ASSERT_EQ("{}", std::string(p)); } TEST(ParseTokenizerParams, Default) { - TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}}; + TypeParams params{ + {"tokenizer_params", R"({"analyzer":{"tokenizer": "standard"}})"}}; auto p = ParseTokenizerParams(params); ASSERT_EQ(params.at("tokenizer_params"), p); } TEST(TextMatch, Index) { using Index = index::TextMatchIndex; - auto index = std::make_unique(std::numeric_limits::max(), - "milvus_tokenizer", - "{}"); + auto index = std::make_unique( + std::numeric_limits::max(), "milvus_tokenizer", "{}"); index->CreateReader(); index->AddText("football, basketball, pingpang", 0); index->AddText("swimming, football", 1); diff --git a/internal/proxy/task_query.go b/internal/proxy/task_query.go index 29f3606f84532..3bbdaffd9ae29 100644 --- a/internal/proxy/task_query.go +++ b/internal/proxy/task_query.go @@ -297,7 +297,6 @@ func (t *queryTask) CanSkipAllocTimestamp() bool { } consistencyLevel = collectionInfo.consistencyLevel } - return consistencyLevel != commonpb.ConsistencyLevel_Strong } diff --git a/internal/proxy/task_search.go b/internal/proxy/task_search.go index 279387f5e6af4..511bbf74855e2 100644 --- a/internal/proxy/task_search.go +++ b/internal/proxy/task_search.go @@ -111,7 +111,6 @@ func (t *searchTask) CanSkipAllocTimestamp() bool { } consistencyLevel = collectionInfo.consistencyLevel } - return consistencyLevel != commonpb.ConsistencyLevel_Strong } diff --git a/internal/util/ctokenizer/c_tokenizer_test.go b/internal/util/ctokenizer/c_tokenizer_test.go index 9b9517020d69e..9f8361d53a735 100644 --- a/internal/util/ctokenizer/c_tokenizer_test.go +++ b/internal/util/ctokenizer/c_tokenizer_test.go @@ -10,7 +10,7 @@ import ( func TestTokenizer(t *testing.T) { // default tokenizer. { - m := make(map[string]string) + m := "{\"analyzer\":{\"tokenizer\": \"standard\"}}" tokenizer, err := NewTokenizer(m) assert.NoError(t, err) defer tokenizer.Destroy() @@ -24,8 +24,7 @@ func TestTokenizer(t *testing.T) { // jieba tokenizer. { - m := make(map[string]string) - m["tokenizer"] = "jieba" + m := "{\"analyzer\":{\"tokenizer\": \"jieba\"}}" tokenizer, err := NewTokenizer(m) assert.NoError(t, err) defer tokenizer.Destroy() diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index b47d6301085f2..0dc8c47e124eb 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -778,7 +778,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct. def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name, auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs): tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } fields = [ gen_int64_field(), diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index 76df79b8edd68..3716616fc9104 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -33,7 +33,7 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase): """ @pytest.mark.tags(CaseLabel.L0) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_collection_for_full_text_search(self, tokenizer): """ target: test create collection with full text search @@ -41,7 +41,7 @@ def test_create_collection_for_full_text_search(self, tokenizer): expected: create collection successfully """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -97,7 +97,7 @@ def test_create_collection_for_full_text_search(self, tokenizer): assert len(res["functions"]) == len(text_fields) @pytest.mark.tags(CaseLabel.L0) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_collection_for_full_text_search_twice_with_same_schema(self, tokenizer): """ target: test create collection with full text search twice with same schema @@ -105,7 +105,7 @@ def test_create_collection_for_full_text_search_twice_with_same_schema(self, tok expected: create collection successfully and create again successfully """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -175,7 +175,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("tokenizer", ["unsupported"]) - @pytest.mark.xfail(reason="") + @pytest.mark.skip(reason="check not implement may cause panic") def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, tokenizer): """ target: test create collection with full text search with unsupported tokenizer @@ -183,7 +183,7 @@ def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, expected: create collection failed """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -249,7 +249,7 @@ def test_create_collection_for_full_text_search_with_invalid_input_output(self, expected: create collection failed """ tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } dim = 128 fields = [ @@ -327,7 +327,7 @@ def test_create_collection_for_full_text_search_with_field_not_tokenized(self): expected: create collection failed """ tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } dim = 128 fields = [ @@ -397,7 +397,7 @@ class TestInsertWithFullTextSearch(TestcaseBase): @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("nullable", [False, True]) @pytest.mark.parametrize("text_lang", ["en", "zh", "hybrid"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullable): """ target: test insert data with full text search @@ -406,7 +406,7 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl expected: insert successfully and count is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -542,7 +542,7 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl @pytest.mark.parametrize("enable_dynamic_field", [True]) @pytest.mark.parametrize("nullable", [False]) @pytest.mark.parametrize("text_lang", ["en"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_lang, nullable, enable_dynamic_field): """ target: test insert data with full text search and enable dynamic field @@ -552,7 +552,7 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_ expected: insert successfully and count is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -692,7 +692,7 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_ @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("nullable", [True]) @pytest.mark.parametrize("text_lang", ["en"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, nullable): """ target: test insert data for full text search with dataframe @@ -700,7 +700,7 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, 2. query count and verify the result """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -831,7 +831,7 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, assert len(data) == count @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer): """ target: test insert data with full text search with part of empty string @@ -842,7 +842,7 @@ def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer): """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -990,7 +990,7 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nullable", [True]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nullable): """ target: test insert data with full text search with non varchar data @@ -999,7 +999,7 @@ def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nul """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -1089,7 +1089,7 @@ class TestUpsertWithFullTextSearch(TestcaseBase): @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("nullable", [False, True]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021") def test_upsert_for_full_text_search(self, tokenizer, nullable): """ @@ -1100,7 +1100,7 @@ def test_upsert_for_full_text_search(self, tokenizer, nullable): expected: upsert successfully and data is updated """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -1260,7 +1260,7 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nullable", [False]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021") def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nullable): """ @@ -1270,7 +1270,7 @@ def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nulla expected: upsert failed """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -1402,7 +1402,7 @@ class TestDeleteWithFullTextSearch(TestcaseBase): """ @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_delete_for_full_text_search(self, tokenizer): """ target: test delete data for full text search @@ -1412,7 +1412,7 @@ def test_delete_for_full_text_search(self, tokenizer): expected: delete successfully and data is deleted """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -1564,7 +1564,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("b", [0.1]) @pytest.mark.parametrize("k", [1.2]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_index_for_full_text_search_default( self, tokenizer, index_type, k, b ): @@ -1576,7 +1576,7 @@ def test_create_index_for_full_text_search_default( expected: create index successfully and index info is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } empty_percent = 0.0 dim = 128 @@ -1688,7 +1688,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("b", [0.5]) @pytest.mark.parametrize("k", [1.5]) @pytest.mark.parametrize("index_type", ["HNSW", "INVALID_INDEX_TYPE"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_full_text_search_with_invalid_index_type( self, tokenizer, index_type, k, b ): @@ -1699,7 +1699,7 @@ def test_create_full_text_search_with_invalid_index_type( expected: create index failed """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } empty_percent = 0.0 dim = 128 @@ -1796,7 +1796,7 @@ def test_create_full_text_search_with_invalid_index_type( @pytest.mark.parametrize("k", [1.5]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("metric_type", ["COSINE", "L2", "IP"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_full_text_search_index_with_invalid_metric_type( self, tokenizer, index_type, metric_type, k, b ): @@ -1807,7 +1807,7 @@ def test_create_full_text_search_index_with_invalid_metric_type( expected: create index failed """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } empty_percent = 0.0 dim = 128 @@ -1903,7 +1903,7 @@ def test_create_full_text_search_index_with_invalid_metric_type( @pytest.mark.parametrize("b", [0.5]) @pytest.mark.parametrize("k", [1.5]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_index_using_bm25_metric_type_for_non_bm25_output_field( self, tokenizer, index_type, k, b ): @@ -1915,7 +1915,7 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field( expected: create index failed """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } empty_percent = 0.0 dim = 128 @@ -2000,7 +2000,7 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field( @pytest.mark.parametrize("b", [-1]) @pytest.mark.parametrize("k", [-1]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_full_text_search_with_invalid_bm25_params( self, tokenizer, index_type, k, b ): @@ -2011,7 +2011,7 @@ def test_create_full_text_search_with_invalid_bm25_params( expected: create index failed """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } empty_percent = 0.0 dim = 128 @@ -2121,7 +2121,7 @@ class TestSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("enable_inverted_index", [True]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"]) @pytest.mark.parametrize("expr", ["text_match", "id_range"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.parametrize("offset", [10, 0]) def test_full_text_search_default( self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq @@ -2134,7 +2134,7 @@ def test_full_text_search_default( expected: full text search successfully and result is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -2512,7 +2512,7 @@ def test_full_text_search_with_jieba_tokenizer( @pytest.mark.parametrize("enable_inverted_index", [True]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("expr", [None]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_full_text_search_with_range_search( self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq ): @@ -2524,7 +2524,7 @@ def test_full_text_search_with_range_search( expected: full text search successfully and result is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -2677,7 +2677,7 @@ def test_full_text_search_with_range_search( @pytest.mark.parametrize("enable_inverted_index", [True]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("expr", [None]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_full_text_search_with_search_iterator( self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq ): @@ -2689,7 +2689,7 @@ def test_full_text_search_with_search_iterator( expected: full text search successfully and result is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -2830,7 +2830,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("enable_inverted_index", [True]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("invalid_search_data", ["empty_text"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37022") def test_search_for_full_text_search_with_empty_string_search_data( self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data @@ -2843,7 +2843,7 @@ def test_search_for_full_text_search_with_empty_string_search_data( expected: full text search successfully but result is empty """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -2960,7 +2960,7 @@ def test_search_for_full_text_search_with_empty_string_search_data( @pytest.mark.parametrize("enable_inverted_index", [True]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"]) @pytest.mark.parametrize("invalid_search_data", ["sparse_vector", "dense_vector"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_search_for_full_text_search_with_invalid_search_data( self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data ): @@ -2972,7 +2972,7 @@ def test_search_for_full_text_search_with_invalid_search_data( expected: full text search failed and return error """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -3107,7 +3107,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("enable_partition_key", [True]) @pytest.mark.parametrize("enable_inverted_index", [True]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_hybrid_search_with_full_text_search( self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type ): @@ -3119,7 +3119,7 @@ def test_hybrid_search_with_full_text_search( expected: hybrid search successfully and result is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index dc1e6622c2e62..5acddf0fb4d9e 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -4441,7 +4441,7 @@ class TestQueryTextMatch(TestcaseBase): @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("enable_partition_key", [True, False]) @pytest.mark.parametrize("enable_inverted_index", [True, False]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_query_text_match_en_normal( self, tokenizer, enable_inverted_index, enable_partition_key ): @@ -4453,7 +4453,7 @@ def test_query_text_match_en_normal( expected: text match successfully and result is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ @@ -4724,24 +4724,21 @@ def test_query_text_match_custom_analyzer(self): expected: get the correct token, text match successfully and result is correct """ tokenizer_params = { - "tokenizer": "standard", - "alpha_num_only": True, - "ascii_folding": True, - "lower_case": True, - "max_token_length": 40, - "split_compound_words": [ - "dampf", - "schiff", - "fahrt", - "brot", - "backen", - "automat", - ], - "stemmer": "English", - "stop": { - "language": "English", - "words": ["an", "the"], + "analyzer": { + "tokenizer": "standard", + # "lowercase", "asciifolding", "alphanumonly" was system filter + "filter":["lowercase", "asciifolding", "alphanumonly", "my_stop_filter", "my_stemmer"], }, + "filter": { + "my_stop_filter":{ + "type": "stop", + "stop_words": ["in", "of"], + }, + "my_stemmer":{ + "type": "stemmer", + "language": "english", + } + } } dim = 128 fields = [ @@ -4852,7 +4849,7 @@ def test_query_text_match_with_combined_expression_for_single_field(self): expected: query successfully and result is correct """ tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } # 1. initialize with data dim = 128 @@ -4966,7 +4963,7 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): expected: query successfully and result is correct """ tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } # 1. initialize with data dim = 128 @@ -5109,7 +5106,7 @@ def test_query_text_match_with_multi_lang(self): # 1. initialize with data tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } # 1. initialize with data dim = 128 @@ -5254,7 +5251,7 @@ def test_query_text_match_with_addition_inverted_index(self): # 1. initialize with data fake_en = Faker("en_US") tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } dim = 128 default_fields = [ @@ -5481,7 +5478,7 @@ def test_query_text_match_with_some_empty_string(self): """ # 1. initialize with data tokenizer_params = { - "tokenizer": "default", + "analyzer":{"tokenizer": "standard"}, } # 1. initialize with data dim = 128 @@ -5740,7 +5737,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): expected: create collection failed and return error """ tokenizer_params = { - "tokenizer": "Unsupported", + "analyzer":{"tokenizer": "Unsupported"}, } dim = 128 default_fields = [ diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py index ee21195a10a8c..025d8180b18fa 100644 --- a/tests/python_client/testcases/test_search.py +++ b/tests/python_client/testcases/test_search.py @@ -13290,7 +13290,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase): @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("enable_partition_key", [True, False]) @pytest.mark.parametrize("enable_inverted_index", [True, False]) - @pytest.mark.parametrize("tokenizer", ["default"]) + @pytest.mark.parametrize("tokenizer", ["standard"]) def test_search_with_text_match_filter_normal_en( self, tokenizer, enable_inverted_index, enable_partition_key ): @@ -13302,7 +13302,7 @@ def test_search_with_text_match_filter_normal_en( expected: text match successfully and result is correct """ tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } dim = 128 fields = [ diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py index 0f6f6c640e770..030474e833f47 100644 --- a/tests/restful_client_v2/testcases/test_vector_operations.py +++ b/tests/restful_client_v2/testcases/test_vector_operations.py @@ -1881,7 +1881,7 @@ def test_search_vector_with_ignore_growing(self, ignore_growing): assert len(res) == limit - @pytest.mark.parametrize("tokenizer", ["jieba", "default"]) + @pytest.mark.parametrize("tokenizer", ["jieba", "standard"]) def test_search_vector_with_text_match_filter(self, tokenizer): """ Query a vector with a simple payload @@ -1894,7 +1894,7 @@ def test_search_vector_with_text_match_filter(self, tokenizer): # create a collection dim = 128 tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } name = gen_collection_name() fields = [ @@ -2718,7 +2718,7 @@ def test_query_vector_with_large_sum_of_limit_offset(self, sum_of_limit_offset): if "like" in filter_expr: assert name.startswith(prefix) - @pytest.mark.parametrize("tokenizer", ["jieba", "default"]) + @pytest.mark.parametrize("tokenizer", ["jieba", "standard"]) def test_query_vector_with_text_match_filter(self, tokenizer): """ Query a vector with a simple payload @@ -2731,7 +2731,7 @@ def test_query_vector_with_text_match_filter(self, tokenizer): # create a collection dim = 128 tokenizer_params = { - "tokenizer": tokenizer, + "analyzer":{"tokenizer": tokenizer}, } name = gen_collection_name() fields = [ From 7c47ec191ba895b149a07f47ea50ad26ae2072f3 Mon Sep 17 00:00:00 2001 From: aoiasd Date: Sun, 3 Nov 2024 18:02:32 +0800 Subject: [PATCH 4/4] support analyzer type Signed-off-by: aoiasd --- .../tantivy-binding/src/index_reader_text.rs | 2 +- .../src/index_reader_text_c.rs | 8 +-- .../src/index_writer_text_c.rs | 8 +-- .../tantivy/tantivy-binding/src/tokenizer.rs | 70 +++++++++++++++---- .../tantivy-binding/src/tokenizer_c.rs | 8 +-- .../tantivy-binding/src/tokenizer_filter.rs | 21 ++---- .../tantivy/tantivy-binding/src/util.rs | 18 +++++ internal/core/unittest/test_c_tokenizer.cpp | 2 +- internal/core/unittest/test_text_match.cpp | 7 +- internal/util/ctokenizer/c_tokenizer_test.go | 4 +- .../ctokenizer/text_schema_validator_test.go | 4 +- tests/python_client/common/common_func.py | 2 +- .../testcases/test_full_text_search.py | 50 +++++++------ tests/python_client/testcases/test_query.py | 27 +++---- tests/python_client/testcases/test_search.py | 2 +- .../testcases/test_vector_operations.py | 4 +- 16 files changed, 139 insertions(+), 98 deletions(-) diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index 960902410482c..ef6e2d6cb6552 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -14,7 +14,7 @@ impl IndexReaderWrapper { let mut tokenizer = self .index .tokenizer_for_field(self.field) - .unwrap_or(standard_analyzer()) + .unwrap_or(standard_analyzer(vec![])) .clone(); let mut token_stream = tokenizer.token_stream(q); let mut terms: Vec = Vec::new(); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index e96939f236472..fc7e00e7672e7 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -29,10 +29,8 @@ pub extern "C" fn tantivy_register_tokenizer( init_log(); let real = ptr as *mut IndexReaderWrapper; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) }; - let analyzer = unsafe { - let params = c_str_to_str(tokenizer_params).to_string(); - create_tokenizer(¶ms) - }; + let params = unsafe{c_str_to_str(tokenizer_params).to_string()}; + let analyzer = create_tokenizer(¶ms); match analyzer { Ok(text_analyzer) => unsafe { (*real).register_tokenizer( @@ -41,7 +39,7 @@ pub extern "C" fn tantivy_register_tokenizer( ); }, Err(err) => { - panic!("create tokenizer failed with error: {}", err.to_string()); + panic!("create tokenizer failed with error: {} param: {}", err.to_string(), params); }, } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index d73bb8d733082..5443fe14afb88 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -22,10 +22,8 @@ pub extern "C" fn tantivy_create_text_writer( let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() }; let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() }; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() }; - let analyzer = unsafe { - let params = c_str_to_str(tokenizer_params).to_string(); - create_tokenizer(¶ms) - }; + let params = unsafe{c_str_to_str(tokenizer_params).to_string()}; + let analyzer = create_tokenizer(¶ms); match analyzer { Ok(text_analyzer) => { let wrapper = IndexWriterWrapper::create_text_writer( @@ -40,7 +38,7 @@ pub extern "C" fn tantivy_create_text_writer( create_binding(wrapper) } Err(err) => { - log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), unsafe{c_str_to_str(tokenizer_params).to_string()}); + log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params); std::ptr::null_mut() }, } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs index e4b8314366c6d..d831c9d918c6f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -5,9 +5,20 @@ use serde_json as json; use crate::tokenizer_filter::*; use crate::error::TantivyError; +use crate::util::*; -pub(crate) fn standard_analyzer() -> TextAnalyzer { - standard_builder().build() + +// default build-in analyzer +pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder() + .filter(LowerCaser) + .filter(RemoveLongFilter::limit(40)); + + if stop_words.len() > 0{ + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() } fn standard_builder() -> TextAnalyzerBuilder{ @@ -122,14 +133,44 @@ impl AnalyzerBuilder<'_>{ } builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize)); } - other => return Err(format!("unknown key of tokenizer option: {}", other).into()), + other => return Err(format!("unknown analyzer option key: {}", other).into()), } } Ok(builder) } + fn build_template(self, type_: &str)-> Result{ + match type_{ + "standard" => { + let value = self.params.get("stop_words"); + match value{ + Some(value)=>{ + let str_list = get_string_list(value, "filter stop_words")?; + Ok(standard_analyzer(str_list)) + } + None => Ok(standard_analyzer(vec![])) + } + }, + other_ => Err(format!("unknown build-in analyzer type: {}", other_).into()) + } + } + fn build(mut self) -> Result{ - let tokenizer_name = self.get_tokenizer_name()?; + // build base build-in analyzer + match self.params.get("type"){ + Some(type_) =>{ + if !type_.is_string(){ + return Err(format!("analyzer type shoud be string").into()) + } + return self.build_template(type_.as_str().unwrap()); + }, + None => {} + }; + + //build custom analyzer + let tokenizer_name = self.get_tokenizer_name()?; + + // jieba analyzer can't add filter. if tokenizer_name == "jieba"{ return Ok(tantivy_jieba::JiebaTokenizer{}.into()); } @@ -142,15 +183,11 @@ impl AnalyzerBuilder<'_>{ } } -pub(crate) fn create_tokenizer(params: &String) -> Result { - if params.len()==0{ - return Ok(standard_analyzer()); - } - +pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result { match json::from_str::(¶ms){ Ok(value) =>{ if value.is_null(){ - return Ok(standard_analyzer()); + return Ok(standard_analyzer(vec![])); } if !value.is_object(){ return Err("tokenizer params should be a json map".into()); @@ -160,7 +197,7 @@ pub(crate) fn create_tokenizer(params: &String) -> Result Result Result { + if params.len()==0{ + return Ok(standard_analyzer(vec![])); + } + create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params)) +} + #[cfg(test)] mod tests { use crate::tokenizer::create_tokenizer; #[test] fn test_create_tokenizer() { - let params = r#"{"analyzer": {"tokenizer": "standard"}}"#; + let params = r#"{"tokenizer": "standard"}"#; let tokenizer = create_tokenizer(¶ms.to_string()); assert!(tokenizer.is_ok()); @@ -194,7 +238,7 @@ mod tests { #[test] fn test_jieba_tokenizer() { - let params = r#"{"analyzer": {"tokenizer": "jieba"}}"#; + let params = r#"{"tokenizer": "jieba"}"#; let tokenizer = create_tokenizer(¶ms.to_string()); assert!(tokenizer.is_ok()); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs index c7a7a79e2b6c3..86449699d5ab1 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -11,14 +11,12 @@ use crate::{ #[no_mangle] pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void { init_log(); - let analyzer = unsafe { - let params = c_str_to_str(tokenizer_params).to_string(); - create_tokenizer(¶ms) - }; + let params = unsafe{c_str_to_str(tokenizer_params).to_string()}; + let analyzer = create_tokenizer(¶ms); match analyzer { Ok(text_analyzer) => create_binding(text_analyzer), Err(err) => { - log::warn!("create tokenizer failed with error: {}", err.to_string()); + log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params); std::ptr::null_mut() }, } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs index dcfe9e7541c51..41dcbda8f210e 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs @@ -2,6 +2,7 @@ use tantivy::tokenizer::*; use serde_json as json; use crate::error::TantivyError; +use crate::util::*; pub(crate) enum SystemFilter{ Invalid, @@ -44,20 +45,12 @@ fn get_length_filter(params: &json::Map) -> Result)-> Result{ +fn get_stop_words_filter(params: &json::Map)-> Result{ let value = params.get("stop_words"); - if value.is_none() || !value.unwrap().is_array(){ - return Err("stop_words should be array".into()) + if value.is_none(){ + return Err("stop filter stop_words can't be empty".into()); } - - let stop_words= value.unwrap().as_array().unwrap(); - let mut str_list = Vec::::new(); - for element in stop_words{ - match element.as_str(){ - Some(word) => str_list.push(word.to_string()), - None => return Err("stop words item should be string".into()) - } - }; + let str_list = get_string_list(value.unwrap(), "stop_words filter")?; Ok(SystemFilter::Stop(StopWordFilter::remove(str_list))) } @@ -67,7 +60,7 @@ fn get_decompounder_filter(params: &json::Map)-> Result::new(); for element in stop_words{ match element.as_str(){ @@ -149,7 +142,7 @@ impl TryFrom<&json::Map> for SystemFilter { match value.as_str().unwrap(){ "length" => get_length_filter(params), - "stop" => get_stop_filter(params), + "stop" => get_stop_words_filter(params), "decompounder" => get_decompounder_filter(params), "stemmer" => get_stemmer_filter(params), other=> Err(format!("unsupport filter type: {}", other).into()), diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs index 1f1c1655c1032..e705b5df072b1 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs @@ -1,5 +1,7 @@ use std::ffi::c_void; use std::ops::Bound; +use serde_json as json; +use crate::error::TantivyError; use tantivy::{directory::MmapDirectory, Index}; @@ -28,3 +30,19 @@ pub fn free_binding(ptr: *mut c_void) { drop(Box::from_raw(real)); } } + +pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result, TantivyError>{ + if !value.is_array(){ + return Err(format!("{} should be array", label).into()) + } + + let stop_words = value.as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words{ + match element.as_str(){ + Some(word) => str_list.push(word.to_string()), + None => return Err(format!("{} list item should be string", label).into()) + } + }; + Ok(str_list) +} \ No newline at end of file diff --git a/internal/core/unittest/test_c_tokenizer.cpp b/internal/core/unittest/test_c_tokenizer.cpp index fd836983a662e..ac92370eaaaff 100644 --- a/internal/core/unittest/test_c_tokenizer.cpp +++ b/internal/core/unittest/test_c_tokenizer.cpp @@ -47,7 +47,7 @@ set_cmap(CMap m, const std::string& key, const std::string& value) { } TEST(CTokenizer, Default) { - auto tokenizer_params = R"({"analyzer":{"tokenizer": "standard"}})"; + auto tokenizer_params = R"({"tokenizer": "standard"})"; CTokenizer tokenizer; { auto status = create_tokenizer(tokenizer_params, &tokenizer); diff --git a/internal/core/unittest/test_text_match.cpp b/internal/core/unittest/test_text_match.cpp index dfdc9eaa9d175..574ebadb354ce 100644 --- a/internal/core/unittest/test_text_match.cpp +++ b/internal/core/unittest/test_text_match.cpp @@ -84,8 +84,7 @@ TEST(ParseTokenizerParams, NoTokenizerParams) { } TEST(ParseTokenizerParams, Default) { - TypeParams params{ - {"tokenizer_params", R"({"analyzer":{"tokenizer": "standard"}})"}}; + TypeParams params{{"tokenizer_params", R"({"tokenizer": "standard"})"}}; auto p = ParseTokenizerParams(params); ASSERT_EQ(params.at("tokenizer_params"), p); } @@ -250,7 +249,7 @@ TEST(TextMatch, GrowingJieBa) { auto schema = GenTestSchema({ {"enable_match", "true"}, {"enable_tokenizer", "true"}, - {"tokenizer_params", R"({"analyzer":{"tokenizer": "jieba"}})"}, + {"tokenizer_params", R"({"tokenizer": "jieba"})"}, }); auto seg = CreateGrowingSegment(schema, empty_index_meta); std::vector raw_str = {"青铜时代", "黄金时代"}; @@ -327,7 +326,7 @@ TEST(TextMatch, SealedJieBa) { auto schema = GenTestSchema({ {"enable_match", "true"}, {"enable_tokenizer", "true"}, - {"tokenizer_params", R"({"analyzer":{"tokenizer": "jieba"}})"}, + {"tokenizer_params", R"({"tokenizer": "jieba"})"}, }); auto seg = CreateSealedSegment(schema, empty_index_meta); std::vector raw_str = {"青铜时代", "黄金时代"}; diff --git a/internal/util/ctokenizer/c_tokenizer_test.go b/internal/util/ctokenizer/c_tokenizer_test.go index 9f8361d53a735..f15b032a22e27 100644 --- a/internal/util/ctokenizer/c_tokenizer_test.go +++ b/internal/util/ctokenizer/c_tokenizer_test.go @@ -10,7 +10,7 @@ import ( func TestTokenizer(t *testing.T) { // default tokenizer. { - m := "{\"analyzer\":{\"tokenizer\": \"standard\"}}" + m := "{\"tokenizer\": \"standard\"}" tokenizer, err := NewTokenizer(m) assert.NoError(t, err) defer tokenizer.Destroy() @@ -24,7 +24,7 @@ func TestTokenizer(t *testing.T) { // jieba tokenizer. { - m := "{\"analyzer\":{\"tokenizer\": \"jieba\"}}" + m := "{\"tokenizer\": \"jieba\"}" tokenizer, err := NewTokenizer(m) assert.NoError(t, err) defer tokenizer.Destroy() diff --git a/internal/util/ctokenizer/text_schema_validator_test.go b/internal/util/ctokenizer/text_schema_validator_test.go index dd5bc78ce9800..9c202a849a7fa 100644 --- a/internal/util/ctokenizer/text_schema_validator_test.go +++ b/internal/util/ctokenizer/text_schema_validator_test.go @@ -33,7 +33,7 @@ func TestValidateTextSchema(t *testing.T) { DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{ {Key: "enable_match", Value: "true"}, - {Key: "tokenizer_params", Value: `{"analyzer":{"tokenizer": "standard"}}`}, + {Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`}, }, }, { @@ -41,7 +41,7 @@ func TestValidateTextSchema(t *testing.T) { DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{ {Key: "enable_match", Value: "true"}, - {Key: "tokenizer_params", Value: `{"analyzer":{"tokenizer": "standard"}}`}, + {Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`}, }, }, } diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index 0dc8c47e124eb..365bfe5f3599d 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -778,7 +778,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct. def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name, auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs): tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } fields = [ gen_int64_field(), diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index 3716616fc9104..e7d674bb37591 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -41,7 +41,7 @@ def test_create_collection_for_full_text_search(self, tokenizer): expected: create collection successfully """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -105,7 +105,7 @@ def test_create_collection_for_full_text_search_twice_with_same_schema(self, tok expected: create collection successfully and create again successfully """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -183,7 +183,7 @@ def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, expected: create collection failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -249,7 +249,7 @@ def test_create_collection_for_full_text_search_with_invalid_input_output(self, expected: create collection failed """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } dim = 128 fields = [ @@ -327,7 +327,7 @@ def test_create_collection_for_full_text_search_with_field_not_tokenized(self): expected: create collection failed """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } dim = 128 fields = [ @@ -406,7 +406,7 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl expected: insert successfully and count is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -552,7 +552,7 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_ expected: insert successfully and count is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -700,7 +700,7 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, 2. query count and verify the result """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -842,7 +842,7 @@ def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer): """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -999,7 +999,7 @@ def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nul """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1100,7 +1100,7 @@ def test_upsert_for_full_text_search(self, tokenizer, nullable): expected: upsert successfully and data is updated """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1270,7 +1270,7 @@ def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nulla expected: upsert failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1412,7 +1412,7 @@ def test_delete_for_full_text_search(self, tokenizer): expected: delete successfully and data is deleted """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1576,7 +1576,7 @@ def test_create_index_for_full_text_search_default( expected: create index successfully and index info is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -1699,7 +1699,7 @@ def test_create_full_text_search_with_invalid_index_type( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -1807,7 +1807,7 @@ def test_create_full_text_search_index_with_invalid_metric_type( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -1915,7 +1915,7 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -2011,7 +2011,7 @@ def test_create_full_text_search_with_invalid_bm25_params( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -2134,7 +2134,7 @@ def test_full_text_search_default( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2328,9 +2328,7 @@ def test_full_text_search_with_jieba_tokenizer( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{ "tokenizer": tokenizer, - } } dim = 128 fields = [ @@ -2524,7 +2522,7 @@ def test_full_text_search_with_range_search( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2689,7 +2687,7 @@ def test_full_text_search_with_search_iterator( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2843,7 +2841,7 @@ def test_search_for_full_text_search_with_empty_string_search_data( expected: full text search successfully but result is empty """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2972,7 +2970,7 @@ def test_search_for_full_text_search_with_invalid_search_data( expected: full text search failed and return error """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -3119,7 +3117,7 @@ def test_hybrid_search_with_full_text_search( expected: hybrid search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index 5acddf0fb4d9e..03ff8a7578b7b 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -4453,7 +4453,7 @@ def test_query_text_match_en_normal( expected: text match successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -4724,21 +4724,16 @@ def test_query_text_match_custom_analyzer(self): expected: get the correct token, text match successfully and result is correct """ tokenizer_params = { - "analyzer": { "tokenizer": "standard", # "lowercase", "asciifolding", "alphanumonly" was system filter - "filter":["lowercase", "asciifolding", "alphanumonly", "my_stop_filter", "my_stemmer"], - }, - "filter": { - "my_stop_filter":{ + "filter":["lowercase", "asciifolding", "alphanumonly", + { "type": "stop", "stop_words": ["in", "of"], - }, - "my_stemmer":{ + }, { "type": "stemmer", "language": "english", - } - } + }], } dim = 128 fields = [ @@ -4849,7 +4844,7 @@ def test_query_text_match_with_combined_expression_for_single_field(self): expected: query successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -4963,7 +4958,7 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): expected: query successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -5106,7 +5101,7 @@ def test_query_text_match_with_multi_lang(self): # 1. initialize with data tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -5251,7 +5246,7 @@ def test_query_text_match_with_addition_inverted_index(self): # 1. initialize with data fake_en = Faker("en_US") tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } dim = 128 default_fields = [ @@ -5478,7 +5473,7 @@ def test_query_text_match_with_some_empty_string(self): """ # 1. initialize with data tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -5737,7 +5732,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): expected: create collection failed and return error """ tokenizer_params = { - "analyzer":{"tokenizer": "Unsupported"}, + "tokenizer": "Unsupported", } dim = 128 default_fields = [ diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py index 025d8180b18fa..a42d5c44958b8 100644 --- a/tests/python_client/testcases/test_search.py +++ b/tests/python_client/testcases/test_search.py @@ -13302,7 +13302,7 @@ def test_search_with_text_match_filter_normal_en( expected: text match successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py index 030474e833f47..b6fc35f6855b9 100644 --- a/tests/restful_client_v2/testcases/test_vector_operations.py +++ b/tests/restful_client_v2/testcases/test_vector_operations.py @@ -1894,7 +1894,7 @@ def test_search_vector_with_text_match_filter(self, tokenizer): # create a collection dim = 128 tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } name = gen_collection_name() fields = [ @@ -2731,7 +2731,7 @@ def test_query_vector_with_text_match_filter(self, tokenizer): # create a collection dim = 128 tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } name = gen_collection_name() fields = [