From 25e8098eff1eda45d2bd4df996b4feaca0a14fdf Mon Sep 17 00:00:00 2001 From: aoiasd Date: Sun, 3 Nov 2024 18:02:32 +0800 Subject: [PATCH] support analyzer type Signed-off-by: aoiasd --- .../tantivy-binding/src/index_reader_text.rs | 2 +- .../src/index_reader_text_c.rs | 8 +-- .../src/index_writer_text_c.rs | 8 +-- .../tantivy/tantivy-binding/src/tokenizer.rs | 70 +++++++++++++++---- .../tantivy-binding/src/tokenizer_c.rs | 8 +-- .../tantivy-binding/src/tokenizer_filter.rs | 15 ++-- .../tantivy/tantivy-binding/src/util.rs | 18 +++++ tests/python_client/common/common_func.py | 2 +- .../testcases/test_full_text_search.py | 50 +++++++------ tests/python_client/testcases/test_query.py | 27 +++---- tests/python_client/testcases/test_search.py | 2 +- .../testcases/test_vector_operations.py | 4 +- 12 files changed, 128 insertions(+), 86 deletions(-) diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index 960902410482c..ef6e2d6cb6552 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -14,7 +14,7 @@ impl IndexReaderWrapper { let mut tokenizer = self .index .tokenizer_for_field(self.field) - .unwrap_or(standard_analyzer()) + .unwrap_or(standard_analyzer(vec![])) .clone(); let mut token_stream = tokenizer.token_stream(q); let mut terms: Vec = Vec::new(); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index e96939f236472..fc7e00e7672e7 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -29,10 +29,8 @@ pub extern "C" fn tantivy_register_tokenizer( init_log(); let real = ptr as *mut IndexReaderWrapper; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) }; - let analyzer = unsafe { - let params = c_str_to_str(tokenizer_params).to_string(); - create_tokenizer(¶ms) - }; + let params = unsafe{c_str_to_str(tokenizer_params).to_string()}; + let analyzer = create_tokenizer(¶ms); match analyzer { Ok(text_analyzer) => unsafe { (*real).register_tokenizer( @@ -41,7 +39,7 @@ pub extern "C" fn tantivy_register_tokenizer( ); }, Err(err) => { - panic!("create tokenizer failed with error: {}", err.to_string()); + panic!("create tokenizer failed with error: {} param: {}", err.to_string(), params); }, } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index d73bb8d733082..5443fe14afb88 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -22,10 +22,8 @@ pub extern "C" fn tantivy_create_text_writer( let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() }; let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() }; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() }; - let analyzer = unsafe { - let params = c_str_to_str(tokenizer_params).to_string(); - create_tokenizer(¶ms) - }; + let params = unsafe{c_str_to_str(tokenizer_params).to_string()}; + let analyzer = create_tokenizer(¶ms); match analyzer { Ok(text_analyzer) => { let wrapper = IndexWriterWrapper::create_text_writer( @@ -40,7 +38,7 @@ pub extern "C" fn tantivy_create_text_writer( create_binding(wrapper) } Err(err) => { - log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), unsafe{c_str_to_str(tokenizer_params).to_string()}); + log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params); std::ptr::null_mut() }, } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs index e4b8314366c6d..c55381d27d118 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -5,9 +5,20 @@ use serde_json as json; use crate::tokenizer_filter::*; use crate::error::TantivyError; +use crate::util::get_stop_words_list; -pub(crate) fn standard_analyzer() -> TextAnalyzer { - standard_builder().build() + +// default build-in analyzer +pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder() + .filter(LowerCaser) + .filter(RemoveLongFilter::limit(40)); + + if stop_words.len() > 0{ + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() } fn standard_builder() -> TextAnalyzerBuilder{ @@ -122,14 +133,44 @@ impl AnalyzerBuilder<'_>{ } builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize)); } - other => return Err(format!("unknown key of tokenizer option: {}", other).into()), + other => return Err(format!("unknown analyzer option key: {}", other).into()), } } Ok(builder) } + fn build_template(self, type_: &str)-> Result{ + match type_{ + "standard" => { + let value = self.params.get("stop_words"); + match value{ + Some(value)=>{ + let str_list = get_stop_words_list(value)?; + Ok(standard_analyzer(str_list)) + } + None => Ok(standard_analyzer(vec![])) + } + }, + other_ => Err(format!("unknown build-in analyzer type: {}", other_).into()) + } + } + fn build(mut self) -> Result{ - let tokenizer_name = self.get_tokenizer_name()?; + // build base build-in analyzer + match self.params.get("type"){ + Some(type_) =>{ + if !type_.is_string(){ + return Err(format!("analyzer type shoud be string").into()) + } + return self.build_template(type_.as_str().unwrap()); + }, + None => {} + }; + + //build custom analyzer + let tokenizer_name = self.get_tokenizer_name()?; + + // jieba analyzer can't add filter. if tokenizer_name == "jieba"{ return Ok(tantivy_jieba::JiebaTokenizer{}.into()); } @@ -142,15 +183,11 @@ impl AnalyzerBuilder<'_>{ } } -pub(crate) fn create_tokenizer(params: &String) -> Result { - if params.len()==0{ - return Ok(standard_analyzer()); - } - +pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result { match json::from_str::(¶ms){ Ok(value) =>{ if value.is_null(){ - return Ok(standard_analyzer()); + return Ok(standard_analyzer(vec![])); } if !value.is_object(){ return Err("tokenizer params should be a json map".into()); @@ -160,7 +197,7 @@ pub(crate) fn create_tokenizer(params: &String) -> Result Result Result { + if params.len()==0{ + return Ok(standard_analyzer(vec![])); + } + create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params)) +} + #[cfg(test)] mod tests { use crate::tokenizer::create_tokenizer; #[test] fn test_create_tokenizer() { - let params = r#"{"analyzer": {"tokenizer": "standard"}}"#; + let params = r#"{"tokenizer": "standard"}"#; let tokenizer = create_tokenizer(¶ms.to_string()); assert!(tokenizer.is_ok()); @@ -194,7 +238,7 @@ mod tests { #[test] fn test_jieba_tokenizer() { - let params = r#"{"analyzer": {"tokenizer": "jieba"}}"#; + let params = r#"{"tokenizer": "jieba"}"#; let tokenizer = create_tokenizer(¶ms.to_string()); assert!(tokenizer.is_ok()); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs index c7a7a79e2b6c3..86449699d5ab1 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -11,14 +11,12 @@ use crate::{ #[no_mangle] pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void { init_log(); - let analyzer = unsafe { - let params = c_str_to_str(tokenizer_params).to_string(); - create_tokenizer(¶ms) - }; + let params = unsafe{c_str_to_str(tokenizer_params).to_string()}; + let analyzer = create_tokenizer(¶ms); match analyzer { Ok(text_analyzer) => create_binding(text_analyzer), Err(err) => { - log::warn!("create tokenizer failed with error: {}", err.to_string()); + log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params); std::ptr::null_mut() }, } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs index dcfe9e7541c51..6118c8504273f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs @@ -2,6 +2,7 @@ use tantivy::tokenizer::*; use serde_json as json; use crate::error::TantivyError; +use crate::util::*; pub(crate) enum SystemFilter{ Invalid, @@ -46,18 +47,10 @@ fn get_length_filter(params: &json::Map) -> Result)-> Result{ let value = params.get("stop_words"); - if value.is_none() || !value.unwrap().is_array(){ - return Err("stop_words should be array".into()) + if value.is_none(){ + return Err("stop filter stop_words can't be empty".into()); } - - let stop_words= value.unwrap().as_array().unwrap(); - let mut str_list = Vec::::new(); - for element in stop_words{ - match element.as_str(){ - Some(word) => str_list.push(word.to_string()), - None => return Err("stop words item should be string".into()) - } - }; + let str_list = get_stop_words_list(value.unwrap())?; Ok(SystemFilter::Stop(StopWordFilter::remove(str_list))) } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs index 1f1c1655c1032..5ffc16b82543a 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs @@ -1,5 +1,7 @@ use std::ffi::c_void; use std::ops::Bound; +use serde_json as json; +use crate::error::TantivyError; use tantivy::{directory::MmapDirectory, Index}; @@ -28,3 +30,19 @@ pub fn free_binding(ptr: *mut c_void) { drop(Box::from_raw(real)); } } + +pub(crate) fn get_stop_words_list(value: &json::Value) -> Result, TantivyError>{ + if !value.is_array(){ + return Err("stop_words should be array".into()) + } + + let stop_words= value.as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words{ + match element.as_str(){ + Some(word) => str_list.push(word.to_string()), + None => return Err("stop words item should be string".into()) + } + }; + Ok(str_list) +} \ No newline at end of file diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index 0be694809c81d..7f81fed6bfd29 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -704,7 +704,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct. def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name, auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs): tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } fields = [ gen_int64_field(), diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index c5acb26c10236..11fd86765ede3 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -36,7 +36,7 @@ def test_create_collection_for_full_text_search(self, tokenizer): expected: create collection successfully """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -100,7 +100,7 @@ def test_create_collection_for_full_text_search_twice_with_same_schema(self, tok expected: create collection successfully and create again successfully """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -178,7 +178,7 @@ def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, expected: create collection failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -244,7 +244,7 @@ def test_create_collection_for_full_text_search_with_invalid_input_output(self, expected: create collection failed """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } dim = 128 fields = [ @@ -322,7 +322,7 @@ def test_create_collection_for_full_text_search_with_field_not_tokenized(self): expected: create collection failed """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } dim = 128 fields = [ @@ -401,7 +401,7 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl expected: insert successfully and count is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -547,7 +547,7 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_ expected: insert successfully and count is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -695,7 +695,7 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, 2. query count and verify the result """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -837,7 +837,7 @@ def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer): """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -994,7 +994,7 @@ def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nul """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1095,7 +1095,7 @@ def test_upsert_for_full_text_search(self, tokenizer, nullable): expected: upsert successfully and data is updated """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1265,7 +1265,7 @@ def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nulla expected: upsert failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1407,7 +1407,7 @@ def test_delete_for_full_text_search(self, tokenizer): expected: delete successfully and data is deleted """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -1571,7 +1571,7 @@ def test_create_index_for_full_text_search_default( expected: create index successfully and index info is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -1694,7 +1694,7 @@ def test_create_full_text_search_with_invalid_index_type( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -1802,7 +1802,7 @@ def test_create_full_text_search_index_with_invalid_metric_type( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -1910,7 +1910,7 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -2006,7 +2006,7 @@ def test_create_full_text_search_with_invalid_bm25_params( expected: create index failed """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } empty_percent = 0.0 dim = 128 @@ -2129,7 +2129,7 @@ def test_full_text_search_default( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2322,9 +2322,7 @@ def test_full_text_search_with_jieba_tokenizer( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{ "tokenizer": tokenizer, - } } dim = 128 fields = [ @@ -2518,7 +2516,7 @@ def test_full_text_search_with_range_search( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2683,7 +2681,7 @@ def test_full_text_search_with_search_iterator( expected: full text search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2837,7 +2835,7 @@ def test_search_for_full_text_search_with_empty_string_search_data( expected: full text search successfully but result is empty """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -2966,7 +2964,7 @@ def test_search_for_full_text_search_with_invalid_search_data( expected: full text search failed and return error """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -3113,7 +3111,7 @@ def test_hybrid_search_with_full_text_search( expected: hybrid search successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index 3696a1138c08d..c7d2015e9801f 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -4448,7 +4448,7 @@ def test_query_text_match_normal( expected: text match successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ @@ -4580,21 +4580,16 @@ def test_query_text_match_custom_analyzer(self): expected: get the correct token, text match successfully and result is correct """ tokenizer_params = { - "analyzer": { "tokenizer": "standard", # "lowercase", "asciifolding", "alphanumonly" was system filter - "filter":["lowercase", "asciifolding", "alphanumonly", "my_stop_filter", "my_stemmer"], - }, - "filter": { - "my_stop_filter":{ + "filter":["lowercase", "asciifolding", "alphanumonly", + { "type": "stop", "stop_words": ["in", "of"], - }, - "my_stemmer":{ + }, { "type": "stemmer", "language": "english", - } - } + }], } dim = 128 fields = [ @@ -4705,7 +4700,7 @@ def test_query_text_match_with_combined_expression_for_single_field(self): expected: query successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -4819,7 +4814,7 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): expected: query successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -4962,7 +4957,7 @@ def test_query_text_match_with_multi_lang(self): # 1. initialize with data tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -5107,7 +5102,7 @@ def test_query_text_match_with_addition_inverted_index(self): # 1. initialize with data fake_en = Faker("en_US") tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } dim = 128 default_fields = [ @@ -5334,7 +5329,7 @@ def test_query_text_match_with_some_empty_string(self): """ # 1. initialize with data tokenizer_params = { - "analyzer":{"tokenizer": "standard"}, + "tokenizer": "standard", } # 1. initialize with data dim = 128 @@ -5593,7 +5588,7 @@ def test_query_text_match_with_unsupported_tokenizer(self): expected: create collection failed and return error """ tokenizer_params = { - "analyzer":{"tokenizer": "Unsupported"}, + "tokenizer": "Unsupported", } dim = 128 default_fields = [ diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py index 617b4fcec96cf..b7d7fb92fb649 100644 --- a/tests/python_client/testcases/test_search.py +++ b/tests/python_client/testcases/test_search.py @@ -13297,7 +13297,7 @@ def test_search_with_text_match_filter_normal( expected: text match successfully and result is correct """ tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } dim = 128 fields = [ diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py index 030474e833f47..b6fc35f6855b9 100644 --- a/tests/restful_client_v2/testcases/test_vector_operations.py +++ b/tests/restful_client_v2/testcases/test_vector_operations.py @@ -1894,7 +1894,7 @@ def test_search_vector_with_text_match_filter(self, tokenizer): # create a collection dim = 128 tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } name = gen_collection_name() fields = [ @@ -2731,7 +2731,7 @@ def test_query_vector_with_text_match_filter(self, tokenizer): # create a collection dim = 128 tokenizer_params = { - "analyzer":{"tokenizer": tokenizer}, + "tokenizer": tokenizer, } name = gen_collection_name() fields = [