From 25e8098eff1eda45d2bd4df996b4feaca0a14fdf Mon Sep 17 00:00:00 2001
From: aoiasd <zhicheng.yue@zilliz.com>
Date: Sun, 3 Nov 2024 18:02:32 +0800
Subject: [PATCH] support analyzer type

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
---
 .../tantivy-binding/src/index_reader_text.rs  |  2 +-
 .../src/index_reader_text_c.rs                |  8 +--
 .../src/index_writer_text_c.rs                |  8 +--
 .../tantivy/tantivy-binding/src/tokenizer.rs  | 70 +++++++++++++++----
 .../tantivy-binding/src/tokenizer_c.rs        |  8 +--
 .../tantivy-binding/src/tokenizer_filter.rs   | 15 ++--
 .../tantivy/tantivy-binding/src/util.rs       | 18 +++++
 tests/python_client/common/common_func.py     |  2 +-
 .../testcases/test_full_text_search.py        | 50 +++++++------
 tests/python_client/testcases/test_query.py   | 27 +++----
 tests/python_client/testcases/test_search.py  |  2 +-
 .../testcases/test_vector_operations.py       |  4 +-
 12 files changed, 128 insertions(+), 86 deletions(-)
diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs
index 960902410482c..ef6e2d6cb6552 100644
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs
@@ -14,7 +14,7 @@ impl IndexReaderWrapper {
         let mut tokenizer = self
             .index
             .tokenizer_for_field(self.field)
-            .unwrap_or(standard_analyzer())
+            .unwrap_or(standard_analyzer(vec![]))
             .clone();
         let mut token_stream = tokenizer.token_stream(q);
         let mut terms: Vec<Term> = Vec::new();
diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs
index e96939f236472..fc7e00e7672e7 100644
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs
@@ -29,10 +29,8 @@ pub extern "C" fn tantivy_register_tokenizer(
     init_log();
     let real = ptr as *mut IndexReaderWrapper;
     let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
-    let analyzer = unsafe {
-        let params = c_str_to_str(tokenizer_params).to_string();
-        create_tokenizer(&params)
-    };
+    let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
+    let analyzer = create_tokenizer(&params);
     match analyzer {
         Ok(text_analyzer) => unsafe {
             (*real).register_tokenizer(
@@ -41,7 +39,7 @@ pub extern "C" fn tantivy_register_tokenizer(
             );
         },
         Err(err) => {
-            panic!("create tokenizer failed with error: {}", err.to_string());
+            panic!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
         },
     }
 }
diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs
index d73bb8d733082..5443fe14afb88 100644
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs
@@ -22,10 +22,8 @@ pub extern "C" fn tantivy_create_text_writer(
     let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
     let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
     let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
-    let analyzer = unsafe {
-        let params = c_str_to_str(tokenizer_params).to_string();
-        create_tokenizer(&params)
-    };
+    let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
+    let analyzer = create_tokenizer(&params);
     match analyzer {
         Ok(text_analyzer) => {
             let wrapper = IndexWriterWrapper::create_text_writer(
@@ -40,7 +38,7 @@ pub extern "C" fn tantivy_create_text_writer(
             create_binding(wrapper)
         }
         Err(err) => {
-            log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), unsafe{c_str_to_str(tokenizer_params).to_string()});
+            log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
             std::ptr::null_mut()
         },
     }
diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
index e4b8314366c6d..c55381d27d118 100644
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
@@ -5,9 +5,20 @@ use serde_json as json;
 
 use crate::tokenizer_filter::*;
 use crate::error::TantivyError;
+use crate::util::get_stop_words_list;
 
-pub(crate) fn standard_analyzer() -> TextAnalyzer {
-    standard_builder().build()
+
+// default build-in analyzer
+pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
+    let builder = standard_builder()
+        .filter(LowerCaser)
+        .filter(RemoveLongFilter::limit(40));
+
+    if stop_words.len() > 0{
+        return builder.filter(StopWordFilter::remove(stop_words)).build();
+    }
+
+    builder.build()
 }
 
 fn standard_builder() -> TextAnalyzerBuilder{
@@ -122,14 +133,44 @@ impl AnalyzerBuilder<'_>{
                     }
                     builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize));
                 }
-                other => return Err(format!("unknown key of tokenizer option: {}", other).into()),
+                other => return Err(format!("unknown analyzer option key: {}", other).into()),
             }
         }
         Ok(builder)
     }
 
+    fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
+        match type_{
+            "standard" => {
+                let value = self.params.get("stop_words");
+                match value{
+                    Some(value)=>{
+                        let str_list = get_stop_words_list(value)?;
+                        Ok(standard_analyzer(str_list))
+                    }
+                    None => Ok(standard_analyzer(vec![]))
+                }                
+            },
+            other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
+        }
+    } 
+
     fn build(mut self) -> Result<TextAnalyzer, TantivyError>{
-        let tokenizer_name = self.get_tokenizer_name()?;      
+        // build base build-in analyzer
+        match self.params.get("type"){
+            Some(type_) =>{
+                if !type_.is_string(){
+                    return Err(format!("analyzer type shoud be string").into())
+                }
+                return self.build_template(type_.as_str().unwrap());
+            },
+            None => {}
+        };
+
+        //build custom analyzer
+        let tokenizer_name = self.get_tokenizer_name()?; 
+
+        // jieba analyzer can't add filter.     
         if tokenizer_name == "jieba"{
             return Ok(tantivy_jieba::JiebaTokenizer{}.into());
         }
@@ -142,15 +183,11 @@ impl AnalyzerBuilder<'_>{
     }
 }
 
-pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
-    if params.len()==0{
-        return Ok(standard_analyzer());
-    }
-
+pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> {
     match json::from_str::<json::Value>(&params){
         Ok(value) =>{
             if value.is_null(){
-                return Ok(standard_analyzer());
+                return Ok(standard_analyzer(vec![]));
             }
             if !value.is_object(){
                 return Err("tokenizer params should be a json map".into());
@@ -160,7 +197,7 @@ pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyE
             // create builder
             let analyzer_params=json_params.get("analyzer");
             if analyzer_params.is_none(){
-                return Ok(standard_analyzer());
+                return Ok(standard_analyzer(vec![]));
             }
             if !analyzer_params.unwrap().is_object(){
                 return Err("analyzer params should be a json map".into());
@@ -180,13 +217,20 @@ pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyE
     }
 }
 
+pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
+    if params.len()==0{
+        return Ok(standard_analyzer(vec![]));
+    }
+    create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
+}
+
 #[cfg(test)]
 mod tests {
     use crate::tokenizer::create_tokenizer;
 
     #[test]
     fn test_create_tokenizer() {
-        let params = r#"{"analyzer": {"tokenizer": "standard"}}"#;
+        let params = r#"{"tokenizer": "standard"}"#;
 
         let tokenizer = create_tokenizer(&params.to_string());
         assert!(tokenizer.is_ok());
@@ -194,7 +238,7 @@ mod tests {
 
     #[test]
     fn test_jieba_tokenizer() {
-        let params = r#"{"analyzer": {"tokenizer": "jieba"}}"#;
+        let params = r#"{"tokenizer": "jieba"}"#;
 
         let tokenizer = create_tokenizer(&params.to_string());
         assert!(tokenizer.is_ok());
diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs
index c7a7a79e2b6c3..86449699d5ab1 100644
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs
@@ -11,14 +11,12 @@ use crate::{
 #[no_mangle]
 pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
     init_log();
-    let analyzer = unsafe {
-        let params = c_str_to_str(tokenizer_params).to_string();
-        create_tokenizer(&params)
-    };
+    let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
+    let analyzer = create_tokenizer(&params);
     match analyzer {
         Ok(text_analyzer) => create_binding(text_analyzer),
         Err(err) => {
-            log::warn!("create tokenizer failed with error: {}", err.to_string());
+            log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
             std::ptr::null_mut()
         },
     }
diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs
index dcfe9e7541c51..6118c8504273f 100644
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs
@@ -2,6 +2,7 @@ use tantivy::tokenizer::*;
 use serde_json as json;
 
 use crate::error::TantivyError;
+use crate::util::*;
 
 pub(crate) enum SystemFilter{
     Invalid,
@@ -46,18 +47,10 @@ fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFi
 
 fn get_stop_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
     let value = params.get("stop_words");
-    if value.is_none() || !value.unwrap().is_array(){
-        return Err("stop_words should be array".into())
+    if value.is_none(){
+        return Err("stop filter stop_words can't be empty".into());
     }
-
-    let stop_words= value.unwrap().as_array().unwrap();
-    let mut str_list = Vec::<String>::new();
-    for element in stop_words{
-        match element.as_str(){
-            Some(word) => str_list.push(word.to_string()),
-            None => return Err("stop words item should be string".into())
-        }
-    };
+    let str_list = get_stop_words_list(value.unwrap())?;
     Ok(SystemFilter::Stop(StopWordFilter::remove(str_list)))
 }
 
diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs
index 1f1c1655c1032..5ffc16b82543a 100644
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs
@@ -1,5 +1,7 @@
 use std::ffi::c_void;
 use std::ops::Bound;
+use serde_json as json;
+use crate::error::TantivyError;
 
 use tantivy::{directory::MmapDirectory, Index};
 
@@ -28,3 +30,19 @@ pub fn free_binding<T>(ptr: *mut c_void) {
         drop(Box::from_raw(real));
     }
 }
+
+pub(crate) fn get_stop_words_list(value: &json::Value) -> Result<Vec<String>, TantivyError>{
+    if !value.is_array(){
+        return Err("stop_words should be array".into())
+    }
+
+    let stop_words= value.as_array().unwrap();
+    let mut str_list = Vec::<String>::new();
+    for element in stop_words{
+        match element.as_str(){
+            Some(word) => str_list.push(word.to_string()),
+            None => return Err("stop words item should be string".into())
+        }
+    };
+    Ok(str_list)
+}
\ No newline at end of file
diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py
index 0be694809c81d..7f81fed6bfd29 100644
--- a/tests/python_client/common/common_func.py
+++ b/tests/python_client/common/common_func.py
@@ -704,7 +704,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct.
 def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
                                        auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs):
     tokenizer_params = {
-        "analyzer":{"tokenizer": "standard"},
+        "tokenizer": "standard",
     }
     fields = [
         gen_int64_field(),
diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py
index c5acb26c10236..11fd86765ede3 100644
--- a/tests/python_client/testcases/test_full_text_search.py
+++ b/tests/python_client/testcases/test_full_text_search.py
@@ -36,7 +36,7 @@ def test_create_collection_for_full_text_search(self, tokenizer):
         expected: create collection successfully
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -100,7 +100,7 @@ def test_create_collection_for_full_text_search_twice_with_same_schema(self, tok
         expected: create collection successfully and create again successfully
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -178,7 +178,7 @@ def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self,
         expected: create collection failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -244,7 +244,7 @@ def test_create_collection_for_full_text_search_with_invalid_input_output(self,
         expected: create collection failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": "standard"},
+            "tokenizer": "standard",
         }
         dim = 128
         fields = [
@@ -322,7 +322,7 @@ def test_create_collection_for_full_text_search_with_field_not_tokenized(self):
         expected: create collection failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": "standard"},
+            "tokenizer": "standard",
         }
         dim = 128
         fields = [
@@ -401,7 +401,7 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl
         expected: insert successfully and count is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -547,7 +547,7 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_
         expected: insert successfully and count is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -695,7 +695,7 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang,
                 2. query count and verify the result
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -837,7 +837,7 @@ def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
         """
 
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -994,7 +994,7 @@ def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nul
         """
 
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -1095,7 +1095,7 @@ def test_upsert_for_full_text_search(self, tokenizer, nullable):
         expected: upsert successfully and data is updated
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -1265,7 +1265,7 @@ def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nulla
         expected: upsert failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -1407,7 +1407,7 @@ def test_delete_for_full_text_search(self, tokenizer):
         expected: delete successfully and data is deleted
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -1571,7 +1571,7 @@ def test_create_index_for_full_text_search_default(
         expected: create index successfully and index info is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         empty_percent = 0.0
         dim = 128
@@ -1694,7 +1694,7 @@ def test_create_full_text_search_with_invalid_index_type(
         expected: create index failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         empty_percent = 0.0
         dim = 128
@@ -1802,7 +1802,7 @@ def test_create_full_text_search_index_with_invalid_metric_type(
         expected: create index failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         empty_percent = 0.0
         dim = 128
@@ -1910,7 +1910,7 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
         expected: create index failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         empty_percent = 0.0
         dim = 128
@@ -2006,7 +2006,7 @@ def test_create_full_text_search_with_invalid_bm25_params(
         expected: create index failed
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         empty_percent = 0.0
         dim = 128
@@ -2129,7 +2129,7 @@ def test_full_text_search_default(
         expected: full text search successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -2322,9 +2322,7 @@ def test_full_text_search_with_jieba_tokenizer(
         expected: full text search successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{
                 "tokenizer": tokenizer,
-            }
         }
         dim = 128
         fields = [
@@ -2518,7 +2516,7 @@ def test_full_text_search_with_range_search(
         expected: full text search successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -2683,7 +2681,7 @@ def test_full_text_search_with_search_iterator(
         expected: full text search successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -2837,7 +2835,7 @@ def test_search_for_full_text_search_with_empty_string_search_data(
         expected: full text search successfully but result is empty
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -2966,7 +2964,7 @@ def test_search_for_full_text_search_with_invalid_search_data(
         expected: full text search failed and return error
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -3113,7 +3111,7 @@ def test_hybrid_search_with_full_text_search(
         expected: hybrid search successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py
index 3696a1138c08d..c7d2015e9801f 100644
--- a/tests/python_client/testcases/test_query.py
+++ b/tests/python_client/testcases/test_query.py
@@ -4448,7 +4448,7 @@ def test_query_text_match_normal(
         expected: text match successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
@@ -4580,21 +4580,16 @@ def test_query_text_match_custom_analyzer(self):
         expected: get the correct token, text match successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer": {
                 "tokenizer": "standard",
                 # "lowercase", "asciifolding", "alphanumonly" was system filter
-                "filter":["lowercase", "asciifolding", "alphanumonly", "my_stop_filter", "my_stemmer"],
-            },
-            "filter": {
-                "my_stop_filter":{
+                "filter":["lowercase", "asciifolding", "alphanumonly",
+                {
                     "type": "stop",
                     "stop_words": ["in", "of"],
-                },
-                "my_stemmer":{
+                }, {
                     "type": "stemmer",
                     "language": "english",
-                }
-            }
+                }],
         }
         dim = 128
         fields = [
@@ -4705,7 +4700,7 @@ def test_query_text_match_with_combined_expression_for_single_field(self):
         expected: query successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": "standard"},
+            "tokenizer": "standard",
         }
         # 1. initialize with data
         dim = 128
@@ -4819,7 +4814,7 @@ def test_query_text_match_with_combined_expression_for_multi_field(self):
         expected: query successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": "standard"},
+            "tokenizer": "standard",
         }
         # 1. initialize with data
         dim = 128
@@ -4962,7 +4957,7 @@ def test_query_text_match_with_multi_lang(self):
 
         # 1. initialize with data
         tokenizer_params = {
-            "analyzer":{"tokenizer": "standard"},
+            "tokenizer": "standard",
         }
         # 1. initialize with data
         dim = 128
@@ -5107,7 +5102,7 @@ def test_query_text_match_with_addition_inverted_index(self):
         # 1. initialize with data
         fake_en = Faker("en_US")
         tokenizer_params = {
-            "analyzer":{"tokenizer": "standard"},
+            "tokenizer": "standard",
         }
         dim = 128
         default_fields = [
@@ -5334,7 +5329,7 @@ def test_query_text_match_with_some_empty_string(self):
         """
         # 1. initialize with data
         tokenizer_params = {
-            "analyzer":{"tokenizer": "standard"},
+            "tokenizer": "standard",
         }
         # 1. initialize with data
         dim = 128
@@ -5593,7 +5588,7 @@ def test_query_text_match_with_unsupported_tokenizer(self):
         expected: create collection failed and return error
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": "Unsupported"},
+            "tokenizer": "Unsupported",
         }
         dim = 128
         default_fields = [
diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py
index 617b4fcec96cf..b7d7fb92fb649 100644
--- a/tests/python_client/testcases/test_search.py
+++ b/tests/python_client/testcases/test_search.py
@@ -13297,7 +13297,7 @@ def test_search_with_text_match_filter_normal(
         expected: text match successfully and result is correct
         """
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         dim = 128
         fields = [
diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py
index 030474e833f47..b6fc35f6855b9 100644
--- a/tests/restful_client_v2/testcases/test_vector_operations.py
+++ b/tests/restful_client_v2/testcases/test_vector_operations.py
@@ -1894,7 +1894,7 @@ def test_search_vector_with_text_match_filter(self, tokenizer):
         # create a collection
         dim = 128
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         name = gen_collection_name()
         fields = [
@@ -2731,7 +2731,7 @@ def test_query_vector_with_text_match_filter(self, tokenizer):
         # create a collection
         dim = 128
         tokenizer_params = {
-            "analyzer":{"tokenizer": tokenizer},
+            "tokenizer": tokenizer,
         }
         name = gen_collection_name()
         fields = [