diff --git a/internal/core/src/common/FieldMeta.cpp b/internal/core/src/common/FieldMeta.cpp index a90f768781fe2..2e9cd8c5052b2 100644 --- a/internal/core/src/common/FieldMeta.cpp +++ b/internal/core/src/common/FieldMeta.cpp @@ -16,25 +16,15 @@ #include #include "Consts.h" -#include "log/Log.h" namespace milvus { TokenizerParams ParseTokenizerParams(const TypeParams& params) { auto iter = params.find("tokenizer_params"); if (iter == params.end()) { - return {}; + return "{}"; } - nlohmann::json j = nlohmann::json::parse(iter->second); - std::map ret; - for (const auto& [k, v] : j.items()) { - try { - ret[k] = v.get(); - } catch (std::exception& e) { - ret[k] = v.dump(); - } - } - return ret; + return iter ->second.c_str(); } bool diff --git a/internal/core/src/common/FieldMeta.h b/internal/core/src/common/FieldMeta.h index ed040902a54d6..0581cf9167551 100644 --- a/internal/core/src/common/FieldMeta.h +++ b/internal/core/src/common/FieldMeta.h @@ -25,7 +25,7 @@ namespace milvus { using TypeParams = std::map; -using TokenizerParams = std::map; +using TokenizerParams = const char*; TokenizerParams ParseTokenizerParams(const TypeParams& params); diff --git a/internal/core/src/index/TextMatchIndex.cpp b/internal/core/src/index/TextMatchIndex.cpp index f21e5b319e006..216b93007f4c9 100644 --- a/internal/core/src/index/TextMatchIndex.cpp +++ b/internal/core/src/index/TextMatchIndex.cpp @@ -22,7 +22,7 @@ constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/"; TextMatchIndex::TextMatchIndex( int64_t commit_interval_in_ms, const char* tokenizer_name, - const std::map& tokenizer_params) + const char* tokenizer_params) : commit_interval_in_ms_(commit_interval_in_ms), last_commit_time_(stdclock::now()) { d_type_ = TantivyDataType::Text; @@ -34,7 +34,7 @@ TextMatchIndex::TextMatchIndex( TextMatchIndex::TextMatchIndex( const std::string& path, const char* tokenizer_name, - const std::map& tokenizer_params) + const char* tokenizer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { path_ = path; @@ -50,7 +50,7 @@ TextMatchIndex::TextMatchIndex( TextMatchIndex::TextMatchIndex( const storage::FileManagerContext& ctx, const char* tokenizer_name, - const std::map& tokenizer_params) + const char* tokenizer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { schema_ = ctx.fieldDataMeta.field_schema; @@ -176,7 +176,7 @@ TextMatchIndex::CreateReader() { void TextMatchIndex::RegisterTokenizer( const char* tokenizer_name, - const std::map& tokenizer_params) { + const char* tokenizer_params) { wrapper_->register_tokenizer(tokenizer_name, tokenizer_params); } diff --git a/internal/core/src/index/TextMatchIndex.h b/internal/core/src/index/TextMatchIndex.h index 570668a0304e0..2a52d2cf0e9d5 100644 --- a/internal/core/src/index/TextMatchIndex.h +++ b/internal/core/src/index/TextMatchIndex.h @@ -25,17 +25,17 @@ class TextMatchIndex : public InvertedIndexTantivy { explicit TextMatchIndex( int64_t commit_interval_in_ms, const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); // for sealed segment. explicit TextMatchIndex( const std::string& path, const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); // for building index. explicit TextMatchIndex( const storage::FileManagerContext& ctx, const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); // for loading index explicit TextMatchIndex(const storage::FileManagerContext& ctx); @@ -69,7 +69,7 @@ class TextMatchIndex : public InvertedIndexTantivy { void RegisterTokenizer( const char* tokenizer_name, - const std::map& tokenizer_params); + const char* tokenizer_params); TargetBitmap MatchQuery(const std::string& query); diff --git a/internal/core/src/segcore/tokenizer_c.cpp b/internal/core/src/segcore/tokenizer_c.cpp index a33a6bd9bfd85..781192f269c63 100644 --- a/internal/core/src/segcore/tokenizer_c.cpp +++ b/internal/core/src/segcore/tokenizer_c.cpp @@ -20,10 +20,9 @@ using Map = std::map; CStatus -create_tokenizer(CMap m, CTokenizer* tokenizer) { +create_tokenizer(const char* params, CTokenizer* tokenizer) { try { - auto mm = reinterpret_cast(m); - auto impl = std::make_unique(*mm); + auto impl = std::make_unique(params); *tokenizer = impl.release(); return milvus::SuccessCStatus(); } catch (std::exception& e) { diff --git a/internal/core/src/segcore/tokenizer_c.h b/internal/core/src/segcore/tokenizer_c.h index 3f84da729efaa..422449c0cab50 100644 --- a/internal/core/src/segcore/tokenizer_c.h +++ b/internal/core/src/segcore/tokenizer_c.h @@ -24,7 +24,7 @@ extern "C" { typedef void* CTokenizer; CStatus -create_tokenizer(CMap m, CTokenizer* tokenizer); +create_tokenizer(const char* params, CTokenizer* tokenizer); CStatus clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 391cece60bccd..5c494f215ece9 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -88,7 +88,9 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern); RustArray tantivy_match_query(void *ptr, const char *query); -void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params); +void tantivy_register_tokenizer(void *ptr, + const char *tokenizer_name, + const char *tokenizer_params); void *tantivy_create_index(const char *field_name, TantivyDataType data_type, @@ -142,7 +144,7 @@ void tantivy_index_add_multi_keywords(void *ptr, void *tantivy_create_text_writer(const char *field_name, const char *path, const char *tokenizer_name, - void *tokenizer_params, + const char *tokenizer_params, uintptr_t num_threads, uintptr_t overall_memory_budget_in_bytes, bool in_ram); @@ -157,7 +159,7 @@ bool tantivy_token_stream_advance(void *token_stream); const char *tantivy_token_stream_get_token(void *token_stream); -void *tantivy_create_tokenizer(void *tokenizer_params); +void *tantivy_create_tokenizer(const char *tokenizer_params); void *tantivy_clone_tokenizer(void *ptr); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index eb0653c90357b..cc461c87deabd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -1,8 +1,13 @@ -use std::{collections::HashMap, ffi::CStr}; +use std::{ffi::CStr}; use libc::{c_char, c_void}; -use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer}; +use crate::{ + array::RustArray, + string_c::c_str_to_str, + index_reader::IndexReaderWrapper, + tokenizer::create_tokenizer, +}; #[no_mangle] pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray { @@ -18,13 +23,13 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> pub extern "C" fn tantivy_register_tokenizer( ptr: *mut c_void, tokenizer_name: *const c_char, - tokenizer_params: *mut c_void, + tokenizer_params: *const c_char, ) { let real = ptr as *mut IndexReaderWrapper; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) }; let analyzer = unsafe { - let m = tokenizer_params as *const HashMap; - create_tokenizer(&(*m)) + let params = c_str_to_str(tokenizer_params).to_string(); + create_tokenizer(¶ms) }; match analyzer { Some(text_analyzer) => unsafe { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index 1ca70ac232c9b..83c8dc34ad954 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -6,13 +6,14 @@ use std::ffi::CStr; use crate::index_writer::IndexWriterWrapper; use crate::tokenizer::create_tokenizer; use crate::util::create_binding; +use crate::string_c::c_str_to_str; #[no_mangle] pub extern "C" fn tantivy_create_text_writer( field_name: *const c_char, path: *const c_char, tokenizer_name: *const c_char, - tokenizer_params: *mut c_void, + tokenizer_params: *const c_char, num_threads: usize, overall_memory_budget_in_bytes: usize, in_ram: bool, @@ -21,8 +22,8 @@ pub extern "C" fn tantivy_create_text_writer( let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() }; let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() }; let analyzer = unsafe { - let m = tokenizer_params as *const HashMap; - create_tokenizer(&(*m)) + let params = c_str_to_str(tokenizer_params).to_string(); + create_tokenizer(¶ms) }; match analyzer { Some(text_analyzer) => { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs index 128ebe266f2cb..9ab3e7c6f9998 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -100,58 +100,51 @@ impl TantivyBuilder<'_>{ } } -pub(crate) fn create_tokenizer(params: &HashMap) -> Option { +pub(crate) fn create_tokenizer(params: &String) -> Option { init_log(); - let analyzer_json_value = match params.get("analyzer"){ - Some(value) => { - let json_analyzer = json::from_str::(value); - if json_analyzer.is_err() { + match json::from_str::(¶ms){ + Ok(value) =>{ + if !value.is_object(){ return None; } - let json_value = json_analyzer.unwrap(); - if !json_value.is_object(){ - return None + let json_params = value.as_object().unwrap(); + // create builder + let analyzer_params=json_params.get("analyzer"); + if analyzer_params.is_none() || !analyzer_params.unwrap().is_object(){ + return None; } - json_value - } - None => json::Value::Object(json::Map::::new()), - }; - - let analyzer_params= analyzer_json_value.as_object().unwrap(); - let mut builder = TantivyBuilder::new(analyzer_params); - let str_filter=params.get("filter"); - if !str_filter.is_none(){ - let json_filter = json::from_str::(str_filter.unwrap()); - if json_filter.is_err(){ - return None - } + let mut builder = TantivyBuilder::new(analyzer_params.unwrap().as_object().unwrap()); - let filter_params = json_filter.unwrap(); - if !filter_params.is_object(){ - return None - } + // build custom filter + let filter_params=json_params.get("filter"); + if !filter_params.is_none() && filter_params.unwrap().is_object(){ + builder.add_costom_filters(filter_params.unwrap().as_object().unwrap()); + } - builder.add_costom_filters(filter_params.as_object().unwrap()); + // build analyzer + builder.build() + }, + Err(_e) => None, } - builder.build() } #[cfg(test)] mod tests { - use std::collections::HashMap; use crate::tokenizer::create_tokenizer; #[test] fn test_create_tokenizer() { - let mut params : HashMap = HashMap::new(); - let analyzer_params = r#" + let params = r#" { - "tokenizer": "jieba" + "analyzer": + { + "tokenizer": "standard", + "filter": [""], + }, }"#; - params.insert("analyzer".to_string(), analyzer_params.to_string()); - let tokenizer = create_tokenizer(¶ms); + let tokenizer = create_tokenizer(¶ms.to_string()); assert!(tokenizer.is_some()); } -} +} \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs index ef572fcc4f2a6..dedca3064e138 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -1,18 +1,17 @@ -use std::collections::HashMap; - -use libc::c_void; +use libc::{c_void,c_char}; use tantivy::tokenizer::TextAnalyzer; use crate::{ + string_c::c_str_to_str, tokenizer::create_tokenizer, util::{create_binding, free_binding}, }; #[no_mangle] -pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void { +pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void { let analyzer = unsafe { - let m = tokenizer_params as *const HashMap; - create_tokenizer(&(*m)) + let params = c_str_to_str(tokenizer_params).to_string(); + create_tokenizer(¶ms) }; match analyzer { Some(text_analyzer) => create_binding(text_analyzer), diff --git a/internal/core/thirdparty/tantivy/tantivy-wrapper.h b/internal/core/thirdparty/tantivy/tantivy-wrapper.h index 17822d1bbdfb3..2e576f5fe9162 100644 --- a/internal/core/thirdparty/tantivy/tantivy-wrapper.h +++ b/internal/core/thirdparty/tantivy/tantivy-wrapper.h @@ -14,7 +14,7 @@ namespace milvus::tantivy { using Map = std::map; static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer"; -static Map DEFAULT_TOKENIZER_PARAMS = {}; +static const char* DEFAULT_TOKENIZER_PARAMS = "{}"; static constexpr uintptr_t DEFAULT_NUM_THREADS = 4; static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES = DEFAULT_NUM_THREADS * 15 * 1024 * 1024; @@ -101,17 +101,15 @@ struct TantivyIndexWrapper { bool in_ram, const char* path, const char* tokenizer_name = DEFAULT_TOKENIZER_NAME, - const std::map& + const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS, uintptr_t num_threads = DEFAULT_NUM_THREADS, uintptr_t overall_memory_budget_in_bytes = DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) { - RustHashMap m; - m.from(tokenizer_params); writer_ = tantivy_create_text_writer(field_name, path, tokenizer_name, - m.get_pointer(), + tokenizer_params, num_threads, overall_memory_budget_in_bytes, in_ram); @@ -136,12 +134,10 @@ struct TantivyIndexWrapper { void register_tokenizer( const char* tokenizer_name, - const std::map& tokenizer_params) { - RustHashMap m; - m.from(tokenizer_params); + const char* tokenizer_params) { if (reader_ != nullptr) { tantivy_register_tokenizer( - reader_, tokenizer_name, m.get_pointer()); + reader_, tokenizer_name, tokenizer_params); } } diff --git a/internal/core/thirdparty/tantivy/tokenizer.h b/internal/core/thirdparty/tantivy/tokenizer.h index 6f42eecbfcbe2..eeeec4db6de3e 100644 --- a/internal/core/thirdparty/tantivy/tokenizer.h +++ b/internal/core/thirdparty/tantivy/tokenizer.h @@ -11,10 +11,9 @@ struct Tokenizer { public: NO_COPY_OR_ASSIGN(Tokenizer); - explicit Tokenizer(const std::map& params) { - RustHashMap m; - m.from(params); - ptr_ = tantivy_create_tokenizer(m.get_pointer()); + explicit Tokenizer(std::string&& params) { + auto shared_params = std::make_shared(std::move(params)); + ptr_ = tantivy_create_tokenizer(shared_params->c_str()); if (ptr_ == nullptr) { throw std::invalid_argument("invalid tokenizer parameters"); } diff --git a/internal/core/unittest/test_text_match.cpp b/internal/core/unittest/test_text_match.cpp index 247a23c9bcbdc..359c35ce367de 100644 --- a/internal/core/unittest/test_text_match.cpp +++ b/internal/core/unittest/test_text_match.cpp @@ -80,23 +80,20 @@ TEST(ParseJson, Naive) { TEST(ParseTokenizerParams, NoTokenizerParams) { TypeParams params{{"k", "v"}}; auto p = ParseTokenizerParams(params); - ASSERT_EQ(0, p.size()); + ASSERT_EQ("{}", p); } TEST(ParseTokenizerParams, Default) { TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}}; auto p = ParseTokenizerParams(params); - ASSERT_EQ(1, p.size()); - auto iter = p.find("tokenizer"); - ASSERT_NE(p.end(), iter); - ASSERT_EQ("default", iter->second); + ASSERT_EQ(params.at("tokenizer_params"), p); } TEST(TextMatch, Index) { using Index = index::TextMatchIndex; auto index = std::make_unique(std::numeric_limits::max(), "milvus_tokenizer", - std::map{}); + "{}"); index->CreateReader(); index->AddText("football, basketball, pingpang", 0); index->AddText("swimming, football", 1); diff --git a/internal/util/ctokenizer/c_tokenizer_factory.go b/internal/util/ctokenizer/c_tokenizer_factory.go index c5690d8861600..1bd3177917741 100644 --- a/internal/util/ctokenizer/c_tokenizer_factory.go +++ b/internal/util/ctokenizer/c_tokenizer_factory.go @@ -9,16 +9,17 @@ package ctokenizer import "C" import ( + "unsafe" + "github.com/milvus-io/milvus/internal/util/tokenizerapi" ) -func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) { - mm := NewCMap() - defer mm.Destroy() - mm.From(m) +func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) { + paramPtr := C.CString(param) + defer C.free(unsafe.Pointer(paramPtr)) var ptr C.CTokenizer - status := C.create_tokenizer(mm.GetPointer(), &ptr) + status := C.create_tokenizer(paramPtr, &ptr) if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil { return nil, err } diff --git a/internal/util/function/bm25_function.go b/internal/util/function/bm25_function.go index 225a3fa30893f..ff8e046b1f504 100644 --- a/internal/util/function/bm25_function.go +++ b/internal/util/function/bm25_function.go @@ -19,7 +19,6 @@ package function import ( - "encoding/json" "fmt" "sync" @@ -41,26 +40,13 @@ type BM25FunctionRunner struct { concurrency int } -// TODO Use json string instead map[string]string as tokenizer params -func getTokenizerParams(field *schemapb.FieldSchema) (map[string]string, error) { - result := map[string]string{} +func getTokenizerParams(field *schemapb.FieldSchema) string { for _, param := range field.GetTypeParams() { if param.Key == "tokenizer_params" { - params := map[string]interface{}{} - err := json.Unmarshal([]byte(param.GetValue()), ¶ms) - if err != nil { - return nil, err - } - for key, param := range params { - bytes, err := json.Marshal(param) - if err != nil { - return nil, err - } - result[key] = string(bytes) - } + return param.Value } } - return result, nil + return "{}" } func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.FunctionSchema) (*BM25FunctionRunner, error) { @@ -72,7 +58,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun schema: schema, concurrency: 8, } - var params map[string]string + var params string for _, field := range coll.GetFields() { if field.GetFieldID() == schema.GetOutputFieldIds()[0] { runner.outputField = field @@ -80,11 +66,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun } if field.GetFieldID() == schema.GetInputFieldIds()[0] { - var err error - params, err = getTokenizerParams(field) - if err != nil { - return nil, err - } + params = getTokenizerParams(field) } }