Skip to content

Commit

Permalink
create tokenizer by string
Browse files Browse the repository at this point in the history
Signed-off-by: aoiasd <[email protected]>
  • Loading branch information
aoiasd committed Oct 23, 2024
1 parent c21cbfd commit 37e9641
Show file tree
Hide file tree
Showing 16 changed files with 87 additions and 123 deletions.
14 changes: 2 additions & 12 deletions internal/core/src/common/FieldMeta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,15 @@
#include <boost/lexical_cast.hpp>

#include "Consts.h"
#include "log/Log.h"

namespace milvus {
TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("tokenizer_params");
if (iter == params.end()) {
return {};
return "{}";
}
nlohmann::json j = nlohmann::json::parse(iter->second);
std::map<std::string, std::string> ret;
for (const auto& [k, v] : j.items()) {
try {
ret[k] = v.get<std::string>();
} catch (std::exception& e) {
ret[k] = v.dump();
}
}
return ret;
return iter ->second.c_str();
}

bool
Expand Down
2 changes: 1 addition & 1 deletion internal/core/src/common/FieldMeta.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

namespace milvus {
using TypeParams = std::map<std::string, std::string>;
using TokenizerParams = std::map<std::string, std::string>;
using TokenizerParams = const char*;

TokenizerParams
ParseTokenizerParams(const TypeParams& params);
Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/index/TextMatchIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";
TextMatchIndex::TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
const char* tokenizer_params)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Text;
Expand All @@ -34,7 +34,7 @@ TextMatchIndex::TextMatchIndex(
TextMatchIndex::TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
path_ = path;
Expand All @@ -50,7 +50,7 @@ TextMatchIndex::TextMatchIndex(
TextMatchIndex::TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
schema_ = ctx.fieldDataMeta.field_schema;
Expand Down Expand Up @@ -176,7 +176,7 @@ TextMatchIndex::CreateReader() {
void
TextMatchIndex::RegisterTokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params) {
const char* tokenizer_params) {
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
}

Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/index/TextMatchIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
explicit TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);
// for sealed segment.
explicit TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);
// for building index.
explicit TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);
// for loading index
explicit TextMatchIndex(const storage::FileManagerContext& ctx);

Expand Down Expand Up @@ -69,7 +69,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
void
RegisterTokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);

TargetBitmap
MatchQuery(const std::string& query);
Expand Down
5 changes: 2 additions & 3 deletions internal/core/src/segcore/tokenizer_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
using Map = std::map<std::string, std::string>;

CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer) {
create_tokenizer(const char* params, CTokenizer* tokenizer) {
try {
auto mm = reinterpret_cast<Map*>(m);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
*tokenizer = impl.release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
Expand Down
2 changes: 1 addition & 1 deletion internal/core/src/segcore/tokenizer_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ extern "C" {
typedef void* CTokenizer;

CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer);
create_tokenizer(const char* params, CTokenizer* tokenizer);

CStatus
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);

RustArray tantivy_match_query(void *ptr, const char *query);

void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params);
void tantivy_register_tokenizer(void *ptr,
const char *tokenizer_name,
const char *tokenizer_params);

void *tantivy_create_index(const char *field_name,
TantivyDataType data_type,
Expand Down Expand Up @@ -142,7 +144,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
void *tantivy_create_text_writer(const char *field_name,
const char *path,
const char *tokenizer_name,
void *tokenizer_params,
const char *tokenizer_params,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
Expand All @@ -157,7 +159,7 @@ bool tantivy_token_stream_advance(void *token_stream);

const char *tantivy_token_stream_get_token(void *token_stream);

void *tantivy_create_tokenizer(void *tokenizer_params);
void *tantivy_create_tokenizer(const char *tokenizer_params);

void *tantivy_clone_tokenizer(void *ptr);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
use std::{collections::HashMap, ffi::CStr};
use std::{ffi::CStr};

use libc::{c_char, c_void};

use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer};
use crate::{
array::RustArray,
string_c::c_str_to_str,
index_reader::IndexReaderWrapper,
tokenizer::create_tokenizer,
};

#[no_mangle]
pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray {
Expand All @@ -18,13 +23,13 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
pub extern "C" fn tantivy_register_tokenizer(
ptr: *mut c_void,
tokenizer_name: *const c_char,
tokenizer_params: *mut c_void,
tokenizer_params: *const c_char,
) {
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => unsafe {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@ use std::ffi::CStr;
use crate::index_writer::IndexWriterWrapper;
use crate::tokenizer::create_tokenizer;
use crate::util::create_binding;
use crate::string_c::c_str_to_str;

#[no_mangle]
pub extern "C" fn tantivy_create_text_writer(
field_name: *const c_char,
path: *const c_char,
tokenizer_name: *const c_char,
tokenizer_params: *mut c_void,
tokenizer_params: *const c_char,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
Expand All @@ -21,8 +22,8 @@ pub extern "C" fn tantivy_create_text_writer(
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => {
Expand Down
61 changes: 27 additions & 34 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,58 +100,51 @@ impl TantivyBuilder<'_>{
}
}

pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
pub(crate) fn create_tokenizer(params: &String) -> Option<TextAnalyzer> {
init_log();

let analyzer_json_value = match params.get("analyzer"){
Some(value) => {
let json_analyzer = json::from_str::<json::Value>(value);
if json_analyzer.is_err() {
match json::from_str::<json::Value>(&params){
Ok(value) =>{
if !value.is_object(){
return None;
}
let json_value = json_analyzer.unwrap();
if !json_value.is_object(){
return None
let json_params = value.as_object().unwrap();
// create builder
let analyzer_params=json_params.get("analyzer");
if analyzer_params.is_none() || !analyzer_params.unwrap().is_object(){
return None;
}
json_value
}
None => json::Value::Object(json::Map::<String, json::Value>::new()),
};

let analyzer_params= analyzer_json_value.as_object().unwrap();
let mut builder = TantivyBuilder::new(analyzer_params);
let str_filter=params.get("filter");
if !str_filter.is_none(){
let json_filter = json::from_str::<json::Value>(str_filter.unwrap());
if json_filter.is_err(){
return None
}
let mut builder = TantivyBuilder::new(analyzer_params.unwrap().as_object().unwrap());

let filter_params = json_filter.unwrap();
if !filter_params.is_object(){
return None
}
// build custom filter
let filter_params=json_params.get("filter");
if !filter_params.is_none() && filter_params.unwrap().is_object(){
builder.add_costom_filters(filter_params.unwrap().as_object().unwrap());
}

builder.add_costom_filters(filter_params.as_object().unwrap());
// build analyzer
builder.build()
},
Err(_e) => None,
}
builder.build()
}

#[cfg(test)]
mod tests {
use std::collections::HashMap;
use crate::tokenizer::create_tokenizer;

#[test]
fn test_create_tokenizer() {
let mut params : HashMap<String, String> = HashMap::new();
let analyzer_params = r#"
let params = r#"
{
"tokenizer": "jieba"
"analyzer":
{
"tokenizer": "standard",
"filter": [""],
},
}"#;

params.insert("analyzer".to_string(), analyzer_params.to_string());
let tokenizer = create_tokenizer(&params);
let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_some());
}
}
}
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
use std::collections::HashMap;

use libc::c_void;
use libc::{c_void,c_char};
use tantivy::tokenizer::TextAnalyzer;

use crate::{
string_c::c_str_to_str,
tokenizer::create_tokenizer,
util::{create_binding, free_binding},
};

#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void {
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => create_binding(text_analyzer),
Expand Down
14 changes: 5 additions & 9 deletions internal/core/thirdparty/tantivy/tantivy-wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace milvus::tantivy {
using Map = std::map<std::string, std::string>;

static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer";
static Map DEFAULT_TOKENIZER_PARAMS = {};
static const char* DEFAULT_TOKENIZER_PARAMS = "{}";
static constexpr uintptr_t DEFAULT_NUM_THREADS = 4;
static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES =
DEFAULT_NUM_THREADS * 15 * 1024 * 1024;
Expand Down Expand Up @@ -101,17 +101,15 @@ struct TantivyIndexWrapper {
bool in_ram,
const char* path,
const char* tokenizer_name = DEFAULT_TOKENIZER_NAME,
const std::map<std::string, std::string>&
const char*
tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
RustHashMap m;
m.from(tokenizer_params);
writer_ = tantivy_create_text_writer(field_name,
path,
tokenizer_name,
m.get_pointer(),
tokenizer_params,
num_threads,
overall_memory_budget_in_bytes,
in_ram);
Expand All @@ -136,12 +134,10 @@ struct TantivyIndexWrapper {
void
register_tokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params) {
RustHashMap m;
m.from(tokenizer_params);
const char* tokenizer_params) {
if (reader_ != nullptr) {
tantivy_register_tokenizer(
reader_, tokenizer_name, m.get_pointer());
reader_, tokenizer_name, tokenizer_params);
}
}

Expand Down
7 changes: 3 additions & 4 deletions internal/core/thirdparty/tantivy/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ struct Tokenizer {
public:
NO_COPY_OR_ASSIGN(Tokenizer);

explicit Tokenizer(const std::map<std::string, std::string>& params) {
RustHashMap m;
m.from(params);
ptr_ = tantivy_create_tokenizer(m.get_pointer());
explicit Tokenizer(std::string&& params) {
auto shared_params = std::make_shared<std::string>(std::move(params));
ptr_ = tantivy_create_tokenizer(shared_params->c_str());
if (ptr_ == nullptr) {
throw std::invalid_argument("invalid tokenizer parameters");
}
Expand Down
Loading

0 comments on commit 37e9641

Please sign in to comment.