Skip to content

Commit

Permalink
create tokenizer by string
Browse files Browse the repository at this point in the history
Signed-off-by: aoiasd <[email protected]>
  • Loading branch information
aoiasd committed Oct 23, 2024
1 parent c21cbfd commit e9f469a
Show file tree
Hide file tree
Showing 17 changed files with 98 additions and 131 deletions.
14 changes: 2 additions & 12 deletions internal/core/src/common/FieldMeta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,15 @@
#include <boost/lexical_cast.hpp>

#include "Consts.h"
#include "log/Log.h"

namespace milvus {
TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("tokenizer_params");
if (iter == params.end()) {
return {};
return "{}";
}
nlohmann::json j = nlohmann::json::parse(iter->second);
std::map<std::string, std::string> ret;
for (const auto& [k, v] : j.items()) {
try {
ret[k] = v.get<std::string>();
} catch (std::exception& e) {
ret[k] = v.dump();
}
}
return ret;
return iter ->second.c_str();
}

bool
Expand Down
2 changes: 1 addition & 1 deletion internal/core/src/common/FieldMeta.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

namespace milvus {
using TypeParams = std::map<std::string, std::string>;
using TokenizerParams = std::map<std::string, std::string>;
using TokenizerParams = const char*;

TokenizerParams
ParseTokenizerParams(const TypeParams& params);
Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/index/TextMatchIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";
TextMatchIndex::TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
const char* tokenizer_params)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Text;
Expand All @@ -34,7 +34,7 @@ TextMatchIndex::TextMatchIndex(
TextMatchIndex::TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
path_ = path;
Expand All @@ -50,7 +50,7 @@ TextMatchIndex::TextMatchIndex(
TextMatchIndex::TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
schema_ = ctx.fieldDataMeta.field_schema;
Expand Down Expand Up @@ -176,7 +176,7 @@ TextMatchIndex::CreateReader() {
void
TextMatchIndex::RegisterTokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params) {
const char* tokenizer_params) {
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
}

Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/index/TextMatchIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
explicit TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);
// for sealed segment.
explicit TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);
// for building index.
explicit TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);
// for loading index
explicit TextMatchIndex(const storage::FileManagerContext& ctx);

Expand Down Expand Up @@ -69,7 +69,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
void
RegisterTokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
const char* tokenizer_params);

TargetBitmap
MatchQuery(const std::string& query);
Expand Down
5 changes: 2 additions & 3 deletions internal/core/src/segcore/tokenizer_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
using Map = std::map<std::string, std::string>;

CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer) {
create_tokenizer(const char* params, CTokenizer* tokenizer) {
try {
auto mm = reinterpret_cast<Map*>(m);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
*tokenizer = impl.release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
Expand Down
2 changes: 1 addition & 1 deletion internal/core/src/segcore/tokenizer_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ extern "C" {
typedef void* CTokenizer;

CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer);
create_tokenizer(const char* params, CTokenizer* tokenizer);

CStatus
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);

RustArray tantivy_match_query(void *ptr, const char *query);

void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params);
void tantivy_register_tokenizer(void *ptr,
const char *tokenizer_name,
const char *tokenizer_params);

void *tantivy_create_index(const char *field_name,
TantivyDataType data_type,
Expand Down Expand Up @@ -142,7 +144,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
void *tantivy_create_text_writer(const char *field_name,
const char *path,
const char *tokenizer_name,
void *tokenizer_params,
const char *tokenizer_params,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
Expand All @@ -157,7 +159,7 @@ bool tantivy_token_stream_advance(void *token_stream);

const char *tantivy_token_stream_get_token(void *token_stream);

void *tantivy_create_tokenizer(void *tokenizer_params);
void *tantivy_create_tokenizer(const char *tokenizer_params);

void *tantivy_clone_tokenizer(void *ptr);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use tantivy::{
Term,
};

use crate::{index_reader::IndexReaderWrapper, tokenizer::default_tokenizer};
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_tokenizer};

impl IndexReaderWrapper {
// split the query string into multiple tokens using index's default tokenizer,
Expand All @@ -14,7 +14,7 @@ impl IndexReaderWrapper {
let mut tokenizer = self
.index
.tokenizer_for_field(self.field)
.unwrap_or(default_tokenizer())
.unwrap_or(standard_tokenizer())
.clone();
let mut token_stream = tokenizer.token_stream(q);
let mut terms: Vec<Term> = Vec::new();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
use std::{collections::HashMap, ffi::CStr};
use std::{ffi::CStr};

use libc::{c_char, c_void};

use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer};
use crate::{
array::RustArray,
string_c::c_str_to_str,
index_reader::IndexReaderWrapper,
tokenizer::create_tokenizer,
};

#[no_mangle]
pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray {
Expand All @@ -18,13 +23,13 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
pub extern "C" fn tantivy_register_tokenizer(
ptr: *mut c_void,
tokenizer_name: *const c_char,
tokenizer_params: *mut c_void,
tokenizer_params: *const c_char,
) {
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => unsafe {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@ use std::ffi::CStr;
use crate::index_writer::IndexWriterWrapper;
use crate::tokenizer::create_tokenizer;
use crate::util::create_binding;
use crate::string_c::c_str_to_str;

#[no_mangle]
pub extern "C" fn tantivy_create_text_writer(
field_name: *const c_char,
path: *const c_char,
tokenizer_name: *const c_char,
tokenizer_params: *mut c_void,
tokenizer_params: *const c_char,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
Expand All @@ -21,8 +22,8 @@ pub extern "C" fn tantivy_create_text_writer(
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => {
Expand Down
76 changes: 36 additions & 40 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,8 @@ use serde_json as json;
use crate::tokenizer_filter::*;
use crate::log::init_log;

lazy_static! {
static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default();
}

pub(crate) fn default_tokenizer() -> TextAnalyzer {
DEFAULT_TOKENIZER_MANAGER.get("default").unwrap()
pub(crate) fn standard_tokenizer() -> TextAnalyzer {
TextAnalyzer::builder(SimpleTokenizer::default()).build()
}

struct TantivyBuilder<'a>{
Expand Down Expand Up @@ -89,6 +85,7 @@ impl TantivyBuilder<'_>{
}
Some(builder.build())
}
// TODO support jieba filter and use same builder with standard.
"jieba" => {
Some(tantivy_jieba::JiebaTokenizer {}.into())
}
Expand All @@ -100,58 +97,57 @@ impl TantivyBuilder<'_>{
}
}

pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
pub(crate) fn create_tokenizer(params: &String) -> Option<TextAnalyzer> {
init_log();

let analyzer_json_value = match params.get("analyzer"){
Some(value) => {
let json_analyzer = json::from_str::<json::Value>(value);
if json_analyzer.is_err() {
match json::from_str::<json::Value>(&params){
Ok(value) =>{
if value.is_null(){
return Some(standard_tokenizer());
}
if !value.is_object(){
return None;
}
let json_value = json_analyzer.unwrap();
if !json_value.is_object(){
return None
let json_params = value.as_object().unwrap();
// create builder
let analyzer_params=json_params.get("analyzer");
if analyzer_params.is_none(){
return Some(standard_tokenizer());
}
json_value
}
None => json::Value::Object(json::Map::<String, json::Value>::new()),
};

let analyzer_params= analyzer_json_value.as_object().unwrap();
let mut builder = TantivyBuilder::new(analyzer_params);
let str_filter=params.get("filter");
if !str_filter.is_none(){
let json_filter = json::from_str::<json::Value>(str_filter.unwrap());
if json_filter.is_err(){
return None
}
if !analyzer_params.unwrap().is_object(){
return None;
}
let mut builder = TantivyBuilder::new(analyzer_params.unwrap().as_object().unwrap());

let filter_params = json_filter.unwrap();
if !filter_params.is_object(){
return None
}
// build custom filter
let filter_params=json_params.get("filter");
if !filter_params.is_none() && filter_params.unwrap().is_object(){
builder.add_costom_filters(filter_params.unwrap().as_object().unwrap());
}

builder.add_costom_filters(filter_params.as_object().unwrap());
// build analyzer
builder.build()
},
Err(_e) => None,
}
builder.build()
}

#[cfg(test)]
mod tests {
use std::collections::HashMap;
use crate::tokenizer::create_tokenizer;

#[test]
fn test_create_tokenizer() {
let mut params : HashMap<String, String> = HashMap::new();
let analyzer_params = r#"
let params = r#"
{
"tokenizer": "jieba"
"analyzer":
{
"tokenizer": "standard",
"filter": [""],
},
}"#;

params.insert("analyzer".to_string(), analyzer_params.to_string());
let tokenizer = create_tokenizer(&params);
let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_some());
}
}
}
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
use std::collections::HashMap;

use libc::c_void;
use libc::{c_void,c_char};
use tantivy::tokenizer::TextAnalyzer;

use crate::{
string_c::c_str_to_str,
tokenizer::create_tokenizer,
util::{create_binding, free_binding},
};

#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void {
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => create_binding(text_analyzer),
Expand Down
Loading

0 comments on commit e9f469a

Please sign in to comment.