Skip to content

Commit

Permalink
Tokenizer support build with params and clone for concurrency
Browse files Browse the repository at this point in the history
Signed-off-by: aoiasd <[email protected]>
  • Loading branch information
aoiasd committed Oct 22, 2024
1 parent 50607a5 commit 692a3cc
Show file tree
Hide file tree
Showing 13 changed files with 360 additions and 20 deletions.
12 changes: 12 additions & 0 deletions internal/core/src/segcore/tokenizer_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License

#include "segcore/tokenizer_c.h"
#include <memory>
#include "common/FieldMeta.h"
#include "common/protobuf_utils.h"
#include "pb/schema.pb.h"
Expand All @@ -30,6 +31,17 @@ create_tokenizer(CMap m, CTokenizer* tokenizer) {
}
}

CStatus
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst) {
try {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(*tokenizer);
*rst = impl->Clone().release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(&e);
}
}

void
free_tokenizer(CTokenizer tokenizer) {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
Expand Down
3 changes: 3 additions & 0 deletions internal/core/src/segcore/tokenizer_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ typedef void* CTokenizer;
CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer);

CStatus
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);

void
free_tokenizer(CTokenizer tokenizer);

Expand Down
6 changes: 4 additions & 2 deletions internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ env_logger = "0.11.3"
log = "0.4.21"
tantivy-jieba = "0.10.0"
lazy_static = "1.4.0"
serde_json = "1.0.128"

[build-dependencies]
cbindgen = "0.26.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ const char *tantivy_token_stream_get_token(void *token_stream);

void *tantivy_create_tokenizer(void *tokenizer_params);

void *tantivy_clone_tokenizer(void *ptr);

void tantivy_free_tokenizer(void *tokenizer);

bool tantivy_index_exist(const char *path);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod log;
mod string_c;
mod token_stream_c;
mod tokenizer;
mod tokenizer_filter;
mod tokenizer_c;
mod util;
mod util_c;
Expand Down
133 changes: 118 additions & 15 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
use lazy_static::lazy_static;
use log::{info, warn};
use log::warn;
use std::collections::HashMap;
use tantivy::tokenizer::{TextAnalyzer, TokenizerManager};
use tantivy::tokenizer::*;
use serde_json::{self as json, value};

use crate::tokenizer_filter::*;
use crate::log::init_log;

lazy_static! {
Expand All @@ -12,32 +15,128 @@ pub(crate) fn default_tokenizer() -> TextAnalyzer {
DEFAULT_TOKENIZER_MANAGER.get("default").unwrap()
}

fn jieba_tokenizer() -> TextAnalyzer {
tantivy_jieba::JiebaTokenizer {}.into()
struct TantivyBuilder<'a>{
// builder: TextAnalyzerBuilder
filters:HashMap<String, SystemFilter>,
params:&'a json::Map<String, json::Value>
}

pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
init_log();
impl TantivyBuilder<'_>{
fn new(params: &json::Map<String, json::Value>) -> TantivyBuilder{
TantivyBuilder{
filters: HashMap::new(),
params:params,
}
}

fn add_costom_filter(&mut self, name: &String, params: &json::Map<String, json::Value>){
match SystemFilter::try_from(params){
Ok(filter) => {self.filters.insert(name.to_string(), filter);},
Err(_e) => {},
};
}

match params.get("tokenizer") {
Some(tokenizer_name) => match tokenizer_name.as_str() {
"default" => {
Some(default_tokenizer())
fn add_costom_filters(&mut self, params:&json::Map<String, json::Value>){
for (name, value) in params{
if !value.is_object(){
continue;
}

self.add_costom_filter(name, value.as_object().unwrap());
}
}

fn build(mut self) -> Option<TextAnalyzer>{
let tokenizer=self.params.get("tokenizer");
if !tokenizer.is_none() && !tokenizer.unwrap().is_string(){
return None;
}

let tokenizer_name = {
if !tokenizer.is_none(){
tokenizer.unwrap().as_str().unwrap()
}else{
"standard"
}
};

match tokenizer_name {
"standard" => {
let mut builder = TextAnalyzer::builder(SimpleTokenizer::default()).dynamic();
let filters= self.params.get("filter");
if !filters.is_none() && filters.unwrap().is_array(){
for filter in filters.unwrap().as_array().unwrap(){
if filter.is_string(){
let filter_name = filter.as_str().unwrap();
let costum = self.filters.remove(filter_name);
if !costum.is_none(){
builder = costum.unwrap().transform(builder);
continue;
}
// check if filter was system filter
let system = SystemFilter::from(filter_name);
match system {
SystemFilter::Invalid => {
log::warn!("build analyzer failed, filter not found :{}", filter_name);
return None
}
other => {
builder = other.transform(builder);
},
}
}
}
}
Some(builder.build())
}
"jieba" => {
Some(jieba_tokenizer())
Some(tantivy_jieba::JiebaTokenizer {}.into())
}
s => {
warn!("unsupported tokenizer: {}", s);
None
}
},
None => {
Some(default_tokenizer())
}
}
}

pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
init_log();

let analyzer_json_value = match params.get("analyzer"){
Some(value) => {
let json_analyzer = json::from_str::<json::Value>(value);
if json_analyzer.is_err() {
return None;
}
let json_value = json_analyzer.unwrap();
if !json_value.is_object(){
return None
}
json_value
}
None => json::Value::Object(json::Map::<String, json::Value>::new()),
};

let analyzer_params= analyzer_json_value.as_object().unwrap();
let mut builder = TantivyBuilder::new(analyzer_params);
let str_filter=params.get("filter");
if !str_filter.is_none(){
let json_filter = json::from_str::<json::Value>(str_filter.unwrap());
if json_filter.is_err(){
return None
}

let filter_params = json_filter.unwrap();
if !filter_params.is_object(){
return None
}

builder.add_costom_filters(filter_params.as_object().unwrap());
}
builder.build()
}

#[cfg(test)]
mod tests {
use std::collections::HashMap;
Expand All @@ -46,8 +145,12 @@ mod tests {
#[test]
fn test_create_tokenizer() {
let mut params : HashMap<String, String> = HashMap::new();
params.insert("tokenizer".parse().unwrap(), "jieba".parse().unwrap());
let analyzer_params = r#"
{
"tokenizer": "jieba"
}"#;

params.insert("analyzer".to_string(), analyzer_params.to_string());
let tokenizer = create_tokenizer(&params);
assert!(tokenizer.is_some());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mu
}
}

#[no_mangle]
pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void {
let analyzer=ptr as *mut TextAnalyzer;
let clone = unsafe {(*analyzer).clone()};
create_binding(clone)
}

#[no_mangle]
pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
free_binding::<TextAnalyzer>(tokenizer);
Expand Down
Loading

0 comments on commit 692a3cc

Please sign in to comment.