Skip to content

Commit

Permalink
support analyzer type
Browse files Browse the repository at this point in the history
Signed-off-by: aoiasd <[email protected]>
  • Loading branch information
aoiasd committed Nov 3, 2024
1 parent adad5f3 commit 25e8098
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 86 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ impl IndexReaderWrapper {
let mut tokenizer = self
.index
.tokenizer_for_field(self.field)
.unwrap_or(standard_analyzer())
.unwrap_or(standard_analyzer(vec![]))
.clone();
let mut token_stream = tokenizer.token_stream(q);
let mut terms: Vec<Term> = Vec::new();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ pub extern "C" fn tantivy_register_tokenizer(
init_log();
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
let analyzer = unsafe {
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => unsafe {
(*real).register_tokenizer(
Expand All @@ -41,7 +39,7 @@ pub extern "C" fn tantivy_register_tokenizer(
);
},
Err(err) => {
panic!("create tokenizer failed with error: {}", err.to_string());
panic!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
},
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@ pub extern "C" fn tantivy_create_text_writer(
let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
let analyzer = unsafe {
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => {
let wrapper = IndexWriterWrapper::create_text_writer(
Expand All @@ -40,7 +38,7 @@ pub extern "C" fn tantivy_create_text_writer(
create_binding(wrapper)
}
Err(err) => {
log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), unsafe{c_str_to_str(tokenizer_params).to_string()});
log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
std::ptr::null_mut()
},
}
Expand Down
70 changes: 57 additions & 13 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,20 @@ use serde_json as json;

use crate::tokenizer_filter::*;
use crate::error::TantivyError;
use crate::util::get_stop_words_list;

pub(crate) fn standard_analyzer() -> TextAnalyzer {
standard_builder().build()

// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(40));

if stop_words.len() > 0{
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}

fn standard_builder() -> TextAnalyzerBuilder{
Expand Down Expand Up @@ -122,14 +133,44 @@ impl AnalyzerBuilder<'_>{
}
builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize));
}
other => return Err(format!("unknown key of tokenizer option: {}", other).into()),
other => return Err(format!("unknown analyzer option key: {}", other).into()),
}
}
Ok(builder)
}

fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
match type_{
"standard" => {
let value = self.params.get("stop_words");
match value{
Some(value)=>{
let str_list = get_stop_words_list(value)?;
Ok(standard_analyzer(str_list))
}
None => Ok(standard_analyzer(vec![]))
}
},
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
}
}

fn build(mut self) -> Result<TextAnalyzer, TantivyError>{
let tokenizer_name = self.get_tokenizer_name()?;
// build base build-in analyzer
match self.params.get("type"){
Some(type_) =>{
if !type_.is_string(){
return Err(format!("analyzer type shoud be string").into())
}
return self.build_template(type_.as_str().unwrap());
},
None => {}
};

//build custom analyzer
let tokenizer_name = self.get_tokenizer_name()?;

// jieba analyzer can't add filter.
if tokenizer_name == "jieba"{
return Ok(tantivy_jieba::JiebaTokenizer{}.into());
}
Expand All @@ -142,15 +183,11 @@ impl AnalyzerBuilder<'_>{
}
}

pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
if params.len()==0{
return Ok(standard_analyzer());
}

pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> {
match json::from_str::<json::Value>(&params){
Ok(value) =>{
if value.is_null(){
return Ok(standard_analyzer());
return Ok(standard_analyzer(vec![]));
}
if !value.is_object(){
return Err("tokenizer params should be a json map".into());
Expand All @@ -160,7 +197,7 @@ pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyE
// create builder
let analyzer_params=json_params.get("analyzer");
if analyzer_params.is_none(){
return Ok(standard_analyzer());
return Ok(standard_analyzer(vec![]));
}
if !analyzer_params.unwrap().is_object(){
return Err("analyzer params should be a json map".into());
Expand All @@ -180,21 +217,28 @@ pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyE
}
}

pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
if params.len()==0{
return Ok(standard_analyzer(vec![]));
}
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
}

#[cfg(test)]
mod tests {
use crate::tokenizer::create_tokenizer;

#[test]
fn test_create_tokenizer() {
let params = r#"{"analyzer": {"tokenizer": "standard"}}"#;
let params = r#"{"tokenizer": "standard"}"#;

let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok());
}

#[test]
fn test_jieba_tokenizer() {
let params = r#"{"analyzer": {"tokenizer": "jieba"}}"#;
let params = r#"{"tokenizer": "jieba"}"#;

let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,12 @@ use crate::{
#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
init_log();
let analyzer = unsafe {
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => create_binding(text_analyzer),
Err(err) => {
log::warn!("create tokenizer failed with error: {}", err.to_string());
log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
std::ptr::null_mut()
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use tantivy::tokenizer::*;
use serde_json as json;

use crate::error::TantivyError;
use crate::util::*;

pub(crate) enum SystemFilter{
Invalid,
Expand Down Expand Up @@ -46,18 +47,10 @@ fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFi

fn get_stop_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
let value = params.get("stop_words");
if value.is_none() || !value.unwrap().is_array(){
return Err("stop_words should be array".into())
if value.is_none(){
return Err("stop filter stop_words can't be empty".into());
}

let stop_words= value.unwrap().as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words{
match element.as_str(){
Some(word) => str_list.push(word.to_string()),
None => return Err("stop words item should be string".into())
}
};
let str_list = get_stop_words_list(value.unwrap())?;
Ok(SystemFilter::Stop(StopWordFilter::remove(str_list)))
}

Expand Down
18 changes: 18 additions & 0 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::ffi::c_void;
use std::ops::Bound;
use serde_json as json;
use crate::error::TantivyError;

use tantivy::{directory::MmapDirectory, Index};

Expand Down Expand Up @@ -28,3 +30,19 @@ pub fn free_binding<T>(ptr: *mut c_void) {
drop(Box::from_raw(real));
}
}

pub(crate) fn get_stop_words_list(value: &json::Value) -> Result<Vec<String>, TantivyError>{
if !value.is_array(){
return Err("stop_words should be array".into())
}

let stop_words= value.as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words{
match element.as_str(){
Some(word) => str_list.push(word.to_string()),
None => return Err("stop words item should be string".into())
}
};
Ok(str_list)
}
2 changes: 1 addition & 1 deletion tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct.
def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs):
tokenizer_params = {
"analyzer":{"tokenizer": "standard"},
"tokenizer": "standard",
}
fields = [
gen_int64_field(),
Expand Down
Loading

0 comments on commit 25e8098

Please sign in to comment.