Skip to content

Commit

Permalink
Add chinese and english analyzer with refactor jieba tokenizer
Browse files Browse the repository at this point in the history
Signed-off-by: aoiasd <[email protected]>
  • Loading branch information
aoiasd committed Nov 12, 2024
1 parent 2630717 commit fce7377
Show file tree
Hide file tree
Showing 11 changed files with 249 additions and 57 deletions.
32 changes: 11 additions & 21 deletions internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ scopeguard = "1.2"
zstd-sys = "=2.0.9"
env_logger = "0.11.3"
log = "0.4.21"
tantivy-jieba = "0.10.0"
lazy_static = "1.4.0"
serde_json = "1.0.128"
jieba-rs = "0.6.8"
regex = "1.11.1"

[build-dependencies]
cbindgen = "0.26.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use jieba_rs;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use lazy_static::lazy_static;

lazy_static! {
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
}

#[derive(Clone)]
pub enum JiebaMode {
Exact,
Search,
}

#[derive(Clone)]
pub struct JiebaTokenizer{
mode: JiebaMode,
hmm: bool,
}

pub struct JiebaTokenStream {
tokens: Vec<Token>,
index: usize,
}

impl TokenStream for JiebaTokenStream {
fn advance(&mut self) -> bool {
if self.index < self.tokens.len() {
self.index += 1;
true
} else {
false
}
}

fn token(&self) -> &Token {
&self.tokens[self.index - 1]
}

fn token_mut(&mut self) -> &mut Token {
&mut self.tokens[self.index - 1]
}
}

impl JiebaTokenizer {
pub fn new() -> JiebaTokenizer{
JiebaTokenizer{mode: JiebaMode::Search, hmm: true}
}

fn tokenize(&self, text: &str) -> Vec<Token>{
let mut indices = text.char_indices().collect::<Vec<_>>();
indices.push((text.len(), '\0'));
let ori_tokens = match self.mode{
JiebaMode::Exact => {
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm)
},
JiebaMode::Search => {
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm)
},
};

let mut tokens = Vec::new();
for token in ori_tokens {
tokens.push(Token {
offset_from: indices[token.start].0,
offset_to: indices[token.end].0,
position: token.start,
text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
position_length: token.end - token.start,
});
}
tokens
}
}

impl Tokenizer for JiebaTokenizer {
type TokenStream<'a> = JiebaTokenStream;

fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
let tokens = self.tokenize(text);
JiebaTokenStream { tokens, index: 0 }
}
}
2 changes: 2 additions & 0 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ mod util;
mod error;
mod util_c;
mod vec_collector;
mod stop_words;
mod jieba_tokenizer;

pub fn add(left: usize, right: usize) -> usize {
left + right
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pub const ENGLISH: &[&str] = &[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
"their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
];
95 changes: 66 additions & 29 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,42 @@
use log::warn;
use std::collections::HashMap;
use tantivy::tokenizer::*;
use tantivy::tokenizer::StopWordFilter;
use serde_json as json;

use crate::stop_words;
use crate::tokenizer_filter::*;
use crate::jieba_tokenizer::JiebaTokenizer;
use crate::error::TantivyError;
use crate::util::*;


// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser);

if stop_words.len() > 0{
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}

fn chinese_analzyzer(stop_words: Vec<String>) -> TextAnalyzer{
let builder = jieba_builder().filter(CnCharOnlyFilter);
if stop_words.len() > 0{
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

builder.build()
}

fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
let builder = standard_builder()
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(40));
.filter(Stemmer::new(Language::English))
.filter(StopWordFilter::remove(stop_words::ENGLISH.iter().map(|&word| word.to_owned())));

if stop_words.len() > 0{
return builder.filter(StopWordFilter::remove(stop_words)).build();
Expand All @@ -29,10 +53,15 @@ fn whitespace_builder()-> TextAnalyzerBuilder{
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
}

fn jieba_builder() -> TextAnalyzerBuilder{
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
}

fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{
match name.as_str() {
"standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()),
"jieba" => Ok(jieba_builder()),
other => {
warn!("unsupported tokenizer: {}", other);
Err(format!("unsupported tokenizer: {}", other).into())
Expand Down Expand Up @@ -92,6 +121,7 @@ impl AnalyzerBuilder<'_>{
}

let filters = params.as_array().unwrap();

for filter in filters{
if filter.is_string(){
let filter_name = filter.as_str().unwrap();
Expand Down Expand Up @@ -127,30 +157,34 @@ impl AnalyzerBuilder<'_>{
// build with filter if filter param exist
builder=self.build_filter(builder, value)?;
},
"max_token_length" => {
if !value.is_u64(){
return Err("max token length should be int type".into());
}
builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize));
}
other => return Err(format!("unknown analyzer option key: {}", other).into()),
}
}
Ok(builder)
}

fn get_stop_words_option(&self) -> Result<Vec<String>, TantivyError>{
let value = self.params.get("stop_words");
match value{
Some(value)=>{
let str_list = get_string_list(value, "filter stop_words")?;
Ok(get_stop_words_list(str_list))
}
None => Ok(vec![])
}
}

fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
match type_{
"standard" => {
let value = self.params.get("stop_words");
match value{
Some(value)=>{
let str_list = get_string_list(value, "filter stop_words")?;
Ok(standard_analyzer(str_list))
}
None => Ok(standard_analyzer(vec![]))
}
Ok(standard_analyzer(self.get_stop_words_option()?))
},
"chinese" => {
Ok(chinese_analzyzer(self.get_stop_words_option()?))
},
"english" => {
Ok(english_analyzer(self.get_stop_words_option()?))
}
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
}
}
Expand All @@ -168,13 +202,7 @@ impl AnalyzerBuilder<'_>{
};

//build custom analyzer
let tokenizer_name = self.get_tokenizer_name()?;

// jieba analyzer can't add filter.
if tokenizer_name == "jieba"{
return Ok(tantivy_jieba::JiebaTokenizer{}.into());
}

let tokenizer_name = self.get_tokenizer_name()?;
let mut builder=get_builder_by_name(&tokenizer_name)?;

// build with option
Expand Down Expand Up @@ -227,28 +255,37 @@ pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyE
#[cfg(test)]
mod tests {
use crate::tokenizer::create_tokenizer;
use regex;

#[test]
fn test_create_tokenizer() {
let params = r#"{"tokenizer": "standard"}"#;
fn test_standard_analyzer() {
let params = r#"{
"type": "standard",
"stop_words": ["_english_"]
}"#;

let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
}

#[test]
fn test_jieba_tokenizer() {
let params = r#"{"tokenizer": "jieba"}"#;
fn test_chinese_analyzer() {
let params = r#"{
"type": "chinese"
}"#;

let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok());
let mut bining = tokenizer.unwrap();

let mut stream = bining.token_stream("系统安全");
let regex = regex::Regex::new("\\p{Han}+").unwrap();

let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
while stream.advance(){
let token = stream.token();
let text = token.text.clone();
print!("test token :{}\n", text.as_str())
print!("test token :{} symbol: {}\n", text.as_str(), regex.is_match(text.as_str()))
}
}

}
Loading

0 comments on commit fce7377

Please sign in to comment.