forked from milvus-io/milvus
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add chinese and english analyzer with refactor jieba tokenizer
Signed-off-by: aoiasd <[email protected]>
- Loading branch information
Showing
11 changed files
with
249 additions
and
57 deletions.
There are no files selected for viewing
32 changes: 11 additions & 21 deletions
32
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 83 additions & 0 deletions
83
internal/core/thirdparty/tantivy/tantivy-binding/src/jieba_tokenizer.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
use jieba_rs; | ||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; | ||
use lazy_static::lazy_static; | ||
|
||
lazy_static! { | ||
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); | ||
} | ||
|
||
#[derive(Clone)] | ||
pub enum JiebaMode { | ||
Exact, | ||
Search, | ||
} | ||
|
||
#[derive(Clone)] | ||
pub struct JiebaTokenizer{ | ||
mode: JiebaMode, | ||
hmm: bool, | ||
} | ||
|
||
pub struct JiebaTokenStream { | ||
tokens: Vec<Token>, | ||
index: usize, | ||
} | ||
|
||
impl TokenStream for JiebaTokenStream { | ||
fn advance(&mut self) -> bool { | ||
if self.index < self.tokens.len() { | ||
self.index += 1; | ||
true | ||
} else { | ||
false | ||
} | ||
} | ||
|
||
fn token(&self) -> &Token { | ||
&self.tokens[self.index - 1] | ||
} | ||
|
||
fn token_mut(&mut self) -> &mut Token { | ||
&mut self.tokens[self.index - 1] | ||
} | ||
} | ||
|
||
impl JiebaTokenizer { | ||
pub fn new() -> JiebaTokenizer{ | ||
JiebaTokenizer{mode: JiebaMode::Search, hmm: true} | ||
} | ||
|
||
fn tokenize(&self, text: &str) -> Vec<Token>{ | ||
let mut indices = text.char_indices().collect::<Vec<_>>(); | ||
indices.push((text.len(), '\0')); | ||
let ori_tokens = match self.mode{ | ||
JiebaMode::Exact => { | ||
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm) | ||
}, | ||
JiebaMode::Search => { | ||
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm) | ||
}, | ||
}; | ||
|
||
let mut tokens = Vec::new(); | ||
for token in ori_tokens { | ||
tokens.push(Token { | ||
offset_from: indices[token.start].0, | ||
offset_to: indices[token.end].0, | ||
position: token.start, | ||
text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), | ||
position_length: token.end - token.start, | ||
}); | ||
} | ||
tokens | ||
} | ||
} | ||
|
||
impl Tokenizer for JiebaTokenizer { | ||
type TokenStream<'a> = JiebaTokenStream; | ||
|
||
fn token_stream(&mut self, text: &str) -> JiebaTokenStream { | ||
let tokens = self.tokenize(text); | ||
JiebaTokenStream { tokens, index: 0 } | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 5 additions & 0 deletions
5
internal/core/thirdparty/tantivy/tantivy-binding/src/stop_words.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pub const ENGLISH: &[&str] = &[ | ||
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", | ||
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", | ||
"their", "then", "there", "these", "they", "this", "to", "was", "will", "with", | ||
]; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.