Skip to content

Commit

Permalink
add error struct for tantivy binding
Browse files Browse the repository at this point in the history
Signed-off-by: aoiasd <[email protected]>
  • Loading branch information
aoiasd committed Oct 24, 2024
1 parent e9f469a commit 198a7e1
Show file tree
Hide file tree
Showing 14 changed files with 214 additions and 142 deletions.
2 changes: 1 addition & 1 deletion internal/core/src/common/FieldMeta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ ParseTokenizerParams(const TypeParams& params) {
if (iter == params.end()) {
return "{}";
}
return iter ->second.c_str();
return iter->second.c_str();
}

bool
Expand Down
26 changes: 11 additions & 15 deletions internal/core/src/index/TextMatchIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@
namespace milvus::index {
constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";

TextMatchIndex::TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params)
TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Text;
Expand All @@ -31,10 +30,9 @@ TextMatchIndex::TextMatchIndex(
field_name.c_str(), true, "", tokenizer_name, tokenizer_params);
}

TextMatchIndex::TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params)
TextMatchIndex::TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
path_ = path;
Expand All @@ -47,10 +45,9 @@ TextMatchIndex::TextMatchIndex(
tokenizer_params);
}

TextMatchIndex::TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params)
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
schema_ = ctx.fieldDataMeta.field_schema;
Expand Down Expand Up @@ -174,9 +171,8 @@ TextMatchIndex::CreateReader() {
}

void
TextMatchIndex::RegisterTokenizer(
const char* tokenizer_name,
const char* tokenizer_params) {
TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
const char* tokenizer_params) {
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
}

Expand Down
25 changes: 10 additions & 15 deletions internal/core/src/index/TextMatchIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,17 @@ using stdclock = std::chrono::high_resolution_clock;
class TextMatchIndex : public InvertedIndexTantivy<std::string> {
public:
// for growing segment.
explicit TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params);
explicit TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params);
// for sealed segment.
explicit TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params);
explicit TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params);
// for building index.
explicit TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params);
explicit TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params);
// for loading index
explicit TextMatchIndex(const storage::FileManagerContext& ctx);

Expand Down Expand Up @@ -67,9 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
CreateReader();

void
RegisterTokenizer(
const char* tokenizer_name,
const char* tokenizer_params);
RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params);

TargetBitmap
MatchQuery(const std::string& query);
Expand Down
34 changes: 34 additions & 0 deletions internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use serde_json as json;
pub struct TantivyError{
reason: String,
}

impl TantivyError{
fn new(reason:String) -> Self{
TantivyError{reason:reason}
}
}

impl From<&str> for TantivyError{
fn from(value: &str) -> Self {
Self::new(value.to_string())
}
}

impl From<String> for TantivyError{
fn from(value: String) -> Self {
Self::new(value)
}
}

impl From<json::Error> for TantivyError{
fn from(value: json::Error) -> Self {
Self::new(value.to_string())
}
}

impl ToString for TantivyError{
fn to_string(&self) -> String {
return self.reason.clone()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use tantivy::{
Term,
};

use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_tokenizer};
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};

impl IndexReaderWrapper {
// split the query string into multiple tokens using index's default tokenizer,
Expand All @@ -14,7 +14,7 @@ impl IndexReaderWrapper {
let mut tokenizer = self
.index
.tokenizer_for_field(self.field)
.unwrap_or(standard_tokenizer())
.unwrap_or(standard_analyzer())
.clone();
let mut token_stream = tokenizer.token_stream(q);
let mut terms: Vec<Term> = Vec::new();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use crate::{
string_c::c_str_to_str,
index_reader::IndexReaderWrapper,
tokenizer::create_tokenizer,
log::init_log,
};

#[no_mangle]
Expand All @@ -25,21 +26,22 @@ pub extern "C" fn tantivy_register_tokenizer(
tokenizer_name: *const c_char,
tokenizer_params: *const c_char,
) {
init_log();
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
let analyzer = unsafe {
let params = c_str_to_str(tokenizer_params).to_string();
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => unsafe {
Ok(text_analyzer) => unsafe {
(*real).register_tokenizer(
String::from(tokenizer_name_str.to_str().unwrap()),
text_analyzer,
);
},
None => {
panic!("unsupported tokenizer");
}
Err(err) => {
panic!("create tokenizer failed with error: {}", err.to_string());
},
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::ffi::c_char;
use std::ffi::c_void;
use std::ffi::CStr;
Expand All @@ -7,6 +6,7 @@ use crate::index_writer::IndexWriterWrapper;
use crate::tokenizer::create_tokenizer;
use crate::util::create_binding;
use crate::string_c::c_str_to_str;
use crate::log::init_log;

#[no_mangle]
pub extern "C" fn tantivy_create_text_writer(
Expand All @@ -18,6 +18,7 @@ pub extern "C" fn tantivy_create_text_writer(
overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> *mut c_void {
init_log();
let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
Expand All @@ -26,7 +27,7 @@ pub extern "C" fn tantivy_create_text_writer(
create_tokenizer(&params)
};
match analyzer {
Some(text_analyzer) => {
Ok(text_analyzer) => {
let wrapper = IndexWriterWrapper::create_text_writer(
String::from(field_name_str),
String::from(path_str),
Expand All @@ -38,8 +39,9 @@ pub extern "C" fn tantivy_create_text_writer(
);
create_binding(wrapper)
}
None => {
Err(err) => {
log::warn!("create tokenizer failed with error: {}", err.to_string());
std::ptr::null_mut()
}
},
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ mod tokenizer;
mod tokenizer_filter;
mod tokenizer_c;
mod util;
mod error;
mod util_c;
mod vec_collector;

Expand Down
Loading

0 comments on commit 198a7e1

Please sign in to comment.