-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: tantivy tokenizer binding (#35801)
fix: #35800 --------- Signed-off-by: longjiquan <[email protected]>
- Loading branch information
1 parent
1413ffe
commit 5ea2454
Showing
32 changed files
with
1,225 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// Copyright (C) 2019-2020 Zilliz. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software distributed under the License | ||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
// or implied. See the License for the specific language governing permissions and limitations under the License | ||
|
||
#include "segcore/map_c.h" | ||
|
||
#include <memory> | ||
#include <map> | ||
#include <string> | ||
|
||
using Map = std::map<std::string, std::string>; | ||
|
||
CMap | ||
create_cmap() { | ||
auto m = std::make_unique<Map>(); | ||
return m.release(); | ||
} | ||
|
||
void | ||
free_cmap(CMap m) { | ||
delete static_cast<Map*>(m); | ||
} | ||
|
||
void | ||
cmap_set(CMap m, | ||
const char* key, | ||
uint32_t key_len, | ||
const char* value, | ||
uint32_t value_len) { | ||
auto mm = static_cast<Map*>(m); | ||
(*mm)[std::string(key, key_len)] = std::string(value, value_len); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
// Copyright (C) 2019-2020 Zilliz. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software distributed under the License | ||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
// or implied. See the License for the specific language governing permissions and limitations under the License | ||
|
||
#pragma once | ||
|
||
#include <stdint.h> | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
typedef void* CMap; | ||
|
||
CMap | ||
create_cmap(); | ||
|
||
void | ||
free_cmap(CMap m); | ||
|
||
void | ||
cmap_set(CMap m, | ||
const char* key, | ||
uint32_t key_len, | ||
const char* value, | ||
uint32_t value_len); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// Copyright (C) 2019-2020 Zilliz. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software distributed under the License | ||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
// or implied. See the License for the specific language governing permissions and limitations under the License | ||
|
||
#include <stdlib.h> | ||
#include <string.h> | ||
|
||
#include "segcore/token_stream_c.h" | ||
#include "token-stream.h" | ||
|
||
void | ||
free_token_stream(CTokenStream token_stream) { | ||
delete static_cast<milvus::tantivy::TokenStream*>(token_stream); | ||
} | ||
|
||
bool | ||
token_stream_advance(CTokenStream token_stream) { | ||
return static_cast<milvus::tantivy::TokenStream*>(token_stream)->advance(); | ||
} | ||
|
||
// Note: returned token must be freed by the caller using `free_token`. | ||
const char* | ||
token_stream_get_token(CTokenStream token_stream) { | ||
return static_cast<milvus::tantivy::TokenStream*>(token_stream) | ||
->get_token_no_copy(); | ||
} | ||
|
||
void | ||
free_token(void* token) { | ||
free_rust_string(static_cast<const char*>(token)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
// Copyright (C) 2019-2020 Zilliz. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software distributed under the License | ||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
// or implied. See the License for the specific language governing permissions and limitations under the License | ||
|
||
#pragma once | ||
|
||
#include <stdint.h> | ||
|
||
#include "map_c.h" | ||
#include "common/type_c.h" | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
typedef void* CTokenStream; | ||
|
||
void free_token_stream(CTokenStream); | ||
|
||
bool token_stream_advance(CTokenStream); | ||
|
||
// Note: returned string must be freed by the caller. | ||
const char* token_stream_get_token(CTokenStream); | ||
|
||
void | ||
free_token(void* token); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// Copyright (C) 2019-2020 Zilliz. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software distributed under the License | ||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
// or implied. See the License for the specific language governing permissions and limitations under the License | ||
|
||
#include "segcore/tokenizer_c.h" | ||
#include "common/EasyAssert.h" | ||
|
||
#include "tokenizer.h" | ||
|
||
using Map = std::map<std::string, std::string>; | ||
|
||
CStatus | ||
create_tokenizer(CMap m, CTokenizer* tokenizer) { | ||
try { | ||
auto mm = reinterpret_cast<Map*>(m); | ||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm); | ||
*tokenizer = impl.release(); | ||
return milvus::SuccessCStatus(); | ||
} catch (std::exception& e) { | ||
return milvus::FailureCStatus(&e); | ||
} | ||
} | ||
|
||
void | ||
free_tokenizer(CTokenizer tokenizer) { | ||
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer); | ||
delete impl; | ||
} | ||
|
||
CTokenStream | ||
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len) { | ||
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer); | ||
return impl->CreateTokenStream(std::string(text, text_len)).release(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
// Copyright (C) 2019-2020 Zilliz. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software distributed under the License | ||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
// or implied. See the License for the specific language governing permissions and limitations under the License | ||
|
||
#pragma once | ||
|
||
#include <stdint.h> | ||
|
||
#include "segcore/map_c.h" | ||
#include "segcore/token_stream_c.h" | ||
#include "common/type_c.h" | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
typedef void* CTokenizer; | ||
|
||
CStatus | ||
create_tokenizer(CMap m, CTokenizer* tokenizer); | ||
|
||
void | ||
free_tokenizer(CTokenizer tokenizer); | ||
|
||
CTokenStream | ||
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#pragma once | ||
|
||
#include <map> | ||
|
||
#include "tantivy-binding.h" | ||
#include "rust-binding.h" | ||
|
||
namespace milvus::tantivy { | ||
|
||
struct RustHashMap { | ||
public: | ||
NO_COPY_OR_ASSIGN(RustHashMap); | ||
|
||
RustHashMap() { | ||
ptr_ = create_hashmap(); | ||
} | ||
|
||
~RustHashMap() { | ||
if (ptr_ != nullptr) { | ||
free_hashmap(ptr_); | ||
} | ||
} | ||
|
||
void | ||
from(const std::map<std::string, std::string>& m) { | ||
for (const auto& [k, v] : m) { | ||
set(k, v); | ||
} | ||
} | ||
|
||
void* | ||
get_pointer() { | ||
return ptr_; | ||
} | ||
|
||
void | ||
set(const std::string& k, const std::string& v) { | ||
hashmap_set_value(ptr_, k.c_str(), v.c_str()); | ||
} | ||
|
||
private: | ||
void* ptr_ = nullptr; | ||
}; | ||
} // namespace milvus::tantivy |
Oops, something went wrong.