Skip to content

Commit

Permalink
feat: tantivy tokenizer binding (#35801)
Browse files Browse the repository at this point in the history
fix: #35800

---------

Signed-off-by: longjiquan <[email protected]>
  • Loading branch information
longjiquan authored Sep 1, 2024
1 parent 1413ffe commit 5ea2454
Show file tree
Hide file tree
Showing 32 changed files with 1,225 additions and 19 deletions.
2 changes: 1 addition & 1 deletion internal/core/src/segcore/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@


add_source_at_current_directory_recursively()
add_library(milvus_segcore OBJECT ${SOURCE_FILES})
add_library(milvus_segcore OBJECT ${SOURCE_FILES})
39 changes: 39 additions & 0 deletions internal/core/src/segcore/map_c.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

#include "segcore/map_c.h"

#include <memory>
#include <map>
#include <string>

using Map = std::map<std::string, std::string>;

CMap
create_cmap() {
auto m = std::make_unique<Map>();
return m.release();
}

void
free_cmap(CMap m) {
delete static_cast<Map*>(m);
}

void
cmap_set(CMap m,
const char* key,
uint32_t key_len,
const char* value,
uint32_t value_len) {
auto mm = static_cast<Map*>(m);
(*mm)[std::string(key, key_len)] = std::string(value, value_len);
}
37 changes: 37 additions & 0 deletions internal/core/src/segcore/map_c.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

#pragma once

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef void* CMap;

CMap
create_cmap();

void
free_cmap(CMap m);

void
cmap_set(CMap m,
const char* key,
uint32_t key_len,
const char* value,
uint32_t value_len);

#ifdef __cplusplus
}
#endif
38 changes: 38 additions & 0 deletions internal/core/src/segcore/token_stream_c.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

#include <stdlib.h>
#include <string.h>

#include "segcore/token_stream_c.h"
#include "token-stream.h"

void
free_token_stream(CTokenStream token_stream) {
delete static_cast<milvus::tantivy::TokenStream*>(token_stream);
}

bool
token_stream_advance(CTokenStream token_stream) {
return static_cast<milvus::tantivy::TokenStream*>(token_stream)->advance();
}

// Note: returned token must be freed by the caller using `free_token`.
const char*
token_stream_get_token(CTokenStream token_stream) {
return static_cast<milvus::tantivy::TokenStream*>(token_stream)
->get_token_no_copy();
}

void
free_token(void* token) {
free_rust_string(static_cast<const char*>(token));
}
37 changes: 37 additions & 0 deletions internal/core/src/segcore/token_stream_c.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

#pragma once

#include <stdint.h>

#include "map_c.h"
#include "common/type_c.h"

#ifdef __cplusplus
extern "C" {
#endif

typedef void* CTokenStream;

void free_token_stream(CTokenStream);

bool token_stream_advance(CTokenStream);

// Note: returned string must be freed by the caller.
const char* token_stream_get_token(CTokenStream);

void
free_token(void* token);

#ifdef __cplusplus
}
#endif
41 changes: 41 additions & 0 deletions internal/core/src/segcore/tokenizer_c.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

#include "segcore/tokenizer_c.h"
#include "common/EasyAssert.h"

#include "tokenizer.h"

using Map = std::map<std::string, std::string>;

CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer) {
try {
auto mm = reinterpret_cast<Map*>(m);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
*tokenizer = impl.release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(&e);
}
}

void
free_tokenizer(CTokenizer tokenizer) {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
delete impl;
}

CTokenStream
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len) {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
return impl->CreateTokenStream(std::string(text, text_len)).release();
}
37 changes: 37 additions & 0 deletions internal/core/src/segcore/tokenizer_c.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

#pragma once

#include <stdint.h>

#include "segcore/map_c.h"
#include "segcore/token_stream_c.h"
#include "common/type_c.h"

#ifdef __cplusplus
extern "C" {
#endif

typedef void* CTokenizer;

CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer);

void
free_tokenizer(CTokenizer tokenizer);

CTokenStream
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len);

#ifdef __cplusplus
}
#endif
49 changes: 32 additions & 17 deletions internal/core/thirdparty/tantivy/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,24 +58,39 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address)
endif()

add_executable(test_tantivy test.cpp)
target_link_libraries(test_tantivy
tantivy_binding
boost_filesystem
dl
)
# TODO: move these below tests to ut.

add_executable(bench_tantivy bench.cpp)
target_link_libraries(bench_tantivy
tantivy_binding
boost_filesystem
dl
)
option(BUILD_TANTIVY_WITH_UT "compile tantivy with ut" OFF)

add_executable(ffi_demo ffi_demo.cpp)
target_link_libraries(ffi_demo
tantivy_binding
dl
)
if (BUILD_TANTIVY_WITH_UT)
message(STATUS "compile tantivy with ut")

add_executable(test_tantivy test.cpp)
target_link_libraries(test_tantivy
tantivy_binding
boost_filesystem
dl
)

add_executable(bench_tantivy bench.cpp)
target_link_libraries(bench_tantivy
tantivy_binding
boost_filesystem
dl
)

add_executable(ffi_demo ffi_demo.cpp)
target_link_libraries(ffi_demo
tantivy_binding
dl
)

add_executable(tokenizer_demo tokenizer_demo.cpp)
target_link_libraries(tokenizer_demo
tantivy_binding
dl
)
else ()
endif ()

set( TANTIVY_INCLUDE_DIR ${LIB_HEADER_FOLDER};${CMAKE_CURRENT_SOURCE_DIR} CACHE INTERNAL "Path to tantivy include directory" )
44 changes: 44 additions & 0 deletions internal/core/thirdparty/tantivy/rust-hashmap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#pragma once

#include <map>

#include "tantivy-binding.h"
#include "rust-binding.h"

namespace milvus::tantivy {

struct RustHashMap {
public:
NO_COPY_OR_ASSIGN(RustHashMap);

RustHashMap() {
ptr_ = create_hashmap();
}

~RustHashMap() {
if (ptr_ != nullptr) {
free_hashmap(ptr_);
}
}

void
from(const std::map<std::string, std::string>& m) {
for (const auto& [k, v] : m) {
set(k, v);
}
}

void*
get_pointer() {
return ptr_;
}

void
set(const std::string& k, const std::string& v) {
hashmap_set_value(ptr_, k.c_str(), v.c_str());
}

private:
void* ptr_ = nullptr;
};
} // namespace milvus::tantivy
Loading

0 comments on commit 5ea2454

Please sign in to comment.