Skip to content

Commit

Permalink
feat: support inverted index (milvus-io#28783)
Browse files Browse the repository at this point in the history
issue: milvus-io#27704

Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.

Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.

Not supported: `ARRAY` and `JSON`.

Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.

The inverted index is very easy to be used.

Taking below collection as an example:

```python
fields = [
		FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
		FieldSchema(name="int8", dtype=DataType.INT8),
		FieldSchema(name="int16", dtype=DataType.INT16),
		FieldSchema(name="int32", dtype=DataType.INT32),
		FieldSchema(name="int64", dtype=DataType.INT64),
		FieldSchema(name="float", dtype=DataType.FLOAT),
		FieldSchema(name="double", dtype=DataType.DOUBLE),
		FieldSchema(name="bool", dtype=DataType.BOOL),
		FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
		FieldSchema(name="random", dtype=DataType.DOUBLE),
		FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```

Then we can simply create inverted index for field via:

```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```

Then, term query and range query on the field can be speed up
automatically by the inverted index:

```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```

---------

Signed-off-by: longjiquan <[email protected]>
  • Loading branch information
longjiquan authored Dec 31, 2023
1 parent 984e7bb commit 3f46c6d
Show file tree
Hide file tree
Showing 50 changed files with 3,023 additions and 180 deletions.
5 changes: 3 additions & 2 deletions internal/core/run_clang_format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ fi
CorePath=$1

formatThis() {
find "$1" | grep -E "(*\.cpp|*\.h|*\.cc)$" | grep -v "gen_tools/templates" | grep -v "/thirdparty" | grep -v "\.pb\." | xargs clang-format-10 -i
find "$1" | grep -E "(*\.cpp|*\.h|*\.cc)$" | grep -v "gen_tools/templates" | grep -v "\.pb\." | xargs clang-format-10 -i
}

formatThis "${CorePath}/src"
formatThis "${CorePath}/unittest"
formatThis "${CorePath}/unittest/bench"
formatThis "${CorePath}/thirdparty/tantivy"

${CorePath}/build-support/add_cpp_license.sh ${CorePath}/build-support/cpp_license.txt ${CorePath}
${CorePath}/build-support/add_cmake_license.sh ${CorePath}/build-support/cmake_license.txt ${CorePath}
${CorePath}/build-support/add_cmake_license.sh ${CorePath}/build-support/cmake_license.txt ${CorePath}
11 changes: 11 additions & 0 deletions internal/core/src/common/Channel.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

#include <oneapi/tbb/concurrent_queue.h>

#include <atomic>
Expand Down
3 changes: 2 additions & 1 deletion internal/core/src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ set(INDEX_FILES
ScalarIndex.cpp
ScalarIndexSort.cpp
SkipIndex.cpp
InvertedIndexTantivy.cpp
)

milvus_add_pkg_config("milvus_index")
add_library(milvus_index SHARED ${INDEX_FILES})

target_link_libraries(milvus_index milvus_storage milvus-storage)
target_link_libraries(milvus_index milvus_storage milvus-storage tantivy_binding)

install(TARGETS milvus_index DESTINATION "${CMAKE_INSTALL_LIBDIR}")
89 changes: 66 additions & 23 deletions internal/core/src/index/IndexFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,22 @@
#include "index/ScalarIndexSort.h"
#include "index/StringIndexMarisa.h"
#include "index/BoolIndex.h"
#include "index/InvertedIndexTantivy.h"

namespace milvus::index {

template <typename T>
ScalarIndexPtr<T>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context) {
const storage::FileManagerContext& file_manager_context,
DataType d_type) {
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<T>>(cfg,
file_manager_context);
}
return CreateScalarIndexSort<T>(file_manager_context);
}

Expand All @@ -44,11 +52,18 @@ IndexFactory::CreateScalarIndex(
//

template <>
ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
inline ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context) {
const storage::FileManagerContext& file_manager_context,
DataType d_type) {
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context);
}
return CreateStringIndexMarisa(file_manager_context);
#else
throw SegcoreError(Unsupported, "unsupported platform");
Expand All @@ -60,7 +75,14 @@ ScalarIndexPtr<T>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space) {
std::shared_ptr<milvus_storage::Space> space,
DataType d_type) {
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<T>>(
cfg, file_manager_context, space);
}
return CreateScalarIndexSort<T>(file_manager_context, space);
}

Expand All @@ -69,8 +91,15 @@ ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space) {
std::shared_ptr<milvus_storage::Space> space,
DataType d_type) {
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context, space);
}
return CreateStringIndexMarisa(file_manager_context, space);
#else
throw SegcoreError(Unsupported, "unsupported platform");
Expand Down Expand Up @@ -111,25 +140,32 @@ IndexFactory::CreateScalarIndex(
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreateScalarIndex<bool>(index_type, file_manager_context);
return CreateScalarIndex<bool>(
index_type, file_manager_context, data_type);
case DataType::INT8:
return CreateScalarIndex<int8_t>(index_type, file_manager_context);
return CreateScalarIndex<int8_t>(
index_type, file_manager_context, data_type);
case DataType::INT16:
return CreateScalarIndex<int16_t>(index_type, file_manager_context);
return CreateScalarIndex<int16_t>(
index_type, file_manager_context, data_type);
case DataType::INT32:
return CreateScalarIndex<int32_t>(index_type, file_manager_context);
return CreateScalarIndex<int32_t>(
index_type, file_manager_context, data_type);
case DataType::INT64:
return CreateScalarIndex<int64_t>(index_type, file_manager_context);
return CreateScalarIndex<int64_t>(
index_type, file_manager_context, data_type);
case DataType::FLOAT:
return CreateScalarIndex<float>(index_type, file_manager_context);
return CreateScalarIndex<float>(
index_type, file_manager_context, data_type);
case DataType::DOUBLE:
return CreateScalarIndex<double>(index_type, file_manager_context);
return CreateScalarIndex<double>(
index_type, file_manager_context, data_type);

// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreateScalarIndex<std::string>(index_type,
file_manager_context);
return CreateScalarIndex<std::string>(
index_type, file_manager_context, data_type);
default:
throw SegcoreError(
DataTypeInvalid,
Expand Down Expand Up @@ -187,25 +223,32 @@ IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info,
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreateScalarIndex<bool>(index_type, file_manager, space);
return CreateScalarIndex<bool>(
index_type, file_manager, space, data_type);
case DataType::INT8:
return CreateScalarIndex<int8_t>(index_type, file_manager, space);
return CreateScalarIndex<int8_t>(
index_type, file_manager, space, data_type);
case DataType::INT16:
return CreateScalarIndex<int16_t>(index_type, file_manager, space);
return CreateScalarIndex<int16_t>(
index_type, file_manager, space, data_type);
case DataType::INT32:
return CreateScalarIndex<int32_t>(index_type, file_manager, space);
return CreateScalarIndex<int32_t>(
index_type, file_manager, space, data_type);
case DataType::INT64:
return CreateScalarIndex<int64_t>(index_type, file_manager, space);
return CreateScalarIndex<int64_t>(
index_type, file_manager, space, data_type);
case DataType::FLOAT:
return CreateScalarIndex<float>(index_type, file_manager, space);
return CreateScalarIndex<float>(
index_type, file_manager, space, data_type);
case DataType::DOUBLE:
return CreateScalarIndex<double>(index_type, file_manager, space);
return CreateScalarIndex<double>(
index_type, file_manager, space, data_type);

// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreateScalarIndex<std::string>(
index_type, file_manager, space);
index_type, file_manager, space, data_type);
default:
throw SegcoreError(
DataTypeInvalid,
Expand Down
18 changes: 15 additions & 3 deletions internal/core/src/index/IndexFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class IndexFactory {
CreateIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space);

IndexBasePtr
CreateVectorIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context);
Expand All @@ -86,18 +87,29 @@ class IndexFactory {
ScalarIndexPtr<T>
CreateScalarIndex(const IndexType& index_type,
const storage::FileManagerContext& file_manager =
storage::FileManagerContext());
storage::FileManagerContext(),
DataType d_type = DataType::NONE);

template <typename T>
ScalarIndexPtr<T>
CreateScalarIndex(const IndexType& index_type,
const storage::FileManagerContext& file_manager,
std::shared_ptr<milvus_storage::Space> space);
std::shared_ptr<milvus_storage::Space> space,
DataType d_type = DataType::NONE);
};

template <>
ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context);
const storage::FileManagerContext& file_manager_context,
DataType d_type);

template <>
ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space,
DataType d_type);
} // namespace milvus::index
Loading

0 comments on commit 3f46c6d

Please sign in to comment.