Skip to content

Commit

Permalink
enhance: refactor bitmap index and internal hybrid index
Browse files Browse the repository at this point in the history
Signed-off-by: luzhang <[email protected]>
  • Loading branch information
luzhang committed Jul 16, 2024
1 parent eae4dfc commit a25d269
Show file tree
Hide file tree
Showing 19 changed files with 258 additions and 58 deletions.
10 changes: 5 additions & 5 deletions internal/core/src/index/HybridScalarIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ void
HybridScalarIndex<T>::BuildInternal(
const std::vector<FieldDataPtr>& field_datas) {
auto index = GetInternalIndex();
LOG_INFO("build bitmap index with internal index:{}",
LOG_INFO("build hybrid index with internal index:{}",
ToString(internal_index_type_));
index->BuildWithFieldData(field_datas);
}
Expand Down Expand Up @@ -406,7 +406,7 @@ HybridScalarIndex<T>::Load(const BinarySet& binary_set, const Config& config) {
DeserializeIndexType(binary_set);

auto index = GetInternalIndex();
LOG_INFO("load bitmap index with internal index:{}",
LOG_INFO("load hybrid index with internal index:{}",
ToString(internal_index_type_));
index->Load(binary_set, config);

Expand All @@ -420,7 +420,7 @@ HybridScalarIndex<T>::Load(milvus::tracer::TraceContext ctx,
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load bitmap index");
"index file paths is empty when load hybrid index");

auto index_type_file = GetRemoteIndexTypeFile(index_files.value());

Expand All @@ -439,7 +439,7 @@ HybridScalarIndex<T>::Load(milvus::tracer::TraceContext ctx,
DeserializeIndexType(binary_set);

auto index = GetInternalIndex();
LOG_INFO("load bitmap index with internal index:{}",
LOG_INFO("load hybrid index with internal index:{}",
ToString(internal_index_type_));
index->Load(ctx, config);

Expand All @@ -456,4 +456,4 @@ template class HybridScalarIndex<double>;
template class HybridScalarIndex<std::string>;

} // namespace index
} // namespace milvus
} // namespace milvus
26 changes: 20 additions & 6 deletions internal/core/src/index/IndexFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ IndexFactory::CreatePrimitiveScalarIndex(
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<T>>(file_manager_context);
}
if (index_type == HYBRID_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<T>>(file_manager_context);
}
return CreateScalarIndexSort<T>(file_manager_context);
Expand All @@ -63,6 +66,9 @@ IndexFactory::CreatePrimitiveScalarIndex<std::string>(
file_manager_context);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
}
if (index_type == HYBRID_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<std::string>>(
file_manager_context);
}
Expand All @@ -83,6 +89,9 @@ IndexFactory::CreatePrimitiveScalarIndex(
space);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<T>>(file_manager_context, space);
}
if (index_type == HYBRID_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<T>>(file_manager_context,
space);
}
Expand All @@ -101,6 +110,10 @@ IndexFactory::CreatePrimitiveScalarIndex<std::string>(
file_manager_context, space);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<std::string>>(file_manager_context,
space);
}
if (index_type == HYBRID_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<std::string>>(
file_manager_context, space);
}
Expand Down Expand Up @@ -179,16 +192,17 @@ IndexBasePtr
IndexFactory::CreateCompositeScalarIndex(
IndexType index_type,
const storage::FileManagerContext& file_manager_context) {
if (index_type == BITMAP_INDEX_TYPE) {
auto element_type = static_cast<DataType>(
file_manager_context.fieldDataMeta.field_schema.element_type());
return CreatePrimitiveScalarIndex(
element_type, index_type, file_manager_context);
} else if (index_type == INVERTED_INDEX_TYPE) {
if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE ||
index_type == INVERTED_INDEX_TYPE) {
auto element_type = static_cast<DataType>(
file_manager_context.fieldDataMeta.field_schema.element_type());
return CreatePrimitiveScalarIndex(
element_type, index_type, file_manager_context);
} else {
PanicInfo(
Unsupported,
fmt::format("index type: {} for composite scalar not supported now",
index_type));
}
}

Expand Down
24 changes: 18 additions & 6 deletions internal/core/src/index/InvertedIndexTantivy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ get_tantivy_data_type(proto::schema::DataType data_type) {
return TantivyDataType::F64;
}

case proto::schema::DataType::String:
case proto::schema::DataType::VarChar: {
return TantivyDataType::Keyword;
}
Expand Down Expand Up @@ -152,7 +153,7 @@ InvertedIndexTantivy<T>::Build(const Config& config) {
AssertInfo(insert_files.has_value(), "insert_files were empty");
auto field_datas =
mem_file_manager_->CacheRawDataToMemory(insert_files.value());
build_index(field_datas);
BuildWithFieldData(field_datas);
}

template <typename T>
Expand All @@ -173,7 +174,7 @@ InvertedIndexTantivy<T>::BuildV2(const Config& config) {
field_data->FillFieldData(col_data);
field_datas.push_back(field_data);
}
build_index(field_datas);
BuildWithFieldData(field_datas);
}

template <typename T>
Expand All @@ -185,7 +186,17 @@ InvertedIndexTantivy<T>::Load(milvus::tracer::TraceContext ctx,
AssertInfo(index_files.has_value(),
"index file paths is empty when load disk ann index data");
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
disk_file_manager_->CacheIndexToDisk(index_files.value());
auto files_value = index_files.value();
// need erase the index type file that has been readed
auto index_type_file =
disk_file_manager_->GetRemoteIndexPrefix() + std::string("/index_type");
files_value.erase(std::remove_if(files_value.begin(),
files_value.end(),
[&](const std::string& file) {
return file == index_type_file;
}),
files_value.end());
disk_file_manager_->CacheIndexToDisk(files_value);
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
}

Expand Down Expand Up @@ -398,7 +409,7 @@ InvertedIndexTantivy<T>::BuildWithRawData(size_t n,

template <typename T>
void
InvertedIndexTantivy<T>::build_index(
InvertedIndexTantivy<T>::BuildWithFieldData(
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
switch (schema_.data_type()) {
case proto::schema::DataType::Bool:
Expand Down Expand Up @@ -454,8 +465,9 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
auto n = data->get_num_rows();
auto array_column = static_cast<const Array*>(data->Data());
for (int64_t i = 0; i < n; i++) {
assert(array_column[i].get_element_type() ==
static_cast<DataType>(schema_.element_type()));
Assert(IsStringDataType(array_column[i].get_element_type()));
Assert(IsStringDataType(
static_cast<DataType>(schema_.element_type())));
std::vector<std::string> output;
for (int64_t j = 0; j < array_column[i].length(); j++) {
output.push_back(
Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/index/InvertedIndexTantivy.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
GetIndexType() const override {
return ScalarIndexType::INVERTED;
}

void
Build(const Config& config = {}) override;

Expand Down Expand Up @@ -170,12 +170,12 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
const TargetBitmap
RegexQuery(const std::string& pattern) override;

private:
void
finish();
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;

private:
void
build_index(const std::vector<std::shared_ptr<FieldDataBase>>& field_datas);
finish();

void
build_index_for_array(
Expand Down
1 change: 1 addition & 0 deletions internal/core/src/index/Meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ constexpr const char* ASCENDING_SORT = "STL_SORT";
constexpr const char* MARISA_TRIE = "Trie";
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
constexpr const char* HYBRID_INDEX_TYPE = "HYBRID";

// index meta
constexpr const char* COLLECTION_ID = "collection_id";
Expand Down
6 changes: 6 additions & 0 deletions internal/core/src/storage/DiskFileManagerImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ class DiskFileManagerImpl : public FileManagerImpl {
const std::vector<std::string>& remote_files,
const std::vector<int64_t>& remote_file_sizes);

std::string
GetRemoteIndexPrefix() const {
return space_ != nullptr ? GetRemoteIndexObjectPrefixV2()
: GetRemoteIndexObjectPrefix();
}

private:
int64_t
GetIndexBuildId() {
Expand Down
42 changes: 37 additions & 5 deletions internal/core/unittest/test_array_bitmap_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,9 @@ class ArrayBitmapIndexTest : public testing::Test {
std::vector<std::string> index_files;

Config config;
config["index_type"] = milvus::index::BITMAP_INDEX_TYPE;
config["index_type"] = milvus::index::HYBRID_INDEX_TYPE;
config["insert_files"] = std::vector<std::string>{log_path};
config["bitmap_cardinality_limit"] = "1000";
config["bitmap_cardinality_limit"] = "100";

auto build_index =
indexbuilder::IndexFactory::GetInstance().CreateIndex(
Expand All @@ -223,7 +223,7 @@ class ArrayBitmapIndexTest : public testing::Test {
}

index::CreateIndexInfo index_info{};
index_info.index_type = milvus::index::BITMAP_INDEX_TYPE;
index_info.index_type = milvus::index::HYBRID_INDEX_TYPE;
index_info.field_type = DataType::ARRAY;

config["index_files"] = index_files;
Expand All @@ -233,11 +233,15 @@ class ArrayBitmapIndexTest : public testing::Test {
index_->Load(milvus::tracer::TraceContext{}, config);
}

void
SetUp() override {
virtual void
SetParam() {
nb_ = 10000;
cardinality_ = 30;
}

void
SetUp() override {
SetParam();
// if constexpr (std::is_same_v<T, int8_t>) {
// type_ = DataType::INT8;
// } else if constexpr (std::is_same_v<T, int16_t>) {
Expand Down Expand Up @@ -338,3 +342,31 @@ REGISTER_TYPED_TEST_SUITE_P(ArrayBitmapIndexTest,
INSTANTIATE_TYPED_TEST_SUITE_P(ArrayBitmapE2ECheck,
ArrayBitmapIndexTest,
BitmapType);

template <typename T>
class ArrayBitmapIndexTestV1 : public ArrayBitmapIndexTest<T> {
public:
virtual void
SetParam() override {
this->nb_ = 10000;
this->cardinality_ = 200;
}

virtual ~ArrayBitmapIndexTestV1() {
}
};

TYPED_TEST_SUITE_P(ArrayBitmapIndexTestV1);

TYPED_TEST_P(ArrayBitmapIndexTestV1, CountFuncTest) {
auto count = this->index_->Count();
EXPECT_EQ(count, this->nb_);
}

using BitmapTypeV1 = testing::Types<int32_t, int64_t, std::string>;

REGISTER_TYPED_TEST_SUITE_P(ArrayBitmapIndexTestV1, CountFuncTest);

INSTANTIATE_TYPED_TEST_SUITE_P(ArrayBitmapE2ECheckV1,
ArrayBitmapIndexTestV1,
BitmapTypeV1);
4 changes: 2 additions & 2 deletions internal/core/unittest/test_hybrid_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class HybridIndexTestV1 : public testing::Test {
std::vector<std::string> index_files;

Config config;
config["index_type"] = milvus::index::BITMAP_INDEX_TYPE;
config["index_type"] = milvus::index::HYBRID_INDEX_TYPE;
config["insert_files"] = std::vector<std::string>{log_path};
config["bitmap_cardinality_limit"] = "1000";

Expand All @@ -135,7 +135,7 @@ class HybridIndexTestV1 : public testing::Test {
}

index::CreateIndexInfo index_info{};
index_info.index_type = milvus::index::BITMAP_INDEX_TYPE;
index_info.index_type = milvus::index::HYBRID_INDEX_TYPE;
index_info.field_type = type_;

config["index_files"] = index_files;
Expand Down
48 changes: 35 additions & 13 deletions internal/proxy/task_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,26 +149,48 @@ func (cit *createIndexTask) parseIndexParams() error {

specifyIndexType, exist := indexParamsMap[common.IndexTypeKey]
if exist && specifyIndexType != "" {
_, err := indexparamcheck.GetIndexCheckerMgrInstance().GetChecker(specifyIndexType)
if err != nil {
checker, err := indexparamcheck.GetIndexCheckerMgrInstance().GetChecker(specifyIndexType)
// not enable hybrid index for user, used in milvus internally
if err != nil || indexparamcheck.IsHYBRIDChecker(checker) {
log.Ctx(cit.ctx).Warn("Failed to get index checker", zap.String(common.IndexTypeKey, specifyIndexType))
return merr.WrapErrParameterInvalid("valid index", fmt.Sprintf("invalid index type: %s", specifyIndexType))
}
}

if !isVecIndex {
specifyIndexType, exist := indexParamsMap[common.IndexTypeKey]
if Params.AutoIndexConfig.ScalarAutoIndexEnable.GetAsBool() || specifyIndexType == AutoIndexName || !exist {
if typeutil.IsArithmetic(cit.fieldSchema.DataType) {
indexParamsMap[common.IndexTypeKey] = Params.AutoIndexConfig.ScalarNumericIndexType.GetValue()
} else if typeutil.IsStringType(cit.fieldSchema.DataType) {
indexParamsMap[common.IndexTypeKey] = Params.AutoIndexConfig.ScalarVarcharIndexType.GetValue()
} else if typeutil.IsBoolType(cit.fieldSchema.DataType) {
indexParamsMap[common.IndexTypeKey] = Params.AutoIndexConfig.ScalarBoolIndexType.GetValue()
} else {
return merr.WrapErrParameterInvalid("supported field",
fmt.Sprintf("create auto index on %s field is not supported", cit.fieldSchema.DataType.String()))
autoIndexEnable := Params.AutoIndexConfig.ScalarAutoIndexEnable.GetAsBool()

if autoIndexEnable || !exist || specifyIndexType == AutoIndexName {
getPrimitiveIndexType := func(dataType schemapb.DataType) string {
if typeutil.IsBoolType(dataType) {
return Params.AutoIndexConfig.ScalarBoolIndexType.GetValue()
} else if typeutil.IsIntegerType(dataType) {
return Params.AutoIndexConfig.ScalarIntIndexType.GetValue()
} else if typeutil.IsFloatingType(dataType) {
return Params.AutoIndexConfig.ScalarFloatIndexType.GetValue()
} else {
return Params.AutoIndexConfig.ScalarVarcharIndexType.GetValue()
}
}

indexType, err := func() (string, error) {
dataType := cit.fieldSchema.DataType
if typeutil.IsPrimitiveType(dataType) {
return getPrimitiveIndexType(dataType), nil
} else if typeutil.IsArrayType(dataType) {
return getPrimitiveIndexType(cit.fieldSchema.ElementType), nil
} else {
return "", fmt.Errorf("create auto index on type:%s is not supported", dataType.String())
}
}()

if err != nil {
return merr.WrapErrParameterInvalid("supported field", err.Error())
}

indexParamsMap[common.IndexTypeKey] = indexType

}
} else {
specifyIndexType, exist := indexParamsMap[common.IndexTypeKey]
Expand Down Expand Up @@ -368,7 +390,7 @@ func fillDimension(field *schemapb.FieldSchema, indexParams map[string]string) e
func checkTrain(field *schemapb.FieldSchema, indexParams map[string]string) error {
indexType := indexParams[common.IndexTypeKey]

if indexType == indexparamcheck.IndexBitmap {
if indexType == indexparamcheck.IndexHybrid {
_, exist := indexParams[common.BitmapCardinalityLimitKey]
if !exist {
indexParams[common.BitmapCardinalityLimitKey] = paramtable.Get().CommonCfg.BitmapIndexCardinalityBound.GetValue()
Expand Down
Loading

0 comments on commit a25d269

Please sign in to comment.