From 42f7800b5bc7d92c55e55f5177bc9365165783b2 Mon Sep 17 00:00:00 2001 From: zhagnlu <1542303831@qq.com> Date: Sat, 24 Aug 2024 01:40:58 +0800 Subject: [PATCH] enhance: add bitmap offset cache to speed up retrieve raw data (#35498) #35458 Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/index/BitmapIndex.cpp | 49 ++++++++++++++++++- internal/core/src/index/BitmapIndex.h | 11 +++++ internal/core/src/index/Index.h | 3 -- internal/core/src/index/Meta.h | 6 +++ internal/core/src/index/StringIndexMarisa.cpp | 2 +- internal/core/src/index/Utils.h | 6 +++ internal/core/src/index/VectorDiskIndex.cpp | 6 +-- internal/core/src/index/VectorMemIndex.cpp | 9 ++-- internal/core/src/segcore/load_index_c.cpp | 6 +-- internal/datacoord/index_service.go | 8 ++- pkg/common/common.go | 9 ++-- .../indexparamcheck/bitmap_checker_test.go | 1 + .../indexparamcheck/bitmap_index_checker.go | 3 ++ pkg/util/indexparamcheck/index_type.go | 19 +++++++ pkg/util/indexparams/index_params.go | 1 + 15 files changed, 118 insertions(+), 21 deletions(-) diff --git a/internal/core/src/index/BitmapIndex.cpp b/internal/core/src/index/BitmapIndex.cpp index 0dfc2506cbc28..c0e0290bd6b3b 100644 --- a/internal/core/src/index/BitmapIndex.cpp +++ b/internal/core/src/index/BitmapIndex.cpp @@ -255,7 +255,7 @@ BitmapIndex::Serialize(const Config& config) { ret_set.Append(BITMAP_INDEX_META, index_meta.first, index_meta.second); LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}", - Cardinality(), + data_.size(), total_num_rows_); Disassemble(ret_set); @@ -345,6 +345,31 @@ BitmapIndex::DeserializeIndexData(const uint8_t* data_ptr, } } +template +void +BitmapIndex::BuildOffsetCache() { + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + data_offsets_cache_.resize(total_num_rows_); + for (auto it = data_.begin(); it != data_.end(); it++) { + for (const auto& v : it->second) { + data_offsets_cache_[v] = it; + } + } + } else { + for (auto it = bitsets_.begin(); it != bitsets_.end(); it++) { + bitsets_offsets_cache_.resize(total_num_rows_); + const auto& bits = it->second; + for (int i = 0; i < bits.size(); i++) { + if (bits[i]) { + bitsets_offsets_cache_[i] = it; + } + } + } + } + use_offset_cache_ = true; + LOG_INFO("build offset cache for bitmap index"); +} + template <> void BitmapIndex::DeserializeIndexData(const uint8_t* data_ptr, @@ -377,6 +402,9 @@ template void BitmapIndex::LoadWithoutAssemble(const BinarySet& binary_set, const Config& config) { + auto enable_offset_cache = + GetValueFromConfig(config, ENABLE_OFFSET_CACHE); + auto index_meta_buffer = binary_set.GetByName(BITMAP_INDEX_META); auto index_meta = DeserializeIndexMeta(index_meta_buffer->data.get(), index_meta_buffer->size); @@ -387,6 +415,10 @@ BitmapIndex::LoadWithoutAssemble(const BinarySet& binary_set, auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA); DeserializeIndexData(index_data_buffer->data.get(), index_length); + if (enable_offset_cache.has_value() && enable_offset_cache.value()) { + BuildOffsetCache(); + } + LOG_INFO("load bitmap index with cardinality = {}, num_rows = {}", Cardinality(), total_num_rows_); @@ -575,7 +607,6 @@ BitmapIndex::RangeForRoaring(const T value, const OpType op) { } auto lb = data_.begin(); auto ub = data_.end(); - switch (op) { case OpType::LessThan: { ub = std::lower_bound(data_.begin(), @@ -758,12 +789,26 @@ BitmapIndex::RangeForRoaring(const T lower_value, return res; } +template +T +BitmapIndex::Reverse_Lookup_InCache(size_t idx) const { + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + return data_offsets_cache_[idx]->first; + } else { + return bitsets_offsets_cache_[idx]->first; + } +} + template T BitmapIndex::Reverse_Lookup(size_t idx) const { AssertInfo(is_built_, "index has not been built"); AssertInfo(idx < total_num_rows_, "out of range of total coun"); + if (use_offset_cache_) { + return Reverse_Lookup_InCache(idx); + } + if (build_mode_ == BitmapIndexBuildMode::ROARING) { for (auto it = data_.begin(); it != data_.end(); it++) { for (const auto& v : it->second) { diff --git a/internal/core/src/index/BitmapIndex.h b/internal/core/src/index/BitmapIndex.h index 3bf279cf8b75b..5c97ccec81bd6 100644 --- a/internal/core/src/index/BitmapIndex.h +++ b/internal/core/src/index/BitmapIndex.h @@ -175,6 +175,12 @@ class BitmapIndex : public ScalarIndex { void DeserializeIndexData(const uint8_t* data_ptr, size_t index_length); + void + BuildOffsetCache(); + + T + Reverse_Lookup_InCache(size_t idx) const; + void ChooseIndexLoadMode(int64_t index_length); @@ -210,6 +216,11 @@ class BitmapIndex : public ScalarIndex { std::map bitsets_; size_t total_num_rows_{0}; proto::schema::FieldSchema schema_; + bool use_offset_cache_{false}; + std::vector::iterator> + data_offsets_cache_; + std::vector::iterator> + bitsets_offsets_cache_; std::shared_ptr file_manager_; // generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate diff --git a/internal/core/src/index/Index.h b/internal/core/src/index/Index.h index 0061253de620f..f5072a653ef6b 100644 --- a/internal/core/src/index/Index.h +++ b/internal/core/src/index/Index.h @@ -25,9 +25,6 @@ #include "common/Tracer.h" #include "common/Types.h" -const std::string kMmapFilepath = "mmap_filepath"; -const std::string kEnableMmap = "enable_mmap"; - namespace milvus::index { class IndexBase { diff --git a/internal/core/src/index/Meta.h b/internal/core/src/index/Meta.h index 1d427eb1debe6..c0c9ea6cd81b5 100644 --- a/internal/core/src/index/Meta.h +++ b/internal/core/src/index/Meta.h @@ -58,6 +58,12 @@ constexpr const char* INDEX_ENGINE_VERSION = "index_engine_version"; constexpr const char* BITMAP_INDEX_CARDINALITY_LIMIT = "bitmap_cardinality_limit"; +// index config key +constexpr const char* MMAP_FILE_PATH = "mmap_filepath"; +constexpr const char* ENABLE_MMAP = "enable_mmap"; +constexpr const char* INDEX_FILES = "index_files"; +constexpr const char* ENABLE_OFFSET_CACHE = "indexoffsetcache.enabled"; + // VecIndex file metas constexpr const char* DISK_ANN_PREFIX_PATH = "index_prefix"; constexpr const char* DISK_ANN_RAW_DATA_PATH = "data_path"; diff --git a/internal/core/src/index/StringIndexMarisa.cpp b/internal/core/src/index/StringIndexMarisa.cpp index 6052532fc0a87..26e6bd8c0c6b1 100644 --- a/internal/core/src/index/StringIndexMarisa.cpp +++ b/internal/core/src/index/StringIndexMarisa.cpp @@ -201,7 +201,7 @@ StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set, } file.Seek(0, SEEK_SET); - if (config.contains(kEnableMmap)) { + if (config.contains(ENABLE_MMAP)) { trie_.mmap(file_name.c_str()); } else { trie_.read(file.Descriptor()); diff --git a/internal/core/src/index/Utils.h b/internal/core/src/index/Utils.h index 1444eeeac638d..1c5f175e26cb5 100644 --- a/internal/core/src/index/Utils.h +++ b/internal/core/src/index/Utils.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "common/Types.h" #include "common/FieldData.h" @@ -79,7 +80,12 @@ void inline CheckParameter(Config& conf, template inline std::optional GetValueFromConfig(const Config& cfg, const std::string& key) { + // cfg value are all string type if (cfg.contains(key)) { + if constexpr (std::is_same_v) { + return boost::algorithm::to_lower_copy( + cfg.at(key).get()) == "true"; + } return cfg.at(key).get(); } return std::nullopt; diff --git a/internal/core/src/index/VectorDiskIndex.cpp b/internal/core/src/index/VectorDiskIndex.cpp index 5bc7400ebe73c..e33336fb2e460 100644 --- a/internal/core/src/index/VectorDiskIndex.cpp +++ b/internal/core/src/index/VectorDiskIndex.cpp @@ -406,9 +406,9 @@ VectorDiskAnnIndex::update_load_json(const Config& config) { } } - if (config.contains(kMmapFilepath)) { - load_config.erase(kMmapFilepath); - load_config[kEnableMmap] = true; + if (config.contains(MMAP_FILE_PATH)) { + load_config.erase(MMAP_FILE_PATH); + load_config[ENABLE_MMAP] = true; } return load_config; diff --git a/internal/core/src/index/VectorMemIndex.cpp b/internal/core/src/index/VectorMemIndex.cpp index 97e5752626daf..6d7767fcf4e3d 100644 --- a/internal/core/src/index/VectorMemIndex.cpp +++ b/internal/core/src/index/VectorMemIndex.cpp @@ -32,6 +32,7 @@ #include "index/Index.h" #include "index/IndexInfo.h" +#include "index/Meta.h" #include "index/Utils.h" #include "common/EasyAssert.h" #include "config/ConfigKnowhere.h" @@ -142,7 +143,7 @@ template void VectorMemIndex::Load(milvus::tracer::TraceContext ctx, const Config& config) { - if (config.contains(kMmapFilepath)) { + if (config.contains(MMAP_FILE_PATH)) { return LoadFromFile(config); } @@ -483,7 +484,7 @@ VectorMemIndex::GetSparseVector(const DatasetPtr dataset) const { template void VectorMemIndex::LoadFromFile(const Config& config) { - auto filepath = GetValueFromConfig(config, kMmapFilepath); + auto filepath = GetValueFromConfig(config, MMAP_FILE_PATH); AssertInfo(filepath.has_value(), "mmap filepath is empty when load index"); std::filesystem::create_directories( @@ -598,8 +599,8 @@ void VectorMemIndex::LoadFromFile(const Config& config) { LOG_INFO("load index into Knowhere..."); auto conf = config; - conf.erase(kMmapFilepath); - conf[kEnableMmap] = true; + conf.erase(MMAP_FILE_PATH); + conf[ENABLE_MMAP] = true; auto start_deserialize = std::chrono::system_clock::now(); auto stat = index_.DeserializeFromFile(filepath.value(), conf); auto deserialize_duration = diff --git a/internal/core/src/segcore/load_index_c.cpp b/internal/core/src/segcore/load_index_c.cpp index 0db7f7e3ecdf5..cf36f64e3d510 100644 --- a/internal/core/src/segcore/load_index_c.cpp +++ b/internal/core/src/segcore/load_index_c.cpp @@ -221,7 +221,6 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) { static_cast(c_load_index_info); auto& index_params = load_index_info->index_params; auto field_type = load_index_info->field_type; - auto engine_version = load_index_info->index_engine_version; milvus::index::CreateIndexInfo index_info; @@ -271,7 +270,7 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) { auto config = milvus::index::ParseConfigFromIndexParams( load_index_info->index_params); - config["index_files"] = load_index_info->index_files; + config[milvus::index::INDEX_FILES] = load_index_info->index_files; milvus::storage::FileManagerContext fileManagerContext( field_meta, index_meta, remote_chunk_manager); @@ -289,9 +288,10 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) { std::to_string(load_index_info->field_id) / std::to_string(load_index_info->index_id); - config[kMmapFilepath] = filepath.string(); + config[milvus::index::MMAP_FILE_PATH] = filepath.string(); } + LOG_DEBUG("load index with configs: {}", config.dump()); load_index_info->index->Load(ctx, config); span->End(); diff --git a/internal/datacoord/index_service.go b/internal/datacoord/index_service.go index 3b981df605fa0..3c519341cb732 100644 --- a/internal/datacoord/index_service.go +++ b/internal/datacoord/index_service.go @@ -272,13 +272,19 @@ func (s *Server) CreateIndex(ctx context.Context, req *indexpb.CreateIndexReques func ValidateIndexParams(index *model.Index) error { indexType := GetIndexType(index.IndexParams) indexParams := funcutil.KeyValuePair2Map(index.IndexParams) + userIndexParams := funcutil.KeyValuePair2Map(index.UserIndexParams) if err := indexparamcheck.ValidateMmapIndexParams(indexType, indexParams); err != nil { return merr.WrapErrParameterInvalidMsg("invalid mmap index params", err.Error()) } - userIndexParams := funcutil.KeyValuePair2Map(index.UserIndexParams) if err := indexparamcheck.ValidateMmapIndexParams(indexType, userIndexParams); err != nil { return merr.WrapErrParameterInvalidMsg("invalid mmap user index params", err.Error()) } + if err := indexparamcheck.ValidateOffsetCacheIndexParams(indexType, indexParams); err != nil { + return merr.WrapErrParameterInvalidMsg("invalid offset cache index params", err.Error()) + } + if err := indexparamcheck.ValidateOffsetCacheIndexParams(indexType, userIndexParams); err != nil { + return merr.WrapErrParameterInvalidMsg("invalid offset cache index params", err.Error()) + } return nil } diff --git a/pkg/common/common.go b/pkg/common/common.go index aa77b5f5dafcf..bfbc640e45617 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -167,10 +167,11 @@ const ( // common properties const ( - MmapEnabledKey = "mmap.enabled" - LazyLoadEnableKey = "lazyload.enabled" - PartitionKeyIsolationKey = "partitionkey.isolation" - FieldSkipLoadKey = "field.skipLoad" + MmapEnabledKey = "mmap.enabled" + LazyLoadEnableKey = "lazyload.enabled" + PartitionKeyIsolationKey = "partitionkey.isolation" + FieldSkipLoadKey = "field.skipLoad" + IndexOffsetCacheEnabledKey = "indexoffsetcache.enabled" ) const ( diff --git a/pkg/util/indexparamcheck/bitmap_checker_test.go b/pkg/util/indexparamcheck/bitmap_checker_test.go index 6bf134854a491..95d74f85bc2dd 100644 --- a/pkg/util/indexparamcheck/bitmap_checker_test.go +++ b/pkg/util/indexparamcheck/bitmap_checker_test.go @@ -29,4 +29,5 @@ func Test_BitmapIndexChecker(t *testing.T) { assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Double})) assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Float})) assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Double})) + assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Double, IsPrimaryKey: true})) } diff --git a/pkg/util/indexparamcheck/bitmap_index_checker.go b/pkg/util/indexparamcheck/bitmap_index_checker.go index f19b472baa7e9..f19943a50ea93 100644 --- a/pkg/util/indexparamcheck/bitmap_index_checker.go +++ b/pkg/util/indexparamcheck/bitmap_index_checker.go @@ -16,6 +16,9 @@ func (c *BITMAPChecker) CheckTrain(params map[string]string) error { } func (c *BITMAPChecker) CheckValidDataType(field *schemapb.FieldSchema) error { + if field.IsPrimaryKey { + return fmt.Errorf("create bitmap index on primary key not supported") + } mainType := field.GetDataType() elemType := field.GetElementType() if !typeutil.IsBoolType(mainType) && !typeutil.IsIntegerType(mainType) && diff --git a/pkg/util/indexparamcheck/index_type.go b/pkg/util/indexparamcheck/index_type.go index efe4bbbc7375a..31f3ca3f6683a 100644 --- a/pkg/util/indexparamcheck/index_type.go +++ b/pkg/util/indexparamcheck/index_type.go @@ -73,6 +73,10 @@ func IsVectorMmapIndex(indexType IndexType) bool { indexType == IndexSparseWand } +func IsOffsetCacheSupported(indexType IndexType) bool { + return indexType == IndexBitmap +} + func IsDiskIndex(indexType IndexType) bool { return indexType == IndexDISKANN } @@ -96,3 +100,18 @@ func ValidateMmapIndexParams(indexType IndexType, indexParams map[string]string) } return nil } + +func ValidateOffsetCacheIndexParams(indexType IndexType, indexParams map[string]string) error { + offsetCacheEnable, ok := indexParams[common.IndexOffsetCacheEnabledKey] + if !ok { + return nil + } + enable, err := strconv.ParseBool(offsetCacheEnable) + if err != nil { + return fmt.Errorf("invalid %s value: %s, expected: true, false", common.IndexOffsetCacheEnabledKey, offsetCacheEnable) + } + if enable && IsOffsetCacheSupported(indexType) { + return fmt.Errorf("only bitmap index support %s now", common.IndexOffsetCacheEnabledKey) + } + return nil +} diff --git a/pkg/util/indexparams/index_params.go b/pkg/util/indexparams/index_params.go index d3d2433591c71..b4876c9c7c24f 100644 --- a/pkg/util/indexparams/index_params.go +++ b/pkg/util/indexparams/index_params.go @@ -54,6 +54,7 @@ var configableIndexParams = typeutil.NewSet[string]() func init() { configableIndexParams.Insert(common.MmapEnabledKey) + configableIndexParams.Insert(common.IndexOffsetCacheEnabledKey) } func IsConfigableIndexParam(key string) bool {