From 998a8b8a9fabe1df78ef9dc33cbcd0d1ee08a87b Mon Sep 17 00:00:00 2001
From: liliu-z <105927039+liliu-z@users.noreply.github.com>
Date: Sat, 10 Jun 2023 00:26:33 -0700
Subject: [PATCH] Sync perform of HNSW/DiskANN (#924)
Signed-off-by: liliu-z
---
knowhere/index/vector_index/ConfAdapter.cpp | 10 +--
.../index/vector_index/IndexDiskANNConfig.cpp | 64 ++++++++++---------
.../index/vector_index/IndexDiskANNConfig.h | 8 ++-
knowhere/index/vector_index/IndexHNSW.cpp | 23 ++-----
.../vector_index/helpers/IndexParameter.h | 8 ++-
unittest/test_diskann.cpp | 2 +-
6 files changed, 55 insertions(+), 60 deletions(-)
diff --git a/knowhere/index/vector_index/ConfAdapter.cpp b/knowhere/index/vector_index/ConfAdapter.cpp
index 5a9591313..232faed14 100644
--- a/knowhere/index/vector_index/ConfAdapter.cpp
+++ b/knowhere/index/vector_index/ConfAdapter.cpp
@@ -39,7 +39,6 @@ static const int64_t HNSW_MIN_EFCONSTRUCTION = 8;
static const int64_t HNSW_MAX_EFCONSTRUCTION = 512;
static const int64_t HNSW_MIN_M = 4;
static const int64_t HNSW_MAX_M = 64;
-static const int64_t HNSW_MAX_EF = 32768;
static const std::vector default_metric_array{metric::L2, metric::IP};
static const std::vector default_binary_metric_array{metric::HAMMING, metric::JACCARD, metric::TANIMOTO,
@@ -237,14 +236,7 @@ HNSWConfAdapter::CheckTrain(Config& cfg, const IndexMode mode) {
bool
HNSWConfAdapter::CheckSearch(Config& cfg, const IndexType type, const IndexMode mode) {
- auto topk = GetMetaTopk(cfg);
- if (topk < HNSW_MAX_EF) {
- // normal case if topk is not large
- CheckIntegerRange(cfg, indexparam::EF, GetMetaTopk(cfg), HNSW_MAX_EF);
- } else {
- // if topk is large
- CheckIntegerRange(cfg, indexparam::EF, topk, topk * 2);
- }
+ CheckIntegerRange(cfg, indexparam::EF, GetMetaTopk(cfg), std::numeric_limits::max());
return ConfAdapter::CheckSearch(cfg, type, mode);
}
diff --git a/knowhere/index/vector_index/IndexDiskANNConfig.cpp b/knowhere/index/vector_index/IndexDiskANNConfig.cpp
index 07eb16847..a909eb7fa 100644
--- a/knowhere/index/vector_index/IndexDiskANNConfig.cpp
+++ b/knowhere/index/vector_index/IndexDiskANNConfig.cpp
@@ -12,6 +12,7 @@
#include "knowhere/index/vector_index/IndexDiskANNConfig.h"
#include
+#include
#include
#include
#include
@@ -63,8 +64,9 @@ static constexpr uint32_t kBuildNumThreadsMinValue = 1;
static constexpr uint32_t kBuildNumThreadsMaxValue = 128;
static constexpr uint32_t kDiskPqBytesMinValue = 0;
static constexpr std::optional kDiskPqBytesMaxValue = std::nullopt;
-static constexpr uint32_t kSearchListSizeMaxValue = 200;
-static constexpr uint32_t kKThreshold = 16;
+static constexpr std::optional kSearchListSizeMaxValue = std::nullopt;
+static constexpr uint32_t kDefaultSearchListSizeDivider = 16;
+static constexpr uint32_t kInvalideSearchListSize = 0;
static constexpr uint32_t kBeamwidthMinValue = 1;
static constexpr uint32_t kBeamwidthMaxValue = 128;
static constexpr float kFilterThresholdMinValue = -1;
@@ -142,6 +144,17 @@ CheckNumericParamAndSet(const Config& config, const std::string& key, std::optio
config.at(key).get_to(to_be_set);
}
+template
+void
+CheckNumericParamAndSetWithDefault(const Config& config, const std::string& key, std::optional min_o,
+ std::optional max_o, T default_value, T& to_be_set) {
+ if (!config.contains(key)) {
+ to_be_set = default_value;
+ return;
+ }
+ CheckNumericParamAndSet(config, key, min_o, max_o, to_be_set);
+}
+
/**
* @brief Check the non-numeric param's existence and type, and allocate it to the config.
*/
@@ -196,11 +209,8 @@ to_json(Config& config, const DiskANNPrepareConfig& prep_conf) {
void
from_json(const Config& config, DiskANNPrepareConfig& prep_conf) {
- if (config.contains(kAioMaxnr)) {
- CheckNumericParamAndSet(config, kAioMaxnr, kAioMaxnrMinValue, kAioMaxnrMaxValue, prep_conf.aio_maxnr);
- } else {
- prep_conf.aio_maxnr = kAioMaxnrDefaultValue;
- }
+ CheckNumericParamAndSetWithDefault(config, kAioMaxnr, kAioMaxnrMinValue, kAioMaxnrMaxValue,
+ kAioMaxnrDefaultValue, prep_conf.aio_maxnr);
CheckNumericParamAndSet(config, kCacheDramBudgetGb, kCacheDramBudgetGbMinValue, kCacheDramBudgetGbMaxValue,
prep_conf.search_cache_budget_gb);
@@ -219,23 +229,21 @@ to_json(Config& config, const DiskANNQueryConfig& query_conf) {
void
from_json(const Config& config, DiskANNQueryConfig& query_conf) {
CheckNumericParamAndSet(config, kK, kKMinValue, kKMaxValue, query_conf.k);
- auto search_list_threshold = query_conf.k < kKThreshold ? kKThreshold : query_conf.k;
- if (config.contains(kSearchListSize)) {
- // The search_list_size should be no less than the k.
- CheckNumericParamAndSet(config, kSearchListSize, query_conf.k,
- std::max(kSearchListSizeMaxValue, static_cast(10 * query_conf.k)),
- query_conf.search_list_size);
+ uint32_t default_search_list_size =
+ query_conf.k <= kDefaultSearchListSizeDivider ? kDefaultSearchListSizeDivider : query_conf.k;
+
+ if (config.contains(kSearchListSize) &&
+ kInvalideSearchListSize == GetValueFromConfig(config, kSearchListSize)) { // Exist but invalid
+ query_conf.search_list_size = default_search_list_size;
} else {
- // if search_list_size not set (==0), not in json string, modify the value.
- query_conf.search_list_size = search_list_threshold;
+ CheckNumericParamAndSetWithDefault(config, kSearchListSize, query_conf.k, kSearchListSizeMaxValue,
+ default_search_list_size, query_conf.search_list_size);
}
+
CheckNumericParamAndSet(config, kBeamwidth, kBeamwidthMinValue, kBeamwidthMaxValue, query_conf.beamwidth);
- if (config.contains(kFilterThreshold)) {
- CheckNumericParamAndSet(config, kFilterThreshold, kFilterThresholdMinValue, kFilterThresholdMaxValue,
- query_conf.filter_threshold);
- } else {
- query_conf.filter_threshold = kFilterThresholdMinValue;
- }
+ CheckNumericParamAndSetWithDefault(config, kFilterThreshold, kFilterThresholdMinValue,
+ kFilterThresholdMaxValue, kFilterThresholdMinValue,
+ query_conf.filter_threshold);
}
void
@@ -261,12 +269,9 @@ from_json(const Config& config, DiskANNQueryByRangeConfig& query_conf) {
CheckNumericParamAndSet(config, kMinK, kMinKMinValue, kMinKMaxValue, query_conf.min_k);
CheckNumericParamAndSet(config, kMaxK, query_conf.min_k, kMaxKMaxValue, query_conf.max_k);
CheckNumericParamAndSet(config, kBeamwidth, kBeamwidthMinValue, kBeamwidthMaxValue, query_conf.beamwidth);
- if (config.contains(kSearchListAndKRatio)) {
- CheckNumericParamAndSet(config, kSearchListAndKRatio, kSearchListAndKRatioMinValue,
- kSearchListAndKRatioMaxValue, query_conf.search_list_and_k_ratio);
- } else {
- query_conf.search_list_and_k_ratio = kSearchListAndKRatioDefaultValue;
- }
+ CheckNumericParamAndSetWithDefault(config, kSearchListAndKRatio, kSearchListAndKRatioMinValue,
+ kSearchListAndKRatioMaxValue, kSearchListAndKRatioDefaultValue,
+ query_conf.search_list_and_k_ratio);
}
DiskANNBuildConfig
@@ -309,10 +314,11 @@ DiskANNQueryByRangeConfig::Set(Config& config, const DiskANNQueryByRangeConfig&
config[kDiskANNQueryByRangeConfig] = query_conf;
}
-const DiskANNPrepareConfig kSanityCheckDiskANNPrepareConfig; // use default
+const DiskANNPrepareConfig kSanityCheckDiskANNPrepareConfig; // use default
const DiskANNQueryConfig kSanityCheckDiskANNQueryConfig{kSanityCheckMinTopK, kSanityCheckMinTopK};
-Config GenSanityCheckDiskANNConfig(const Config& build_config) {
+Config
+GenSanityCheckDiskANNConfig(const Config& build_config) {
Config config = build_config;
DiskANNPrepareConfig::Set(config, kSanityCheckDiskANNPrepareConfig);
DiskANNQueryConfig::Set(config, kSanityCheckDiskANNQueryConfig);
diff --git a/knowhere/index/vector_index/IndexDiskANNConfig.h b/knowhere/index/vector_index/IndexDiskANNConfig.h
index a6d3a9459..16329cebf 100644
--- a/knowhere/index/vector_index/IndexDiskANNConfig.h
+++ b/knowhere/index/vector_index/IndexDiskANNConfig.h
@@ -85,8 +85,9 @@ struct DiskANNPrepareConfig {
struct DiskANNQueryConfig {
uint64_t k;
// A list of search_list sizes to perform searches with. Larger parameters will result in slower latencies, but
- // higher accuracies. Must be at least the value of k.
- uint32_t search_list_size = 128;
+ // higher accuracies. Must be at least the value of k. Default to 0, meaning Knowhere need to take care of the
+ // default value
+ uint32_t search_list_size = 0;
// The beamwidth to be used for search. This is the maximum number of IO requests each query will issue per
// iteration of search code. Larger beamwidth will result in fewer IO round-trips per query but might result in
// slightly higher total number of IO requests to SSD per query. For the highest query throughput with a fixed SSD
@@ -125,5 +126,6 @@ struct DiskANNQueryByRangeConfig {
Set(Config& config, const DiskANNQueryByRangeConfig& query_conf);
};
-Config GenSanityCheckDiskANNConfig(const Config& build_config);
+Config
+GenSanityCheckDiskANNConfig(const Config& build_config);
} // namespace knowhere
diff --git a/knowhere/index/vector_index/IndexHNSW.cpp b/knowhere/index/vector_index/IndexHNSW.cpp
index 98740da45..760fde183 100644
--- a/knowhere/index/vector_index/IndexHNSW.cpp
+++ b/knowhere/index/vector_index/IndexHNSW.cpp
@@ -29,21 +29,11 @@
#include "index/vector_index/helpers/RangeUtil.h"
namespace knowhere {
+
namespace {
- inline int64_t
- CheckAndGetEfValue(const Config& config) {
- auto topk_val = GetMetaTopk(config);
- if (CheckKeyInConfig(config, indexparam::EF)) {
- auto ef_val = GetIndexParamEf(config);
- if (ef_val < topk_val) {
- KNOWHERE_THROW_MSG("ef is smaller than topk in hnsw.");
- }
- return ef_val;
- } else {
- return std::max(knowhere::DEFAULT_HNSW_EF, topk_val);
- }
- }
-} // namespace
+static constexpr int64_t kDefaultEfDivider = 16;
+static constexpr int64_t kDefaultRangeSearchEf = 16;
+}
BinarySet
IndexHNSW::Serialize(const Config& config) {
@@ -267,7 +257,7 @@ IndexHNSW::QueryImpl(int64_t n, const float* xq, int64_t k, float* distances, in
feder = std::make_unique();
}
- size_t ef = CheckAndGetEfValue(config);
+ size_t ef = GetIndexParamEf(config, k <= kDefaultEfDivider ? kDefaultEfDivider : k);
hnswlib::SearchParam param{ef};
bool transform = (index_->metric_type_ == 1); // InnerProduct: 1
@@ -309,7 +299,8 @@ IndexHNSW::QueryByRangeImpl(int64_t n, const float* xq, float*& distances, int64
feder = std::make_unique();
}
- size_t ef = CheckAndGetEfValue(config);
+ size_t ef = GetIndexParamEf(config, kDefaultRangeSearchEf);
+
hnswlib::SearchParam param{ef};
float radius = GetMetaRadius(config);
diff --git a/knowhere/index/vector_index/helpers/IndexParameter.h b/knowhere/index/vector_index/helpers/IndexParameter.h
index 3bfe3071d..4ac6cdff8 100644
--- a/knowhere/index/vector_index/helpers/IndexParameter.h
+++ b/knowhere/index/vector_index/helpers/IndexParameter.h
@@ -108,6 +108,11 @@ SetValueToConfig(Config& cfg, const std::string& key, const T value) {
return GetValueFromConfigWithDefaultValue(cfg, key, value); \
}
+#define DEFINE_CONFIG_GETTER_WITH_CUSTOMIZED_DEFAULT_VALUE(func_name, key, T) \
+ inline T func_name(const Config& cfg, T value) { \
+ return GetValueFromConfigWithDefaultValue(cfg, key, value); \
+ }
+
#define DEFINE_CONFIG_SETTER(func_name, key, T) \
inline void func_name(Config& cfg, T value) { \
SetValueToConfig(cfg, key, (T)(value)); \
@@ -153,7 +158,6 @@ static const int64_t DEFAULT_PQ_M = 4;
static const int64_t DEFAULT_PQ_NBITS = 8;
static const int64_t DEFAULT_HNSW_EFCONSTRUCTION = 360;
static const int64_t DEFAULT_HNSW_M = 30;
-static const int64_t DEFAULT_HNSW_EF = 16;
DEFINE_CONFIG_GETTER_WITH_DEFAULT_VALUE(GetIndexParamNprobe, indexparam::NPROBE, DEFAULT_NPROBE, int64_t)
DEFINE_CONFIG_SETTER(SetIndexParamNprobe, indexparam::NPROBE, int64_t)
@@ -176,7 +180,7 @@ DEFINE_CONFIG_SETTER(SetIndexParamEfConstruction, indexparam::EFCONSTRUCTION, in
DEFINE_CONFIG_GETTER_WITH_DEFAULT_VALUE(GetIndexParamHNSWM, indexparam::HNSW_M, DEFAULT_HNSW_M, int64_t)
DEFINE_CONFIG_SETTER(SetIndexParamHNSWM, indexparam::HNSW_M, int64_t)
-DEFINE_CONFIG_GETTER(GetIndexParamEf, indexparam::EF, int64_t)
+DEFINE_CONFIG_GETTER_WITH_CUSTOMIZED_DEFAULT_VALUE(GetIndexParamEf, indexparam::EF, int64_t)
DEFINE_CONFIG_SETTER(SetIndexParamEf, indexparam::EF, int64_t)
DEFINE_CONFIG_GETTER(GetIndexParamOverviewLevels, indexparam::OVERVIEW_LEVELS, int64_t)
diff --git a/unittest/test_diskann.cpp b/unittest/test_diskann.cpp
index cd8f98381..f5e941c83 100644
--- a/unittest/test_diskann.cpp
+++ b/unittest/test_diskann.cpp
@@ -477,7 +477,7 @@ TEST_P(DiskANNTest, search_without_search_list_size) {
cfg.clear();
knowhere::DiskANNQueryConfig::Set(cfg, tmp_config);
search_list_size = knowhere::DiskANNQueryConfig::Get(cfg).search_list_size;
- EXPECT_EQ(search_list_size, 128);
+ EXPECT_EQ(search_list_size, 16);
}
TEST_P(DiskANNTest, knn_search_test) {