From 869c93d4347ea46a0a296cc5306d5a1c3be1ce81 Mon Sep 17 00:00:00 2001 From: Jiquan Long Date: Tue, 11 Jun 2024 14:13:54 +0800 Subject: [PATCH] enhance: speed up array-equal operator via inverted index (#33633) Signed-off-by: longjiquan --- internal/core/src/common/Schema.h | 9 + internal/core/src/exec/expression/Expr.h | 16 + .../core/src/exec/expression/UnaryExpr.cpp | 166 +++++++++- internal/core/src/exec/expression/UnaryExpr.h | 8 + .../core/src/index/InvertedIndexTantivy.cpp | 54 +++- .../core/src/index/InvertedIndexTantivy.h | 12 + internal/core/src/index/ScalarIndex.h | 14 + .../core/thirdparty/tantivy/tantivy-wrapper.h | 9 - internal/core/thirdparty/tantivy/test.cpp | 10 +- internal/core/unittest/CMakeLists.txt | 1 + .../unittest/test_array_inverted_index.cpp | 297 ++++++++++++++++++ .../core/unittest/test_inverted_index.cpp | 2 - internal/core/unittest/test_utils/DataGen.h | 26 +- .../core/unittest/test_utils/GenExprProto.h | 11 +- 14 files changed, 614 insertions(+), 21 deletions(-) create mode 100644 internal/core/unittest/test_array_inverted_index.cpp diff --git a/internal/core/src/common/Schema.h b/internal/core/src/common/Schema.h index b1068dd650392..754766f54388b 100644 --- a/internal/core/src/common/Schema.h +++ b/internal/core/src/common/Schema.h @@ -51,6 +51,15 @@ class Schema { return field_id; } + FieldId + AddDebugArrayField(const std::string& name, DataType element_type) { + auto field_id = FieldId(debug_id); + debug_id++; + this->AddField( + FieldName(name), field_id, DataType::ARRAY, element_type); + return field_id; + } + // auto gen field_id for convenience FieldId AddDebugField(const std::string& name, diff --git a/internal/core/src/exec/expression/Expr.h b/internal/core/src/exec/expression/Expr.h index ea9eeac92cef9..a300515560b2d 100644 --- a/internal/core/src/exec/expression/Expr.h +++ b/internal/core/src/exec/expression/Expr.h @@ -280,6 +280,22 @@ class SegmentExpr : public Expr { return result; } + template + void + ProcessIndexChunksV2(FUNC func, ValTypes... values) { + typedef std:: + conditional_t, std::string, T> + IndexInnerType; + using Index = index::ScalarIndex; + + for (size_t i = current_index_chunk_; i < num_index_chunk_; i++) { + const Index& index = + segment_->chunk_scalar_index(field_id_, i); + auto* index_ptr = const_cast(&index); + func(index_ptr, values...); + } + } + template bool CanUseIndex(OpType op) const { diff --git a/internal/core/src/exec/expression/UnaryExpr.cpp b/internal/core/src/exec/expression/UnaryExpr.cpp index f780ec487ba47..b9567133de801 100644 --- a/internal/core/src/exec/expression/UnaryExpr.cpp +++ b/internal/core/src/exec/expression/UnaryExpr.cpp @@ -20,6 +20,66 @@ namespace milvus { namespace exec { +template +VectorPtr +PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArrayForIndex() { + return ExecRangeVisitorImplArray(); +} + +template <> +VectorPtr +PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArrayForIndex< + proto::plan::Array>() { + switch (expr_->op_type_) { + case proto::plan::Equal: + case proto::plan::NotEqual: { + switch (expr_->column_.element_type_) { + case DataType::BOOL: { + return ExecArrayEqualForIndex(expr_->op_type_ == + proto::plan::NotEqual); + } + case DataType::INT8: { + return ExecArrayEqualForIndex( + expr_->op_type_ == proto::plan::NotEqual); + } + case DataType::INT16: { + return ExecArrayEqualForIndex( + expr_->op_type_ == proto::plan::NotEqual); + } + case DataType::INT32: { + return ExecArrayEqualForIndex( + expr_->op_type_ == proto::plan::NotEqual); + } + case DataType::INT64: { + return ExecArrayEqualForIndex( + expr_->op_type_ == proto::plan::NotEqual); + } + case DataType::FLOAT: + case DataType::DOUBLE: { + // not accurate on floating point number, rollback to bruteforce. + return ExecRangeVisitorImplArray(); + } + case DataType::VARCHAR: { + if (segment_->type() == SegmentType::Growing) { + return ExecArrayEqualForIndex( + expr_->op_type_ == proto::plan::NotEqual); + } else { + return ExecArrayEqualForIndex( + expr_->op_type_ == proto::plan::NotEqual); + } + } + default: + PanicInfo(DataTypeInvalid, + "unsupported element type when execute array " + "equal for index: {}", + expr_->column_.element_type_); + } + } + default: + return ExecRangeVisitorImplArray(); + } +} + void PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { switch (expr_->column_.data_type_) { @@ -99,7 +159,13 @@ PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { result = ExecRangeVisitorImplArray(); break; case proto::plan::GenericValue::ValCase::kArrayVal: - result = ExecRangeVisitorImplArray(); + if (is_index_mode_) { + result = ExecRangeVisitorImplArrayForIndex< + proto::plan::Array>(); + } else { + result = + ExecRangeVisitorImplArray(); + } break; default: PanicInfo( @@ -196,6 +262,104 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray() { return res_vec; } +template +VectorPtr +PhyUnaryRangeFilterExpr::ExecArrayEqualForIndex(bool reverse) { + typedef std:: + conditional_t, std::string, T> + IndexInnerType; + using Index = index::ScalarIndex; + auto real_batch_size = GetNextBatchSize(); + if (real_batch_size == 0) { + return nullptr; + } + + // get all elements. + auto val = GetValueFromProto(expr_->val_); + if (val.array_size() == 0) { + // rollback to bruteforce. no candidates will be filtered out via index. + return ExecRangeVisitorImplArray(); + } + + // cache the result to suit the framework. + auto batch_res = + ProcessIndexChunks([this, &val, reverse](Index* _) { + boost::container::vector elems; + for (auto const& element : val.array()) { + auto e = GetValueFromProto(element); + if (std::find(elems.begin(), elems.end(), e) == elems.end()) { + elems.push_back(e); + } + } + + // filtering by index, get candidates. + auto size_per_chunk = segment_->size_per_chunk(); + auto retrieve = [ size_per_chunk, this ](int64_t offset) -> auto { + auto chunk_idx = offset / size_per_chunk; + auto chunk_offset = offset % size_per_chunk; + const auto& chunk = + segment_->template chunk_data(field_id_, + chunk_idx); + return chunk.data() + chunk_offset; + }; + + // compare the array via the raw data. + auto filter = [&retrieve, &val, reverse](size_t offset) -> bool { + auto data_ptr = retrieve(offset); + return data_ptr->is_same_array(val) ^ reverse; + }; + + // collect all candidates. + std::unordered_set candidates; + std::unordered_set tmp_candidates; + auto first_callback = [&candidates](size_t offset) -> void { + candidates.insert(offset); + }; + auto callback = [&candidates, + &tmp_candidates](size_t offset) -> void { + if (candidates.find(offset) != candidates.end()) { + tmp_candidates.insert(offset); + } + }; + auto execute_sub_batch = + [](Index* index_ptr, + const IndexInnerType& val, + const std::function& callback) { + index_ptr->InApplyCallback(1, &val, callback); + }; + + // run in-filter. + for (size_t idx = 0; idx < elems.size(); idx++) { + if (idx == 0) { + ProcessIndexChunksV2( + execute_sub_batch, elems[idx], first_callback); + } else { + ProcessIndexChunksV2( + execute_sub_batch, elems[idx], callback); + candidates = std::move(tmp_candidates); + } + // the size of candidates is small enough. + if (candidates.size() * 100 < active_count_) { + break; + } + } + TargetBitmap res(active_count_); + // run post-filter. The filter will only be executed once in the framework. + for (const auto& candidate : candidates) { + res[candidate] = filter(candidate); + } + return res; + }); + AssertInfo(batch_res.size() == real_batch_size, + "internal error: expr processed rows {} not equal " + "expect batch size {}", + batch_res.size(), + real_batch_size); + + // return the result. + return std::make_shared(std::move(batch_res)); +} + template VectorPtr PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() { diff --git a/internal/core/src/exec/expression/UnaryExpr.h b/internal/core/src/exec/expression/UnaryExpr.h index e6342eda86434..40371e0e51f38 100644 --- a/internal/core/src/exec/expression/UnaryExpr.h +++ b/internal/core/src/exec/expression/UnaryExpr.h @@ -310,6 +310,14 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr { VectorPtr ExecRangeVisitorImplArray(); + template + VectorPtr + ExecRangeVisitorImplArrayForIndex(); + + template + VectorPtr + ExecArrayEqualForIndex(bool reverse); + // Check overflow and cache result for performace template ColumnVectorPtr diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index f09297dd33269..3b9a54fae940b 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -204,6 +204,25 @@ apply_hits(TargetBitmap& bitset, const RustArrayWrapper& w, bool v) { } } +inline void +apply_hits_with_filter(TargetBitmap& bitset, + const RustArrayWrapper& w, + const std::function& filter) { + for (size_t j = 0; j < w.array_.len; j++) { + auto the_offset = w.array_.array[j]; + bitset[the_offset] = filter(the_offset); + } +} + +inline void +apply_hits_with_callback( + const RustArrayWrapper& w, + const std::function& callback) { + for (size_t j = 0; j < w.array_.len; j++) { + callback(w.array_.array[j]); + } +} + template const TargetBitmap InvertedIndexTantivy::In(size_t n, const T* values) { @@ -215,6 +234,28 @@ InvertedIndexTantivy::In(size_t n, const T* values) { return bitset; } +template +const TargetBitmap +InvertedIndexTantivy::InApplyFilter( + size_t n, const T* values, const std::function& filter) { + TargetBitmap bitset(Count()); + for (size_t i = 0; i < n; ++i) { + auto array = wrapper_->term_query(values[i]); + apply_hits_with_filter(bitset, array, filter); + } + return bitset; +} + +template +void +InvertedIndexTantivy::InApplyCallback( + size_t n, const T* values, const std::function& callback) { + for (size_t i = 0; i < n; ++i) { + auto array = wrapper_->term_query(values[i]); + apply_hits_with_callback(array, callback); + } +} + template const TargetBitmap InvertedIndexTantivy::NotIn(size_t n, const T* values) { @@ -311,6 +352,9 @@ void InvertedIndexTantivy::BuildWithRawData(size_t n, const void* values, const Config& config) { + if constexpr (std::is_same_v) { + schema_.set_data_type(proto::schema::DataType::Bool); + } if constexpr (std::is_same_v) { schema_.set_data_type(proto::schema::DataType::Int8); } @@ -341,7 +385,15 @@ InvertedIndexTantivy::BuildWithRawData(size_t n, std::string field = "test_inverted_index"; wrapper_ = std::make_shared( field.c_str(), d_type_, path_.c_str()); - wrapper_->add_data(static_cast(values), n); + if (config.find("is_array") != config.end()) { + // only used in ut. + auto arr = static_cast*>(values); + for (size_t i = 0; i < n; i++) { + wrapper_->template add_multi_data(arr[i].data(), arr[i].size()); + } + } else { + wrapper_->add_data(static_cast(values), n); + } finish(); } diff --git a/internal/core/src/index/InvertedIndexTantivy.h b/internal/core/src/index/InvertedIndexTantivy.h index cc0178804c343..53fb9c2d687ac 100644 --- a/internal/core/src/index/InvertedIndexTantivy.h +++ b/internal/core/src/index/InvertedIndexTantivy.h @@ -111,6 +111,18 @@ class InvertedIndexTantivy : public ScalarIndex { const TargetBitmap In(size_t n, const T* values) override; + const TargetBitmap + InApplyFilter( + size_t n, + const T* values, + const std::function& filter) override; + + void + InApplyCallback( + size_t n, + const T* values, + const std::function& callback) override; + const TargetBitmap NotIn(size_t n, const T* values) override; diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index aacef521f5db3..37d22a288d80b 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -50,6 +50,20 @@ class ScalarIndex : public IndexBase { virtual const TargetBitmap In(size_t n, const T* values) = 0; + virtual const TargetBitmap + InApplyFilter(size_t n, + const T* values, + const std::function& filter) { + PanicInfo(ErrorCode::Unsupported, "InApplyFilter is not implemented"); + } + + virtual void + InApplyCallback(size_t n, + const T* values, + const std::function& callback) { + PanicInfo(ErrorCode::Unsupported, "InApplyCallback is not implemented"); + } + virtual const TargetBitmap NotIn(size_t n, const T* values) = 0; diff --git a/internal/core/thirdparty/tantivy/tantivy-wrapper.h b/internal/core/thirdparty/tantivy/tantivy-wrapper.h index 3076f502aee21..7574d3875ca24 100644 --- a/internal/core/thirdparty/tantivy/tantivy-wrapper.h +++ b/internal/core/thirdparty/tantivy/tantivy-wrapper.h @@ -51,15 +51,6 @@ struct RustArrayWrapper { std::cout << ss.str() << std::endl; } - std::set - to_set() { - std::set s; - for (int i = 0; i < array_.len; i++) { - s.insert(array_.array[i]); - } - return s; - } - RustArray array_; private: diff --git a/internal/core/thirdparty/tantivy/test.cpp b/internal/core/thirdparty/tantivy/test.cpp index 602ea3449f0a2..a380481042487 100644 --- a/internal/core/thirdparty/tantivy/test.cpp +++ b/internal/core/thirdparty/tantivy/test.cpp @@ -200,6 +200,12 @@ test_32717() { } } +std::set +to_set(const RustArrayWrapper& w) { + std::set s(w.array_.array, w.array_.array + w.array_.len); + return s; +} + template std::map> build_inverted_index(const std::vector>& vec_of_array) { @@ -236,7 +242,7 @@ test_array_int() { auto inverted_index = build_inverted_index(vec_of_array); for (const auto& [term, posting_list] : inverted_index) { - auto hits = w.term_query(term).to_set(); + auto hits = to_set(w.term_query(term)); assert(posting_list == hits); } } @@ -266,7 +272,7 @@ test_array_string() { auto inverted_index = build_inverted_index(vec_of_array); for (const auto& [term, posting_list] : inverted_index) { - auto hits = w.term_query(term).to_set(); + auto hits = to_set(w.term_query(term)); assert(posting_list == hits); } } diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index 657198c9b88c2..e742e25a5a2bb 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -66,6 +66,7 @@ set(MILVUS_TEST_FILES test_group_by.cpp test_regex_query_util.cpp test_regex_query.cpp + test_array_inverted_index.cpp ) if ( BUILD_DISK_ANN STREQUAL "ON" ) diff --git a/internal/core/unittest/test_array_inverted_index.cpp b/internal/core/unittest/test_array_inverted_index.cpp new file mode 100644 index 0000000000000..cd4833b52bf38 --- /dev/null +++ b/internal/core/unittest/test_array_inverted_index.cpp @@ -0,0 +1,297 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICEN_SE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRAN_TIES OR CON_DITION_S OF AN_Y KIN_D, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include +#include + +#include "pb/plan.pb.h" +#include "index/InvertedIndexTantivy.h" +#include "common/Schema.h" +#include "segcore/SegmentSealedImpl.h" +#include "test_utils/DataGen.h" +#include "test_utils/GenExprProto.h" +#include "query/PlanProto.h" +#include "query/generated/ExecPlanNodeVisitor.h" + +using namespace milvus; +using namespace milvus::query; +using namespace milvus::segcore; + +template +SchemaPtr +GenTestSchema() { + auto schema_ = std::make_shared(); + schema_->AddDebugField( + "fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); + auto pk = schema_->AddDebugField("pk", DataType::INT64); + schema_->set_primary_field_id(pk); + + if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::BOOL); + } else if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::INT8); + } else if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::INT16); + } else if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::INT32); + } else if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::INT64); + } else if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::FLOAT); + } else if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::DOUBLE); + } else if constexpr (std::is_same_v) { + schema_->AddDebugArrayField("array", DataType::VARCHAR); + } + + return schema_; +} + +template +class ArrayInvertedIndexTest : public ::testing::Test { + public: + void + SetUp() override { + schema_ = GenTestSchema(); + seg_ = CreateSealedSegment(schema_); + N_ = 3000; + uint64_t seed = 19190504; + auto raw_data = DataGen(schema_, N_, seed); + auto array_col = + raw_data.get_col(schema_->get_field_id(FieldName("array"))) + ->scalars() + .array_data() + .data(); + for (size_t i = 0; i < N_; i++) { + boost::container::vector array; + if constexpr (std::is_same_v) { + for (size_t j = 0; j < array_col[i].bool_data().data_size(); + j++) { + array.push_back(array_col[i].bool_data().data(j)); + } + } else if constexpr (std::is_same_v) { + for (size_t j = 0; j < array_col[i].long_data().data_size(); + j++) { + array.push_back(array_col[i].long_data().data(j)); + } + } else if constexpr (std::is_integral_v) { + for (size_t j = 0; j < array_col[i].int_data().data_size(); + j++) { + array.push_back(array_col[i].int_data().data(j)); + } + } else if constexpr (std::is_floating_point_v) { + for (size_t j = 0; j < array_col[i].float_data().data_size(); + j++) { + array.push_back(array_col[i].float_data().data(j)); + } + } else if constexpr (std::is_same_v) { + for (size_t j = 0; j < array_col[i].string_data().data_size(); + j++) { + array.push_back(array_col[i].string_data().data(j)); + } + } + vec_of_array_.push_back(array); + } + SealedLoadFieldData(raw_data, *seg_); + LoadInvertedIndex(); + } + + void + TearDown() override { + } + + void + LoadInvertedIndex() { + auto index = std::make_unique>(); + Config cfg; + cfg["is_array"] = true; + index->BuildWithRawData(N_, vec_of_array_.data(), cfg); + LoadIndexInfo info{ + .field_id = schema_->get_field_id(FieldName("array")).get(), + .index = std::move(index), + }; + seg_->LoadIndex(info); + } + + public: + SchemaPtr schema_; + SegmentSealedUPtr seg_; + int64_t N_; + std::vector> vec_of_array_; +}; + +TYPED_TEST_SUITE_P(ArrayInvertedIndexTest); + +TYPED_TEST_P(ArrayInvertedIndexTest, ArrayContainsAny) { + const auto& meta = this->schema_->operator[](FieldName("array")); + auto column_info = test::GenColumnInfo( + meta.get_id().get(), + static_cast(meta.get_data_type()), + false, + false, + static_cast(meta.get_element_type())); + auto contains_expr = std::make_unique(); + contains_expr->set_allocated_column_info(column_info); + contains_expr->set_op(proto::plan::JSONContainsExpr_JSONOp:: + JSONContainsExpr_JSONOp_ContainsAny); + contains_expr->set_elements_same_type(true); + for (const auto& elem : this->vec_of_array_[0]) { + auto t = test::GenGenericValue(elem); + contains_expr->mutable_elements()->AddAllocated(t); + } + auto expr = test::GenExpr(); + expr->set_allocated_json_contains_expr(contains_expr.release()); + + auto parser = ProtoParser(*this->schema_); + auto typed_expr = parser.ParseExprs(*expr); + auto parsed = + std::make_shared(DEFAULT_PLANNODE_ID, typed_expr); + + auto segpromote = dynamic_cast(this->seg_.get()); + query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP); + BitsetType final; + visitor.ExecuteExprNode(parsed, segpromote, this->N_, final); + + std::unordered_set elems(this->vec_of_array_[0].begin(), + this->vec_of_array_[0].end()); + auto ref = [this, &elems](size_t offset) -> bool { + std::unordered_set row(this->vec_of_array_[offset].begin(), + this->vec_of_array_[offset].end()); + for (const auto& elem : elems) { + if (row.find(elem) != row.end()) { + return true; + } + } + return false; + }; + ASSERT_EQ(final.size(), this->N_); + for (size_t i = 0; i < this->N_; i++) { + ASSERT_EQ(final[i], ref(i)) << "i: " << i << ", final[i]: " << final[i] + << ", ref(i): " << ref(i); + } +} + +TYPED_TEST_P(ArrayInvertedIndexTest, ArrayContainsAll) { + const auto& meta = this->schema_->operator[](FieldName("array")); + auto column_info = test::GenColumnInfo( + meta.get_id().get(), + static_cast(meta.get_data_type()), + false, + false, + static_cast(meta.get_element_type())); + auto contains_expr = std::make_unique(); + contains_expr->set_allocated_column_info(column_info); + contains_expr->set_op(proto::plan::JSONContainsExpr_JSONOp:: + JSONContainsExpr_JSONOp_ContainsAll); + contains_expr->set_elements_same_type(true); + for (const auto& elem : this->vec_of_array_[0]) { + auto t = test::GenGenericValue(elem); + contains_expr->mutable_elements()->AddAllocated(t); + } + auto expr = test::GenExpr(); + expr->set_allocated_json_contains_expr(contains_expr.release()); + + auto parser = ProtoParser(*this->schema_); + auto typed_expr = parser.ParseExprs(*expr); + auto parsed = + std::make_shared(DEFAULT_PLANNODE_ID, typed_expr); + + auto segpromote = dynamic_cast(this->seg_.get()); + query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP); + BitsetType final; + visitor.ExecuteExprNode(parsed, segpromote, this->N_, final); + + std::unordered_set elems(this->vec_of_array_[0].begin(), + this->vec_of_array_[0].end()); + auto ref = [this, &elems](size_t offset) -> bool { + std::unordered_set row(this->vec_of_array_[offset].begin(), + this->vec_of_array_[offset].end()); + for (const auto& elem : elems) { + if (row.find(elem) == row.end()) { + return false; + } + } + return true; + }; + ASSERT_EQ(final.size(), this->N_); + for (size_t i = 0; i < this->N_; i++) { + ASSERT_EQ(final[i], ref(i)) << "i: " << i << ", final[i]: " << final[i] + << ", ref(i): " << ref(i); + } +} + +TYPED_TEST_P(ArrayInvertedIndexTest, ArrayEqual) { + if (std::is_floating_point_v) { + GTEST_SKIP() << "not accurate to perform equal comparison on floating " + "point number"; + } + + const auto& meta = this->schema_->operator[](FieldName("array")); + auto column_info = test::GenColumnInfo( + meta.get_id().get(), + static_cast(meta.get_data_type()), + false, + false, + static_cast(meta.get_element_type())); + auto unary_range_expr = std::make_unique(); + unary_range_expr->set_allocated_column_info(column_info); + unary_range_expr->set_op(proto::plan::OpType::Equal); + auto arr = new proto::plan::GenericValue; + arr->mutable_array_val()->set_element_type( + static_cast(meta.get_element_type())); + arr->mutable_array_val()->set_same_type(true); + for (const auto& elem : this->vec_of_array_[0]) { + auto e = test::GenGenericValue(elem); + arr->mutable_array_val()->mutable_array()->AddAllocated(e); + } + unary_range_expr->set_allocated_value(arr); + auto expr = test::GenExpr(); + expr->set_allocated_unary_range_expr(unary_range_expr.release()); + + auto parser = ProtoParser(*this->schema_); + auto typed_expr = parser.ParseExprs(*expr); + auto parsed = + std::make_shared(DEFAULT_PLANNODE_ID, typed_expr); + + auto segpromote = dynamic_cast(this->seg_.get()); + query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP); + BitsetType final; + visitor.ExecuteExprNode(parsed, segpromote, this->N_, final); + + auto ref = [this](size_t offset) -> bool { + if (this->vec_of_array_[0].size() != + this->vec_of_array_[offset].size()) { + return false; + } + auto size = this->vec_of_array_[0].size(); + for (size_t i = 0; i < size; i++) { + if (this->vec_of_array_[0][i] != this->vec_of_array_[offset][i]) { + return false; + } + } + return true; + }; + ASSERT_EQ(final.size(), this->N_); + for (size_t i = 0; i < this->N_; i++) { + ASSERT_EQ(final[i], ref(i)) << "i: " << i << ", final[i]: " << final[i] + << ", ref(i): " << ref(i); + } +} + +using ElementType = testing:: + Types; + +REGISTER_TYPED_TEST_CASE_P(ArrayInvertedIndexTest, + ArrayContainsAny, + ArrayContainsAll, + ArrayEqual); + +INSTANTIATE_TYPED_TEST_SUITE_P(Naive, ArrayInvertedIndexTest, ElementType); diff --git a/internal/core/unittest/test_inverted_index.cpp b/internal/core/unittest/test_inverted_index.cpp index d01813ab94e6a..c8b9bf3663235 100644 --- a/internal/core/unittest/test_inverted_index.cpp +++ b/internal/core/unittest/test_inverted_index.cpp @@ -25,8 +25,6 @@ using namespace milvus; -// TODO: I would suggest that our all indexes use this test to simulate the real production environment. - namespace milvus::test { auto gen_field_meta(int64_t collection_id = 1, diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index 37c3d6f27676d..3b69ed98e8ec0 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -480,8 +480,30 @@ inline GeneratedData DataGen(SchemaPtr schema, } break; } - case DataType::INT8: - case DataType::INT16: + case DataType::INT8: { + for (int i = 0; i < N / repeat_count; i++) { + milvus::proto::schema::ScalarField field_data; + + for (int j = 0; j < array_len; j++) { + field_data.mutable_int_data()->add_data( + static_cast(random())); + } + data[i] = field_data; + } + break; + } + case DataType::INT16: { + for (int i = 0; i < N / repeat_count; i++) { + milvus::proto::schema::ScalarField field_data; + + for (int j = 0; j < array_len; j++) { + field_data.mutable_int_data()->add_data( + static_cast(random())); + } + data[i] = field_data; + } + break; + } case DataType::INT32: { for (int i = 0; i < N / repeat_count; i++) { milvus::proto::schema::ScalarField field_data; diff --git a/internal/core/unittest/test_utils/GenExprProto.h b/internal/core/unittest/test_utils/GenExprProto.h index 171273b1fc7fd..77f0a4964e4bb 100644 --- a/internal/core/unittest/test_utils/GenExprProto.h +++ b/internal/core/unittest/test_utils/GenExprProto.h @@ -15,15 +15,18 @@ namespace milvus::test { inline auto -GenColumnInfo(int64_t field_id, - proto::schema::DataType field_type, - bool auto_id, - bool is_pk) { +GenColumnInfo( + int64_t field_id, + proto::schema::DataType field_type, + bool auto_id, + bool is_pk, + proto::schema::DataType element_type = proto::schema::DataType::None) { auto column_info = new proto::plan::ColumnInfo(); column_info->set_field_id(field_id); column_info->set_data_type(field_type); column_info->set_is_autoid(auto_id); column_info->set_is_primary_key(is_pk); + column_info->set_element_type(element_type); return column_info; }