From f0095bc010ef88d1a6200508e3166891d939ca83 Mon Sep 17 00:00:00 2001 From: "Xianhui.Lin" Date: Fri, 20 Dec 2024 15:49:21 +0800 Subject: [PATCH] improve jsoninvert unitest Signed-off-by: Xianhui.Lin --- .../core/src/index/JsonKeyInvertedIndex.cpp | 2 - .../core/unittest/test_json_key_index.cpp | 400 +++++++++++------- 2 files changed, 239 insertions(+), 163 deletions(-) diff --git a/internal/core/src/index/JsonKeyInvertedIndex.cpp b/internal/core/src/index/JsonKeyInvertedIndex.cpp index f4b140fb366ff..cbf3d1af7cfb3 100644 --- a/internal/core/src/index/JsonKeyInvertedIndex.cpp +++ b/internal/core/src/index/JsonKeyInvertedIndex.cpp @@ -26,8 +26,6 @@ JsonKeyInvertedIndex::AddInvertedRecord(const std::vector& paths, uint16_t offset, uint16_t length) { auto key = std::string("/") + Join(paths, "."); - std::cout << "xxx insert inverted key" << key << "rowid" << row_id - << "offset" << offset << "length" << length << std::endl; LOG_DEBUG( "insert inverted key: {}, row_id: {}, offset: " "{}, length:{}", diff --git a/internal/core/unittest/test_json_key_index.cpp b/internal/core/unittest/test_json_key_index.cpp index c128c78909535..dc8089be1e042 100644 --- a/internal/core/unittest/test_json_key_index.cpp +++ b/internal/core/unittest/test_json_key_index.cpp @@ -24,82 +24,22 @@ #include "test_utils/indexbuilder_test_utils.h" #include "index/Meta.h" #include "index/JsonKeyInvertedIndex.h" - +#include "common/Json.h" +#include "common/Types.h" using namespace milvus::index; using namespace milvus::indexbuilder; using namespace milvus; using namespace milvus::index; -std::string -join(const std::vector& vec, const std::string& delimiter) { - std::ostringstream oss; - for (size_t i = 0; i < vec.size(); ++i) { - oss << vec[i]; - if (i != vec.size() - 1) { - oss << delimiter; - } - } - return oss.str(); -} - -// 1000 keys -static std::string -GenerateJson(int N) { - std::vector data(N); - std::default_random_engine er(42); - std::normal_distribution<> distr(0, 1); - std::vector keys; - for (int i = 0; i < N; i++) { - keys.push_back("keys" + std::to_string(i)); - } - std::string json_string; - std::vector values(N); - for (int i = 0; i < N; i++) { - if (i % 7 == 0) { - values[i] = std::to_string(er()); - } else if (i % 7 == 1) { - values[i] = std::to_string(static_cast(er())); - } else if (i % 7 == 2) { - values[i] = er() / 2 == 0 ? "true" : "false"; - } else if (i % 7 == 3) { - values[i] = "\"xxxx" + std::to_string(i) + "\""; - } else if (i % 7 == 4) { - std::vector intvec(10); - for (int j = 0; j < 10; j++) { - intvec[j] = std::to_string(i + j); - } - values[i] = "[" + join(intvec, ",") + "]"; - } else if (i % 7 == 5) { - std::vector doublevec(10); - for (int j = 0; j < 10; j++) { - doublevec[j] = - std::to_string(static_cast(i + j + er())); - } - values[i] = "[" + join(doublevec, ",") + "]"; - } else if (i % 7 == 6) { - std::vector stringvec(10); - for (int j = 0; j < 10; j++) { - stringvec[j] = "\"xxx" + std::to_string(j) + "\""; - } - values[i] = "[" + join(stringvec, ",") + "]"; - } - } - json_string += "{"; - for (int i = 0; i < N - 1; i++) { - json_string += R"(")" + keys[i] + R"(":)" + values[i] + R"(,)"; - } - json_string += R"(")" + keys[N - 1] + R"(":)" + values[N - 1]; - json_string += "}"; - return json_string; -} - -static std::vector -GenerateJsons(int size, int dim) { +static std::vector +GenerateJsons(int size) { std::vector jsons; - for (int i = 0; i < size; ++i) { - std::cout << GenerateJson(dim) << std::endl; - jsons.push_back( - milvus::Json(simdjson::padded_string(GenerateJson(dim)))); + for (int i = 0; i < size; i++) { + auto str = R"({"int":)" + std::to_string(random()) + R"(,"double":)" + + std::to_string(static_cast(random())) + + R"(,"string":")" + std::to_string(random()) + + R"(","bool": true)" + R"(, "array": [1,2,3])" + "}"; + jsons.push_back(milvus::Json(simdjson::padded_string(str))); } return jsons; } @@ -113,8 +53,7 @@ class JsonKeyIndexTest : public testing::Test { int64_t field_id, int64_t index_build_id, int64_t index_version, - int64_t size, - int64_t dim) { + int64_t size) { proto::schema::FieldSchema field_schema; field_schema.set_data_type(proto::schema::DataType::JSON); @@ -123,7 +62,7 @@ class JsonKeyIndexTest : public testing::Test { auto index_meta = storage::IndexMeta{ segment_id, field_id, index_build_id, index_version}; - data_ = std::move(GenerateJsons(size, dim)); + data_ = std::move(GenerateJsons(size)); auto field_data = storage::CreateFieldData(DataType::JSON); field_data->FillFieldData(data_.data(), data_.size()); storage::InsertData insert_data(field_data); @@ -177,8 +116,7 @@ class JsonKeyIndexTest : public testing::Test { int64_t field_id = 101; int64_t index_build_id = 1000; int64_t index_version = 10000; - size_ = 10; - dim_ = 10; + size_ = 10000; std::string root_path = "/tmp/test-jsonkey-index/"; storage::StorageConfig storage_config; @@ -192,8 +130,7 @@ class JsonKeyIndexTest : public testing::Test { field_id, index_build_id, index_version, - size_, - dim_); + size_); } virtual ~JsonKeyIndexTest() override { @@ -204,59 +141,42 @@ class JsonKeyIndexTest : public testing::Test { void TestTermInFunc() { { - std::vector> testcases{{"705894"}}; + struct Testcase { + std::vector term; + std::vector nested_path; + }; + std::vector testcases{ + {{1, 2, 3, 4}, {"int"}}, + {{10, 100, 1000, 10000}, {"int"}}, + {{100, 10000, 9999, 444}, {"int"}}, + {{23, 42, 66, 17, 25}, {"int"}}, + }; for (auto testcase : testcases) { - auto check = [&](std::string value) { - std::unordered_set term_set(testcase.begin(), - testcase.end()); + auto check = [&](int64_t value) { + std::unordered_set term_set(testcase.term.begin(), + testcase.term.end()); return term_set.find(value) != term_set.end(); }; - std::unordered_set term_set(testcase.begin(), - testcase.end()); + std::unordered_set term_set(testcase.term.begin(), + testcase.term.end()); auto filter_func = [&term_set, this](uint32_t row_id, uint16_t offset, uint16_t size) { - auto val = - this->data_[row_id].template at_pos( - offset, size); + auto val = this->data_[row_id].template at_pos( + offset, size); if (val.second != "") { return false; } - return term_set.find((std::string(val.first))) != + return term_set.find((int64_t(val.first))) != term_set.end(); }; - auto bitset = - index_->FilterByPath("/keys0", size_, filter_func); - + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto bitset = index_->FilterByPath(pointer, size_, filter_func); ASSERT_EQ(bitset.size(), size_); for (int i = 0; i < bitset.size(); ++i) { + auto val = data_[i].template at(pointer).value(); auto ans = bitset[i]; - auto ref = check("705894"); - ASSERT_EQ(ans, ref); - } - } - } - { - std::vector testcases{"true"}; - for (auto& value : testcases) { - auto filter_func = [this, &value](uint32_t row_id, - uint16_t offset, - uint16_t size) { - auto val = - this->data_[row_id].template at_pos( - offset, size); - if (val.second != "") { - return false; - } - return std::string(val.first) == value; - }; - - auto bitset = - index_->FilterByPath("/keys2", size_, filter_func); - ASSERT_EQ(bitset.size(), size_); - for (int i = 0; i < bitset.size(); ++i) { - auto ans = bitset[i]; - auto ref = (value == "false"); + auto ref = check(val); ASSERT_EQ(ans, ref); } } @@ -264,7 +184,16 @@ class JsonKeyIndexTest : public testing::Test { } void TestUnaryRangeInFunc() { - std::vector testcases{"10", "705894", "805894"}; + struct Testcase { + int64_t val; + std::vector nested_path; + }; + std::vector testcases{ + {10, {"int"}}, + {20, {"int"}}, + {30, {"int"}}, + {40, {"int"}}, + }; std::vector ops{ OpType::Equal, OpType::NotEqual, @@ -274,40 +203,40 @@ class JsonKeyIndexTest : public testing::Test { OpType::LessEqual, }; for (const auto& testcase : testcases) { - auto check = [&](std::string value) { return value == testcase; }; - std::function f = check; + auto check = [&](int64_t value) { return value == testcase.val; }; + std::function f = check; for (auto& op : ops) { switch (op) { case OpType::Equal: { - f = [&](std::string value) { - return value == testcase; + f = [&](int64_t value) { + return value == testcase.val; }; break; } case OpType::NotEqual: { - f = [&](std::string value) { - return value != testcase; + f = [&](int64_t value) { + return value != testcase.val; }; break; } case OpType::GreaterEqual: { - f = [&](std::string value) { - return value >= testcase; + f = [&](int64_t value) { + return value >= testcase.val; }; break; } case OpType::GreaterThan: { - f = [&](std::string value) { return value > testcase; }; + f = [&](int64_t value) { return value > testcase.val; }; break; } case OpType::LessEqual: { - f = [&](std::string value) { - return value <= testcase; + f = [&](int64_t value) { + return value <= testcase.val; }; break; } case OpType::LessThan: { - f = [&](std::string value) { return value < testcase; }; + f = [&](int64_t value) { return value < testcase.val; }; break; } default: { @@ -318,35 +247,35 @@ class JsonKeyIndexTest : public testing::Test { auto filter_func = [&op, &testcase, this](uint32_t row_id, uint16_t offset, uint16_t size) { - auto val = - this->data_[row_id].template at_pos( - offset, size); + auto val = this->data_[row_id].template at_pos( + offset, size); if (val.second != "") { return false; } switch (op) { case OpType::GreaterThan: - return std::string(val.first) > testcase; + return int64_t(val.first) > testcase.val; case OpType::GreaterEqual: - return std::string(val.first) >= testcase; + return int64_t(val.first) >= testcase.val; case OpType::LessThan: - return std::string(val.first) < testcase; + return int64_t(val.first) < testcase.val; case OpType::LessEqual: - return std::string(val.first) <= testcase; + return int64_t(val.first) <= testcase.val; case OpType::Equal: - return std::string(val.first) == testcase; + return int64_t(val.first) == testcase.val; case OpType::NotEqual: - return std::string(val.first) != testcase; + return int64_t(val.first) != testcase.val; default: return false; } }; - auto bitset = - index_->FilterByPath("/keys0", size_, filter_func); + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto bitset = index_->FilterByPath(pointer, size_, filter_func); ASSERT_EQ(bitset.size(), size_); for (int i = 0; i < bitset.size(); ++i) { auto ans = bitset[i]; - auto ref = f("705894"); + auto val = data_[i].template at(pointer).value(); + auto ref = f(val); ASSERT_EQ(ans, ref); } } @@ -358,17 +287,18 @@ class JsonKeyIndexTest : public testing::Test { struct Testcase { bool lower_inclusive; bool upper_inclusive; - std::string lower; - std::string upper; + int64_t lower; + int64_t upper; + std::vector nested_path; }; std::vector testcases{ - {true, false, "10", "20"}, - {true, true, "20", "30"}, - {false, true, "30", "40"}, - {false, false, "40", "50"}, + {true, false, 10, 20, {"int"}}, + {true, true, 20, 30, {"int"}}, + {false, true, 30, 40, {"int"}}, + {false, false, 40, 50, {"int"}}, }; for (const auto& testcase : testcases) { - auto check = [&](std::string value) { + auto check = [&](int64_t value) { if (testcase.lower_inclusive && testcase.upper_inclusive) { return testcase.lower <= value && value <= testcase.upper; } else if (testcase.lower_inclusive && @@ -386,43 +316,190 @@ class JsonKeyIndexTest : public testing::Test { uint16_t offset, uint16_t size) { auto val = - this->data_[row_id].template at_pos( - offset, size); + this->data_[row_id].template at_pos(offset, size); if (val.second != "") { return false; } if (testcase.lower_inclusive && testcase.upper_inclusive) { - return testcase.lower <= std::string(val.first) && - std::string(val.first) <= testcase.upper; + return testcase.lower <= int64_t(val.first) && + int64_t(val.first) <= testcase.upper; } else if (testcase.lower_inclusive && !testcase.upper_inclusive) { - return testcase.lower <= std::string(val.first) && - std::string(val.first) < testcase.upper; + return testcase.lower <= int64_t(val.first) && + int64_t(val.first) < testcase.upper; } else if (!testcase.lower_inclusive && testcase.upper_inclusive) { - return testcase.lower < std::string(val.first) && - std::string(val.first) <= testcase.upper; + return testcase.lower < int64_t(val.first) && + int64_t(val.first) <= testcase.upper; } else { - return testcase.lower < std::string(val.first) && - std::string(val.first) < testcase.upper; + return testcase.lower < int64_t(val.first) && + int64_t(val.first) < testcase.upper; } }; - auto bitset = index_->FilterByPath("/keys7", size_, filter_func); + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto bitset = index_->FilterByPath(pointer, size_, filter_func); ASSERT_EQ(bitset.size(), size_); for (int i = 0; i < bitset.size(); ++i) { auto ans = bitset[i]; - auto ref = check("970724117"); + auto val = data_[i].template at(pointer).value(); + auto ref = check(val); ASSERT_EQ(ans, ref); } } } + void + TestExistInFunc() { + struct Testcase { + std::vector nested_path; + }; + std::vector testcases{ + {{"A"}}, + {{"int"}}, + {{"double"}}, + {{"B"}}, + }; + for (const auto& testcase : testcases) { + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto filter_func = [&pointer, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + return this->data_[row_id].exist(pointer); + }; + + auto bitset = index_->FilterByPath(pointer, size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + auto ans = bitset[i]; + auto val = data_[i].exist(pointer); + ASSERT_EQ(ans, val); + } + } + } + + void + TestJsonContainsAllFunc() { + { + std::vector> testcases{ + {{1, 10}, {"int"}}, + {{10, 100}, {"int"}}, + {{100, 1000}, {"int"}}, + {{1000, 10}, {"int"}}, + {{2, 4, 6, 8, 10}, {"int"}}, + {{1, 2, 3, 4, 5}, {"int"}}, + }; + for (const auto& testcase : testcases) { + auto check = [&](const std::vector& values) { + for (auto const& e : testcase.term) { + if (std::find(values.begin(), values.end(), e) == + values.end()) { + return false; + } + } + return true; + }; + auto pointer = milvus::Json::pointer(testcase.nested_path); + std::unordered_set elements; + for (auto const& element : testcase.term) { + elements.insert(element); + } + auto filter_func = [&elements, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto array = this->data_[row_id].array_at(offset, size); + std::unordered_set tmp_elements(elements); + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { + continue; + } + tmp_elements.erase(val.value()); + if (tmp_elements.size() == 0) { + return true; + } + } + return tmp_elements.empty(); + }; + + auto bitset = index_->FilterByPath(pointer, size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + auto ans = bitset[i]; + auto array = data_[i].array_at(pointer); + std::vector res; + for (const auto& element : array) { + res.push_back(element.template get()); + } + ASSERT_EQ(ans, check(res)); + } + } + } + + { + std::vector> bool_testcases{ + {{true, true}, {"bool"}}, {{false, false}, {"bool"}}}; + for (const auto& testcase : bool_testcases) { + auto check = [&](const std::vector& values) { + for (auto const& e : testcase.term) { + if (std::find(values.begin(), values.end(), e) == + values.end()) { + return false; + } + } + return true; + }; + auto pointer = milvus::Json::pointer(testcase.nested_path); + std::unordered_set elements; + for (auto const& element : testcase.term) { + elements.insert(element); + } + auto filter_func = [&elements, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto array = this->data_[row_id].array_at(offset, size); + std::unordered_set tmp_elements(elements); + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { + continue; + } + tmp_elements.erase(val.value()); + if (tmp_elements.size() == 0) { + return true; + } + } + + return tmp_elements.empty(); + }; + + auto bitset = index_->FilterByPath(pointer, size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + auto ans = bitset[i]; + auto array = data_[i].array_at(pointer); + std::vector res; + for (const auto& element : array) { + res.push_back(element.template get()); + } + ASSERT_EQ(ans, check(res)); + } + } + } + } + + template + struct Testcase { + std::vector term; + std::vector nested_path; + bool res; + }; + public: std::shared_ptr index_; DataType type_; size_t size_; - size_t dim_; std::vector data_; + std::vector json_col; std::shared_ptr chunk_manager_; }; @@ -430,4 +507,5 @@ TEST_F(JsonKeyIndexTest, CountFuncTest) { TestTermInFunc(); TestUnaryRangeInFunc(); TestBinaryRangeInFunc(); + TestExistInFunc(); } \ No newline at end of file