From b9cb9ab380e7b702283e162e7567b7c6582b34b2 Mon Sep 17 00:00:00 2001 From: oh2024 Date: Wed, 5 Jun 2024 05:27:11 +0000 Subject: [PATCH 1/3] fix: handle repeated sort keys and sort as int --- hybridse/src/codegen/udf_ir_builder_test.cc | 21 +++++++++++ .../src/udf/default_defs/feature_zero_def.cc | 36 ++++++++++--------- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/hybridse/src/codegen/udf_ir_builder_test.cc b/hybridse/src/codegen/udf_ir_builder_test.cc index 20d531fee2f..0e7a5d57e73 100644 --- a/hybridse/src/codegen/udf_ir_builder_test.cc +++ b/hybridse/src/codegen/udf_ir_builder_test.cc @@ -1199,6 +1199,27 @@ TEST_F(UdfIRBuilderTest, CustUdfs) { false); } +TEST_F(UdfIRBuilderTest, JsonArraySortAsInt) { + openmldb::base::StringRef json = R"([{"a": "6", "b": "2"}, {"a": "11", "b": "9"}])"; + CheckUdf("json_array_sort", "9,2", json, "a", "b", 10, + true); + CheckUdf("json_array_sort", "2,9", json, "a", "b", 10, + false); +} + +TEST_F(UdfIRBuilderTest, JsonArraySortRepeatedKey) { + openmldb::base::StringRef json = R"([{"a": "6", "b": "a"}, {"a": "11", "b": "aaa"}, {"a": "6", "b": "aa"}])"; + CheckUdf("json_array_sort", "aaa,aa,a", json, "a", "b", + 10, true); + CheckUdf("json_array_sort", "a,aa,aaa", json, "a", "b", + 10, false); +} + +TEST_F(UdfIRBuilderTest, JsonArraySortInvalidKey) { + openmldb::base::StringRef json = R"([{"a": "a", "b": "2"}, {"a": "11", "b": "9"}])"; + CheckUdf("json_array_sort", "", json, "a", "b", 10, + true); +} } } // namespace codegen diff --git a/hybridse/src/udf/default_defs/feature_zero_def.cc b/hybridse/src/udf/default_defs/feature_zero_def.cc index 743e4911f4a..81a97cecdc6 100644 --- a/hybridse/src/udf/default_defs/feature_zero_def.cc +++ b/hybridse/src/udf/default_defs/feature_zero_def.cc @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include #include @@ -25,15 +27,14 @@ #include "boost/algorithm/string.hpp" #include "boost/algorithm/string/join.hpp" #include "boost/algorithm/string/regex.hpp" - #include "codec/list_iterator_codec.h" #include "codec/type_codec.h" +#include "simdjson.h" #include "udf/containers.h" #include "udf/default_udf_library.h" #include "udf/udf.h" #include "udf/udf_registry.h" #include "vm/jit_runtime.h" -#include "simdjson.h" using openmldb::base::Date; using hybridse::codec::ListRef; @@ -615,7 +616,7 @@ void json_array_sort(::openmldb::base::StringRef *json_array, std::string_view order_ref(order->data_, order->size_); std::string_view column_ref(column->data_, column->size_); - std::map container; + std::vector> container; for (auto ele : arr) { simdjson::ondemand::object obj; @@ -635,27 +636,30 @@ void json_array_sort(::openmldb::base::StringRef *json_array, continue; } - container.emplace(order_val, column_val); + int order_int; + auto [ptr_order, ec_order] = std::from_chars(order_val.data(), order_val.data() + order_val.size(), order_int); + if (ec_order != std::errc()) { + return; + } + container.emplace_back(order_int, column_val); } + std::sort(container.begin(), container.end(), [desc](const auto& a, const auto& b) { + if (a.first == b.first) { + return desc ? a.second > b.second : a.second < b.second; + } + return desc ? a.first > b.first : a.first < b.first; + }); + std::stringstream ss; uint32_t topn = static_cast(n); auto sz = container.size(); for (uint32_t i = 0; i < topn && i < sz; ++i) { - if (desc) { - auto it = std::next(container.crbegin(), i); - ss << it->second; - if (std::next(it, 1) != container.crend() && i + 1 < topn) { - ss << ","; - } - } else { - auto it = std::next(container.cbegin(), i); - ss << it->second; - if (std::next(it, 1) != container.cend() && i + 1 < topn) { - ss << ","; + ss << container[i].second; + if (i + 1 < topn && i + 1 < sz) { + ss << ","; } - } } auto str = ss.str(); From ec6f3e3358290e902785aeb838b8d34a3d1dded8 Mon Sep 17 00:00:00 2001 From: oh2024 Date: Wed, 5 Jun 2024 07:50:15 +0000 Subject: [PATCH 2/3] fix: case where sort column is not int --- hybridse/src/codegen/udf_ir_builder_test.cc | 6 ++++-- hybridse/src/udf/default_defs/feature_zero_def.cc | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hybridse/src/codegen/udf_ir_builder_test.cc b/hybridse/src/codegen/udf_ir_builder_test.cc index 0e7a5d57e73..2127ebea5be 100644 --- a/hybridse/src/codegen/udf_ir_builder_test.cc +++ b/hybridse/src/codegen/udf_ir_builder_test.cc @@ -1216,9 +1216,11 @@ TEST_F(UdfIRBuilderTest, JsonArraySortRepeatedKey) { } TEST_F(UdfIRBuilderTest, JsonArraySortInvalidKey) { - openmldb::base::StringRef json = R"([{"a": "a", "b": "2"}, {"a": "11", "b": "9"}])"; - CheckUdf("json_array_sort", "", json, "a", "b", 10, + openmldb::base::StringRef json = R"([{"a": "-1", "b": "1"}, {"a": "a", "b": "2"}, {"a": "11", "b": "9"}])"; + CheckUdf("json_array_sort", "9,2,1", json, "a", "b", 10, true); + CheckUdf("json_array_sort", "1,2,9", json, "a", "b", 10, + false); } } } // namespace codegen diff --git a/hybridse/src/udf/default_defs/feature_zero_def.cc b/hybridse/src/udf/default_defs/feature_zero_def.cc index 81a97cecdc6..fea9147206e 100644 --- a/hybridse/src/udf/default_defs/feature_zero_def.cc +++ b/hybridse/src/udf/default_defs/feature_zero_def.cc @@ -639,7 +639,7 @@ void json_array_sort(::openmldb::base::StringRef *json_array, int order_int; auto [ptr_order, ec_order] = std::from_chars(order_val.data(), order_val.data() + order_val.size(), order_int); if (ec_order != std::errc()) { - return; + order_int = 0; } container.emplace_back(order_int, column_val); } From 2f3f4be227b82b27dd3248b300efa07fad7ebf8f Mon Sep 17 00:00:00 2001 From: oh2024 Date: Wed, 5 Jun 2024 07:58:34 +0000 Subject: [PATCH 3/3] fix: use the largest possible integer type available on any given platform --- hybridse/src/udf/default_defs/feature_zero_def.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hybridse/src/udf/default_defs/feature_zero_def.cc b/hybridse/src/udf/default_defs/feature_zero_def.cc index fea9147206e..79f76af05bc 100644 --- a/hybridse/src/udf/default_defs/feature_zero_def.cc +++ b/hybridse/src/udf/default_defs/feature_zero_def.cc @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -616,7 +617,7 @@ void json_array_sort(::openmldb::base::StringRef *json_array, std::string_view order_ref(order->data_, order->size_); std::string_view column_ref(column->data_, column->size_); - std::vector> container; + std::vector> container; for (auto ele : arr) { simdjson::ondemand::object obj; @@ -636,7 +637,7 @@ void json_array_sort(::openmldb::base::StringRef *json_array, continue; } - int order_int; + std::intmax_t order_int; auto [ptr_order, ec_order] = std::from_chars(order_val.data(), order_val.data() + order_val.size(), order_int); if (ec_order != std::errc()) { order_int = 0;