diff --git a/.github/workflows/auto-cherry-pick.yml b/.github/workflows/auto-cherry-pick.yml new file mode 100644 index 00000000000000..1d59b52b6c9c56 --- /dev/null +++ b/.github/workflows/auto-cherry-pick.yml @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +name: Auto Cherry-Pick to Branch + +on: + pull_request: + types: + - closed + branches: + - master + +jobs: + auto_cherry_pick: + runs-on: ubuntu-latest + if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') && github.event.pull_request.merged == true }} + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + pip install PyGithub + + - name: Auto cherry-pick + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_NAME: ${{ github.repository }} + CONFLICT_LABEL: cherry-pick-conflict-in-3.0 + run: | + python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-3.0 diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index 02497f6a044b91..e198017f17a0d3 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -39,6 +39,7 @@ #include "cloud/cloud_tablet.h" #include "cloud/config.h" #include "cloud/pb_convert.h" +#include "common/config.h" #include "common/logging.h" #include "common/status.h" #include "cpp/sync_point.h" @@ -410,6 +411,10 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ req.set_cumulative_point(tablet->cumulative_layer_point()); } req.set_end_version(-1); + // backend side use schema dict + if (config::variant_use_cloud_schema_dict) { + req.set_schema_op(GetRowsetRequest::RETURN_DICT); + } VLOG_DEBUG << "send GetRowsetRequest: " << req.ShortDebugString(); stub->get_rowset(&cntl, &req, &resp, nullptr); @@ -524,7 +529,8 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ existed_rowset->rowset_id().to_string() == cloud_rs_meta_pb.rowset_id_v2()) { continue; // Same rowset, skip it } - RowsetMetaPB meta_pb = cloud_rowset_meta_to_doris(cloud_rs_meta_pb); + RowsetMetaPB meta_pb = cloud_rowset_meta_to_doris( + cloud_rs_meta_pb, resp.has_schema_dict() ? &resp.schema_dict() : nullptr); auto rs_meta = std::make_shared(); rs_meta->init_from_pb(meta_pb); RowsetSharedPtr rowset; diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index 4f452656a6236b..5d7b445917aa20 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -558,14 +558,16 @@ std::vector CloudStorageEngine::_generate_cloud_compaction_task } else if (config::enable_parallel_cumu_compaction) { filter_out = [&tablet_preparing_cumu_compaction](CloudTablet* t) { return tablet_preparing_cumu_compaction.contains(t->tablet_id()) || - (t->tablet_state() != TABLET_RUNNING && t->alter_version() == -1); + (t->tablet_state() != TABLET_RUNNING && + (!config::enable_new_tablet_do_compaction || t->alter_version() == -1)); }; } else { filter_out = [&tablet_preparing_cumu_compaction, &submitted_cumu_compactions](CloudTablet* t) { return tablet_preparing_cumu_compaction.contains(t->tablet_id()) || submitted_cumu_compactions.contains(t->tablet_id()) || - (t->tablet_state() != TABLET_RUNNING && t->alter_version() == -1); + (t->tablet_state() != TABLET_RUNNING && + (!config::enable_new_tablet_do_compaction || t->alter_version() == -1)); }; } diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 576f1da72622e4..86893dc38a240c 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -108,6 +108,36 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, return capture_rs_readers_unlocked(version_path, rs_splits); } +Status CloudTablet::merge_rowsets_schema() { + // Find the rowset with the max version + auto max_version_rowset = + std::max_element( + _rs_version_map.begin(), _rs_version_map.end(), + [](const auto& a, const auto& b) { + return !a.second->tablet_schema() + ? true + : (!b.second->tablet_schema() + ? false + : a.second->tablet_schema()->schema_version() < + b.second->tablet_schema() + ->schema_version()); + }) + ->second; + TabletSchemaSPtr max_version_schema = max_version_rowset->tablet_schema(); + // If the schema has variant columns, perform a merge to create a wide tablet schema + if (max_version_schema->num_variant_columns() > 0) { + std::vector schemas; + std::transform(_rs_version_map.begin(), _rs_version_map.end(), std::back_inserter(schemas), + [](const auto& rs_meta) { return rs_meta.second->tablet_schema(); }); + // Merge the collected schemas to obtain the least common schema + RETURN_IF_ERROR(vectorized::schema_util::get_least_common_schema(schemas, nullptr, + max_version_schema)); + VLOG_DEBUG << "dump schema: " << max_version_schema->dump_full_schema(); + _merged_tablet_schema = max_version_schema; + } + return Status::OK(); +} + // There are only two tablet_states RUNNING and NOT_READY in cloud mode // This function will erase the tablet from `CloudTabletMgr` when it can't find this tablet in MS. Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data) { @@ -133,6 +163,10 @@ Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data) if (st.is()) { clear_cache(); } + + // Merge all rowset schemas within a CloudTablet + RETURN_IF_ERROR(merge_rowsets_schema()); + return st; } @@ -188,16 +222,7 @@ Status CloudTablet::sync_if_not_running() { } TabletSchemaSPtr CloudTablet::merged_tablet_schema() const { - std::shared_lock rdlock(_meta_lock); - TabletSchemaSPtr target_schema; - std::vector schemas; - for (const auto& [_, rowset] : _rs_version_map) { - schemas.push_back(rowset->tablet_schema()); - } - // get the max version schema and merge all schema - static_cast( - vectorized::schema_util::get_least_common_schema(schemas, nullptr, target_schema)); - return target_schema; + return _merged_tablet_schema; } void CloudTablet::add_rowsets(std::vector to_add, bool version_overlap, diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 2bd1ce475028ab..5f10211ef53388 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -208,6 +208,9 @@ class CloudTablet final : public BaseTablet { Status sync_if_not_running(); + // Merge all rowset schemas within a CloudTablet + Status merge_rowsets_schema(); + CloudStorageEngine& _engine; // this mutex MUST ONLY be used when sync meta @@ -246,6 +249,9 @@ class CloudTablet final : public BaseTablet { std::mutex _base_compaction_lock; std::mutex _cumulative_compaction_lock; mutable std::mutex _rowset_update_lock; + + // Schema will be merged from all rowsets when sync_rowsets + TabletSchemaSPtr _merged_tablet_schema; }; using CloudTabletSPtr = std::shared_ptr; diff --git a/be/src/cloud/injection_point_action.cpp b/be/src/cloud/injection_point_action.cpp index e0f88debf52f6f..bc6676313c1717 100644 --- a/be/src/cloud/injection_point_action.cpp +++ b/be/src/cloud/injection_point_action.cpp @@ -108,6 +108,15 @@ void register_suites() { sp->set_call_back("VOlapTableSink::close", [](auto&&) { std::this_thread::sleep_for(std::chrono::seconds(5)); }); }); + // curl be_ip:http_port/api/injection_point/apply_suite?name=test_ttl_lru_evict' + suite_map.emplace("test_ttl_lru_evict", [] { + auto* sp = SyncPoint::get_instance(); + sp->set_call_back("BlockFileCache::change_limit1", [](auto&& args) { + LOG(INFO) << "BlockFileCache::change_limit1"; + auto* limit = try_any_cast(args[0]); + *limit = 1; + }); + }); suite_map.emplace("test_file_segment_cache_corruption", [] { auto* sp = SyncPoint::get_instance(); sp->set_call_back("Segment::open:corruption", [](auto&& args) { diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index 550c08c5481d3a..c65d3208be4871 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -17,6 +17,7 @@ #include "cloud/pb_convert.h" +#include #include #include @@ -138,19 +139,54 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { out->mutable_inverted_index_file_info()->Swap(in.mutable_inverted_index_file_info()); } -RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in) { +static void fill_schema_with_dict(const RowsetMetaCloudPB& in, RowsetMetaPB* out, + const SchemaCloudDictionary& dict) { + std::unordered_map unique_id_map; + //init map + for (ColumnPB& column : *out->mutable_tablet_schema()->mutable_column()) { + unique_id_map[column.unique_id()] = &column; + } + // column info + for (size_t i = 0; i < in.schema_dict_key_list().column_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().column_dict_key_list(i); + const ColumnPB& dict_val = dict.column_dict().at(dict_key); + ColumnPB& to_add = *out->mutable_tablet_schema()->add_column(); + to_add = dict_val; + VLOG_DEBUG << "fill dict column " << dict_val.ShortDebugString(); + } + + // index info + for (size_t i = 0; i < in.schema_dict_key_list().index_info_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().index_info_dict_key_list(i); + const TabletIndexPB& dict_val = dict.index_dict().at(dict_key); + *out->mutable_tablet_schema()->add_index() = dict_val; + VLOG_DEBUG << "fill dict index " << dict_val.ShortDebugString(); + } + + // sparse column info + for (size_t i = 0; i < in.schema_dict_key_list().sparse_column_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().sparse_column_dict_key_list(i); + const ColumnPB& dict_val = dict.column_dict().at(dict_key); + *unique_id_map.at(dict_val.parent_unique_id())->add_sparse_columns() = dict_val; + VLOG_DEBUG << "fill dict sparse column" << dict_val.ShortDebugString(); + } +} + +RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict) { RowsetMetaPB out; - cloud_rowset_meta_to_doris(&out, in); + cloud_rowset_meta_to_doris(&out, in, dict); return out; } -RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&& in) { +RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&& in, const SchemaCloudDictionary* dict) { RowsetMetaPB out; - cloud_rowset_meta_to_doris(&out, std::move(in)); + cloud_rowset_meta_to_doris(&out, std::move(in), dict); return out; } -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) { +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict) { // ATTN: please keep the set order aligned with the definition of proto `TabletSchemaCloudPB`. out->set_rowset_id(in.rowset_id()); out->set_partition_id(in.partition_id()); @@ -185,6 +221,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) if (in.has_tablet_schema()) { cloud_tablet_schema_to_doris(out->mutable_tablet_schema(), in.tablet_schema()); } + if (dict != nullptr) { + fill_schema_with_dict(in, out, *dict); + } out->set_txn_expiration(in.txn_expiration()); out->set_segments_overlap_pb(in.segments_overlap_pb()); out->mutable_segments_file_size()->CopyFrom(in.segments_file_size()); @@ -198,7 +237,8 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) out->mutable_inverted_index_file_info()->CopyFrom(in.inverted_index_file_info()); } -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, + const SchemaCloudDictionary* dict) { // ATTN: please keep the set order aligned with the definition of proto `TabletSchemaCloudPB`. out->set_rowset_id(in.rowset_id()); out->set_partition_id(in.partition_id()); @@ -234,6 +274,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { cloud_tablet_schema_to_doris(out->mutable_tablet_schema(), std::move(*in.mutable_tablet_schema())); } + if (dict != nullptr) { + fill_schema_with_dict(in, out, *dict); + } out->set_txn_expiration(in.txn_expiration()); out->set_segments_overlap_pb(in.segments_overlap_pb()); out->mutable_segments_file_size()->Swap(in.mutable_segments_file_size()); diff --git a/be/src/cloud/pb_convert.h b/be/src/cloud/pb_convert.h index 0cfa033f2930a0..31fe43adb11a6d 100644 --- a/be/src/cloud/pb_convert.h +++ b/be/src/cloud/pb_convert.h @@ -24,10 +24,14 @@ RowsetMetaCloudPB doris_rowset_meta_to_cloud(const RowsetMetaPB&); RowsetMetaCloudPB doris_rowset_meta_to_cloud(RowsetMetaPB&&); void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in); void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in); -RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB&); -RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&&); -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in); -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in); +RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB&, + const SchemaCloudDictionary* dict = nullptr); +RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&&, + const SchemaCloudDictionary* dict = nullptr); +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict = nullptr); +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, + const SchemaCloudDictionary* dict = nullptr); // TabletSchemaPB <=> TabletSchemaCloudPB TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB&); diff --git a/be/src/clucene b/be/src/clucene index 5e9566ab364d71..7cf6cf410d41d9 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 5e9566ab364d71b64c436ee46e5c848eed0ab7f7 +Subproject commit 7cf6cf410d41d95456edba263cc55b7b6f5ab027 diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index b077deac04f700..48d4565c1d3407 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1003,6 +1003,8 @@ DEFINE_Int32(pipeline_executor_size, "0"); DEFINE_Bool(enable_workload_group_for_scan, "false"); DEFINE_mInt64(workload_group_scan_task_wait_timeout_ms, "10000"); +// Whether use schema dict in backend side instead of MetaService side(cloud mode) +DEFINE_mBool(variant_use_cloud_schema_dict, "true"); DEFINE_mDouble(variant_ratio_of_defaults_as_sparse_column, "1"); DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048"); DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); @@ -1030,7 +1032,7 @@ DEFINE_Bool(enable_file_cache_query_limit, "false"); DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "90"); DEFINE_mInt32(file_cache_exit_disk_resource_limit_mode_percent, "80"); DEFINE_mBool(enable_read_cache_file_directly, "false"); -DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "false"); +DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "true"); DEFINE_mInt64(file_cache_ttl_valid_check_interval_second, "0"); // zero for not checking // If true, evict the ttl cache using LRU when full. // Otherwise, only expiration can evict ttl and new data won't add to cache when full. @@ -1048,7 +1050,7 @@ DEFINE_String(inverted_index_searcher_cache_limit, "10%"); // set `true` to enable insert searcher into cache when write inverted index data DEFINE_Bool(enable_write_index_searcher_cache, "true"); DEFINE_Bool(enable_inverted_index_cache_check_timestamp, "true"); -DEFINE_Int32(inverted_index_fd_number_limit_percent, "40"); // 40% +DEFINE_Int32(inverted_index_fd_number_limit_percent, "20"); // 20% DEFINE_Int32(inverted_index_query_cache_shards, "256"); // inverted index match bitmap cache size @@ -1097,9 +1099,9 @@ DEFINE_mInt32(schema_cache_sweep_time_sec, "100"); // max number of segment cache, default -1 for backward compatibility fd_number*2/5 DEFINE_Int32(segment_cache_capacity, "-1"); -DEFINE_Int32(segment_cache_fd_percentage, "40"); -DEFINE_mInt32(estimated_mem_per_column_reader, "1024"); -DEFINE_Int32(segment_cache_memory_percentage, "2"); +DEFINE_Int32(segment_cache_fd_percentage, "20"); +DEFINE_mInt32(estimated_mem_per_column_reader, "512"); +DEFINE_Int32(segment_cache_memory_percentage, "5"); // enable feature binlog, default false DEFINE_Bool(enable_feature_binlog, "false"); @@ -1313,7 +1315,7 @@ DEFINE_Int64(num_s3_file_upload_thread_pool_min_thread, "16"); // The max thread num for S3FileUploadThreadPool DEFINE_Int64(num_s3_file_upload_thread_pool_max_thread, "64"); // The max ratio for ttl cache's size -DEFINE_mInt64(max_ttl_cache_ratio, "90"); +DEFINE_mInt64(max_ttl_cache_ratio, "50"); // The maximum jvm heap usage ratio for hdfs write workload DEFINE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio, "0.5"); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1374,6 +1376,8 @@ DEFINE_mInt32(lz4_compression_block_size, "262144"); DEFINE_mBool(enable_pipeline_task_leakage_detect, "false"); +DEFINE_Int32(query_cache_size, "512"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 734d73f46d8bb1..27e697b0c800f1 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1199,6 +1199,7 @@ DECLARE_mInt64(LZ4_HC_compression_level); // Threshold of a column as sparse column // Notice: TEST ONLY DECLARE_mDouble(variant_ratio_of_defaults_as_sparse_column); +DECLARE_mBool(variant_use_cloud_schema_dict); // Threshold to estimate a column is sparsed // Notice: TEST ONLY DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column); @@ -1454,6 +1455,9 @@ DECLARE_mInt32(lz4_compression_block_size); DECLARE_mBool(enable_pipeline_task_leakage_detect); +// MB +DECLARE_Int32(query_cache_size); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index 44846ded868e8f..a544e59c5b971f 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "common/exception.h" @@ -137,7 +138,8 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { for (const auto& col : pschema.partial_update_input_columns()) { _partial_update_input_columns.insert(col); } - std::unordered_map, SlotDescriptor*> slots_map; + std::unordered_map slots_map; + _tuple_desc = _obj_pool.add(new TupleDescriptor(pschema.tuple_desc())); for (const auto& p_slot_desc : pschema.slot_descs()) { @@ -145,8 +147,10 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { _tuple_desc->add_slot(slot_desc); string data_type; EnumToString(TPrimitiveType, to_thrift(slot_desc->col_type()), data_type); - slots_map.emplace(std::make_pair(to_lower(slot_desc->col_name()), - TabletColumn::get_field_type_by_string(data_type)), + std::string is_null_str = slot_desc->is_nullable() ? "true" : "false"; + std::string data_type_str = + std::to_string(int64_t(TabletColumn::get_field_type_by_string(data_type))); + slots_map.emplace(to_lower(slot_desc->col_name()) + "+" + data_type_str + is_null_str, slot_desc); } @@ -157,9 +161,11 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { for (const auto& pcolumn_desc : p_index.columns_desc()) { if (!_is_partial_update || _partial_update_input_columns.contains(pcolumn_desc.name())) { - auto it = slots_map.find(std::make_pair( - to_lower(pcolumn_desc.name()), - TabletColumn::get_field_type_by_string(pcolumn_desc.type()))); + std::string is_null_str = pcolumn_desc.is_nullable() ? "true" : "false"; + std::string data_type_str = std::to_string( + int64_t(TabletColumn::get_field_type_by_string(pcolumn_desc.type()))); + auto it = slots_map.find(to_lower(pcolumn_desc.name()) + "+" + data_type_str + + is_null_str); if (it == std::end(slots_map)) { return Status::InternalError("unknown index column, column={}, type={}", pcolumn_desc.name(), pcolumn_desc.type()); @@ -206,12 +212,14 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { for (const auto& tcolumn : tschema.partial_update_input_columns) { _partial_update_input_columns.insert(tcolumn); } - std::unordered_map, SlotDescriptor*> slots_map; + std::unordered_map slots_map; _tuple_desc = _obj_pool.add(new TupleDescriptor(tschema.tuple_desc)); for (const auto& t_slot_desc : tschema.slot_descs) { auto* slot_desc = _obj_pool.add(new SlotDescriptor(t_slot_desc)); _tuple_desc->add_slot(slot_desc); - slots_map.emplace(std::make_pair(to_lower(slot_desc->col_name()), slot_desc->col_type()), + std::string is_null_str = slot_desc->is_nullable() ? "true" : "false"; + std::string data_type_str = std::to_string(int64_t(slot_desc->col_type())); + slots_map.emplace(to_lower(slot_desc->col_name()) + "+" + data_type_str + is_null_str, slot_desc); } @@ -223,9 +231,11 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { for (const auto& tcolumn_desc : t_index.columns_desc) { if (!_is_partial_update || _partial_update_input_columns.contains(tcolumn_desc.column_name)) { - auto it = slots_map.find( - std::make_pair(to_lower(tcolumn_desc.column_name), - thrift_to_type(tcolumn_desc.column_type.type))); + std::string is_null_str = tcolumn_desc.is_allow_null ? "true" : "false"; + std::string data_type_str = + std::to_string(int64_t(thrift_to_type(tcolumn_desc.column_type.type))); + auto it = slots_map.find(to_lower(tcolumn_desc.column_name) + "+" + data_type_str + + is_null_str); if (it == slots_map.end()) { return Status::InternalError("unknown index column, column={}, type={}", tcolumn_desc.column_name, diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index d330d327149fd2..ca012dc55b3bf2 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -77,6 +77,10 @@ class VRuntimeFilterSlots { if (filter->get_real_type() != RuntimeFilterType::IN_FILTER) { continue; } + if (!filter->need_sync_filter_size() && + filter->type() == RuntimeFilterType::IN_OR_BLOOM_FILTER) { + continue; + } if (has_in_filter.contains(filter->expr_order())) { filter->set_ignored(); continue; @@ -84,7 +88,7 @@ class VRuntimeFilterSlots { has_in_filter.insert(filter->expr_order()); } - // process ignore filter when it has IN_FILTER on same expr, and init bloom filter size + // process ignore filter when it has IN_FILTER on same expr for (auto filter : _runtime_filters) { if (filter->get_ignored()) { continue; diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 321d9d73d89ed9..1a840e2dc6f9a3 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -21,6 +21,7 @@ #include "io/cache/block_file_cache.h" #include "common/status.h" +#include "cpp/sync_point.h" #if defined(__APPLE__) #include @@ -83,6 +84,94 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _total_evict_size_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_total_evict_size"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_index"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_index"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_index"); + + _evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_self_lru_disposable"); + _evict_by_self_lru_metrics_matrix[FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_self_lru_normal"); + _evict_by_self_lru_metrics_matrix[FileCacheType::INDEX] = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_self_lru_index"); + _evict_by_self_lru_metrics_matrix[FileCacheType::TTL] = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_self_lru_ttl"); + + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_index"); + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_index"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_index"); + + _evict_by_try_release = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_try_release"); + _num_read_blocks = std::make_shared>(_cache_base_path.c_str(), "file_cache_num_read_blocks"); _num_hit_blocks = std::make_shared>(_cache_base_path.c_str(), @@ -106,6 +195,8 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); + _disk_limit_mode_metrics = + std::make_shared>(_cache_base_path.c_str(), "disk_limit_mode", 0); _disposable_queue = LRUQueue(cache_settings.disposable_queue_size, cache_settings.disposable_queue_elements, 60 * 60); @@ -113,7 +204,7 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, 7 * 24 * 60 * 60); _normal_queue = LRUQueue(cache_settings.query_queue_size, cache_settings.query_queue_elements, 24 * 60 * 60); - _ttl_queue = LRUQueue(std::numeric_limits::max(), std::numeric_limits::max(), + _ttl_queue = LRUQueue(cache_settings.ttl_queue_size, cache_settings.ttl_queue_elements, std::numeric_limits::max()); if (cache_settings.storage == "memory") { @@ -243,15 +334,12 @@ void BlockFileCache::use_cell(const FileBlockCell& cell, FileBlocks* result, boo result->push_back(cell.file_block); } - if (cell.file_block->cache_type() != FileCacheType::TTL || - config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(cell.file_block->cache_type()); - DCHECK(cell.queue_iterator) << "impossible"; - /// Move to the end of the queue. The iterator remains valid. - if (move_iter_flag) { - queue.move_to_end(*cell.queue_iterator, cache_lock); - } + auto& queue = get_queue(cell.file_block->cache_type()); + /// Move to the end of the queue. The iterator remains valid. + if (cell.queue_iterator && move_iter_flag) { + queue.move_to_end(*cell.queue_iterator, cache_lock); } + cell.update_atime(); cell.is_deleted = false; } @@ -316,14 +404,10 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); - if (config::enable_ttl_cache_evict_using_lru) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = ttl_queue.add( - cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - cell.queue_iterator.reset(); - } + auto& ttl_queue = get_queue(FileCacheType::TTL); + cell.queue_iterator = + ttl_queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); } else { LOG_WARNING("Failed to change key meta").error(st); } @@ -357,7 +441,7 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte auto st = cell.file_block->change_cache_type_between_ttl_and_others( FileCacheType::NORMAL); if (st.ok()) { - if (config::enable_ttl_cache_evict_using_lru) { + if (cell.queue_iterator) { auto& ttl_queue = get_queue(FileCacheType::TTL); ttl_queue.remove(cell.queue_iterator.value(), cache_lock); } @@ -733,11 +817,10 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha << " cache_type=" << cache_type_to_string(context.cache_type) << " error=" << st.msg(); } - if (cell.file_block->cache_type() != FileCacheType::TTL || - config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(cell.file_block->cache_type()); - cell.queue_iterator = queue.add(hash, offset, size, cache_lock); - } + + auto& queue = get_queue(cell.file_block->cache_type()); + cell.queue_iterator = queue.add(hash, offset, size, cache_lock); + if (cell.file_block->cache_type() == FileCacheType::TTL) { if (_key_to_time.find(hash) == _key_to_time.end()) { _key_to_time[hash] = context.expiration_time; @@ -751,7 +834,7 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha } size_t BlockFileCache::try_release() { - std::lock_guard l(_mutex); + std::lock_guard cache_lock(_mutex); std::vector trash; for (auto& [hash, blocks] : _files) { for (auto& [offset, cell] : blocks) { @@ -760,11 +843,14 @@ size_t BlockFileCache::try_release() { } } } + size_t remove_size = 0; for (auto& cell : trash) { FileBlockSPtr file_block = cell->file_block; std::lock_guard lc(cell->file_block->_mutex); - remove(file_block, l, lc); + remove_size += file_block->range().size(); + remove(file_block, cache_lock, lc); } + *_evict_by_try_release << remove_size; LOG(INFO) << "Released " << trash.size() << " blocks in file cache " << _cache_base_path; return trash.size(); } @@ -843,9 +929,10 @@ void BlockFileCache::remove_file_blocks_and_clean_time_maps( void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, - std::lock_guard& cache_lock, bool is_ttl) { + std::lock_guard& cache_lock, + size_t& cur_removed_size) { for (const auto& [entry_key, entry_offset, entry_size] : queue) { - if (!is_overflow(removed_size, size, cur_cache_size, is_ttl)) { + if (!is_overflow(removed_size, size, cur_cache_size)) { break; } auto* cell = get_cell(entry_key, entry_offset, cache_lock); @@ -863,6 +950,7 @@ void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t DCHECK(file_block->_download_state == FileBlock::State::DOWNLOADED); to_evict.push_back(cell); removed_size += cell_size; + cur_removed_size += cell_size; } } } @@ -872,6 +960,9 @@ bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; auto limit = config::max_ttl_cache_ratio * _capacity; + + TEST_INJECTION_POINT_CALLBACK("BlockFileCache::change_limit1", &limit); + if ((_cur_ttl_size + size) * 100 > limit) { return false; } @@ -885,8 +976,9 @@ bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, } std::vector to_evict; auto collect_eliminate_fragments = [&](LRUQueue& queue) { + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); }; if (disposable_queue_size != 0) { collect_eliminate_fragments(get_queue(FileCacheType::DISPOSABLE)); @@ -913,8 +1005,9 @@ bool BlockFileCache::try_reserve_for_ttl(size_t size, std::lock_guard to_evict; + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - true); + cur_removed_size); remove_file_blocks_and_clean_time_maps(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); @@ -947,10 +1040,6 @@ bool BlockFileCache::try_reserve(const UInt128Wrapper& hash, const CacheContext& size = 5 * size; } - if (context.cache_type == FileCacheType::TTL) { - return try_reserve_for_ttl(size, cache_lock); - } - auto query_context = config::enable_file_cache_query_limit && (context.query_id.hi != 0 || context.query_id.lo != 0) ? get_query_context(context.query_id, cache_lock) @@ -1052,7 +1141,7 @@ bool BlockFileCache::remove_if_ttl_file_unlock(const UInt128Wrapper& file_key, b auto st = cell.file_block->change_cache_type_between_ttl_and_others( FileCacheType::NORMAL); if (st.ok()) { - if (config::enable_ttl_cache_evict_using_lru) { + if (cell.queue_iterator) { ttl_queue.remove(cell.queue_iterator.value(), cache_lock); } auto& queue = get_queue(FileCacheType::NORMAL); @@ -1111,12 +1200,33 @@ void BlockFileCache::remove_if_cached(const UInt128Wrapper& file_key) { } } -std::vector BlockFileCache::get_other_cache_type(FileCacheType cur_cache_type) { +std::vector BlockFileCache::get_other_cache_type_without_ttl( + FileCacheType cur_cache_type) { switch (cur_cache_type) { + case FileCacheType::TTL: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::INDEX}; case FileCacheType::INDEX: return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL}; case FileCacheType::NORMAL: return {FileCacheType::DISPOSABLE, FileCacheType::INDEX}; + case FileCacheType::DISPOSABLE: + return {FileCacheType::NORMAL, FileCacheType::INDEX}; + default: + return {}; + } + return {}; +} + +std::vector BlockFileCache::get_other_cache_type(FileCacheType cur_cache_type) { + switch (cur_cache_type) { + case FileCacheType::TTL: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::INDEX}; + case FileCacheType::INDEX: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::TTL}; + case FileCacheType::NORMAL: + return {FileCacheType::DISPOSABLE, FileCacheType::INDEX, FileCacheType::TTL}; + case FileCacheType::DISPOSABLE: + return {FileCacheType::NORMAL, FileCacheType::INDEX, FileCacheType::TTL}; default: return {}; } @@ -1129,7 +1239,7 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size _files.find(hash)->second.find(offset) != _files.find(hash)->second.end()); FileBlockCell* cell = get_cell(hash, offset, cache_lock); DCHECK(cell != nullptr); - if (cell->file_block->cache_type() != FileCacheType::TTL) { + if (cell->queue_iterator) { auto& queue = get_queue(cell->file_block->cache_type()); DCHECK(queue.contains(hash, offset, cache_lock)); auto iter = queue.get(hash, offset, cache_lock); @@ -1142,13 +1252,14 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size } bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( - std::vector other_cache_types, size_t size, int64_t cur_time, - std::lock_guard& cache_lock) { + FileCacheType cur_type, std::vector other_cache_types, size_t size, + int64_t cur_time, std::lock_guard& cache_lock) { size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; std::vector to_evict; for (FileCacheType cache_type : other_cache_types) { auto& queue = get_queue(cache_type); + size_t remove_size_per_type = 0; for (const auto& [entry_key, entry_offset, entry_size] : queue) { if (!is_overflow(removed_size, size, cur_cache_size)) { break; @@ -1170,39 +1281,48 @@ bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( DCHECK(file_block->_download_state == FileBlock::State::DOWNLOADED); to_evict.push_back(cell); removed_size += cell_size; + remove_size_per_type += cell_size; } } + *(_evict_by_heat_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; } remove_file_blocks(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); } -bool BlockFileCache::is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size, - bool is_ttl) const { +bool BlockFileCache::is_overflow(size_t removed_size, size_t need_size, + size_t cur_cache_size) const { bool ret = false; if (_disk_resource_limit_mode) { ret = (removed_size < need_size); } else { ret = (cur_cache_size + need_size - removed_size > _capacity); } - if (is_ttl) { - size_t ttl_threshold = config::max_ttl_cache_ratio * _capacity / 100; - return (ret || ((cur_cache_size + need_size - removed_size) > ttl_threshold)); - } return ret; } bool BlockFileCache::try_reserve_from_other_queue_by_size( - std::vector other_cache_types, size_t size, + FileCacheType cur_type, std::vector other_cache_types, size_t size, std::lock_guard& cache_lock) { size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; std::vector to_evict; + // we follow the privilege defined in get_other_cache_types to evict for (FileCacheType cache_type : other_cache_types) { auto& queue = get_queue(cache_type); + + // we will not drain each of them to the bottom -- i.e., we only + // evict what they have stolen. + size_t cur_queue_size = queue.get_capacity(cache_lock); + size_t cur_queue_max_size = queue.get_max_size(); + if (cur_queue_size <= cur_queue_max_size) { + continue; + } + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); + *(_evict_by_size_metrics_matrix[cache_type][cur_type]) << cur_removed_size; } remove_file_blocks(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); @@ -1211,16 +1331,15 @@ bool BlockFileCache::try_reserve_from_other_queue_by_size( bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, size_t size, int64_t cur_time, std::lock_guard& cache_lock) { - // disposable queue cannot reserve other queues - if (cur_cache_type == FileCacheType::DISPOSABLE) { - return false; - } - auto other_cache_types = get_other_cache_type(cur_cache_type); - bool reserve_success = try_reserve_from_other_queue_by_hot_interval(other_cache_types, size, - cur_time, cache_lock); + // currently, TTL cache is not considered as a candidate + auto other_cache_types = get_other_cache_type_without_ttl(cur_cache_type); + bool reserve_success = try_reserve_from_other_queue_by_hot_interval( + cur_cache_type, other_cache_types, size, cur_time, cache_lock); if (reserve_success || !config::file_cache_enable_evict_from_other_queue_by_size) { return reserve_success; } + + other_cache_types = get_other_cache_type(cur_cache_type); auto& cur_queue = get_queue(cur_cache_type); size_t cur_queue_size = cur_queue.get_capacity(cache_lock); size_t cur_queue_max_size = cur_queue.get_max_size(); @@ -1228,7 +1347,8 @@ bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, if (_cur_cache_size + size > _capacity && cur_queue_size + size > cur_queue_max_size) { return false; } - return try_reserve_from_other_queue_by_size(other_cache_types, size, cache_lock); + return try_reserve_from_other_queue_by_size(cur_cache_type, other_cache_types, size, + cache_lock); } bool BlockFileCache::try_reserve_for_lru(const UInt128Wrapper& hash, @@ -1244,9 +1364,11 @@ bool BlockFileCache::try_reserve_for_lru(const UInt128Wrapper& hash, size_t cur_cache_size = _cur_cache_size; std::vector to_evict; + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); remove_file_blocks(to_evict, cache_lock); + *(_evict_by_self_lru_metrics_matrix[context.cache_type]) << cur_removed_size; if (is_overflow(removed_size, size, cur_cache_size)) { return false; @@ -1521,6 +1643,7 @@ std::string BlockFileCache::reset_capacity(size_t new_capacity) { ss << " ttl_queue released " << queue_released; } _disk_resource_limit_mode = true; + _disk_limit_mode_metrics->set_value(1); _async_clear_file_cache = true; ss << " total_space_released=" << space_released; } @@ -1541,6 +1664,7 @@ void BlockFileCache::check_disk_resource_limit() { } if (_capacity > _cur_cache_size) { _disk_resource_limit_mode = false; + _disk_limit_mode_metrics->set_value(0); } std::pair percent; int ret = disk_used_percentage(_cache_base_path, &percent); @@ -1566,10 +1690,12 @@ void BlockFileCache::check_disk_resource_limit() { if (capacity_percentage >= config::file_cache_enter_disk_resource_limit_mode_percent || inode_is_insufficient(inode_percentage)) { _disk_resource_limit_mode = true; + _disk_limit_mode_metrics->set_value(1); } else if (_disk_resource_limit_mode && (capacity_percentage < config::file_cache_exit_disk_resource_limit_mode_percent) && (inode_percentage < config::file_cache_exit_disk_resource_limit_mode_percent)) { _disk_resource_limit_mode = false; + _disk_limit_mode_metrics->set_value(0); } if (_disk_resource_limit_mode) { // log per mins @@ -1684,14 +1810,9 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); - if (config::enable_ttl_cache_evict_using_lru) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = - ttl_queue.add(hash, cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - cell.queue_iterator.reset(); - } + auto& ttl_queue = get_queue(FileCacheType::TTL); + cell.queue_iterator = ttl_queue.add(hash, cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); } if (!st.ok()) { LOG_WARNING("").error(st); @@ -1805,10 +1926,8 @@ std::string BlockFileCache::clear_file_cache_directly() { << " time_elapsed=" << duration_cast(steady_clock::now() - start).count() << " num_files=" << num_files << " cache_size=" << cache_size << " index_queue_size=" << index_queue_size << " normal_queue_size=" << normal_queue_size - << " disposible_queue_size=" << disposible_queue_size; - if (config::enable_ttl_cache_evict_using_lru) { - ss << "ttl_queue_size=" << ttl_queue_size; - } + << " disposible_queue_size=" << disposible_queue_size << "ttl_queue_size=" << ttl_queue_size; + auto msg = ss.str(); LOG(INFO) << msg; return msg; @@ -1851,6 +1970,12 @@ std::map BlockFileCache::get_stats() { stats["index_queue_curr_elements"] = (double)_cur_index_queue_element_count_metrics->get_value(); + stats["ttl_queue_max_size"] = (double)_ttl_queue.get_max_size(); + stats["ttl_queue_curr_size"] = (double)_cur_ttl_cache_lru_queue_cache_size_metrics->get_value(); + stats["ttl_queue_max_elements"] = (double)_ttl_queue.get_max_element_size(); + stats["ttl_queue_curr_elements"] = + (double)_cur_ttl_cache_lru_queue_element_count_metrics->get_value(); + stats["normal_queue_max_size"] = (double)_normal_queue.get_max_size(); stats["normal_queue_curr_size"] = (double)_cur_normal_queue_element_count_metrics->get_value(); stats["normal_queue_max_elements"] = (double)_normal_queue.get_max_element_size(); @@ -1867,6 +1992,36 @@ std::map BlockFileCache::get_stats() { return stats; } +// for be UTs +std::map BlockFileCache::get_stats_unsafe() { + std::map stats; + stats["hits_ratio"] = (double)_hit_ratio->get_value(); + stats["hits_ratio_5m"] = (double)_hit_ratio_5m->get_value(); + stats["hits_ratio_1h"] = (double)_hit_ratio_1h->get_value(); + + stats["index_queue_max_size"] = (double)_index_queue.get_max_size(); + stats["index_queue_curr_size"] = (double)_index_queue.get_capacity_unsafe(); + stats["index_queue_max_elements"] = (double)_index_queue.get_max_element_size(); + stats["index_queue_curr_elements"] = (double)_index_queue.get_elements_num_unsafe(); + + stats["ttl_queue_max_size"] = (double)_ttl_queue.get_max_size(); + stats["ttl_queue_curr_size"] = (double)_ttl_queue.get_capacity_unsafe(); + stats["ttl_queue_max_elements"] = (double)_ttl_queue.get_max_element_size(); + stats["ttl_queue_curr_elements"] = (double)_ttl_queue.get_elements_num_unsafe(); + + stats["normal_queue_max_size"] = (double)_normal_queue.get_max_size(); + stats["normal_queue_curr_size"] = (double)_normal_queue.get_capacity_unsafe(); + stats["normal_queue_max_elements"] = (double)_normal_queue.get_max_element_size(); + stats["normal_queue_curr_elements"] = (double)_normal_queue.get_elements_num_unsafe(); + + stats["disposable_queue_max_size"] = (double)_disposable_queue.get_max_size(); + stats["disposable_queue_curr_size"] = (double)_disposable_queue.get_capacity_unsafe(); + stats["disposable_queue_max_elements"] = (double)_disposable_queue.get_max_element_size(); + stats["disposable_queue_curr_elements"] = (double)_disposable_queue.get_elements_num_unsafe(); + + return stats; +} + template void BlockFileCache::remove(FileBlockSPtr file_block, std::lock_guard& cache_lock, std::lock_guard& block_lock); diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index ac30e2411fa81b..c0c66334a2b32c 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -145,6 +145,9 @@ class BlockFileCache { std::map get_stats(); + // for be UTs + std::map get_stats_unsafe(); + class LRUQueue { public: LRUQueue() = default; @@ -179,6 +182,10 @@ class BlockFileCache { return cache_size; } + size_t get_capacity_unsafe() const { return cache_size; } + + size_t get_elements_num_unsafe() const { return queue.size(); } + size_t get_elements_num(std::lock_guard& /* cache_lock */) const { return queue.size(); } @@ -345,6 +352,7 @@ class BlockFileCache { bool try_reserve_during_async_load(size_t size, std::lock_guard& cache_lock); std::vector get_other_cache_type(FileCacheType cur_cache_type); + std::vector get_other_cache_type_without_ttl(FileCacheType cur_cache_type); bool try_reserve_from_other_queue(FileCacheType cur_cache_type, size_t offset, int64_t cur_time, std::lock_guard& cache_lock); @@ -390,15 +398,16 @@ class BlockFileCache { void recycle_deleted_blocks(); - bool try_reserve_from_other_queue_by_hot_interval(std::vector other_cache_types, + bool try_reserve_from_other_queue_by_hot_interval(FileCacheType cur_type, + std::vector other_cache_types, size_t size, int64_t cur_time, std::lock_guard& cache_lock); - bool try_reserve_from_other_queue_by_size(std::vector other_cache_types, + bool try_reserve_from_other_queue_by_size(FileCacheType cur_type, + std::vector other_cache_types, size_t size, std::lock_guard& cache_lock); - bool is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size, - bool is_ttl = false) const; + bool is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size) const; void remove_file_blocks(std::vector&, std::lock_guard&); @@ -407,7 +416,10 @@ class BlockFileCache { void find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, - std::lock_guard& cache_lock, bool is_ttl); + std::lock_guard& cache_lock, size_t& cur_removed_size); + + void recycle_stale_rowset_async_bottom_half(); + // info std::string _cache_base_path; size_t _capacity = 0; @@ -459,6 +471,10 @@ class BlockFileCache { std::shared_ptr> _cur_disposable_queue_cache_size_metrics; std::array>, 4> _queue_evict_size_metrics; std::shared_ptr> _total_evict_size_metrics; + std::shared_ptr> _evict_by_heat_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_size_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_self_lru_metrics_matrix[4]; + std::shared_ptr> _evict_by_try_release; std::shared_ptr>> _num_hit_blocks_5m; std::shared_ptr>> _num_read_blocks_5m; @@ -472,6 +488,7 @@ class BlockFileCache { std::shared_ptr> _hit_ratio; std::shared_ptr> _hit_ratio_5m; std::shared_ptr> _hit_ratio_1h; + std::shared_ptr> _disk_limit_mode_metrics; }; } // namespace doris::io diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index c569ace0011866..674879300452df 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -34,6 +34,7 @@ std::string FileCacheSettings::to_string() const { << ", disposable_queue_elements: " << disposable_queue_elements << ", index_queue_size: " << index_queue_size << ", index_queue_elements: " << index_queue_elements + << ", ttl_queue_size: " << ttl_queue_size << ", ttl_queue_elements: " << ttl_queue_elements << ", query_queue_size: " << query_queue_size << ", query_queue_elements: " << query_queue_elements << ", storage: " << storage; return ss.str(); @@ -58,6 +59,10 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach std::max(settings.index_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); + settings.ttl_queue_size = per_size * config::max_ttl_cache_ratio; + settings.ttl_queue_elements = std::max(settings.ttl_queue_size / settings.max_file_block_size, + REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); + settings.query_queue_size = settings.capacity - settings.disposable_queue_size - settings.index_queue_size; settings.query_queue_elements = diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 21309831a8284c..30579ba7851b28 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -26,17 +26,17 @@ namespace doris::io { inline static constexpr size_t REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS = 100 * 1024; inline static constexpr size_t FILE_CACHE_MAX_FILE_BLOCK_SIZE = 1 * 1024 * 1024; -inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 85; -inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 10; +inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 40; +inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 5; inline static constexpr size_t DEFAULT_INDEX_PERCENT = 5; using uint128_t = vectorized::UInt128; -enum class FileCacheType { - INDEX, - NORMAL, - DISPOSABLE, - TTL, +enum FileCacheType { + INDEX = 2, + NORMAL = 1, + DISPOSABLE = 0, + TTL = 3, }; struct UInt128Wrapper { @@ -93,6 +93,8 @@ struct FileCacheSettings { size_t index_queue_elements {0}; size_t query_queue_size {0}; size_t query_queue_elements {0}; + size_t ttl_queue_size {0}; + size_t ttl_queue_elements {0}; size_t max_file_block_size {0}; size_t max_query_cache_size {0}; std::string storage; diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index 1fd3b785b9072f..0cb918448c4de8 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -79,7 +79,8 @@ Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t .use_page_cache = !config::disable_storage_page_cache, .file_reader = segment->file_reader().get(), .stats = stats, - .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY}, + .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY, + .file_cache_stats = &stats->file_cache_stats}, }; RETURN_IF_ERROR((*column_iterator)->init(opt)); return Status::OK(); @@ -441,7 +442,8 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest const std::vector& specified_rowsets, RowLocation* row_location, uint32_t version, std::vector>& segment_caches, - RowsetSharedPtr* rowset, bool with_rowid) { + RowsetSharedPtr* rowset, bool with_rowid, + OlapReaderStatistics* stats) { SCOPED_BVAR_LATENCY(g_tablet_lookup_rowkey_latency); size_t seq_col_length = 0; // use the latest tablet schema to decide if the tablet has sequence column currently @@ -489,7 +491,7 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest for (auto id : picked_segments) { Status s = segments[id]->lookup_row_key(encoded_key, schema, with_seq_col, with_rowid, - &loc); + &loc, stats); if (s.is()) { continue; } diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index 943f815581809a..f862aa06cc0e60 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -153,7 +153,8 @@ class BaseTablet { const std::vector& specified_rowsets, RowLocation* row_location, uint32_t version, std::vector>& segment_caches, - RowsetSharedPtr* rowset = nullptr, bool with_rowid = true); + RowsetSharedPtr* rowset = nullptr, bool with_rowid = true, + OlapReaderStatistics* stats = nullptr); // calc delete bitmap when flush memtable, use a fake version to calc // For example, cur max version is 5, and we use version 6 to calc but diff --git a/be/src/olap/cold_data_compaction.cpp b/be/src/olap/cold_data_compaction.cpp index 3c61819903460b..54e21d7d7bcf6a 100644 --- a/be/src/olap/cold_data_compaction.cpp +++ b/be/src/olap/cold_data_compaction.cpp @@ -28,6 +28,7 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" +#include "common/status.h" #include "io/fs/remote_file_system.h" #include "olap/compaction.h" #include "olap/olap_common.h" @@ -97,7 +98,7 @@ Status ColdDataCompaction::modify_rowsets() { std::lock_guard wlock(_tablet->get_header_lock()); SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); // Merged cooldowned rowsets MUST NOT be managed by version graph, they will be reclaimed by `remove_unused_remote_files`. - tablet()->delete_rowsets(_input_rowsets, false); + RETURN_IF_ERROR(tablet()->delete_rowsets(_input_rowsets, false)); tablet()->add_rowsets({_output_rowset}); // TODO(plat1ko): process primary key _tablet->tablet_meta()->set_cooldown_meta_id(cooldown_meta_id); diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 236e5d4ac7bf21..0fd9b57faf8b93 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -100,14 +100,14 @@ bool is_rowset_tidy(std::string& pre_max_key, const RowsetSharedPtr& rhs) { } } std::string min_key; - auto ret = rhs->min_key(&min_key); + auto ret = rhs->first_key(&min_key); if (!ret) { return false; } if (min_key <= pre_max_key) { return false; } - CHECK(rhs->max_key(&pre_max_key)); + CHECK(rhs->last_key(&pre_max_key)); return true; } diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index ab034123ac883c..a79434551b5cc1 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,8 @@ Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type, if (stats_output && stats_output->rowid_conversion) { reader_params.record_rowids = true; + reader_params.rowid_conversion = stats_output->rowid_conversion; + stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); } reader_params.return_columns.resize(cur_tablet_schema.num_columns()); @@ -98,17 +101,6 @@ Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type, reader_params.origin_return_columns = &reader_params.return_columns; RETURN_IF_ERROR(reader.init(reader_params)); - if (reader_params.record_rowids) { - stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); - // init segment rowid map for rowid conversion - std::vector segment_num_rows; - for (auto& rs_split : reader_params.rs_splits) { - RETURN_IF_ERROR(rs_split.rs_reader->get_segment_num_rows(&segment_num_rows)); - stats_output->rowid_conversion->init_segment_map( - rs_split.rs_reader->rowset()->rowset_id(), segment_num_rows); - } - } - vectorized::Block block = cur_tablet_schema.create_block(reader_params.return_columns); size_t output_rows = 0; bool eof = false; @@ -274,6 +266,8 @@ Status Merger::vertical_compact_one_group( if (is_key && stats_output && stats_output->rowid_conversion) { reader_params.record_rowids = true; + reader_params.rowid_conversion = stats_output->rowid_conversion; + stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); } reader_params.return_columns = column_group; @@ -281,17 +275,6 @@ Status Merger::vertical_compact_one_group( reader_params.batch_size = batch_size; RETURN_IF_ERROR(reader.init(reader_params, sample_info)); - if (reader_params.record_rowids) { - stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); - // init segment rowid map for rowid conversion - std::vector segment_num_rows; - for (auto& rs_split : reader_params.rs_splits) { - RETURN_IF_ERROR(rs_split.rs_reader->get_segment_num_rows(&segment_num_rows)); - stats_output->rowid_conversion->init_segment_map( - rs_split.rs_reader->rowset()->rowset_id(), segment_num_rows); - } - } - vectorized::Block block = tablet_schema.create_block(reader_params.return_columns); size_t output_rows = 0; bool eof = false; diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index c1d3038050fbd4..d3bd0f0a3a2436 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -305,24 +305,22 @@ struct OlapReaderStatistics { // block_load_ns // block_init_ns // block_init_seek_ns - // block_conditions_filtered_ns - // first_read_ns - // block_first_read_seek_ns + // generate_row_ranges_ns + // predicate_column_read_ns + // predicate_column_read_seek_ns // lazy_read_ns // block_lazy_read_seek_ns int64_t block_init_ns = 0; int64_t block_init_seek_num = 0; int64_t block_init_seek_ns = 0; - int64_t first_read_ns = 0; - int64_t second_read_ns = 0; - int64_t block_first_read_seek_num = 0; - int64_t block_first_read_seek_ns = 0; + int64_t predicate_column_read_ns = 0; + int64_t non_predicate_read_ns = 0; + int64_t predicate_column_read_seek_num = 0; + int64_t predicate_column_read_seek_ns = 0; int64_t lazy_read_ns = 0; int64_t block_lazy_read_seek_num = 0; int64_t block_lazy_read_seek_ns = 0; - int64_t block_convert_ns = 0; - int64_t raw_rows_read = 0; int64_t rows_vec_cond_filtered = 0; @@ -351,11 +349,10 @@ struct OlapReaderStatistics { int64_t rows_del_by_bitmap = 0; // the number of rows filtered by various column indexes. int64_t rows_conditions_filtered = 0; - int64_t block_conditions_filtered_ns = 0; - int64_t block_conditions_filtered_bf_ns = 0; - int64_t block_conditions_filtered_zonemap_ns = 0; - int64_t block_conditions_filtered_zonemap_rp_ns = 0; - int64_t block_conditions_filtered_dict_ns = 0; + int64_t generate_row_ranges_ns = 0; + int64_t generate_row_ranges_by_bf_ns = 0; + int64_t generate_row_ranges_by_zonemap_ns = 0; + int64_t generate_row_ranges_by_dict_ns = 0; int64_t index_load_ns = 0; @@ -372,7 +369,6 @@ struct OlapReaderStatistics { int64_t inverted_index_query_cache_miss = 0; int64_t inverted_index_query_null_bitmap_timer = 0; int64_t inverted_index_query_bitmap_copy_timer = 0; - int64_t inverted_index_query_bitmap_op_timer = 0; int64_t inverted_index_searcher_open_timer = 0; int64_t inverted_index_searcher_search_timer = 0; int64_t inverted_index_searcher_cache_hit = 0; diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index d3554cae15d66a..9d40ff5a8fad51 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -17,6 +17,7 @@ #include "olap/primary_key_index.h" +#include #include #include @@ -95,7 +96,8 @@ Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader, // parse primary key index _index_reader.reset(new segment_v2::IndexedColumnReader(file_reader, meta.primary_key_index())); _index_reader->set_is_pk_index(true); - RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false)); + RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false, + _pk_index_load_stats)); _index_parsed = true; return Status::OK(); @@ -107,7 +109,8 @@ Status PrimaryKeyIndexReader::parse_bf(io::FileReaderSPtr file_reader, segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index(); segment_v2::BloomFilterIndexReader bf_index_reader(std::move(file_reader), column_index_meta.bloom_filter_index()); - RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false)); + RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false, + _pk_index_load_stats)); std::unique_ptr bf_iter; RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter)); RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf)); diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h index b5eb13131b73a0..dcbbc5f30625f4 100644 --- a/be/src/olap/primary_key_index.h +++ b/be/src/olap/primary_key_index.h @@ -25,6 +25,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/indexed_column_reader.h" @@ -97,7 +98,8 @@ class PrimaryKeyIndexBuilder { class PrimaryKeyIndexReader { public: - PrimaryKeyIndexReader() : _index_parsed(false), _bf_parsed(false) {} + PrimaryKeyIndexReader(OlapReaderStatistics* pk_index_load_stats = nullptr) + : _index_parsed(false), _bf_parsed(false), _pk_index_load_stats(pk_index_load_stats) {} ~PrimaryKeyIndexReader() { segment_v2::g_pk_total_bloom_filter_num << -static_cast(_bf_num); @@ -111,9 +113,10 @@ class PrimaryKeyIndexReader { Status parse_bf(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta); - Status new_iterator(std::unique_ptr* index_iterator) const { + Status new_iterator(std::unique_ptr* index_iterator, + OlapReaderStatistics* stats = nullptr) const { DCHECK(_index_parsed); - index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get())); + index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get(), stats)); return Status::OK(); } @@ -152,6 +155,7 @@ class PrimaryKeyIndexReader { std::unique_ptr _bf; size_t _bf_num = 0; uint64 _bf_bytes = 0; + OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index d2c7023f659c20..042893f1374374 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -235,6 +235,12 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context for (size_t i = 0; i < segments.size(); i++) { _segments_rows[i] = segments[i]->num_rows(); } + if (_read_context->record_rowids) { + // init segment rowid map for rowid conversion + std::vector segment_num_rows; + RETURN_IF_ERROR(get_segment_num_rows(&segment_num_rows)); + _read_context->rowid_conversion->init_segment_map(rowset()->rowset_id(), segment_num_rows); + } auto [seg_start, seg_end] = _segment_offsets; if (seg_start == seg_end) { diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 6050a33bfc2f5d..24e660cd2f7210 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -269,7 +269,9 @@ class Rowset : public std::enable_shared_from_this { _rowset_meta->get_segments_key_bounds(segments_key_bounds); return Status::OK(); } - bool min_key(std::string* min_key) { + + // min key of the first segment + bool first_key(std::string* min_key) { KeyBoundsPB key_bounds; bool ret = _rowset_meta->get_first_segment_key_bound(&key_bounds); if (!ret) { @@ -278,7 +280,9 @@ class Rowset : public std::enable_shared_from_this { *min_key = key_bounds.min_key(); return true; } - bool max_key(std::string* max_key) { + + // max key of the last segment + bool last_key(std::string* max_key) { KeyBoundsPB key_bounds; bool ret = _rowset_meta->get_last_segment_key_bound(&key_bounds); if (!ret) { diff --git a/be/src/olap/rowset/rowset_reader_context.h b/be/src/olap/rowset/rowset_reader_context.h index 0d4f5897772ad5..fd3b4fed56f967 100644 --- a/be/src/olap/rowset/rowset_reader_context.h +++ b/be/src/olap/rowset/rowset_reader_context.h @@ -21,6 +21,7 @@ #include "io/io_common.h" #include "olap/column_predicate.h" #include "olap/olap_common.h" +#include "olap/rowid_conversion.h" #include "runtime/runtime_state.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" @@ -75,6 +76,7 @@ struct RowsetReaderContext { bool enable_unique_key_merge_on_write = false; const DeleteBitmap* delete_bitmap = nullptr; bool record_rowids = false; + RowIdConversion* rowid_conversion; bool is_vertical_compaction = false; bool is_key_column_group = false; const std::set* output_columns = nullptr; @@ -82,7 +84,6 @@ struct RowsetReaderContext { // slots that cast may be eliminated in storage layer std::map target_cast_type_for_variants; int64_t ttl_seconds = 0; - size_t topn_limit = 0; }; } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp index 0857c1890c47ce..917356f486be6e 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp @@ -31,8 +31,10 @@ namespace doris { namespace segment_v2 { -Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory) { +Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { // TODO yyq: implement a new once flag to avoid status construct. + _index_load_stats = index_load_stats; return _load_once.call([this, use_page_cache, kept_in_memory] { return _load(use_page_cache, kept_in_memory); }); @@ -42,7 +44,7 @@ Status BloomFilterIndexReader::_load(bool use_page_cache, bool kept_in_memory) { const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta->bloom_filter(); _bloom_filter_reader.reset(new IndexedColumnReader(_file_reader, bf_index_meta)); - RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory)); + RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory, _index_load_stats)); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h index c2617ef4e4e980..effaa876e1c0fd 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h @@ -47,7 +47,8 @@ class BloomFilterIndexReader { _bloom_filter_index_meta.reset(new BloomFilterIndexPB(bloom_filter_index_meta)); } - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* _bf_index_load_stats = nullptr); BloomFilterAlgorithmPB algorithm() { return _bloom_filter_index_meta->algorithm(); } @@ -67,6 +68,7 @@ class BloomFilterIndexReader { const TypeInfo* _type_info = nullptr; std::unique_ptr _bloom_filter_index_meta = nullptr; std::unique_ptr _bloom_filter_reader; + OlapReaderStatistics* _index_load_stats = nullptr; }; class BloomFilterIndexIterator { diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp index 59251b5595dd07..4ed98fd9a6b968 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp @@ -60,9 +60,11 @@ static bvar::Adder g_index_reader_memory_bytes("doris_index_reader_mem using strings::Substitute; -Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory) { +Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { _use_page_cache = use_page_cache; _kept_in_memory = kept_in_memory; + _index_load_stats = index_load_stats; _type_info = get_scalar_type_info((FieldType)_meta.data_type()); if (_type_info == nullptr) { @@ -105,7 +107,7 @@ Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* BlockCompressionCodec* local_compress_codec; RETURN_IF_ERROR(get_block_compression_codec(_meta.compression(), &local_compress_codec)); RETURN_IF_ERROR(read_page(PagePointer(pp), handle, &body, &footer, INDEX_PAGE, - local_compress_codec, false)); + local_compress_codec, false, _index_load_stats)); RETURN_IF_ERROR(reader->parse(body, footer.index_page_footer())); _mem_size += body.get_size(); return Status::OK(); @@ -113,8 +115,10 @@ Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, PageTypePB type, - BlockCompressionCodec* codec, bool pre_decode) const { + BlockCompressionCodec* codec, bool pre_decode, + OlapReaderStatistics* stats) const { OlapReaderStatistics tmp_stats; + OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; PageReadOptions opts { .use_page_cache = _use_page_cache, .kept_in_memory = _kept_in_memory, @@ -123,9 +127,10 @@ Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, .file_reader = _file_reader.get(), .page_pointer = pp, .codec = codec, - .stats = &tmp_stats, + .stats = stats_ptr, .encoding_info = _encoding_info, - .io_ctx = io::IOContext {.is_index_data = true}, + .io_ctx = io::IOContext {.is_index_data = true, + .file_cache_stats = &stats_ptr->file_cache_stats}, }; if (_is_pk_index) { opts.type = PRIMARY_KEY_INDEX_PAGE; @@ -154,8 +159,8 @@ Status IndexedColumnIterator::_read_data_page(const PagePointer& pp) { PageHandle handle; Slice body; PageFooterPB footer; - RETURN_IF_ERROR( - _reader->read_page(pp, &handle, &body, &footer, DATA_PAGE, _compress_codec, true)); + RETURN_IF_ERROR(_reader->read_page(pp, &handle, &body, &footer, DATA_PAGE, _compress_codec, + true, _stats)); // parse data page // note that page_index is not used in IndexedColumnIterator, so we pass 0 PageDecoderOptions opts; diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.h b/be/src/olap/rowset/segment_v2/indexed_column_reader.h index d156643a21c11d..6168fba2ed51c2 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.h @@ -27,6 +27,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/index_page.h" #include "olap/rowset/segment_v2/page_handle.h" @@ -53,11 +54,13 @@ class IndexedColumnReader { ~IndexedColumnReader(); - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats = nullptr); // read a page specified by `pp' from `file' into `handle' Status read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, - PageTypePB type, BlockCompressionCodec* codec, bool pre_decode) const; + PageTypePB type, BlockCompressionCodec* codec, bool pre_decode, + OlapReaderStatistics* stats = nullptr) const; int64_t num_values() const { return _num_values; } const EncodingInfo* encoding_info() const { return _encoding_info; } @@ -95,14 +98,17 @@ class IndexedColumnReader { const KeyCoder* _value_key_coder = nullptr; uint64_t _mem_size = 0; bool _is_pk_index = false; + OlapReaderStatistics* _index_load_stats = nullptr; }; class IndexedColumnIterator { public: - explicit IndexedColumnIterator(const IndexedColumnReader* reader) + explicit IndexedColumnIterator(const IndexedColumnReader* reader, + OlapReaderStatistics* stats = nullptr) : _reader(reader), _ordinal_iter(&reader->_ordinal_index_reader), - _value_iter(&reader->_value_index_reader) {} + _value_iter(&reader->_value_index_reader), + _stats(stats) {} // Seek to the given ordinal entry. Entry 0 is the first entry. // Return Status::Error if provided seek point is past the end. @@ -151,6 +157,7 @@ class IndexedColumnIterator { ordinal_t _current_ordinal = 0; // iterator owned compress codec, should NOT be shared by threads, initialized before used BlockCompressionCodec* _compress_codec = nullptr; + OlapReaderStatistics* _stats = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 8729bd0c590276..50874d0db5c72b 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -75,6 +75,23 @@ const int32_t MAX_LEAF_COUNT = 1024; const float MAXMBSortInHeap = 512.0 * 8; const int DIMS = 1; +bool InvertedIndexColumnWriter::check_support_inverted_index(const TabletColumn& column) { + // bellow types are not supported in inverted index for extracted columns + static std::set invalid_types = { + FieldType::OLAP_FIELD_TYPE_DOUBLE, + FieldType::OLAP_FIELD_TYPE_JSONB, + FieldType::OLAP_FIELD_TYPE_ARRAY, + FieldType::OLAP_FIELD_TYPE_FLOAT, + }; + if (column.is_extracted_column() && (invalid_types.contains(column.type()))) { + return false; + } + if (column.is_variant_type()) { + return false; + } + return true; +} + template class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { public: diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index 63c1e219e649e8..da90752db09168 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -33,7 +33,6 @@ #include "io/fs/local_file_system.h" #include "olap/olap_common.h" #include "olap/options.h" -#include "olap/tablet_schema.h" namespace doris { class CollectionValue; @@ -41,6 +40,7 @@ class CollectionValue; class Field; class TabletIndex; +class TabletColumn; namespace segment_v2 { class InvertedIndexFileWriter; @@ -74,22 +74,7 @@ class InvertedIndexColumnWriter { // check if the column is valid for inverted index, some columns // are generated from variant, but not all of them are supported - static bool check_support_inverted_index(const TabletColumn& column) { - // bellow types are not supported in inverted index for extracted columns - static std::set invalid_types = { - FieldType::OLAP_FIELD_TYPE_DOUBLE, - FieldType::OLAP_FIELD_TYPE_JSONB, - FieldType::OLAP_FIELD_TYPE_ARRAY, - FieldType::OLAP_FIELD_TYPE_FLOAT, - }; - if (column.is_extracted_column() && (invalid_types.contains(column.type()))) { - return false; - } - if (column.is_variant_type()) { - return false; - } - return true; - } + static bool check_support_inverted_index(const TabletColumn& column); private: DISALLOW_COPY_AND_ASSIGN(InvertedIndexColumnWriter); diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 11457a7a332769..06a458a75ca4d9 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -458,7 +458,8 @@ Status Segment::_load_pk_bloom_filter() { }); } -Status Segment::load_pk_index_and_bf() { +Status Segment::load_pk_index_and_bf(OlapReaderStatistics* index_load_stats) { + _pk_index_load_stats = index_load_stats; RETURN_IF_ERROR(load_index()); RETURN_IF_ERROR(_load_pk_bloom_filter()); return Status::OK(); @@ -467,7 +468,7 @@ Status Segment::load_pk_index_and_bf() { Status Segment::load_index() { return _load_index_once.call([this] { if (_tablet_schema->keys_type() == UNIQUE_KEYS && _pk_index_meta != nullptr) { - _pk_index_reader = std::make_unique(); + _pk_index_reader = std::make_unique(_pk_index_load_stats); RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta)); // _meta_mem_usage += _pk_index_reader->get_memory_size(); return Status::OK(); @@ -926,7 +927,8 @@ Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column, } Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_schema, - bool with_seq_col, bool with_rowid, RowLocation* row_location) { + bool with_seq_col, bool with_rowid, RowLocation* row_location, + OlapReaderStatistics* stats) { RETURN_IF_ERROR(load_pk_index_and_bf()); bool has_seq_col = latest_schema->has_sequence_col(); bool has_rowid = !latest_schema->cluster_key_idxes().empty(); @@ -946,7 +948,7 @@ Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_sche } bool exact_match = false; std::unique_ptr index_iterator; - RETURN_IF_ERROR(_pk_index_reader->new_iterator(&index_iterator)); + RETURN_IF_ERROR(_pk_index_reader->new_iterator(&index_iterator, stats)); auto st = index_iterator->seek_at_or_after(&key_without_seq, &exact_match); if (!st.ok() && !st.is()) { return st; diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 035860b9bc9038..322b5d8b238df7 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -130,7 +130,8 @@ class Segment : public std::enable_shared_from_this { } Status lookup_row_key(const Slice& key, const TabletSchema* latest_schema, bool with_seq_col, - bool with_rowid, RowLocation* row_location); + bool with_rowid, RowLocation* row_location, + OlapReaderStatistics* stats = nullptr); Status read_key_by_rowid(uint32_t row_id, std::string* key); @@ -140,7 +141,7 @@ class Segment : public std::enable_shared_from_this { Status load_index(); - Status load_pk_index_and_bf(); + Status load_pk_index_and_bf(OlapReaderStatistics* index_load_stats = nullptr); void update_healthy_status(Status new_status) { _healthy_status.update(new_status); } // The segment is loaded into SegmentCache and then will load indices, if there are something wrong @@ -294,6 +295,7 @@ class Segment : public std::enable_shared_from_this { InvertedIndexFileInfo _idx_file_info; int _be_exec_version = BeExecVersionManager::get_newest_version(); + OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 04ec5830d2885f..f43ccf37e78280 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -497,7 +497,7 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra } Status SegmentIterator::_get_row_ranges_by_column_conditions() { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_ns); if (_row_bitmap.isEmpty()) { return Status::OK(); } @@ -565,7 +565,7 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row size_t pre_size = 0; { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_bf_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_bf_ns); // first filter data by bloom filter index // bloom filter index only use CondColumn RowRanges bf_row_ranges = RowRanges::create_single(num_rows()); @@ -588,7 +588,7 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row } { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_zonemap_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_zonemap_ns); RowRanges zone_map_row_ranges = RowRanges::create_single(num_rows()); // second filter data by zone map for (const auto& cid : cids) { @@ -652,7 +652,7 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row } { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_dict_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_dict_ns); /// Low cardinality optimization is currently not very stable, so to prevent data corruption, /// we are temporarily disabling its use in data compaction. if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) { @@ -1400,7 +1400,7 @@ Status SegmentIterator::_vec_init_lazy_materialization() { if (!_is_common_expr_column[cid]) { _non_predicate_columns.push_back(cid); } else { - _second_read_column_ids.push_back(cid); + _non_predicate_column_ids.push_back(cid); } } } @@ -1410,13 +1410,13 @@ Status SegmentIterator::_vec_init_lazy_materialization() { if (_lazy_materialization_read) { // insert pred cid to first_read_columns for (auto cid : pred_column_ids) { - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } } else if (!_is_need_vec_eval && !_is_need_short_eval && !_is_need_expr_eval) { // no pred exists, just read and output column for (int i = 0; i < _schema->num_column_ids(); i++) { auto cid = _schema->column_id(i); - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } } else { if (_is_need_vec_eval || _is_need_short_eval) { @@ -1428,18 +1428,18 @@ Status SegmentIterator::_vec_init_lazy_materialization() { _short_cir_pred_column_ids.end()); pred_id_set.insert(_vec_pred_column_ids.begin(), _vec_pred_column_ids.end()); - DCHECK(_second_read_column_ids.empty()); - // _second_read_column_ids must be empty. Otherwise _lazy_materialization_read must not false. + DCHECK(_non_predicate_column_ids.empty()); + // _non_predicate_column_ids must be empty. Otherwise _lazy_materialization_read must not false. for (int i = 0; i < _schema->num_column_ids(); i++) { auto cid = _schema->column_id(i); if (pred_id_set.find(cid) != pred_id_set.end()) { - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } // In the past, if schema columns > pred columns, the _lazy_materialization_read maybe == false, but // we make sure using _lazy_materialization_read= true now, so these logic may never happens. I comment // these lines and we could delete them in the future to make the code more clear. // else if (non_pred_set.find(cid) != non_pred_set.end()) { - // _first_read_column_ids.push_back(cid); + // _predicate_column_ids.push_back(cid); // // when _lazy_materialization_read = false, non-predicate column should also be filtered by sel idx, so we regard it as pred columns // _is_pred_column[cid] = true; // } @@ -1447,7 +1447,7 @@ Status SegmentIterator::_vec_init_lazy_materialization() { } else if (_is_need_expr_eval) { DCHECK(!_is_need_vec_eval && !_is_need_short_eval); for (auto cid : _common_expr_columns) { - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } } } @@ -1633,7 +1633,7 @@ void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) { * 1. Reads a batch of rowids (up to the specified limit), and checks if they are continuous. * Continuous here means that the rowids form an unbroken sequence (e.g., 1, 2, 3, 4...). * - * 2. For each column that needs to be read (identified by _first_read_column_ids): + * 2. For each column that needs to be read (identified by _predicate_column_ids): * - If the rowids are continuous, the function uses seek_to_ordinal and next_batch * for efficient reading. * - If the rowids are not continuous, the function processes them in smaller batches @@ -1646,13 +1646,13 @@ void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) { */ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32_t& nrows_read, bool set_block_rowid) { - SCOPED_RAW_TIMER(&_opts.stats->first_read_ns); + SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_ns); nrows_read = _range_iter->read_batch_rowids(_block_rowids.data(), nrows_read_limit); bool is_continuous = (nrows_read > 1) && (_block_rowids[nrows_read - 1] - _block_rowids[0] == nrows_read - 1); - for (auto cid : _first_read_column_ids) { + for (auto cid : _predicate_column_ids) { auto& column = _current_return_columns[cid]; if (_no_need_read_key_data(cid, column, nrows_read)) { continue; @@ -1677,9 +1677,9 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 if (is_continuous) { size_t rows_read = nrows_read; - _opts.stats->block_first_read_seek_num += 1; + _opts.stats->predicate_column_read_seek_num += 1; if (_opts.runtime_state && _opts.runtime_state->enable_profile()) { - SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns); + SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_seek_ns); RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(_block_rowids[0])); } else { RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(_block_rowids[0])); @@ -1701,9 +1701,9 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 if (batch_continuous) { size_t rows_read = current_batch_size; - _opts.stats->block_first_read_seek_num += 1; + _opts.stats->predicate_column_read_seek_num += 1; if (_opts.runtime_state && _opts.runtime_state->enable_profile()) { - SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns); + SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_seek_ns); RETURN_IF_ERROR( _column_iterators[cid]->seek_to_ordinal(_block_rowids[processed])); } else { @@ -1987,6 +1987,9 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (UNLIKELY(!_lazy_inited)) { RETURN_IF_ERROR(_lazy_init()); _lazy_inited = true; + // If the row bitmap size is smaller than block_row_max, there's no need to reserve that many column rows. + auto nrows_reserve_limit = + std::min(_row_bitmap.cardinality(), uint64_t(_opts.block_row_max)); if (_lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval) { _block_rowids.resize(_opts.block_row_max); } @@ -2011,7 +2014,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { storage_column_type->is_nullable(), _opts.io_ctx.reader_type)); _current_return_columns[cid]->set_rowset_segment_id( {_segment->rowset_id(), _segment->id()}); - _current_return_columns[cid]->reserve(_opts.block_row_max); + _current_return_columns[cid]->reserve(nrows_reserve_limit); } else if (i >= block->columns()) { // if i >= block->columns means the column and not the pred_column means `column i` is // a delete condition column. but the column is not effective in the segment. so we just @@ -2022,7 +2025,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { // TODO: skip read the not effective delete column to speed up segment read. _current_return_columns[cid] = Schema::get_data_type_ptr(*column_desc)->create_column(); - _current_return_columns[cid]->reserve(_opts.block_row_max); + _current_return_columns[cid]->reserve(nrows_reserve_limit); } } @@ -2047,7 +2050,8 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (_can_opt_topn_reads()) { nrows_read_limit = std::min(static_cast(_opts.topn_limit), nrows_read_limit); } - + // If the row bitmap size is smaller than nrows_read_limit, there's no need to reserve that many column rows. + nrows_read_limit = std::min(_row_bitmap.cardinality(), uint64_t(nrows_read_limit)); DBUG_EXECUTE_IF("segment_iterator.topn_opt_1", { if (nrows_read_limit != 1) { return Status::Error("topn opt 1 execute failed: {}", @@ -2062,8 +2066,8 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { RETURN_IF_ERROR(_read_columns_by_index( nrows_read_limit, _current_batch_rows_read, _lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval)); - if (std::find(_first_read_column_ids.begin(), _first_read_column_ids.end(), - _schema->version_col_idx()) != _first_read_column_ids.end()) { + if (std::find(_predicate_column_ids.begin(), _predicate_column_ids.end(), + _schema->version_col_idx()) != _predicate_column_ids.end()) { _replace_version_col(_current_batch_rows_read); } @@ -2088,7 +2092,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (_non_predicate_columns.empty()) { return Status::InternalError("_non_predicate_columns is empty"); } - RETURN_IF_ERROR(_convert_to_expected_type(_first_read_column_ids)); + RETURN_IF_ERROR(_convert_to_expected_type(_predicate_column_ids)); RETURN_IF_ERROR(_convert_to_expected_type(_non_predicate_columns)); _output_non_pred_columns(block); } else { @@ -2109,27 +2113,28 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (selected_size > 0) { // step 3.1: output short circuit and predicate column - // when lazy materialization enables, _first_read_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids) + // when lazy materialization enables, _predicate_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids) // see _vec_init_lazy_materialization // todo(wb) need to tell input columnids from output columnids - RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, + RETURN_IF_ERROR(_output_column_by_sel_idx(block, _predicate_column_ids, _sel_rowid_idx.data(), selected_size)); // step 3.2: read remaining expr column and evaluate it. if (_is_need_expr_eval) { // The predicate column contains the remaining expr column, no need second read. - if (!_second_read_column_ids.empty()) { - SCOPED_RAW_TIMER(&_opts.stats->second_read_ns); + if (!_non_predicate_column_ids.empty()) { + SCOPED_RAW_TIMER(&_opts.stats->non_predicate_read_ns); RETURN_IF_ERROR(_read_columns_by_rowids( - _second_read_column_ids, _block_rowids, _sel_rowid_idx.data(), + _non_predicate_column_ids, _block_rowids, _sel_rowid_idx.data(), selected_size, &_current_return_columns)); - if (std::find(_second_read_column_ids.begin(), - _second_read_column_ids.end(), _schema->version_col_idx()) != - _second_read_column_ids.end()) { + if (std::find(_non_predicate_column_ids.begin(), + _non_predicate_column_ids.end(), + _schema->version_col_idx()) != + _non_predicate_column_ids.end()) { _replace_version_col(selected_size); } - RETURN_IF_ERROR(_convert_to_expected_type(_second_read_column_ids)); - for (auto cid : _second_read_column_ids) { + RETURN_IF_ERROR(_convert_to_expected_type(_non_predicate_column_ids)); + for (auto cid : _non_predicate_column_ids) { auto loc = _schema_block_id_map[cid]; block->replace_by_position(loc, std::move(_current_return_columns[cid])); @@ -2162,17 +2167,17 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { } } } else if (_is_need_expr_eval) { - RETURN_IF_ERROR(_convert_to_expected_type(_second_read_column_ids)); - for (auto cid : _second_read_column_ids) { + RETURN_IF_ERROR(_convert_to_expected_type(_non_predicate_column_ids)); + for (auto cid : _non_predicate_column_ids) { auto loc = _schema_block_id_map[cid]; block->replace_by_position(loc, std::move(_current_return_columns[cid])); } } } else if (_is_need_expr_eval) { - DCHECK(!_first_read_column_ids.empty()); - RETURN_IF_ERROR(_convert_to_expected_type(_first_read_column_ids)); + DCHECK(!_predicate_column_ids.empty()); + RETURN_IF_ERROR(_convert_to_expected_type(_predicate_column_ids)); // first read all rows are insert block, initialize sel_rowid_idx to all rows. - for (auto cid : _first_read_column_ids) { + for (auto cid : _predicate_column_ids) { auto loc = _schema_block_id_map[cid]; block->replace_by_position(loc, std::move(_current_return_columns[cid])); } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index c2e2139e8ad411..5626d15180c295 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -431,8 +431,8 @@ class SegmentIterator : public RowwiseIterator { // first, read predicate columns by various index // second, read non-predicate columns // so we need a field to stand for columns first time to read - std::vector _first_read_column_ids; - std::vector _second_read_column_ids; + std::vector _predicate_column_ids; + std::vector _non_predicate_column_ids; std::vector _columns_to_filter; std::vector _converted_column_ids; std::vector _schema_block_id_map; // map from schema column id to column idx in Block diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 0769dbe86d2a63..4f94189a6212eb 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -666,8 +666,10 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock& _opts.rowset_ctx->merged_tablet_schema = _opts.rowset_ctx->tablet_schema; } TabletSchemaSPtr update_schema; + bool check_schema_size = true; RETURN_IF_ERROR(vectorized::schema_util::get_least_common_schema( - {_opts.rowset_ctx->merged_tablet_schema, _flush_schema}, nullptr, update_schema)); + {_opts.rowset_ctx->merged_tablet_schema, _flush_schema}, nullptr, update_schema, + check_schema_size)); CHECK_GE(update_schema->num_columns(), _flush_schema->num_columns()) << "Rowset merge schema columns count is " << update_schema->num_columns() << ", but flush_schema is larger " << _flush_schema->num_columns() diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index 5ffd9ac2d657c3..2125b508d24f75 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -899,7 +899,7 @@ Status SchemaChangeJob::_do_process_alter_tablet(const TAlterTabletReqV2& reques } } std::vector empty_vec; - _new_tablet->delete_rowsets(rowsets_to_delete, false); + RETURN_IF_ERROR(_new_tablet->delete_rowsets(rowsets_to_delete, false)); // inherit cumulative_layer_point from base_tablet // check if new_tablet.ce_point > base_tablet.ce_point? _new_tablet->set_cumulative_layer_point(-1); diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index abc82c6f3ee98d..26ac54c699b81a 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -17,6 +17,8 @@ #include "olap/segment_loader.h" +#include + #include "common/config.h" #include "common/status.h" #include "olap/olap_define.h" @@ -52,7 +54,8 @@ void SegmentCache::erase(const SegmentCache::CacheKey& key) { Status SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, bool use_cache, - bool need_load_pk_index_and_bf) { + bool need_load_pk_index_and_bf, + OlapReaderStatistics* index_load_stats) { if (cache_handle->is_inited()) { return Status::OK(); } @@ -70,7 +73,7 @@ Status SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, segment_v2::SegmentSharedPtr segment; RETURN_IF_ERROR(rowset->load_segment(i, &segment)); if (need_load_pk_index_and_bf) { - RETURN_IF_ERROR(segment->load_pk_index_and_bf()); + RETURN_IF_ERROR(segment->load_pk_index_and_bf(index_load_stats)); } if (use_cache && !config::disable_segment_cache) { // memory of SegmentCache::CacheValue will be handled by SegmentCache diff --git a/be/src/olap/segment_loader.h b/be/src/olap/segment_loader.h index b3b88fa7700409..834906da93bf74 100644 --- a/be/src/olap/segment_loader.h +++ b/be/src/olap/segment_loader.h @@ -117,7 +117,8 @@ class SegmentLoader { // Load segments of "rowset", return the "cache_handle" which contains segments. // If use_cache is true, it will be loaded from _cache. Status load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, - bool use_cache = false, bool need_load_pk_index_and_bf = false); + bool use_cache = false, bool need_load_pk_index_and_bf = false, + OlapReaderStatistics* index_load_stats = nullptr); void erase_segment(const SegmentCache::CacheKey& key); diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index ebf40c90bea35b..e00b5b595e20dc 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -209,7 +209,7 @@ StorageEngine::StorageEngine(const EngineOptions& options) _txn_manager(new TxnManager(*this, config::txn_map_shard_size, config::txn_shard_size)), _default_rowset_type(BETA_ROWSET), _create_tablet_idx_lru_cache( - new CreateTabletIdxCache(config::partition_disk_index_lru_size)), + new CreateTabletRRIdxCache(config::partition_disk_index_lru_size)), _snapshot_mgr(std::make_unique(*this)) { REGISTER_HOOK_METRIC(unused_rowsets_count, [this]() { // std::lock_guard lock(_gc_mutex); @@ -515,7 +515,7 @@ Status StorageEngine::set_cluster_id(int32_t cluster_id) { int StorageEngine::_get_and_set_next_disk_index(int64 partition_id, TStorageMedium::type storage_medium) { - auto key = CreateTabletIdxCache::get_key(partition_id, storage_medium); + auto key = CreateTabletRRIdxCache::get_key(partition_id, storage_medium); int curr_index = _create_tablet_idx_lru_cache->get_index(key); // -1, lru can't find key if (curr_index == -1) { @@ -1511,7 +1511,7 @@ Status StorageEngine::_persist_broken_paths() { return Status::OK(); } -int CreateTabletIdxCache::get_index(const std::string& key) { +int CreateTabletRRIdxCache::get_index(const std::string& key) { auto* lru_handle = lookup(key); if (lru_handle) { Defer release([cache = this, lru_handle] { cache->release(lru_handle); }); @@ -1522,7 +1522,7 @@ int CreateTabletIdxCache::get_index(const std::string& key) { return -1; } -void CreateTabletIdxCache::set_index(const std::string& key, int next_idx) { +void CreateTabletRRIdxCache::set_index(const std::string& key, int next_idx) { assert(next_idx >= 0); auto* value = new CacheValue; value->idx = next_idx; diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index b2a313adcdbb7e..421c0eb352d712 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -69,7 +69,7 @@ class Thread; class ThreadPool; class TxnManager; class ReportWorker; -class CreateTabletIdxCache; +class CreateTabletRRIdxCache; struct DirInfo; class SnapshotManager; @@ -532,7 +532,7 @@ class StorageEngine final : public BaseStorageEngine { // next index for create tablet std::map _last_use_index; - std::unique_ptr _create_tablet_idx_lru_cache; + std::unique_ptr _create_tablet_idx_lru_cache; std::unique_ptr _snapshot_mgr; }; @@ -540,7 +540,7 @@ class StorageEngine final : public BaseStorageEngine { // lru cache for create tabelt round robin in disks // key: partitionId_medium // value: index -class CreateTabletIdxCache : public LRUCachePolicy { +class CreateTabletRRIdxCache : public LRUCachePolicy { public: // get key, delimiter with DELIMITER '-' static std::string get_key(int64_t partition_id, TStorageMedium::type medium) { @@ -557,10 +557,10 @@ class CreateTabletIdxCache : public LRUCachePolicy { int idx = 0; }; - CreateTabletIdxCache(size_t capacity) + CreateTabletRRIdxCache(size_t capacity) : LRUCachePolicy(CachePolicy::CacheType::CREATE_TABLET_RR_IDX_CACHE, capacity, LRUCacheType::NUMBER, - /*stale_sweep_time_s*/ 30 * 60) {} + /*stale_sweep_time_s*/ 30 * 60, 1) {} }; struct DirInfo { diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 51eabe5495ef89..8c874e0cce724a 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -425,7 +425,7 @@ Status Tablet::revise_tablet_meta(const std::vector& to_add, // error handling if (!calc_bm_status.ok()) { if (is_incremental_clone) { - delete_rowsets(to_add, false); + RETURN_IF_ERROR(delete_rowsets(to_add, false)); LOG(WARNING) << "incremental clone on tablet: " << tablet_id() << " failed due to " << calc_bm_status.msg() << ", revert " << to_add.size() << " rowsets added before."; @@ -438,7 +438,7 @@ Status Tablet::revise_tablet_meta(const std::vector& to_add, // full clone, calculate delete bitmap succeeded, update rowset if (!is_incremental_clone) { - delete_rowsets(to_delete, false); + RETURN_IF_ERROR(delete_rowsets(to_delete, false)); add_rowsets(to_add); // reconstruct from tablet meta _timestamped_version_tracker.construct_versioned_tracker(_tablet_meta->all_rs_metas()); @@ -611,30 +611,33 @@ void Tablet::add_rowsets(const std::vector& to_add) { _tablet_meta->modify_rs_metas(rs_metas, {}); } -void Tablet::delete_rowsets(const std::vector& to_delete, bool move_to_stale) { +Status Tablet::delete_rowsets(const std::vector& to_delete, bool move_to_stale) { if (to_delete.empty()) { - return; + return Status::OK(); } std::vector rs_metas; rs_metas.reserve(to_delete.size()); - for (auto& rs : to_delete) { + for (const auto& rs : to_delete) { rs_metas.push_back(rs->rowset_meta()); _rs_version_map.erase(rs->version()); } _tablet_meta->modify_rs_metas({}, rs_metas, !move_to_stale); if (move_to_stale) { - for (auto& rs : to_delete) { + for (const auto& rs : to_delete) { _stale_rs_version_map[rs->version()] = rs; } _timestamped_version_tracker.add_stale_path_version(rs_metas); } else { - for (auto& rs : to_delete) { + for (const auto& rs : to_delete) { _timestamped_version_tracker.delete_version(rs->version()); if (rs->is_local()) { _engine.add_unused_rowset(rs); + RETURN_IF_ERROR(RowsetMetaManager::remove(_data_dir->get_meta(), tablet_uid(), + rs->rowset_meta()->rowset_id())); } } } + return Status::OK(); } RowsetSharedPtr Tablet::_rowset_with_largest_size() { @@ -2011,7 +2014,7 @@ Status Tablet::_cooldown_data(RowsetSharedPtr rowset) { std::unique_lock meta_wlock(_meta_lock); SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); if (tablet_state() == TABLET_RUNNING) { - delete_rowsets({std::move(old_rowset)}, false); + RETURN_IF_ERROR(delete_rowsets({std::move(old_rowset)}, false)); add_rowsets({std::move(new_rowset)}); // TODO(plat1ko): process primary key _tablet_meta->set_cooldown_meta_id(cooldown_meta_id); @@ -2230,7 +2233,7 @@ Status Tablet::_follow_cooldowned_data() { to_add.push_back(std::move(rs)); } // Note: We CANNOT call `modify_rowsets` here because `modify_rowsets` cannot process version graph correctly. - delete_rowsets(to_delete, false); + RETURN_IF_ERROR(delete_rowsets(to_delete, false)); add_rowsets(to_add); // TODO(plat1ko): process primary key _tablet_meta->set_cooldown_meta_id(cooldown_meta_pb.cooldown_meta_id()); diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 33253e82ced2b5..ed927670c27f59 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -360,7 +360,7 @@ class Tablet final : public BaseTablet { // MUST hold EXCLUSIVE `_meta_lock` void add_rowsets(const std::vector& to_add); // MUST hold EXCLUSIVE `_meta_lock` - void delete_rowsets(const std::vector& to_delete, bool move_to_stale); + Status delete_rowsets(const std::vector& to_delete, bool move_to_stale); // MUST hold SHARED `_meta_lock` const auto& rowset_map() const { return _rs_version_map; } diff --git a/be/src/olap/tablet_column_object_pool.cpp b/be/src/olap/tablet_column_object_pool.cpp new file mode 100644 index 00000000000000..6e07fb4e831e60 --- /dev/null +++ b/be/src/olap/tablet_column_object_pool.cpp @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/tablet_column_object_pool.h" + +#include +#include + +#include "olap/tablet_schema.h" + +namespace doris { + +bvar::Adder g_tablet_column_cache_count("tablet_column_cache_count"); +bvar::Adder g_tablet_column_cache_hit_count("tablet_column_cache_hit_count"); + +std::pair TabletColumnObjectPool::insert(const std::string& key) { + auto* lru_handle = lookup(key); + TabletColumnPtr tablet_column_ptr; + if (lru_handle) { + auto* value = (CacheValue*)LRUCachePolicy::value(lru_handle); + tablet_column_ptr = value->tablet_column; + VLOG_DEBUG << "reuse column "; + g_tablet_column_cache_hit_count << 1; + } else { + auto* value = new CacheValue; + tablet_column_ptr = std::make_shared(); + ColumnPB pb; + pb.ParseFromString(key); + tablet_column_ptr->init_from_pb(pb); + VLOG_DEBUG << "create column "; + value->tablet_column = tablet_column_ptr; + lru_handle = LRUCachePolicy::insert(key, value, 1, 0, CachePriority::NORMAL); + g_tablet_column_cache_count << 1; + } + DCHECK(lru_handle != nullptr); + return {lru_handle, tablet_column_ptr}; +} + +TabletColumnObjectPool::CacheValue::~CacheValue() { + g_tablet_column_cache_count << -1; +} + +} // namespace doris diff --git a/be/src/olap/tablet_column_object_pool.h b/be/src/olap/tablet_column_object_pool.h new file mode 100644 index 00000000000000..1eead6a25c9609 --- /dev/null +++ b/be/src/olap/tablet_column_object_pool.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/tablet_fwd.h" +#include "olap/tablet_schema.h" +#include "runtime/exec_env.h" +#include "runtime/memory/lru_cache_policy.h" + +namespace doris { + +// TabletColumnObjectPool is a cache for TabletColumn objects. It is used to reduce memory consumption +// when there are a large number of identical TabletColumns in the cluster, which usually occurs +// when VARIANT type columns are modified and added, each Rowset has an individual TabletSchema. +// Excessive TabletSchemas can lead to significant memory overhead. Reusing memory for identical +// TabletColumns would greatly reduce this memory consumption. + +class TabletColumnObjectPool : public LRUCachePolicy { +public: + TabletColumnObjectPool(size_t capacity) + : LRUCachePolicy(CachePolicy::CacheType::TABLET_COLUMN_OBJECT_POOL, capacity, + LRUCacheType::NUMBER, config::tablet_schema_cache_recycle_interval) {} + + static TabletColumnObjectPool* create_global_column_cache(size_t capacity) { + auto* res = new TabletColumnObjectPool(capacity); + return res; + } + + static TabletColumnObjectPool* instance() { + return ExecEnv::GetInstance()->get_tablet_column_object_pool(); + } + + std::pair insert(const std::string& key); + +private: + class CacheValue : public LRUCacheValueBase { + public: + ~CacheValue() override; + TabletColumnPtr tablet_column; + }; +}; + +} // namespace doris diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 6123dc6123184a..b5e3accab86b36 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -42,6 +42,7 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_meta_manager.h" #include "olap/tablet_meta_manager.h" #include "olap/utils.h" #include "util/debug_points.h" diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index 9ab9e4b1b365f5..7410b70f4aa471 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -254,6 +254,7 @@ Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) { _reader_context.delete_bitmap = read_params.delete_bitmap; _reader_context.enable_unique_key_merge_on_write = tablet()->enable_unique_key_merge_on_write(); _reader_context.record_rowids = read_params.record_rowids; + _reader_context.rowid_conversion = read_params.rowid_conversion; _reader_context.is_key_column_group = read_params.is_key_column_group; _reader_context.remaining_conjunct_roots = read_params.remaining_conjunct_roots; _reader_context.common_expr_ctxs_push_down = read_params.common_expr_ctxs_push_down; diff --git a/be/src/olap/tablet_reader.h b/be/src/olap/tablet_reader.h index 50517e047ba556..87af3bb08eb36e 100644 --- a/be/src/olap/tablet_reader.h +++ b/be/src/olap/tablet_reader.h @@ -39,6 +39,7 @@ #include "olap/olap_common.h" #include "olap/olap_tuple.h" #include "olap/row_cursor.h" +#include "olap/rowid_conversion.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_reader.h" @@ -166,6 +167,7 @@ class TabletReader { // used for compaction to record row ids bool record_rowids = false; + RowIdConversion* rowid_conversion; std::vector topn_filter_source_node_ids; int topn_filter_target_node_id = -1; // used for special optimization for query : ORDER BY key LIMIT n diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index a9fcad7690cea1..e264c40202924a 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -38,8 +38,10 @@ #include "exec/tablet_info.h" #include "olap/inverted_index_parser.h" #include "olap/olap_define.h" +#include "olap/tablet_column_object_pool.h" #include "olap/types.h" #include "olap/utils.h" +#include "runtime/memory/lru_cache_policy.h" #include "runtime/thread_context.h" #include "tablet_meta.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" @@ -749,7 +751,15 @@ void TabletIndex::init_from_thrift(const TOlapTableIndex& index, if (column_idx >= 0) { col_unique_ids[i] = tablet_schema.column(column_idx).unique_id(); } else { - col_unique_ids[i] = -1; + // if column unique id not found by column name, find by column unique id + // column unique id can not bigger than tablet schema column size, if bigger than column size means + // this column is a new column added by light schema change + if (index.__isset.column_unique_ids && + index.column_unique_ids[i] < tablet_schema.num_columns()) { + col_unique_ids[i] = index.column_unique_ids[i]; + } else { + col_unique_ids[i] = -1; + } } } _col_unique_ids = std::move(col_unique_ids); @@ -851,6 +861,7 @@ TabletSchema::TabletSchema() { TabletSchema::~TabletSchema() { g_total_tablet_schema_num << -1; + clear_column_cache_handlers(); } void TabletSchema::append_column(TabletColumn column, ColumnType col_type) { @@ -940,9 +951,18 @@ void TabletSchema::clear_columns() { _num_null_columns = 0; _num_key_columns = 0; _cols.clear(); + clear_column_cache_handlers(); +} + +void TabletSchema::clear_column_cache_handlers() { + for (auto* cache_handle : _column_cache_handlers) { + TabletColumnObjectPool::instance()->release(cache_handle); + } + _column_cache_handlers.clear(); } -void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns) { +void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns, + bool reuse_cache_column) { _keys_type = schema.keys_type(); _num_columns = 0; _num_variant_columns = 0; @@ -953,25 +973,34 @@ void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extrac _field_name_to_index.clear(); _field_id_to_index.clear(); _cluster_key_idxes.clear(); + clear_column_cache_handlers(); for (const auto& i : schema.cluster_key_idxes()) { _cluster_key_idxes.push_back(i); } for (auto& column_pb : schema.column()) { - TabletColumn column; - column.init_from_pb(column_pb); - if (ignore_extracted_columns && column.is_extracted_column()) { + TabletColumnPtr column; + if (reuse_cache_column) { + auto pair = TabletColumnObjectPool::instance()->insert( + deterministic_string_serialize(column_pb)); + column = pair.second; + _column_cache_handlers.push_back(pair.first); + } else { + column = std::make_shared(); + column->init_from_pb(column_pb); + } + if (ignore_extracted_columns && column->is_extracted_column()) { continue; } - if (column.is_key()) { + if (column->is_key()) { _num_key_columns++; } - if (column.is_nullable()) { + if (column->is_nullable()) { _num_null_columns++; } - if (column.is_variant_type()) { + if (column->is_variant_type()) { ++_num_variant_columns; } - _cols.emplace_back(std::make_shared(std::move(column))); + _cols.emplace_back(std::move(column)); _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns); _field_id_to_index[_cols.back()->unique_id()] = _num_columns; _num_columns++; @@ -1083,6 +1112,7 @@ void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version _sequence_col_idx = -1; _version_col_idx = -1; _cluster_key_idxes.clear(); + clear_column_cache_handlers(); for (const auto& i : ori_tablet_schema._cluster_key_idxes) { _cluster_key_idxes.push_back(i); } @@ -1555,13 +1585,4 @@ bool operator!=(const TabletSchema& a, const TabletSchema& b) { return !(a == b); } -std::string TabletSchema::deterministic_string_serialize(const TabletSchemaPB& schema_pb) { - std::string output; - google::protobuf::io::StringOutputStream string_output_stream(&output); - google::protobuf::io::CodedOutputStream output_stream(&string_output_stream); - output_stream.SetSerializationDeterministic(true); - schema_pb.SerializeToCodedStream(&output_stream); - return output; -} - } // namespace doris diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 1d1d6c9de79d24..e2f90e2716fff9 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -39,6 +39,7 @@ #include "olap/rowset/segment_v2/options.h" #include "runtime/define_primitive_type.h" #include "runtime/descriptors.h" +#include "runtime/memory/lru_cache_policy.h" #include "util/string_util.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/common/string_ref.h" @@ -296,10 +297,22 @@ class TabletSchema { TabletSchema(); virtual ~TabletSchema(); - void init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns = false); + // Init from pb + // ignore_extracted_columns: ignore the extracted columns from variant column + // reuse_cached_column: reuse the cached column in the schema if they are the same, to reduce memory usage + void init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns = false, + bool reuse_cached_column = false); // Notice: Use deterministic way to serialize protobuf, // since serialize Map in protobuf may could lead to un-deterministic by default - static std::string deterministic_string_serialize(const TabletSchemaPB& schema_pb); + template + static std::string deterministic_string_serialize(const PbType& pb) { + std::string output; + google::protobuf::io::StringOutputStream string_output_stream(&output); + google::protobuf::io::CodedOutputStream output_stream(&string_output_stream); + output_stream.SetSerializationDeterministic(true); + pb.SerializeToCodedStream(&output_stream); + return output; + } void to_schema_pb(TabletSchemaPB* tablet_meta_pb) const; void append_column(TabletColumn column, ColumnType col_type = ColumnType::NORMAL); void append_index(TabletIndex index); @@ -501,10 +514,13 @@ class TabletSchema { friend bool operator==(const TabletSchema& a, const TabletSchema& b); friend bool operator!=(const TabletSchema& a, const TabletSchema& b); + void clear_column_cache_handlers(); + KeysType _keys_type = DUP_KEYS; SortType _sort_type = SortType::LEXICAL; size_t _sort_col_num = 0; std::vector _cols; + std::vector _column_cache_handlers; std::vector _indexes; std::unordered_map _field_name_to_index; diff --git a/be/src/olap/tablet_schema_cache.cpp b/be/src/olap/tablet_schema_cache.cpp index e339c947bb97a4..fd238fa5affb3f 100644 --- a/be/src/olap/tablet_schema_cache.cpp +++ b/be/src/olap/tablet_schema_cache.cpp @@ -18,30 +18,45 @@ #include "olap/tablet_schema_cache.h" #include +#include +#include #include "bvar/bvar.h" #include "olap/tablet_schema.h" +#include "util/sha.h" bvar::Adder g_tablet_schema_cache_count("tablet_schema_cache_count"); bvar::Adder g_tablet_schema_cache_columns_count("tablet_schema_cache_columns_count"); +bvar::Adder g_tablet_schema_cache_hit_count("tablet_schema_cache_hit_count"); namespace doris { +// to reduce the memory consumption of the serialized TabletSchema as key. +// use sha256 to prevent from hash collision +static std::string get_key_signature(const std::string& origin) { + SHA256Digest digest; + digest.reset(origin.data(), origin.length()); + return std::string {digest.digest().data(), digest.digest().length()}; +} + std::pair TabletSchemaCache::insert(const std::string& key) { - auto* lru_handle = lookup(key); + std::string key_signature = get_key_signature(key); + auto* lru_handle = lookup(key_signature); TabletSchemaSPtr tablet_schema_ptr; if (lru_handle) { auto* value = (CacheValue*)LRUCachePolicy::value(lru_handle); tablet_schema_ptr = value->tablet_schema; + g_tablet_schema_cache_hit_count << 1; } else { auto* value = new CacheValue; tablet_schema_ptr = std::make_shared(); TabletSchemaPB pb; pb.ParseFromString(key); - tablet_schema_ptr->init_from_pb(pb); + // We should reuse the memory of the same TabletColumn object, set reuse_cached_column to true + tablet_schema_ptr->init_from_pb(pb, false, true); value->tablet_schema = tablet_schema_ptr; - lru_handle = LRUCachePolicy::insert(key, value, tablet_schema_ptr->num_columns(), 0, - CachePriority::NORMAL); + lru_handle = LRUCachePolicy::insert(key_signature, value, tablet_schema_ptr->num_columns(), + 0, CachePriority::NORMAL); g_tablet_schema_cache_count << 1; g_tablet_schema_cache_columns_count << tablet_schema_ptr->num_columns(); } diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index 38a52d1d2118aa..ea7ca76551b9be 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -363,10 +363,16 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta auto column_name = inverted_index.columns[0]; auto column_idx = output_rowset_schema->field_index(column_name); if (column_idx < 0) { - LOG(WARNING) << "referenced column was missing. " - << "[column=" << column_name << " referenced_column=" << column_idx - << "]"; - continue; + if (!inverted_index.column_unique_ids.empty()) { + column_idx = output_rowset_schema->field_index( + inverted_index.column_unique_ids[0]); + } + if (column_idx < 0) { + LOG(WARNING) << "referenced column was missing. " + << "[column=" << column_name + << " referenced_column=" << column_idx << "]"; + continue; + } } auto column = output_rowset_schema->column(column_idx); if (!InvertedIndexColumnWriter::check_support_inverted_index(column)) { diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index 1dd2d52f33b8ac..54436668c85c03 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -89,6 +89,15 @@ TxnManager::TxnManager(StorageEngine& engine, int32_t txn_map_shard_size, int32_ Status TxnManager::prepare_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, bool ingest) { + // check if the tablet has already been shutdown. If it has, it indicates that + // it is an old tablet, and data should not be imported into the old tablet. + // Otherwise, it may lead to data loss during migration. + if (tablet.tablet_state() == TABLET_SHUTDOWN) { + return Status::InternalError( + "The tablet's state is shutdown, tablet_id: {}. The tablet may have been dropped " + "or migrationed. Please check if the table has been dropped or try again.", + tablet.tablet_id()); + } return prepare_txn(partition_id, transaction_id, tablet.tablet_id(), tablet.tablet_uid(), load_id, ingest); } diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index d0c16a3ff5a192..619dd2d2aa3c4d 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -35,6 +35,7 @@ #include "pipeline/exec/join/process_hash_table_probe.h" #include "vec/common/sort/partition_sorter.h" #include "vec/common/sort/sorter.h" +#include "vec/core/block.h" #include "vec/core/types.h" #include "vec/spill/spill_stream.h" @@ -109,19 +110,19 @@ class Dependency : public std::enable_shared_from_this { // Notify downstream pipeline tasks this dependency is ready. void set_ready(); void set_ready_to_read() { - DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->source_deps.size(), 1) << debug_string(); _shared_state->source_deps.front()->set_ready(); } void set_block_to_read() { - DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->source_deps.size(), 1) << debug_string(); _shared_state->source_deps.front()->block(); } void set_ready_to_write() { - DCHECK(_shared_state->sink_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->sink_deps.size(), 1) << debug_string(); _shared_state->sink_deps.front()->set_ready(); } void set_block_to_write() { - DCHECK(_shared_state->sink_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->sink_deps.size(), 1) << debug_string(); _shared_state->sink_deps.front()->block(); } @@ -541,6 +542,12 @@ struct UnionSharedState : public BasicSharedState { const int _child_count; }; +struct CacheSharedState : public BasicSharedState { + ENABLE_FACTORY_CREATOR(CacheSharedState) +public: + DataQueue data_queue; +}; + class MultiCastDataStreamer; struct MultiCastSharedState : public BasicSharedState { diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index 260a599a947a0d..83d566fac9a28c 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -63,17 +63,13 @@ Status AggSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { "SerializeKeyArena", TUnit::BYTES, "MemoryUsage", 1); _build_timer = ADD_TIMER(Base::profile(), "BuildTime"); - _serialize_key_timer = ADD_TIMER(Base::profile(), "SerializeKeyTime"); - _exec_timer = ADD_TIMER(Base::profile(), "ExecTime"); _merge_timer = ADD_TIMER(Base::profile(), "MergeTime"); _expr_timer = ADD_TIMER(Base::profile(), "ExprTime"); - _serialize_data_timer = ADD_TIMER(Base::profile(), "SerializeDataTime"); _deserialize_data_timer = ADD_TIMER(Base::profile(), "DeserializeAndMergeTime"); _hash_table_compute_timer = ADD_TIMER(Base::profile(), "HashTableComputeTime"); _hash_table_limit_compute_timer = ADD_TIMER(Base::profile(), "DoLimitComputeTime"); _hash_table_emplace_timer = ADD_TIMER(Base::profile(), "HashTableEmplaceTime"); _hash_table_input_counter = ADD_COUNTER(Base::profile(), "HashTableInputCount", TUnit::UNIT); - _max_row_size_counter = ADD_COUNTER(Base::profile(), "MaxRowSizeInBytes", TUnit::UNIT); return Status::OK(); } @@ -725,7 +721,10 @@ AggSinkOperatorX::AggSinkOperatorX(ObjectPool* pool, int operator_id, const TPla : tnode.agg_node.grouping_exprs), _is_colocate(tnode.agg_node.__isset.is_colocate && tnode.agg_node.is_colocate), _require_bucket_distribution(require_bucket_distribution), - _agg_fn_output_row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples) {} + _agg_fn_output_row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), + _without_key(tnode.agg_node.grouping_exprs.empty()) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status AggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::init(tnode, state)); diff --git a/be/src/pipeline/exec/aggregation_sink_operator.h b/be/src/pipeline/exec/aggregation_sink_operator.h index 7c146c38a2b135..975b04477f203f 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.h +++ b/be/src/pipeline/exec/aggregation_sink_operator.h @@ -102,11 +102,8 @@ class AggSinkLocalState : public PipelineXSinkLocalState { RuntimeProfile::Counter* _hash_table_input_counter = nullptr; RuntimeProfile::Counter* _build_timer = nullptr; RuntimeProfile::Counter* _expr_timer = nullptr; - RuntimeProfile::Counter* _serialize_key_timer = nullptr; RuntimeProfile::Counter* _merge_timer = nullptr; - RuntimeProfile::Counter* _serialize_data_timer = nullptr; RuntimeProfile::Counter* _deserialize_data_timer = nullptr; - RuntimeProfile::Counter* _max_row_size_counter = nullptr; RuntimeProfile::Counter* _hash_table_memory_usage = nullptr; RuntimeProfile::Counter* _hash_table_size_counter = nullptr; RuntimeProfile::HighWaterMarkCounter* _serialize_key_arena_memory_usage = nullptr; @@ -143,9 +140,8 @@ class AggSinkOperatorX final : public DataSinkOperatorX { DataDistribution required_data_distribution() const override { if (_probe_expr_ctxs.empty()) { - return _needs_finalize || DataSinkOperatorX::_child - ->ignore_data_distribution() - ? DataDistribution(ExchangeType::PASSTHROUGH) + return _needs_finalize + ? DataDistribution(ExchangeType::NOOP) : DataSinkOperatorX::required_data_distribution(); } return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_operator @@ -153,7 +149,6 @@ class AggSinkOperatorX final : public DataSinkOperatorX { : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); } bool require_data_distribution() const override { return _is_colocate; } - bool require_shuffled_data_distribution() const override { return !_probe_expr_ctxs.empty(); } size_t get_revocable_mem_size(RuntimeState* state) const; AggregatedDataVariants* get_agg_data(RuntimeState* state) { @@ -204,8 +199,8 @@ class AggSinkOperatorX final : public DataSinkOperatorX { const std::vector _partition_exprs; const bool _is_colocate; const bool _require_bucket_distribution; - RowDescriptor _agg_fn_output_row_descriptor; + const bool _without_key; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/aggregation_source_operator.cpp b/be/src/pipeline/exec/aggregation_source_operator.cpp index fe03eba4102955..a406bdc329ef50 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/aggregation_source_operator.cpp @@ -29,20 +29,18 @@ namespace doris::pipeline { AggLocalState::AggLocalState(RuntimeState* state, OperatorXBase* parent) : Base(state, parent), _get_results_timer(nullptr), - _serialize_result_timer(nullptr), _hash_table_iterate_timer(nullptr), _insert_keys_to_column_timer(nullptr), - _serialize_data_timer(nullptr) {} + _insert_values_to_column_timer(nullptr) {} Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _get_results_timer = ADD_TIMER(profile(), "GetResultsTime"); - _serialize_result_timer = ADD_TIMER(profile(), "SerializeResultTime"); _hash_table_iterate_timer = ADD_TIMER(profile(), "HashTableIterateTime"); _insert_keys_to_column_timer = ADD_TIMER(profile(), "InsertKeysToColumnTime"); - _serialize_data_timer = ADD_TIMER(profile(), "SerializeDataTime"); + _insert_values_to_column_timer = ADD_TIMER(profile(), "InsertValuesToColumnTime"); _merge_timer = ADD_TIMER(Base::profile(), "MergeTime"); _deserialize_data_timer = ADD_TIMER(Base::profile(), "DeserializeAndMergeTime"); @@ -57,7 +55,7 @@ Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } else { - _executor.get_result = std::bind(&AggLocalState::_serialize_without_key, this, + _executor.get_result = std::bind(&AggLocalState::_get_results_without_key, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } @@ -68,8 +66,8 @@ Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { std::placeholders::_2, std::placeholders::_3); } else { _executor.get_result = std::bind( - &AggLocalState::_serialize_with_serialized_key_result, this, - std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); + &AggLocalState::_get_results_with_serialized_key, this, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); } } @@ -93,18 +91,9 @@ Status AggLocalState::_create_agg_status(vectorized::AggregateDataPtr data) { return Status::OK(); } -Status AggLocalState::_destroy_agg_status(vectorized::AggregateDataPtr data) { - auto& shared_state = *Base::_shared_state; - for (int i = 0; i < shared_state.aggregate_evaluators.size(); ++i) { - shared_state.aggregate_evaluators[i]->function()->destroy( - data + shared_state.offsets_of_aggregate_states[i]); - } - return Status::OK(); -} - -Status AggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, - vectorized::Block* block, bool* eos) { - SCOPED_TIMER(_serialize_result_timer); +Status AggLocalState::_get_results_with_serialized_key(RuntimeState* state, + vectorized::Block* block, bool* eos) { + SCOPED_TIMER(_get_results_timer); auto& shared_state = *_shared_state; int key_size = _shared_state->probe_expr_ctxs.size(); int agg_size = _shared_state->aggregate_evaluators.size(); @@ -124,7 +113,6 @@ Status AggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, } } - SCOPED_TIMER(_get_results_timer); std::visit( vectorized::Overload { [&](std::monostate& arg) -> void { @@ -180,7 +168,7 @@ Status AggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, } { - SCOPED_TIMER(_serialize_data_timer); + SCOPED_TIMER(_insert_values_to_column_timer); for (size_t i = 0; i < shared_state.aggregate_evaluators.size(); ++i) { value_data_types[i] = shared_state.aggregate_evaluators[i] ->function() @@ -332,13 +320,13 @@ Status AggLocalState::_get_with_serialized_key_result(RuntimeState* state, vecto return Status::OK(); } -Status AggLocalState::_serialize_without_key(RuntimeState* state, vectorized::Block* block, - bool* eos) { +Status AggLocalState::_get_results_without_key(RuntimeState* state, vectorized::Block* block, + bool* eos) { + SCOPED_TIMER(_get_results_timer); auto& shared_state = *_shared_state; // 1. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return NULL result // level one aggregation node set `eos = true` return directly - SCOPED_TIMER(_serialize_result_timer); if (UNLIKELY(_shared_state->input_num_rows == 0)) { *eos = true; return Status::OK(); @@ -440,7 +428,9 @@ AggSourceOperatorX::AggSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : Base(pool, tnode, operator_id, descs), _needs_finalize(tnode.agg_node.need_finalize), - _without_key(tnode.agg_node.grouping_exprs.empty()) {} + _without_key(tnode.agg_node.grouping_exprs.empty()) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status AggSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { auto& local_state = get_local_state(state); @@ -575,17 +565,6 @@ template Status AggSourceOperatorX::merge_with_serialized_key_helper( template Status AggSourceOperatorX::merge_with_serialized_key_helper( RuntimeState* state, vectorized::Block* block); -size_t AggLocalState::_get_hash_table_size() { - return std::visit( - vectorized::Overload {[&](std::monostate& arg) -> size_t { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, - "uninited hash table"); - return 0; - }, - [&](auto& agg_method) { return agg_method.hash_table->size(); }}, - _shared_state->agg_data->method_variant); -} - void AggLocalState::_emplace_into_hash_table(vectorized::AggregateDataPtr* places, vectorized::ColumnRawPtrs& key_columns, size_t num_rows) { diff --git a/be/src/pipeline/exec/aggregation_source_operator.h b/be/src/pipeline/exec/aggregation_source_operator.h index a3824a381eb49c..4bb03670bc8c59 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.h +++ b/be/src/pipeline/exec/aggregation_source_operator.h @@ -47,13 +47,12 @@ class AggLocalState final : public PipelineXLocalState { friend class AggSourceOperatorX; Status _get_without_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); + Status _get_results_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); Status _get_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, - bool* eos); + Status _get_results_with_serialized_key(RuntimeState* state, vectorized::Block* block, + bool* eos); Status _create_agg_status(vectorized::AggregateDataPtr data); - Status _destroy_agg_status(vectorized::AggregateDataPtr data); void _make_nullable_output_key(vectorized::Block* block) { if (block->rows() != 0) { auto& shared_state = *Base ::_shared_state; @@ -68,16 +67,14 @@ class AggLocalState final : public PipelineXLocalState { vectorized::ColumnRawPtrs& key_columns, size_t num_rows); void _emplace_into_hash_table(vectorized::AggregateDataPtr* places, vectorized::ColumnRawPtrs& key_columns, size_t num_rows); - size_t _get_hash_table_size(); vectorized::PODArray _places; std::vector _deserialize_buffer; RuntimeProfile::Counter* _get_results_timer = nullptr; - RuntimeProfile::Counter* _serialize_result_timer = nullptr; RuntimeProfile::Counter* _hash_table_iterate_timer = nullptr; RuntimeProfile::Counter* _insert_keys_to_column_timer = nullptr; - RuntimeProfile::Counter* _serialize_data_timer = nullptr; + RuntimeProfile::Counter* _insert_values_to_column_timer = nullptr; RuntimeProfile::Counter* _hash_table_compute_timer = nullptr; RuntimeProfile::Counter* _hash_table_emplace_timer = nullptr; diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 85d7773bdbd025..839a485f2d98c7 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -29,9 +29,10 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); - _blocks_memory_usage = - _profile->AddHighWaterMarkCounter("Blocks", TUnit::BYTES, "MemoryUsage", 1); - _evaluation_timer = ADD_TIMER(profile(), "EvaluationTime"); + _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); + _compute_agg_data_timer = ADD_TIMER(profile(), "ComputeAggDataTime"); + _compute_partition_by_timer = ADD_TIMER(profile(), "ComputePartitionByTime"); + _compute_order_by_timer = ADD_TIMER(profile(), "ComputeOrderByTime"); return Status::OK(); } @@ -201,7 +202,9 @@ AnalyticSinkOperatorX::AnalyticSinkOperatorX(ObjectPool* pool, int operator_id, _require_bucket_distribution(require_bucket_distribution), _partition_exprs(tnode.__isset.distribute_expr_lists && require_bucket_distribution ? tnode.distribute_expr_lists[0] - : tnode.analytic_node.partition_exprs) {} + : tnode.analytic_node.partition_exprs) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::init(tnode, state)); @@ -286,33 +289,41 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block } } - for (size_t i = 0; i < _agg_functions_size; - ++i) { //insert _agg_input_columns, execute calculate for its - for (size_t j = 0; j < local_state._agg_expr_ctxs[i].size(); ++j) { - RETURN_IF_ERROR(_insert_range_column( - input_block, local_state._agg_expr_ctxs[i][j], - local_state._shared_state->agg_input_columns[i][j].get(), block_rows)); + { + SCOPED_TIMER(local_state._compute_agg_data_timer); + for (size_t i = 0; i < _agg_functions_size; + ++i) { //insert _agg_input_columns, execute calculate for its + for (size_t j = 0; j < local_state._agg_expr_ctxs[i].size(); ++j) { + RETURN_IF_ERROR(_insert_range_column( + input_block, local_state._agg_expr_ctxs[i][j], + local_state._shared_state->agg_input_columns[i][j].get(), block_rows)); + } } } - //record column idx in block - for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._shared_state->partition_by_column_idxs[i] = result_col_id; + { + SCOPED_TIMER(local_state._compute_partition_by_timer); + for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { + int result_col_id = -1; + RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( + input_block, &result_col_id)); + DCHECK_GE(result_col_id, 0); + local_state._shared_state->partition_by_column_idxs[i] = result_col_id; + } } - for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; + { + SCOPED_TIMER(local_state._compute_order_by_timer); + for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { + int result_col_id = -1; + RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( + input_block, &result_col_id)); + DCHECK_GE(result_col_id, 0); + local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; + } } - local_state.mem_tracker()->consume(input_block->allocated_bytes()); - local_state._blocks_memory_usage->add(input_block->allocated_bytes()); + COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); + COUNTER_SET(local_state._peak_memory_usage_counter, local_state._memory_used_counter->value()); //TODO: if need improvement, the is a tips to maintain a free queue, //so the memory could reuse, no need to new/delete again; diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index ee305a877f55e1..084998d2c36cdc 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -58,7 +58,9 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; }; @@ -88,9 +90,6 @@ class AnalyticSinkOperatorX final : public DataSinkOperatorXAddHighWaterMarkCounter("Blocks", TUnit::BYTES, "MemoryUsage", 1); - _evaluation_timer = ADD_TIMER(profile(), "EvaluationTime"); + profile()->AddHighWaterMarkCounter("MemoryUsageBlocks", TUnit::BYTES, "", 1); + _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); + _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); + _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); + _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); return Status::OK(); } @@ -232,12 +235,6 @@ Status AnalyticLocalState::open(RuntimeState* state) { std::placeholders::_1); } } - _executor.insert_result = - std::bind(&AnalyticLocalState::_insert_result_info, this, std::placeholders::_1); - _executor.execute = - std::bind(&AnalyticLocalState::_execute_for_win_func, this, std::placeholders::_1, - std::placeholders::_2, std::placeholders::_3, std::placeholders::_4); - _create_agg_status(); return Status::OK(); } @@ -281,6 +278,7 @@ void AnalyticLocalState::_destroy_agg_status() { void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end) { + SCOPED_TIMER(_execute_timer); for (size_t i = 0; i < _agg_functions_size; ++i) { std::vector agg_columns; for (int j = 0; j < _shared_state->agg_input_columns[i].size(); ++j) { @@ -299,6 +297,7 @@ void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t } void AnalyticLocalState::_insert_result_info(int64_t current_block_rows) { + SCOPED_TIMER(_get_result_timer); int64_t current_block_row_pos = _shared_state->input_block_first_row_positions[_output_block_index]; int64_t get_result_start = _shared_state->current_row_position - current_block_row_pos; @@ -343,6 +342,7 @@ void AnalyticLocalState::_insert_result_info(int64_t current_block_rows) { } Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && _window_end_position < current_block_rows) { int64_t range_start, range_end; @@ -366,31 +366,33 @@ Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { // Make sure range_start <= range_end range_start = std::min(range_start, range_end); } - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, range_start, - range_end); - _executor.insert_result(current_block_rows); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + range_start, range_end); + _insert_result_info(current_block_rows); } return Status::OK(); } Status AnalyticLocalState::_get_next_for_partition(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); if (_next_partition) { - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _partition_by_start.pos, _shared_state->partition_by_end.pos); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + _partition_by_start.pos, _shared_state->partition_by_end.pos); } - _executor.insert_result(current_block_rows); + _insert_result_info(current_block_rows); return Status::OK(); } Status AnalyticLocalState::_get_next_for_range(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && _window_end_position < current_block_rows) { if (_shared_state->current_row_position >= _order_by_end.pos) { _update_order_by_range(); - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _order_by_start.pos, _order_by_end.pos); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + _order_by_start.pos, _order_by_end.pos); } - _executor.insert_result(current_block_rows); + _insert_result_info(current_block_rows); } return Status::OK(); } @@ -475,6 +477,7 @@ AnalyticSourceOperatorX::AnalyticSourceOperatorX(ObjectPool* pool, const TPlanNo _has_range_window(tnode.analytic_node.window.type == TAnalyticWindowType::RANGE), _has_window_start(tnode.analytic_node.window.__isset.window_start), _has_window_end(tnode.analytic_node.window.__isset.window_end) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; _fn_scope = AnalyticFnScope::PARTITION; if (tnode.analytic_node.__isset.window && tnode.analytic_node.window.type == TAnalyticWindowType::RANGE) { @@ -535,7 +538,7 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block local_state.init_result_columns(); size_t current_block_rows = local_state._shared_state->input_blocks[local_state._output_block_index].rows(); - static_cast(local_state._executor.get_next(current_block_rows)); + RETURN_IF_ERROR(local_state._executor.get_next(current_block_rows)); if (local_state._window_end_position == current_block_rows) { break; } diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index 38323f1b86bce2..fa7a676f9c43e9 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -96,17 +96,15 @@ class AnalyticLocalState final : public PipelineXLocalState std::vector _agg_functions; RuntimeProfile::Counter* _evaluation_timer = nullptr; + RuntimeProfile::Counter* _execute_timer = nullptr; + RuntimeProfile::Counter* _get_next_timer = nullptr; + RuntimeProfile::Counter* _get_result_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _blocks_memory_usage = nullptr; - using vectorized_execute = std::function; using vectorized_get_next = std::function; - using vectorized_get_result = std::function; struct executor { - vectorized_execute execute; vectorized_get_next get_next; - vectorized_get_result insert_result; }; executor _executor; diff --git a/be/src/pipeline/exec/assert_num_rows_operator.cpp b/be/src/pipeline/exec/assert_num_rows_operator.cpp index 5aa27b51c45095..563c4bf49ca41c 100644 --- a/be/src/pipeline/exec/assert_num_rows_operator.cpp +++ b/be/src/pipeline/exec/assert_num_rows_operator.cpp @@ -27,6 +27,7 @@ AssertNumRowsOperatorX::AssertNumRowsOperatorX(ObjectPool* pool, const TPlanNode : StreamingOperatorX(pool, tnode, operator_id, descs), _desired_num_rows(tnode.assert_num_rows_node.desired_num_rows), _subquery_string(tnode.assert_num_rows_node.subquery_string) { + _is_serial_operator = true; if (tnode.assert_num_rows_node.__isset.assertion) { _assertion = tnode.assert_num_rows_node.assertion; } else { diff --git a/be/src/pipeline/exec/cache_sink_operator.cpp b/be/src/pipeline/exec/cache_sink_operator.cpp new file mode 100644 index 00000000000000..b8b5b5346591c8 --- /dev/null +++ b/be/src/pipeline/exec/cache_sink_operator.cpp @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "cache_sink_operator.h" + +#include + +#include "common/compiler_util.h" // IWYU pragma: keep +#include "common/status.h" +#include "pipeline/exec/data_queue.h" +#include "pipeline/exec/operator.h" +#include "runtime/runtime_state.h" +#include "util/runtime_profile.h" + +namespace doris::pipeline { + +Status CacheSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _shared_state->data_queue.set_sink_dependency(_dependency, 0); + return Status::OK(); +} + +Status CacheSinkLocalState::open(RuntimeState* state) { + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_open_timer); + RETURN_IF_ERROR(Base::open(state)); + // auto& p = _parent->cast(); + + _shared_state->data_queue.set_max_blocks_in_sub_queue(state->data_queue_max_blocks()); + return Status::OK(); +} + +CacheSinkOperatorX::CacheSinkOperatorX(int sink_id, int child_id) + : Base(sink_id, child_id, child_id) { + _name = "CACHE_SINK_OPERATOR"; +} + +Status CacheSinkOperatorX::open(RuntimeState* state) { + return Status::OK(); +} + +Status CacheSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, bool eos) { + auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state.exec_time_counter()); + COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); + + if (in_block->rows() > 0) { + local_state._shared_state->data_queue.push_block( + vectorized::Block::create_unique(std::move(*in_block)), 0); + } + if (UNLIKELY(eos)) { + local_state._shared_state->data_queue.set_finish(0); + } + return Status::OK(); +} + +} // namespace doris::pipeline diff --git a/be/src/pipeline/exec/cache_sink_operator.h b/be/src/pipeline/exec/cache_sink_operator.h new file mode 100644 index 00000000000000..9c4beb48df2e4c --- /dev/null +++ b/be/src/pipeline/exec/cache_sink_operator.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "common/status.h" +#include "operator.h" +#include "vec/core/block.h" + +namespace doris { +class RuntimeState; + +namespace pipeline { +class DataQueue; + +class CacheSinkOperatorX; +class CacheSinkLocalState final : public PipelineXSinkLocalState { +public: + ENABLE_FACTORY_CREATOR(CacheSinkLocalState); + CacheSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) : Base(parent, state) {} + Status init(RuntimeState* state, LocalSinkStateInfo& info) override; + Status open(RuntimeState* state) override; + friend class CacheSinkOperatorX; + using Base = PipelineXSinkLocalState; + using Parent = CacheSinkOperatorX; +}; + +class CacheSinkOperatorX final : public DataSinkOperatorX { +public: + using Base = DataSinkOperatorX; + + friend class CacheSinkLocalState; + CacheSinkOperatorX(int sink_id, int child_id); + ~CacheSinkOperatorX() override = default; + Status init(const TDataSink& tsink) override { + return Status::InternalError("{} should not init with TDataSink", + DataSinkOperatorX::_name); + } + + Status open(RuntimeState* state) override; + + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos) override; + + std::shared_ptr create_shared_state() const override { + std::shared_ptr ss = std::make_shared(); + ss->id = operator_id(); + for (auto& dest : dests_id()) { + ss->related_op_ids.insert(dest); + } + return ss; + } +}; + +} // namespace pipeline +} // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/exec/cache_source_operator.cpp b/be/src/pipeline/exec/cache_source_operator.cpp new file mode 100644 index 00000000000000..e98a18b76a3a98 --- /dev/null +++ b/be/src/pipeline/exec/cache_source_operator.cpp @@ -0,0 +1,201 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pipeline/exec/cache_source_operator.h" + +#include +#include + +#include "common/status.h" +#include "pipeline/dependency.h" +#include "pipeline/exec/operator.h" +#include "vec/core/block.h" + +namespace doris { +class RuntimeState; + +namespace pipeline { + +Status CacheSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + ((CacheSharedState*)_dependency->shared_state()) + ->data_queue.set_source_dependency(_shared_state->source_deps.front()); + const auto& scan_ranges = info.scan_ranges; + bool hit_cache = false; + if (scan_ranges.size() > 1) { + return Status::InternalError("CacheSourceOperator only support one scan range, plan error"); + } + + const auto& cache_param = _parent->cast()._cache_param; + // 1. init the slot orders + const auto& tuple_descs = _parent->cast().row_desc().tuple_descriptors(); + for (auto tuple_desc : tuple_descs) { + for (auto slot_desc : tuple_desc->slots()) { + if (cache_param.output_slot_mapping.find(slot_desc->id()) != + cache_param.output_slot_mapping.end()) { + _slot_orders.emplace_back(cache_param.output_slot_mapping.at(slot_desc->id())); + } else { + return Status::InternalError( + fmt::format("Cache can find the mapping slot id {}, node id {}", + slot_desc->id(), cache_param.node_id)); + } + } + } + + // 2. build cache key by digest_tablet_id + RETURN_IF_ERROR(QueryCache::build_cache_key(scan_ranges, cache_param, &_cache_key, &_version)); + _runtime_profile->add_info_string( + "CacheTabletId", std::to_string(scan_ranges[0].scan_range.palo_scan_range.tablet_id)); + + // 3. lookup the cache and find proper slot order + hit_cache = QueryCache::instance()->lookup(_cache_key, _version, &_query_cache_handle); + _runtime_profile->add_info_string("HitCache", std::to_string(hit_cache)); + if (hit_cache && !cache_param.force_refresh_query_cache) { + _hit_cache_results = _query_cache_handle.get_cache_result(); + auto hit_cache_slot_orders = _query_cache_handle.get_cache_slot_orders(); + + bool need_reorder = _slot_orders.size() != hit_cache_slot_orders->size(); + if (!need_reorder) { + for (int i = 0; i < _slot_orders.size(); ++i) { + need_reorder = _slot_orders[i] != (*hit_cache_slot_orders)[i]; + } + } + + if (need_reorder) { + for (auto slot_id : _slot_orders) { + auto find_res = std::find(hit_cache_slot_orders->begin(), + hit_cache_slot_orders->end(), slot_id); + if (find_res != hit_cache_slot_orders->end()) { + _hit_cache_column_orders.emplace_back(find_res - + hit_cache_slot_orders->begin()); + } else { + return Status::InternalError(fmt::format( + "Cache can find the mapping slot id {}, node id {}, " + "hit_cache_column_orders [{}]", + slot_id, cache_param.node_id, fmt::join(*hit_cache_slot_orders, ","))); + } + } + } + } + + return Status::OK(); +} + +Status CacheSourceLocalState::open(RuntimeState* state) { + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_open_timer); + RETURN_IF_ERROR(Base::open(state)); + + return Status::OK(); +} + +std::string CacheSourceLocalState::debug_string(int indentation_level) const { + fmt::memory_buffer debug_string_buffer; + fmt::format_to(debug_string_buffer, "{}", Base::debug_string(indentation_level)); + if (_shared_state) { + fmt::format_to(debug_string_buffer, ", data_queue: (is_all_finish = {}, has_data = {})", + _shared_state->data_queue.is_all_finish(), + _shared_state->data_queue.remaining_has_data()); + } + return fmt::to_string(debug_string_buffer); +} + +Status CacheSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { + auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state.exec_time_counter()); + + block->clear_column_data(_row_descriptor.num_materialized_slots()); + bool need_clone_empty = block->columns() == 0; + + if (local_state._hit_cache_results == nullptr) { + Defer insert_cache([&] { + if (*eos) { + local_state._runtime_profile->add_info_string( + "InsertCache", std::to_string(local_state._need_insert_cache)); + if (local_state._need_insert_cache) { + local_state._global_cache->insert(local_state._cache_key, local_state._version, + local_state._local_cache_blocks, + local_state._slot_orders, + local_state._current_query_cache_bytes); + local_state._local_cache_blocks.clear(); + } + } + }); + + std::unique_ptr output_block; + int child_idx = 0; + RETURN_IF_ERROR(local_state._shared_state->data_queue.get_block_from_queue(&output_block, + &child_idx)); + // Here, check the value of `_has_data(state)` again after `data_queue.is_all_finish()` is TRUE + // as there may be one or more blocks when `data_queue.is_all_finish()` is TRUE. + *eos = !_has_data(state) && local_state._shared_state->data_queue.is_all_finish(); + + if (!output_block) { + return Status::OK(); + } + + if (local_state._need_insert_cache) { + if (need_clone_empty) { + *block = output_block->clone_empty(); + } + RETURN_IF_ERROR( + vectorized::MutableBlock::build_mutable_block(block).merge(*output_block)); + local_state._current_query_cache_rows += output_block->rows(); + auto mem_consume = output_block->allocated_bytes(); + local_state._current_query_cache_bytes += mem_consume; + local_state._mem_tracker->consume(mem_consume); + + if (_cache_param.entry_max_bytes < local_state._current_query_cache_bytes || + _cache_param.entry_max_rows < local_state._current_query_cache_rows) { + // over the max bytes, pass through the data, no need to do cache + local_state._local_cache_blocks.clear(); + local_state._need_insert_cache = false; + } else { + local_state._local_cache_blocks.emplace_back(std::move(output_block)); + } + } else { + *block = std::move(*output_block); + } + } else { + if (local_state._hit_cache_pos < local_state._hit_cache_results->size()) { + const auto& hit_cache_block = + local_state._hit_cache_results->at(local_state._hit_cache_pos++); + if (need_clone_empty) { + *block = hit_cache_block->clone_empty(); + } + RETURN_IF_ERROR( + vectorized::MutableBlock::build_mutable_block(block).merge(*hit_cache_block)); + if (!local_state._hit_cache_column_orders.empty()) { + auto datas = block->get_columns_with_type_and_name(); + block->clear(); + for (auto loc : local_state._hit_cache_column_orders) { + block->insert(datas[loc]); + } + } + } else { + *eos = true; + } + } + + local_state.reached_limit(block, eos); + return Status::OK(); +} + +} // namespace pipeline +} // namespace doris diff --git a/be/src/pipeline/exec/cache_source_operator.h b/be/src/pipeline/exec/cache_source_operator.h new file mode 100644 index 00000000000000..e764323846b153 --- /dev/null +++ b/be/src/pipeline/exec/cache_source_operator.h @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include + +#include + +#include "common/status.h" +#include "operator.h" +#include "pipeline/query_cache/query_cache.h" + +namespace doris { +class RuntimeState; + +namespace vectorized { +class Block; +} // namespace vectorized + +namespace pipeline { +class DataQueue; + +class CacheSourceOperatorX; +class CacheSourceLocalState final : public PipelineXLocalState { +public: + ENABLE_FACTORY_CREATOR(CacheSourceLocalState); + using Base = PipelineXLocalState; + using Parent = CacheSourceOperatorX; + CacheSourceLocalState(RuntimeState* state, OperatorXBase* parent) : Base(state, parent) {}; + + Status init(RuntimeState* state, LocalStateInfo& info) override; + Status open(RuntimeState* state) override; + + [[nodiscard]] std::string debug_string(int indentation_level = 0) const override; + +private: + friend class CacheSourceOperatorX; + friend class OperatorX; + + QueryCache* _global_cache = QueryCache::instance(); + + std::string _cache_key {}; + int64_t _version = 0; + std::vector _local_cache_blocks; + std::vector _slot_orders; + size_t _current_query_cache_bytes = 0; + size_t _current_query_cache_rows = 0; + bool _need_insert_cache = true; + + QueryCacheHandle _query_cache_handle; + std::vector* _hit_cache_results = nullptr; + std::vector _hit_cache_column_orders; + int _hit_cache_pos = 0; +}; + +class CacheSourceOperatorX final : public OperatorX { +public: + using Base = OperatorX; + CacheSourceOperatorX(ObjectPool* pool, int plan_node_id, int operator_id, + const TQueryCacheParam& cache_param) + : Base(pool, plan_node_id, operator_id), _cache_param(cache_param) { + _op_name = "CACHE_SOURCE_OPERATOR"; + }; + ~CacheSourceOperatorX() override = default; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos) override; + + bool is_source() const override { return true; } + + Status open(RuntimeState* state) override { + static_cast(Base::open(state)); + return Status::OK(); + } + + const RowDescriptor& intermediate_row_desc() const override { + return _child->intermediate_row_desc(); + } + RowDescriptor& row_descriptor() override { return _child->row_descriptor(); } + const RowDescriptor& row_desc() const override { return _child->row_desc(); } + +private: + TQueryCacheParam _cache_param; + bool _has_data(RuntimeState* state) const { + auto& local_state = get_local_state(state); + return local_state._shared_state->data_queue.remaining_has_data(); + } + friend class CacheSourceLocalState; +}; + +} // namespace pipeline +} // namespace doris diff --git a/be/src/pipeline/exec/datagen_operator.cpp b/be/src/pipeline/exec/datagen_operator.cpp index 93b3d058154e62..f0a76992643179 100644 --- a/be/src/pipeline/exec/datagen_operator.cpp +++ b/be/src/pipeline/exec/datagen_operator.cpp @@ -36,7 +36,9 @@ DataGenSourceOperatorX::DataGenSourceOperatorX(ObjectPool* pool, const TPlanNode : OperatorX(pool, tnode, operator_id, descs), _tuple_id(tnode.data_gen_scan_node.tuple_id), _tuple_desc(nullptr), - _runtime_filter_descs(tnode.runtime_filters) {} + _runtime_filter_descs(tnode.runtime_filters) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status DataGenSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(OperatorX::init(tnode, state)); @@ -68,17 +70,25 @@ Status DataGenSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - Status res = local_state._table_func->get_next(state, block, eos); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, - block->columns())); + { + SCOPED_TIMER(local_state._table_function_execution_timer); + RETURN_IF_ERROR(local_state._table_func->get_next(state, block, eos)); + } + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, + block->columns())); + } local_state.reached_limit(block, eos); - return res; + return Status::OK(); } Status DataGenLocalState::init(RuntimeState* state, LocalStateInfo& info) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); + _table_function_execution_timer = ADD_TIMER(profile(), "TableFunctionExecutionTime"); + _filter_timer = ADD_TIMER(profile(), "FilterTime"); auto& p = _parent->cast(); _table_func = std::make_shared(p._tuple_id, p._tuple_desc); _table_func->set_tuple_desc(p._tuple_desc); @@ -87,8 +97,8 @@ Status DataGenLocalState::init(RuntimeState* state, LocalStateInfo& info) { // TODO: use runtime filter to filte result block, maybe this node need derive from vscan_node. for (const auto& filter_desc : p._runtime_filter_descs) { std::shared_ptr runtime_filter; - RETURN_IF_ERROR(state->register_consumer_runtime_filter( - filter_desc, p.ignore_data_distribution(), p.node_id(), &runtime_filter)); + RETURN_IF_ERROR(state->register_consumer_runtime_filter(filter_desc, p.is_serial_operator(), + p.node_id(), &runtime_filter)); runtime_filter->init_profile(_runtime_profile.get()); } return Status::OK(); diff --git a/be/src/pipeline/exec/datagen_operator.h b/be/src/pipeline/exec/datagen_operator.h index c63ef97bb7a40f..bada5ec4080d08 100644 --- a/be/src/pipeline/exec/datagen_operator.h +++ b/be/src/pipeline/exec/datagen_operator.h @@ -44,6 +44,8 @@ class DataGenLocalState final : public PipelineXLocalState<> { private: friend class DataGenSourceOperatorX; std::shared_ptr _table_func; + RuntimeProfile::Counter* _table_function_execution_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; class DataGenSourceOperatorX final : public OperatorX { diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp index 70b73225f060e8..7cea16ad633c3c 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp @@ -72,7 +72,6 @@ Status DistinctStreamingAggLocalState::init(RuntimeState* state, LocalStateInfo& SCOPED_TIMER(Base::exec_time_counter()); SCOPED_TIMER(Base::_init_timer); _build_timer = ADD_TIMER(Base::profile(), "BuildTime"); - _exec_timer = ADD_TIMER(Base::profile(), "ExecTime"); _hash_table_compute_timer = ADD_TIMER(Base::profile(), "HashTableComputeTime"); _hash_table_emplace_timer = ADD_TIMER(Base::profile(), "HashTableEmplaceTime"); _hash_table_input_counter = ADD_COUNTER(Base::profile(), "HashTableInputCount", TUnit::UNIT); @@ -334,7 +333,9 @@ DistinctStreamingAggOperatorX::DistinctStreamingAggOperatorX(ObjectPool* pool, i ? tnode.distribute_expr_lists[0] : tnode.agg_node.grouping_exprs), _is_colocate(tnode.agg_node.__isset.is_colocate && tnode.agg_node.is_colocate), - _require_bucket_distribution(require_bucket_distribution) { + _require_bucket_distribution(require_bucket_distribution), + _without_key(tnode.agg_node.grouping_exprs.empty()) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; if (tnode.agg_node.__isset.use_streaming_preaggregation) { _is_streaming_preagg = tnode.agg_node.use_streaming_preaggregation; if (_is_streaming_preagg) { diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h index edeb432176379d..97df1a6fcbee88 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h @@ -104,6 +104,9 @@ class DistinctStreamingAggOperatorX final bool need_more_input_data(RuntimeState* state) const override; DataDistribution required_data_distribution() const override { + if (_needs_finalize && _probe_expr_ctxs.empty()) { + return {ExchangeType::NOOP}; + } if (_needs_finalize || (!_probe_expr_ctxs.empty() && !_is_streaming_preagg)) { return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_operator ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) @@ -113,9 +116,6 @@ class DistinctStreamingAggOperatorX final } bool require_data_distribution() const override { return _is_colocate; } - bool require_shuffled_data_distribution() const override { - return _needs_finalize || (!_probe_expr_ctxs.empty() && !_is_streaming_preagg); - } private: friend class DistinctStreamingAggLocalState; @@ -136,6 +136,7 @@ class DistinctStreamingAggOperatorX final /// The total size of the row from the aggregate functions. size_t _total_size_of_aggregate_states = 0; bool _is_streaming_preagg = false; + const bool _without_key; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/es_scan_operator.cpp b/be/src/pipeline/exec/es_scan_operator.cpp index c7e953a7fa3201..3759931f9b153a 100644 --- a/be/src/pipeline/exec/es_scan_operator.cpp +++ b/be/src/pipeline/exec/es_scan_operator.cpp @@ -44,12 +44,10 @@ static std::string get_host_and_port(const std::vector& Status EsScanLocalState::_init_profile() { RETURN_IF_ERROR(Base::_init_profile()); - _es_profile.reset(new RuntimeProfile("EsIterator")); - Base::_scanner_profile->add_child(_es_profile.get(), true, nullptr); - _rows_read_counter = ADD_COUNTER(_es_profile, "RowsRead", TUnit::UNIT); - _read_timer = ADD_TIMER(_es_profile, "TotalRawReadTime(*)"); - _materialize_timer = ADD_TIMER(_es_profile, "MaterializeTupleTime(*)"); + _blocks_read_counter = ADD_COUNTER(_runtime_profile, "BlocksRead", TUnit::UNIT); + _read_timer = ADD_TIMER(_runtime_profile, "TotalRawReadTime(*)"); + _materialize_timer = ADD_TIMER(_runtime_profile, "MaterializeTupleTime(*)"); return Status::OK(); } diff --git a/be/src/pipeline/exec/es_scan_operator.h b/be/src/pipeline/exec/es_scan_operator.h index 4e80150d0ba8c6..2ae562e4fc7f32 100644 --- a/be/src/pipeline/exec/es_scan_operator.h +++ b/be/src/pipeline/exec/es_scan_operator.h @@ -52,13 +52,12 @@ class EsScanLocalState final : public ScanLocalState { Status _init_scanners(std::list* scanners) override; std::vector> _scan_ranges; - std::unique_ptr _es_profile; // FIXME: non-static data member '_rows_read_counter' of 'EsScanLocalState' shadows member inherited from type 'ScanLocalStateBase' #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wshadow-field" #endif - RuntimeProfile::Counter* _rows_read_counter = nullptr; + RuntimeProfile::Counter* _blocks_read_counter = nullptr; #ifdef __clang__ #pragma clang diagnostic pop #endif diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index db5c4c78a3129a..98162fc1caed10 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -35,12 +35,6 @@ #include "vec/exprs/vexpr.h" namespace doris::pipeline { - -Status ExchangeSinkLocalState::serialize_block(vectorized::Block* src, PBlock* dest, - int num_receivers) { - return _parent->cast().serialize_block(*this, src, dest, num_receivers); -} - bool ExchangeSinkLocalState::transfer_large_data_by_brpc() const { return _parent->cast()._transfer_large_data_by_brpc; } @@ -58,14 +52,10 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _local_sent_rows = ADD_COUNTER(_profile, "LocalSentRows", TUnit::UNIT); _serialize_batch_timer = ADD_TIMER(_profile, "SerializeBatchTime"); _compress_timer = ADD_TIMER(_profile, "CompressTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); - _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); _split_block_hash_compute_timer = ADD_TIMER(_profile, "SplitBlockHashComputeTime"); - _split_block_distribute_by_channel_timer = - ADD_TIMER(_profile, "SplitBlockDistributeByChannelTime"); + _distribute_rows_into_channels_timer = ADD_TIMER(_profile, "DistributeRowsIntoChannelsTime"); _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); - _rows_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "RowsProduced", TUnit::UNIT, 1); _overall_throughput = _profile->add_derived_counter( "OverallThroughput", TUnit::BYTES_PER_SECOND, std::bind(&RuntimeProfile::units_per_second, _bytes_sent_counter, @@ -120,7 +110,7 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { std::mt19937 g(rd()); shuffle(channels.begin(), channels.end(), g); } - int local_size = 0; + size_t local_size = 0; for (int i = 0; i < channels.size(); ++i) { RETURN_IF_ERROR(channels[i]->open(state)); if (channels[i]->is_local()) { @@ -130,6 +120,8 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { } only_local_exchange = local_size == channels.size(); + _rpc_channels_num = channels.size() - local_size; + PUniqueId id; id.set_hi(_state->query_id().hi); id.set_lo(_state->query_id().lo); @@ -367,7 +359,6 @@ void ExchangeSinkOperatorX::_handle_eof_channel(RuntimeState* state, ChannelPtrT Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); - COUNTER_UPDATE(local_state.rows_sent_counter(), (int64_t)block->rows()); SCOPED_TIMER(local_state.exec_time_counter()); local_state._peak_memory_usage_counter->set(local_state._mem_tracker->peak_consumption()); bool all_receiver_eof = true; @@ -407,14 +398,15 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); bool serialized = false; RETURN_IF_ERROR(local_state._serializer.next_serialized_block( - block, block_holder->get_block(), local_state.channels.size(), &serialized, - eos)); + block, block_holder->get_block(), local_state._rpc_channels_num, + &serialized, eos)); if (serialized) { auto cur_block = local_state._serializer.get_block()->to_block(); if (!cur_block.empty()) { + DCHECK(eos || local_state._serializer.is_local()) << debug_string(state, 0); RETURN_IF_ERROR(local_state._serializer.serialize_block( &cur_block, block_holder->get_block(), - local_state.channels.size())); + local_state._rpc_channels_num)); } else { block_holder->reset_block(); } @@ -481,15 +473,21 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block local_state._partitioner->do_partitioning(state, block, _mem_tracker.get())); } if (_part_type == TPartitionType::HASH_PARTITIONED) { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); RETURN_IF_ERROR(channel_add_rows( state, local_state.channels, local_state._partition_count, local_state._partitioner->get_channel_ids().get(), rows, block, eos)); } else { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); RETURN_IF_ERROR(channel_add_rows( state, local_state.channel_shared_ptrs, local_state._partition_count, local_state._partitioner->get_channel_ids().get(), rows, block, eos)); } } else if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { + int64_t old_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + old_channel_mem_usage += channel->mem_usage(); + } // check out of limit RETURN_IF_ERROR(local_state._send_new_partition_batch()); std::shared_ptr convert_block = std::make_shared(); @@ -521,10 +519,21 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block local_state._row_distribution._deal_batched = true; RETURN_IF_ERROR(local_state._send_new_partition_batch()); } - // the convert_block maybe different with block after execute exprs - // when send data we still use block - RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, num_channels, - channel2rows, block, eos)); + { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); + // the convert_block maybe different with block after execute exprs + // when send data we still use block + RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, num_channels, + channel2rows, block, eos)); + } + int64_t new_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + new_channel_mem_usage += channel->mem_usage(); + } + COUNTER_UPDATE(local_state.memory_used_counter(), + new_channel_mem_usage - old_channel_mem_usage); + COUNTER_SET(local_state.peak_memory_usage_counter(), + local_state.memory_used_counter()->value()); } else if (_part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { { SCOPED_TIMER(local_state._split_block_hash_compute_timer); @@ -533,8 +542,12 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block } std::vector> assignments = local_state.scale_writer_partitioning_exchanger->accept(block); - RETURN_IF_ERROR(channel_add_rows_with_idx( - state, local_state.channels, local_state.channels.size(), assignments, block, eos)); + { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); + RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, + local_state.channels.size(), assignments, + block, eos)); + } } else if (_part_type == TPartitionType::TABLE_SINK_RANDOM_PARTITIONED) { // Control the number of channels according to the flow, thereby controlling the number of table sink writers. @@ -588,24 +601,6 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block return final_st; } -Status ExchangeSinkOperatorX::serialize_block(ExchangeSinkLocalState& state, vectorized::Block* src, - PBlock* dest, int num_receivers) { - { - SCOPED_TIMER(state.serialize_batch_timer()); - dest->Clear(); - size_t uncompressed_bytes = 0; - size_t compressed_bytes = 0; - RETURN_IF_ERROR(src->serialize(_state->be_exec_version(), dest, &uncompressed_bytes, - &compressed_bytes, _compression_type, - _transfer_large_data_by_brpc)); - COUNTER_UPDATE(state.bytes_sent_counter(), compressed_bytes * num_receivers); - COUNTER_UPDATE(state.uncompressed_bytes_counter(), uncompressed_bytes * num_receivers); - COUNTER_UPDATE(state.compress_timer(), src->get_compress_time()); - } - - return Status::OK(); -} - void ExchangeSinkLocalState::register_channels(pipeline::ExchangeSinkBuffer* buffer) { for (auto channel : channels) { ((vectorized::PipChannel*)channel)->register_exchange_buffer(buffer); diff --git a/be/src/pipeline/exec/exchange_sink_operator.h b/be/src/pipeline/exec/exchange_sink_operator.h index c60cefabfa8380..a4f78bdf61c69e 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.h +++ b/be/src/pipeline/exec/exchange_sink_operator.h @@ -78,27 +78,13 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { Status open(RuntimeState* state) override; Status close(RuntimeState* state, Status exec_status) override; Dependency* finishdependency() override { return _finish_dependency.get(); } - Status serialize_block(vectorized::Block* src, PBlock* dest, int num_receivers = 1); void register_channels(pipeline::ExchangeSinkBuffer* buffer); - RuntimeProfile::Counter* brpc_wait_timer() { return _brpc_wait_timer; } RuntimeProfile::Counter* blocks_sent_counter() { return _blocks_sent_counter; } - RuntimeProfile::Counter* rows_sent_counter() { return _rows_sent_counter; } RuntimeProfile::Counter* local_send_timer() { return _local_send_timer; } RuntimeProfile::Counter* local_bytes_send_counter() { return _local_bytes_send_counter; } RuntimeProfile::Counter* local_sent_rows() { return _local_sent_rows; } - RuntimeProfile::Counter* brpc_send_timer() { return _brpc_send_timer; } - RuntimeProfile::Counter* serialize_batch_timer() { return _serialize_batch_timer; } - RuntimeProfile::Counter* split_block_distribute_by_channel_timer() { - return _split_block_distribute_by_channel_timer; - } - RuntimeProfile::Counter* bytes_sent_counter() { return _bytes_sent_counter; } - RuntimeProfile::Counter* split_block_hash_compute_timer() { - return _split_block_hash_compute_timer; - } RuntimeProfile::Counter* merge_block_timer() { return _merge_block_timer; } - RuntimeProfile::Counter* compress_timer() { return _compress_timer; } - RuntimeProfile::Counter* uncompressed_bytes_counter() { return _uncompressed_bytes_counter; } [[nodiscard]] bool transfer_large_data_by_brpc() const; bool is_finished() const override { return _reach_limit.load(); } void set_reach_limit() { _reach_limit = true; }; @@ -130,16 +116,13 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { std::unique_ptr _sink_buffer = nullptr; RuntimeProfile::Counter* _serialize_batch_timer = nullptr; RuntimeProfile::Counter* _compress_timer = nullptr; - RuntimeProfile::Counter* _brpc_send_timer = nullptr; - RuntimeProfile::Counter* _brpc_wait_timer = nullptr; RuntimeProfile::Counter* _bytes_sent_counter = nullptr; RuntimeProfile::Counter* _uncompressed_bytes_counter = nullptr; RuntimeProfile::Counter* _local_sent_rows = nullptr; RuntimeProfile::Counter* _local_send_timer = nullptr; RuntimeProfile::Counter* _split_block_hash_compute_timer = nullptr; - RuntimeProfile::Counter* _split_block_distribute_by_channel_timer = nullptr; + RuntimeProfile::Counter* _distribute_rows_into_channels_timer = nullptr; RuntimeProfile::Counter* _blocks_sent_counter = nullptr; - RuntimeProfile::Counter* _rows_sent_counter = nullptr; // Throughput per total time spent in sender RuntimeProfile::Counter* _overall_throughput = nullptr; // Used to counter send bytes under local data exchange @@ -154,6 +137,7 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { int _sender_id; std::shared_ptr _broadcast_pb_mem_limiter; + size_t _rpc_channels_num = 0; vectorized::BlockSerializer _serializer; std::shared_ptr _queue_dependency = nullptr; @@ -218,9 +202,8 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX(tnode.nullable_tuples.begin(), tnode.nullable_tuples.begin() + tnode.exchange_node.input_row_tuples.size())), - _offset(tnode.exchange_node.__isset.offset ? tnode.exchange_node.offset : 0) {} + _offset(tnode.exchange_node.__isset.offset ? tnode.exchange_node.offset : 0) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status ExchangeSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(OperatorX::init(tnode, state)); @@ -141,15 +147,22 @@ Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block }); SCOPED_TIMER(local_state.exec_time_counter()); if (_is_merging && !local_state.is_ready) { + SCOPED_TIMER(local_state.create_merger_timer); RETURN_IF_ERROR(local_state.stream_recvr->create_merger( local_state.vsort_exec_exprs.lhs_ordering_expr_ctxs(), _is_asc_order, _nulls_first, state->batch_size(), _limit, _offset)); local_state.is_ready = true; return Status::OK(); } - auto status = local_state.stream_recvr->get_next(block, eos); - RETURN_IF_ERROR(doris::vectorized::VExprContext::filter_block(local_state.conjuncts(), block, - block->columns())); + { + SCOPED_TIMER(local_state.get_data_from_recvr_timer); + RETURN_IF_ERROR(local_state.stream_recvr->get_next(block, eos)); + } + { + SCOPED_TIMER(local_state.filter_timer); + RETURN_IF_ERROR(doris::vectorized::VExprContext::filter_block(local_state.conjuncts(), + block, block->columns())); + } // In vsortrunmerger, it will set eos=true, and block not empty // so that eos==true, could not make sure that block not have valid data if (!*eos || block->rows() > 0) { @@ -174,7 +187,7 @@ Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block COUNTER_SET(local_state.rows_returned_counter(), local_state.num_rows_returned()); COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); } - return status; + return Status::OK(); } Status ExchangeLocalState::close(RuntimeState* state) { diff --git a/be/src/pipeline/exec/exchange_source_operator.h b/be/src/pipeline/exec/exchange_source_operator.h index 0fe3dcbb590b7d..f938f5007d1643 100644 --- a/be/src/pipeline/exec/exchange_source_operator.h +++ b/be/src/pipeline/exec/exchange_source_operator.h @@ -59,6 +59,9 @@ class ExchangeLocalState final : public PipelineXLocalState<> { std::vector> deps; std::vector metrics; + RuntimeProfile::Counter* get_data_from_recvr_timer = nullptr; + RuntimeProfile::Counter* filter_timer = nullptr; + RuntimeProfile::Counter* create_merger_timer = nullptr; }; class ExchangeSourceOperatorX final : public OperatorX { @@ -81,7 +84,7 @@ class ExchangeSourceOperatorX final : public OperatorX { [[nodiscard]] bool is_merging() const { return _is_merging; } DataDistribution required_data_distribution() const override { - if (OperatorX::ignore_data_distribution()) { + if (OperatorX::is_serial_operator()) { return {ExchangeType::NOOP}; } return _partition_type == TPartitionType::HASH_PARTITIONED diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp index 6db49bb7ab1089..8da335f4fa2c0e 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp @@ -66,6 +66,7 @@ Status GroupCommitBlockSinkLocalState::open(RuntimeState* state) { } Status GroupCommitBlockSinkLocalState::_initialize_load_queue() { + SCOPED_TIMER(_init_load_queue_timer); auto& p = _parent->cast(); if (_state->exec_env()->wal_mgr()->is_running()) { RETURN_IF_ERROR(_state->exec_env()->group_commit_mgr()->get_first_block_load_queue( @@ -240,6 +241,17 @@ Status GroupCommitBlockSinkLocalState::_add_blocks(RuntimeState* state, return Status::OK(); } +Status GroupCommitBlockSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _init_load_queue_timer = ADD_TIMER(_profile, "InitLoadQueueTime"); + _valid_and_convert_block_timer = ADD_TIMER(_profile, "ValidAndConvertBlockTime"); + _find_partition_timer = ADD_TIMER(_profile, "FindPartitionTime"); + _append_blocks_timer = ADD_TIMER(_profile, "AppendBlocksTime"); + return Status::OK(); +} + Status GroupCommitBlockSinkOperatorX::init(const TDataSink& t_sink) { RETURN_IF_ERROR(Base::init(t_sink)); DCHECK(t_sink.__isset.olap_table_sink); @@ -321,10 +333,15 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc std::shared_ptr block; bool has_filtered_rows = false; - RETURN_IF_ERROR(local_state._block_convertor->validate_and_convert_block( - state, input_block, block, local_state._output_vexpr_ctxs, rows, has_filtered_rows)); + { + SCOPED_TIMER(local_state._valid_and_convert_block_timer); + RETURN_IF_ERROR(local_state._block_convertor->validate_and_convert_block( + state, input_block, block, local_state._output_vexpr_ctxs, rows, + has_filtered_rows)); + } local_state._has_filtered_rows = false; if (!local_state._vpartition->is_auto_partition()) { + SCOPED_TIMER(local_state._find_partition_timer); //reuse vars for find_partition local_state._partitions.assign(rows, nullptr); local_state._filter_bitmap.Reset(rows); @@ -354,23 +371,26 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc } } } - - if (local_state._block_convertor->num_filtered_rows() > 0 || local_state._has_filtered_rows) { - auto cloneBlock = block->clone_without_columns(); - auto res_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); - for (int i = 0; i < rows; ++i) { - if (local_state._block_convertor->filter_map()[i]) { - continue; - } - if (local_state._filter_bitmap.Get(i)) { - continue; + { + SCOPED_TIMER(local_state._append_blocks_timer); + if (local_state._block_convertor->num_filtered_rows() > 0 || + local_state._has_filtered_rows) { + auto cloneBlock = block->clone_without_columns(); + auto res_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + for (int i = 0; i < rows; ++i) { + if (local_state._block_convertor->filter_map()[i]) { + continue; + } + if (local_state._filter_bitmap.Get(i)) { + continue; + } + res_block.add_row(block.get(), i); } - res_block.add_row(block.get(), i); + block->swap(res_block.to_block()); } - block->swap(res_block.to_block()); + // add block into block queue + RETURN_IF_ERROR(local_state._add_block(state, block)); } - // add block into block queue - RETURN_IF_ERROR(local_state._add_block(state, block)); return wind_up(); } diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.h b/be/src/pipeline/exec/group_commit_block_sink_operator.h index 32ca0613652ae4..e469aee8df595c 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.h +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.h @@ -42,8 +42,8 @@ class GroupCommitBlockSinkLocalState final : public PipelineXSinkLocalState dependencies() const override { @@ -79,6 +79,11 @@ class GroupCommitBlockSinkLocalState final : public PipelineXSinkLocalState _finish_dependency; std::shared_ptr _create_plan_dependency = nullptr; std::shared_ptr _put_block_dependency = nullptr; + + RuntimeProfile::Counter* _init_load_queue_timer = nullptr; + RuntimeProfile::Counter* _valid_and_convert_block_timer = nullptr; + RuntimeProfile::Counter* _find_partition_timer = nullptr; + RuntimeProfile::Counter* _append_blocks_timer = nullptr; }; class GroupCommitBlockSinkOperatorX final diff --git a/be/src/pipeline/exec/group_commit_scan_operator.cpp b/be/src/pipeline/exec/group_commit_scan_operator.cpp index 3e6ad62c5dcb7c..fbe7f3c6f22836 100644 --- a/be/src/pipeline/exec/group_commit_scan_operator.cpp +++ b/be/src/pipeline/exec/group_commit_scan_operator.cpp @@ -31,6 +31,7 @@ GroupCommitOperatorX::GroupCommitOperatorX(ObjectPool* pool, const TPlanNode& tn Status GroupCommitOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state.exec_time_counter()); bool find_node = false; while (!find_node && !*eos) { RETURN_IF_ERROR(local_state.load_block_queue->get_block(state, block, &find_node, eos, diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 5be3fcad112db5..7efeb7692d4b71 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -51,19 +51,19 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_state->build_exprs_size = _build_expr_ctxs.size(); _should_build_hash_table = true; + profile()->add_info_string("BroadcastJoin", std::to_string(p._is_broadcast_join)); if (p._is_broadcast_join) { - profile()->add_info_string("BroadcastJoin", "true"); if (state->enable_share_hash_table_for_broadcast_join()) { _should_build_hash_table = info.task_idx == 0; if (_should_build_hash_table) { - profile()->add_info_string("ShareHashTableEnabled", "true"); p._shared_hashtable_controller->set_builder_and_consumers( state->fragment_instance_id(), p.node_id()); } - } else { - profile()->add_info_string("ShareHashTableEnabled", "false"); } } + profile()->add_info_string("BuildShareHashTable", std::to_string(_should_build_hash_table)); + profile()->add_info_string("ShareHashTableEnabled", + std::to_string(state->enable_share_hash_table_for_broadcast_join())); if (!_should_build_hash_table) { _dependency->block(); _finish_dependency->block(); @@ -72,6 +72,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _finish_dependency->shared_from_this()); } + _runtime_filter_init_timer = ADD_TIMER(profile(), "RuntimeFilterInitTime"); _build_blocks_memory_usage = ADD_CHILD_COUNTER_WITH_LEVEL(profile(), "BuildBlocks", TUnit::BYTES, "MemoryUsage", 1); _hash_table_memory_usage = @@ -81,13 +82,10 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo // Build phase auto* record_profile = _should_build_hash_table ? profile() : faker_runtime_profile(); - _build_table_timer = ADD_TIMER(profile(), "BuildTableTime"); - _build_side_merge_block_timer = ADD_TIMER(profile(), "BuildSideMergeBlockTime"); + _build_table_timer = ADD_TIMER(profile(), "BuildHashTableTime"); + _build_side_merge_block_timer = ADD_TIMER(profile(), "MergeBuildBlockTime"); _build_table_insert_timer = ADD_TIMER(record_profile, "BuildTableInsertTime"); _build_expr_call_timer = ADD_TIMER(record_profile, "BuildExprCallTime"); - _build_side_compute_hash_timer = ADD_TIMER(record_profile, "BuildSideHashComputingTime"); - - _allocate_resource_timer = ADD_TIMER(profile(), "AllocateResourceTime"); // Hash Table Init _hash_table_init(state); @@ -253,7 +251,6 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, if (UNLIKELY(rows == 0)) { return Status::OK(); } - COUNTER_UPDATE(_build_rows_counter, rows); block.replace_if_overflow(); vectorized::ColumnRawPtrs raw_ptrs(_build_expr_ctxs.size()); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 1ae9d5ae1a71f8..930d3761791d65 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -96,14 +96,12 @@ class HashJoinBuildSinkLocalState final RuntimeProfile::Counter* _build_table_timer = nullptr; RuntimeProfile::Counter* _build_expr_call_timer = nullptr; RuntimeProfile::Counter* _build_table_insert_timer = nullptr; - RuntimeProfile::Counter* _build_side_compute_hash_timer = nullptr; RuntimeProfile::Counter* _build_side_merge_block_timer = nullptr; - RuntimeProfile::Counter* _allocate_resource_timer = nullptr; - RuntimeProfile::Counter* _build_blocks_memory_usage = nullptr; RuntimeProfile::Counter* _hash_table_memory_usage = nullptr; - RuntimeProfile::HighWaterMarkCounter* _build_arena_memory_usage = nullptr; + RuntimeProfile::Counter* _build_arena_memory_usage = nullptr; + RuntimeProfile::Counter* _runtime_filter_init_timer = nullptr; }; class HashJoinBuildSinkOperatorX final @@ -132,8 +130,8 @@ class HashJoinBuildSinkOperatorX final if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { return {ExchangeType::NOOP}; } else if (_is_broadcast_join) { - return _child->ignore_data_distribution() ? DataDistribution(ExchangeType::PASS_TO_ONE) - : DataDistribution(ExchangeType::NOOP); + return _child->is_serial_operator() ? DataDistribution(ExchangeType::PASS_TO_ONE) + : DataDistribution(ExchangeType::NOOP); } return _join_distribution == TJoinDistributionType::BUCKET_SHUFFLE || _join_distribution == TJoinDistributionType::COLOCATE @@ -141,9 +139,6 @@ class HashJoinBuildSinkOperatorX final : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); } - bool require_shuffled_data_distribution() const override { - return _join_op != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && !_is_broadcast_join; - } bool is_shuffled_operator() const override { return _join_distribution == TJoinDistributionType::PARTITIONED; } diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index f91e1eaa2a1b17..756a151394b41e 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -56,13 +56,11 @@ Status HashJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) _probe_arena_memory_usage = profile()->AddHighWaterMarkCounter("ProbeKeyArena", TUnit::BYTES, "MemoryUsage", 1); // Probe phase - _probe_next_timer = ADD_TIMER(profile(), "ProbeFindNextTime"); _probe_expr_call_timer = ADD_TIMER(profile(), "ProbeExprCallTime"); _search_hashtable_timer = ADD_TIMER(profile(), "ProbeWhenSearchHashTableTime"); _build_side_output_timer = ADD_TIMER(profile(), "ProbeWhenBuildSideOutputTime"); _probe_side_output_timer = ADD_TIMER(profile(), "ProbeWhenProbeSideOutputTime"); - _probe_process_hashtable_timer = ADD_TIMER(profile(), "ProbeWhenProcessHashTableTime"); - _process_other_join_conjunct_timer = ADD_TIMER(profile(), "OtherJoinConjunctTime"); + _non_equal_join_conjuncts_timer = ADD_TIMER(profile(), "NonEqualJoinConjunctEvaluationTime"); _init_probe_side_timer = ADD_TIMER(profile(), "InitProbeSideTime"); return Status::OK(); } @@ -230,7 +228,6 @@ HashJoinProbeOperatorX::HashJoinProbeOperatorX(ObjectPool* pool, const TPlanNode Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Block* output_block, bool* eos) const { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state._probe_timer); if (local_state._shared_state->short_circuit_for_probe) { // If we use a short-circuit strategy, should return empty block directly. *eos = true; @@ -325,7 +322,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc if constexpr (!std::is_same_v) { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { - st = process_hashtable_ctx.process_data_in_hashtable( + st = process_hashtable_ctx.finish_probing( arg, mutable_join_block, &temp_block, eos, _is_mark_join); } else { st = Status::InternalError("uninited hash table"); diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index dde9c00dfe4944..66d709e6541ad8 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -117,14 +117,12 @@ class HashJoinProbeLocalState final std::make_unique(); RuntimeProfile::Counter* _probe_expr_call_timer = nullptr; - RuntimeProfile::Counter* _probe_next_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; - RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _probe_arena_memory_usage = nullptr; RuntimeProfile::Counter* _search_hashtable_timer = nullptr; RuntimeProfile::Counter* _init_probe_side_timer = nullptr; RuntimeProfile::Counter* _build_side_output_timer = nullptr; - RuntimeProfile::Counter* _process_other_join_conjunct_timer = nullptr; + RuntimeProfile::Counter* _non_equal_join_conjuncts_timer = nullptr; }; class HashJoinProbeOperatorX final : public JoinProbeOperatorX { @@ -152,9 +150,6 @@ class HashJoinProbeOperatorX final : public JoinProbeOperatorXrows()); RETURN_IF_ERROR(local_state.sink(state, block, eos)); return Status::OK(); } diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h b/be/src/pipeline/exec/join/process_hash_table_probe.h index 965d62192b2fed..2ccc9aec8c7e01 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe.h @@ -87,9 +87,8 @@ struct ProcessHashTableProbe { // Process full outer join/ right join / right semi/anti join to output the join result // in hash table template - Status process_data_in_hashtable(HashTableType& hash_table_ctx, - vectorized::MutableBlock& mutable_block, - vectorized::Block* output_block, bool* eos, bool is_mark_join); + Status finish_probing(HashTableType& hash_table_ctx, vectorized::MutableBlock& mutable_block, + vectorized::Block* output_block, bool* eos, bool is_mark_join); /// For null aware join with other conjuncts, if the probe key of one row on left side is null, /// we should make this row match with all rows in build side. @@ -137,7 +136,7 @@ struct ProcessHashTableProbe { RuntimeProfile::Counter* _init_probe_side_timer = nullptr; RuntimeProfile::Counter* _build_side_output_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; - RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; + RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; int _right_col_idx; int _right_col_len; diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 653cc8ab4473dd..5de033b63e8aad 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -56,7 +56,7 @@ ProcessHashTableProbe::ProcessHashTableProbe(HashJoinProbeLocalState _init_probe_side_timer(parent->_init_probe_side_timer), _build_side_output_timer(parent->_build_side_output_timer), _probe_side_output_timer(parent->_probe_side_output_timer), - _probe_process_hashtable_timer(parent->_probe_process_hashtable_timer), + _finish_probe_phase_timer(parent->_finish_probe_phase_timer), _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _parent->left_table_data_types().size()), @@ -501,8 +501,8 @@ Status ProcessHashTableProbe::do_other_join_conjuncts(vectorized::Bl return Status::OK(); } - SCOPED_TIMER(_parent->_process_other_join_conjunct_timer); - int orig_columns = output_block->columns(); + SCOPED_TIMER(_parent->_non_equal_join_conjuncts_timer); + size_t orig_columns = output_block->columns(); vectorized::IColumn::Filter other_conjunct_filter(row_count, 1); { bool can_be_filter_all = false; @@ -616,10 +616,11 @@ Status ProcessHashTableProbe::do_other_join_conjuncts(vectorized::Bl template template -Status ProcessHashTableProbe::process_data_in_hashtable( - HashTableType& hash_table_ctx, vectorized::MutableBlock& mutable_block, - vectorized::Block* output_block, bool* eos, bool is_mark_join) { - SCOPED_TIMER(_probe_process_hashtable_timer); +Status ProcessHashTableProbe::finish_probing(HashTableType& hash_table_ctx, + vectorized::MutableBlock& mutable_block, + vectorized::Block* output_block, bool* eos, + bool is_mark_join) { + SCOPED_TIMER(_finish_probe_phase_timer); auto& mcol = mutable_block.mutable_columns(); if (is_mark_join) { std::unique_ptr mark_column = @@ -717,8 +718,7 @@ struct ExtractType { vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ size_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ \ - template Status \ - ProcessHashTableProbe::process_data_in_hashtable::Type>( \ + template Status ProcessHashTableProbe::finish_probing::Type>( \ ExtractType::Type & hash_table_ctx, vectorized::MutableBlock & mutable_block, \ vectorized::Block * output_block, bool* eos, bool is_mark_join); diff --git a/be/src/pipeline/exec/join_build_sink_operator.cpp b/be/src/pipeline/exec/join_build_sink_operator.cpp index 2439dbc8fe1c95..1dcd5099283f62 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.cpp +++ b/be/src/pipeline/exec/join_build_sink_operator.cpp @@ -33,15 +33,11 @@ Status JoinBuildSinkLocalState::init(RuntimeState* stat PipelineXSinkLocalState::profile()->add_info_string("JoinType", to_string(p._join_op)); - _build_rows_counter = ADD_COUNTER(PipelineXSinkLocalState::profile(), - "BuildRows", TUnit::UNIT); _publish_runtime_filter_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), "PublishRuntimeFilterTime"); - _runtime_filter_compute_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), - "RuntimeFilterComputeTime"); - _runtime_filter_init_timer = - ADD_TIMER(PipelineXSinkLocalState::profile(), "RuntimeFilterInitTime"); + _runtime_filter_compute_timer = + ADD_TIMER(PipelineXSinkLocalState::profile(), "BuildRuntimeFilterTime"); return Status::OK(); } @@ -82,6 +78,8 @@ JoinBuildSinkOperatorX::JoinBuildSinkOperatorX(ObjectPool* pool, _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && !_is_mark_join), _runtime_filter_descs(tnode.runtime_filters) { + DataSinkOperatorX::_is_serial_operator = + tnode.__isset.is_serial_operator && tnode.is_serial_operator; _init_join_op(); if (_is_mark_join) { DCHECK(_join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN || diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h index 714e0c34190678..9d79a97397ff77 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.h +++ b/be/src/pipeline/exec/join_build_sink_operator.h @@ -39,10 +39,8 @@ class JoinBuildSinkLocalState : public PipelineXSinkLocalState template friend class JoinBuildSinkOperatorX; - RuntimeProfile::Counter* _build_rows_counter = nullptr; RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr; RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr; - RuntimeProfile::Counter* _runtime_filter_init_timer = nullptr; std::vector> _runtime_filters; }; diff --git a/be/src/pipeline/exec/join_probe_operator.cpp b/be/src/pipeline/exec/join_probe_operator.cpp index 05c62544d2b7ce..cc20fa744313f4 100644 --- a/be/src/pipeline/exec/join_probe_operator.cpp +++ b/be/src/pipeline/exec/join_probe_operator.cpp @@ -29,11 +29,10 @@ Status JoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - _probe_timer = ADD_TIMER(Base::profile(), "ProbeTime"); _join_filter_timer = ADD_TIMER(Base::profile(), "JoinFilterTimer"); _build_output_block_timer = ADD_TIMER(Base::profile(), "BuildOutputBlock"); _probe_rows_counter = ADD_COUNTER_WITH_LEVEL(Base::profile(), "ProbeRows", TUnit::UNIT, 1); - + _finish_probe_phase_timer = ADD_TIMER(Base::profile(), "FinishProbePhaseTime"); return Status::OK(); } @@ -220,6 +219,7 @@ JoinProbeOperatorX::JoinProbeOperatorX(ObjectPool* pool, const T : true) ) { + Base::_is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; if (tnode.__isset.hash_join_node) { _intermediate_row_desc.reset(new RowDescriptor( descs, tnode.hash_join_node.vintermediate_tuple_id_list, diff --git a/be/src/pipeline/exec/join_probe_operator.h b/be/src/pipeline/exec/join_probe_operator.h index 3f68c73d04b161..078806cea4fc5a 100644 --- a/be/src/pipeline/exec/join_probe_operator.h +++ b/be/src/pipeline/exec/join_probe_operator.h @@ -49,10 +49,10 @@ class JoinProbeLocalState : public PipelineXLocalState { size_t _mark_column_id = -1; - RuntimeProfile::Counter* _probe_timer = nullptr; RuntimeProfile::Counter* _probe_rows_counter = nullptr; RuntimeProfile::Counter* _join_filter_timer = nullptr; RuntimeProfile::Counter* _build_output_block_timer = nullptr; + RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; std::unique_ptr _child_block = nullptr; bool _child_eos = false; diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.cpp b/be/src/pipeline/exec/memory_scratch_sink_operator.cpp index 69e30791c139af..b9f18c43e1e239 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.cpp +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.cpp @@ -33,6 +33,9 @@ Status MemoryScratchSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _get_arrow_schema_timer = ADD_TIMER(_profile, "GetArrowSchemaTime"); + _convert_block_to_arrow_batch_timer = ADD_TIMER(_profile, "ConvertBlockToArrowBatchTime"); + _evaluation_timer = ADD_TIMER(_profile, "EvaluationTime"); // create queue state->exec_env()->result_queue_mgr()->create_queue(state->fragment_instance_id(), &_queue); @@ -92,15 +95,24 @@ Status MemoryScratchSinkOperatorX::sink(RuntimeState* state, vectorized::Block* // Exec vectorized expr here to speed up, block.rows() == 0 means expr exec // failed, just return the error status vectorized::Block block; - RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( - local_state._output_vexpr_ctxs, *input_block, &block)); + { + SCOPED_TIMER(local_state._evaluation_timer); + RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( + local_state._output_vexpr_ctxs, *input_block, &block)); + } std::shared_ptr block_arrow_schema; - // After expr executed, use recaculated schema as final schema - RETURN_IF_ERROR(convert_block_arrow_schema(block, &block_arrow_schema)); - RETURN_IF_ERROR(convert_to_arrow_batch(block, block_arrow_schema, arrow::default_memory_pool(), - &result, _timezone_obj)); + { + SCOPED_TIMER(local_state._get_arrow_schema_timer); + // After expr executed, use recaculated schema as final schema + RETURN_IF_ERROR(get_arrow_schema(block, &block_arrow_schema)); + } + { + SCOPED_TIMER(local_state._convert_block_to_arrow_batch_timer); + RETURN_IF_ERROR(convert_to_arrow_batch( + block, block_arrow_schema, arrow::default_memory_pool(), &result, _timezone_obj)); + } local_state._queue->blocking_put(result); - if (local_state._queue->size() < 10) { + if (local_state._queue->size() > config::max_memory_sink_batch_count) { local_state._queue_dependency->block(); } return Status::OK(); diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.h b/be/src/pipeline/exec/memory_scratch_sink_operator.h index c2cd78c7cd5aee..75372500d9bffc 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.h +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.h @@ -45,6 +45,9 @@ class MemoryScratchSinkLocalState final : public PipelineXSinkLocalState _queue_dependency = nullptr; + RuntimeProfile::Counter* _get_arrow_schema_timer = nullptr; + RuntimeProfile::Counter* _convert_block_to_arrow_batch_timer = nullptr; + RuntimeProfile::Counter* _evaluation_timer = nullptr; }; class MemoryScratchSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp index 1028bca7ce2ca4..304e8e96f0c79c 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp @@ -40,6 +40,9 @@ Status MultiCastDataStreamSourceLocalState::init(RuntimeState* state, LocalState auto& p = _parent->cast(); _shared_state->multi_cast_data_streamer->set_dep_by_sender_idx(p._consumer_id, _dependency); _wait_for_rf_timer = ADD_TIMER(_runtime_profile, "WaitForRuntimeFilter"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); + _get_data_timer = ADD_TIMER(_runtime_profile, "GetDataTime"); + _materialize_data_timer = ADD_TIMER(_runtime_profile, "MaterializeDataTime"); // init profile for runtime filter RuntimeFilterConsumer::_init_profile(profile()); init_runtime_filter_dependency(_filter_dependencies, p.operator_id(), p.node_id(), @@ -86,15 +89,19 @@ Status MultiCastDataStreamerSourceOperatorX::get_block(RuntimeState* state, if (!local_state._output_expr_contexts.empty()) { output_block = &tmp_block; } - RETURN_IF_ERROR(local_state._shared_state->multi_cast_data_streamer->pull(_consumer_id, - output_block, eos)); - + { + SCOPED_TIMER(local_state._get_data_timer); + RETURN_IF_ERROR(local_state._shared_state->multi_cast_data_streamer->pull( + _consumer_id, output_block, eos)); + } if (!local_state._conjuncts.empty()) { + SCOPED_TIMER(local_state._filter_timer); RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, output_block->columns())); } if (!local_state._output_expr_contexts.empty() && output_block->rows() > 0) { + SCOPED_TIMER(local_state._materialize_data_timer); RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( local_state._output_expr_contexts, *output_block, block, true)); vectorized::materialize_block_inplace(*block); diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.h b/be/src/pipeline/exec/multi_cast_data_stream_source.h index 76472f3ce85e83..b37c4f7e3a8e05 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.h +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.h @@ -67,6 +67,9 @@ class MultiCastDataStreamSourceLocalState final : public PipelineXLocalState> _filter_dependencies; RuntimeProfile::Counter* _wait_for_rf_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; + RuntimeProfile::Counter* _get_data_timer = nullptr; + RuntimeProfile::Counter* _materialize_data_timer = nullptr; }; class MultiCastDataStreamerSourceOperatorX final diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index 793a37c7396a61..6c1644178228c3 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -139,7 +139,6 @@ Status NestedLoopJoinBuildSinkOperatorX::sink(doris::RuntimeState* state, vector } if (eos) { - COUNTER_UPDATE(local_state._build_rows_counter, local_state._build_rows); RuntimeFilterBuild rf_ctx(&local_state); RETURN_IF_ERROR(rf_ctx(state)); diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h index f2ca259754b661..d6e72799f97d92 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h @@ -76,8 +76,8 @@ class NestedLoopJoinBuildSinkOperatorX final if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { return {ExchangeType::NOOP}; } - return _child->ignore_data_distribution() ? DataDistribution(ExchangeType::BROADCAST) - : DataDistribution(ExchangeType::NOOP); + return _child->is_serial_operator() ? DataDistribution(ExchangeType::BROADCAST) + : DataDistribution(ExchangeType::NOOP); } private: diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp index 9546ed8df56671..51b3aed079b3b9 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp @@ -42,6 +42,10 @@ Status NestedLoopJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _loop_join_timer = ADD_TIMER(profile(), "LoopGenerateJoin"); + _output_temp_blocks_timer = ADD_TIMER(profile(), "OutputTempBlocksTime"); + _update_visited_flags_timer = ADD_TIMER(profile(), "UpdateVisitedFlagsTime"); + _join_conjuncts_evaluation_timer = ADD_TIMER(profile(), "JoinConjunctsEvaluationTime"); + _filtered_by_join_conjuncts_timer = ADD_TIMER(profile(), "FilteredByJoinConjunctsTime"); return Status::OK(); } @@ -164,23 +168,26 @@ Status NestedLoopJoinProbeLocalState::generate_join_block_data(RuntimeState* sta _process_left_child_block(_join_block, now_process_build_block); } - if constexpr (set_probe_side_flag) { - RETURN_IF_ERROR( - (_do_filtering_and_update_visited_flags( - &_join_block, !p._is_left_semi_anti))); - _update_additional_flags(&_join_block); - // If this join operation is left outer join or full outer join, when - // `_left_side_process_count`, means all rows from build - // side have been joined with _left_side_process_count, we should output current - // probe row with null from build side. - if (_left_side_process_count) { - _finalize_current_phase( - _join_block, state->batch_size()); + { + SCOPED_TIMER(_finish_probe_phase_timer); + if constexpr (set_probe_side_flag) { + RETURN_IF_ERROR( + (_do_filtering_and_update_visited_flags( + &_join_block, !p._is_left_semi_anti))); + _update_additional_flags(&_join_block); + // If this join operation is left outer join or full outer join, when + // `_left_side_process_count`, means all rows from build + // side have been joined with _left_side_process_count, we should output current + // probe row with null from build side. + if (_left_side_process_count) { + _finalize_current_phase( + _join_block, state->batch_size()); + } + } else if (_left_side_process_count && p._is_mark_join && + _shared_state->build_blocks.empty()) { + _append_left_data_with_null(_join_block); } - } else if (_left_side_process_count && p._is_mark_join && - _shared_state->build_blocks.empty()) { - _append_left_data_with_null(_join_block); } } @@ -373,6 +380,7 @@ void NestedLoopJoinProbeLocalState::_append_left_data_with_null(vectorized::Bloc void NestedLoopJoinProbeLocalState::_process_left_child_block( vectorized::Block& block, const vectorized::Block& now_process_build_block) const { + SCOPED_TIMER(_output_temp_blocks_timer); auto& p = _parent->cast(); auto dst_columns = block.mutate_columns(); const int max_added_rows = now_process_build_block.rows(); @@ -480,6 +488,7 @@ Status NestedLoopJoinProbeOperatorX::push(doris::RuntimeState* state, vectorized set_build_side_flag, set_probe_side_flag>( state, join_op_variants); }; + SCOPED_TIMER(local_state._loop_join_timer); RETURN_IF_ERROR( std::visit(func, local_state._shared_state->join_op_variants, vectorized::make_bool_variant(_match_all_build || _is_right_semi_anti), diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.h b/be/src/pipeline/exec/nested_loop_join_probe_operator.h index f46a99306a5713..0d1d6510ce61a8 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.h @@ -63,42 +63,47 @@ class NestedLoopJoinProbeLocalState final void _do_filtering_and_update_visited_flags_impl(vectorized::Block* block, int column_to_keep, int build_block_idx, int processed_blocks_num, bool materialize, Filter& filter) { - if constexpr (SetBuildSideFlag) { - for (size_t i = 0; i < processed_blocks_num; i++) { - auto& build_side_flag = - assert_cast( - _shared_state->build_side_visited_flags[build_block_idx].get()) - ->get_data(); - auto* __restrict build_side_flag_data = build_side_flag.data(); - auto cur_sz = build_side_flag.size(); - const size_t offset = _build_offset_stack.top(); - _build_offset_stack.pop(); - for (size_t j = 0; j < cur_sz; j++) { - build_side_flag_data[j] |= filter[offset + j]; + { + SCOPED_TIMER(_update_visited_flags_timer); + if constexpr (SetBuildSideFlag) { + for (size_t i = 0; i < processed_blocks_num; i++) { + auto& build_side_flag = + assert_cast( + _shared_state->build_side_visited_flags[build_block_idx].get()) + ->get_data(); + auto* __restrict build_side_flag_data = build_side_flag.data(); + auto cur_sz = build_side_flag.size(); + const size_t offset = _build_offset_stack.top(); + _build_offset_stack.pop(); + for (size_t j = 0; j < cur_sz; j++) { + build_side_flag_data[j] |= filter[offset + j]; + } + build_block_idx = build_block_idx == 0 ? _shared_state->build_blocks.size() - 1 + : build_block_idx - 1; } - build_block_idx = build_block_idx == 0 ? _shared_state->build_blocks.size() - 1 - : build_block_idx - 1; } - } - if constexpr (SetProbeSideFlag) { - int end = filter.size(); - for (int i = _left_block_pos == _child_block->rows() ? _left_block_pos - 1 - : _left_block_pos; - i >= _left_block_start_pos; i--) { - int offset = 0; - if (!_probe_offset_stack.empty()) { - offset = _probe_offset_stack.top(); - _probe_offset_stack.pop(); - } - if (!_cur_probe_row_visited_flags[i]) { - _cur_probe_row_visited_flags[i] = - simd::contain_byte(filter.data() + offset, end - offset, 1) ? 1 - : 0; + if constexpr (SetProbeSideFlag) { + int end = filter.size(); + for (int i = _left_block_pos == _child_block->rows() ? _left_block_pos - 1 + : _left_block_pos; + i >= _left_block_start_pos; i--) { + int offset = 0; + if (!_probe_offset_stack.empty()) { + offset = _probe_offset_stack.top(); + _probe_offset_stack.pop(); + } + if (!_cur_probe_row_visited_flags[i]) { + _cur_probe_row_visited_flags[i] = + simd::contain_byte(filter.data() + offset, end - offset, 1) + ? 1 + : 0; + } + end = offset; } - end = offset; } } if (materialize) { + SCOPED_TIMER(_filtered_by_join_conjuncts_timer); vectorized::Block::filter_block_internal(block, filter, column_to_keep); } else { CLEAR_BLOCK @@ -119,8 +124,11 @@ class NestedLoopJoinProbeLocalState final if (LIKELY(!_join_conjuncts.empty() && block->rows() > 0)) { vectorized::IColumn::Filter filter(block->rows(), 1); bool can_filter_all = false; - RETURN_IF_ERROR(vectorized::VExprContext::execute_conjuncts( - _join_conjuncts, nullptr, IgnoreNull, block, &filter, &can_filter_all)); + { + SCOPED_TIMER(_join_conjuncts_evaluation_timer); + RETURN_IF_ERROR(vectorized::VExprContext::execute_conjuncts( + _join_conjuncts, nullptr, IgnoreNull, block, &filter, &can_filter_all)); + } if (can_filter_all) { CLEAR_BLOCK @@ -179,6 +187,10 @@ class NestedLoopJoinProbeLocalState final vectorized::VExprContextSPtrs _join_conjuncts; RuntimeProfile::Counter* _loop_join_timer = nullptr; + RuntimeProfile::Counter* _output_temp_blocks_timer = nullptr; + RuntimeProfile::Counter* _update_visited_flags_timer = nullptr; + RuntimeProfile::Counter* _join_conjuncts_evaluation_timer = nullptr; + RuntimeProfile::Counter* _filtered_by_join_conjuncts_timer = nullptr; }; class NestedLoopJoinProbeOperatorX final @@ -197,7 +209,9 @@ class NestedLoopJoinProbeOperatorX final } DataDistribution required_data_distribution() const override { - if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::RIGHT_OUTER_JOIN || _join_op == TJoinOp::RIGHT_ANTI_JOIN || + _join_op == TJoinOp::RIGHT_SEMI_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN) { return {ExchangeType::NOOP}; } return {ExchangeType::ADAPTIVE_PASSTHROUGH}; diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index aa6f0ed49f0478..b27402ac27e7a8 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -29,15 +29,13 @@ #include "olap/tablet_manager.h" #include "pipeline/common/runtime_filter_consumer.h" #include "pipeline/exec/scan_operator.h" +#include "pipeline/query_cache/query_cache.h" #include "service/backend_options.h" #include "util/to_string.h" #include "vec/exec/scan/new_olap_scanner.h" -#include "vec/exec/scan/vscan_node.h" -#include "vec/exprs/vcompound_pred.h" #include "vec/exprs/vectorized_fn_call.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" -#include "vec/exprs/vin_predicate.h" #include "vec/exprs/vslot_ref.h" #include "vec/functions/in.h" @@ -45,6 +43,9 @@ namespace doris::pipeline { Status OlapScanLocalState::_init_profile() { RETURN_IF_ERROR(ScanLocalState::_init_profile()); + // Rows read from storage. + // Include the rows read from doris page cache. + _scan_rows = ADD_COUNTER(_runtime_profile, "ScanRows", TUnit::UNIT); // 1. init segment profile _segment_profile.reset(new RuntimeProfile("SegmentIterator")); _scanner_profile->add_child(_segment_profile.get(), true, nullptr); @@ -60,23 +61,20 @@ Status OlapScanLocalState::_init_profile() { _block_load_counter = ADD_COUNTER(_segment_profile, "BlocksLoad", TUnit::UNIT); _block_fetch_timer = ADD_TIMER(_scanner_profile, "BlockFetchTime"); _delete_bitmap_get_agg_timer = ADD_TIMER(_scanner_profile, "DeleteBitmapGetAggTime"); - _sync_rowset_timer = ADD_TIMER(_scanner_profile, "SyncRowsetTime"); - _raw_rows_counter = ADD_COUNTER(_segment_profile, "RawRowsRead", TUnit::UNIT); - _block_convert_timer = ADD_TIMER(_scanner_profile, "BlockConvertTime"); + if (config::is_cloud_mode()) { + _sync_rowset_timer = ADD_TIMER(_scanner_profile, "SyncRowsetTime"); + } _block_init_timer = ADD_TIMER(_segment_profile, "BlockInitTime"); _block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime"); _block_init_seek_counter = ADD_COUNTER(_segment_profile, "BlockInitSeekCount", TUnit::UNIT); - _block_conditions_filtered_timer = ADD_TIMER(_segment_profile, "BlockConditionsFilteredTime"); - _block_conditions_filtered_bf_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredBloomFilterTime"); + _segment_generate_row_range_timer = ADD_TIMER(_segment_profile, "GenerateRowRangeTime"); + _segment_generate_row_range_by_bf_timer = + ADD_TIMER(_segment_profile, "GenerateRowRangeByBloomFilterIndexTime"); _collect_iterator_merge_next_timer = ADD_TIMER(_segment_profile, "CollectIteratorMergeTime"); - _collect_iterator_normal_next_timer = ADD_TIMER(_segment_profile, "CollectIteratorNormalTime"); - _block_conditions_filtered_zonemap_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredZonemapTime"); - _block_conditions_filtered_zonemap_rp_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredZonemapRuntimePredicateTime"); - _block_conditions_filtered_dict_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredDictTime"); + _segment_generate_row_range_by_zonemap_timer = + ADD_TIMER(_segment_profile, "GenerateRowRangeByZoneMapIndexTime"); + _segment_generate_row_range_by_dict_timer = + ADD_TIMER(_segment_profile, "GenerateRowRangeByDictTime"); _rows_vec_cond_filtered_counter = ADD_COUNTER(_segment_profile, "RowsVectorPredFiltered", TUnit::UNIT); @@ -89,10 +87,11 @@ Status OlapScanLocalState::_init_profile() { _vec_cond_timer = ADD_TIMER(_segment_profile, "VectorPredEvalTime"); _short_cond_timer = ADD_TIMER(_segment_profile, "ShortPredEvalTime"); _expr_filter_timer = ADD_TIMER(_segment_profile, "ExprFilterEvalTime"); - _first_read_timer = ADD_TIMER(_segment_profile, "FirstReadTime"); - _second_read_timer = ADD_TIMER(_segment_profile, "SecondReadTime"); - _first_read_seek_timer = ADD_TIMER(_segment_profile, "FirstReadSeekTime"); - _first_read_seek_counter = ADD_COUNTER(_segment_profile, "FirstReadSeekCount", TUnit::UNIT); + _predicate_column_read_timer = ADD_TIMER(_segment_profile, "PredicateColumnReadTime"); + _non_predicate_column_read_timer = ADD_TIMER(_segment_profile, "NonPredicateColumnReadTime"); + _predicate_column_read_seek_timer = ADD_TIMER(_segment_profile, "PredicateColumnReadSeekTime"); + _predicate_column_read_seek_counter = + ADD_COUNTER(_segment_profile, "PredicateColumnReadSeekCount", TUnit::UNIT); _lazy_read_timer = ADD_TIMER(_segment_profile, "LazyReadTime"); _lazy_read_seek_timer = ADD_TIMER(_segment_profile, "LazyReadSeekTime"); @@ -102,7 +101,7 @@ Status OlapScanLocalState::_init_profile() { _stats_filtered_counter = ADD_COUNTER(_segment_profile, "RowsStatsFiltered", TUnit::UNIT); _stats_rp_filtered_counter = - ADD_COUNTER(_segment_profile, "RowsZonemapRuntimePredicateFiltered", TUnit::UNIT); + ADD_COUNTER(_segment_profile, "RowsZoneMapRuntimePredicateFiltered", TUnit::UNIT); _bf_filtered_counter = ADD_COUNTER(_segment_profile, "RowsBloomFilterFiltered", TUnit::UNIT); _dict_filtered_counter = ADD_COUNTER(_segment_profile, "RowsDictFiltered", TUnit::UNIT); _del_filtered_counter = ADD_COUNTER(_scanner_profile, "RowsDelFiltered", TUnit::UNIT); @@ -133,8 +132,6 @@ Status OlapScanLocalState::_init_profile() { ADD_TIMER(_segment_profile, "InvertedIndexQueryNullBitmapTime"); _inverted_index_query_bitmap_copy_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapCopyTime"); - _inverted_index_query_bitmap_op_timer = - ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapOpTime"); _inverted_index_searcher_open_timer = ADD_TIMER(_segment_profile, "InvertedIndexSearcherOpenTime"); _inverted_index_searcher_search_timer = @@ -146,8 +143,7 @@ Status OlapScanLocalState::_init_profile() { _inverted_index_downgrade_count_counter = ADD_COUNTER(_segment_profile, "InvertedIndexDowngradeCount", TUnit::UNIT); - _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTimer"); - + _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTime"); _filtered_segment_counter = ADD_COUNTER(_segment_profile, "NumSegmentFiltered", TUnit::UNIT); _total_segment_counter = ADD_COUNTER(_segment_profile, "NumSegmentTotal", TUnit::UNIT); _tablet_counter = ADD_COUNTER(_runtime_profile, "TabletNum", TUnit::UNIT); @@ -285,8 +281,9 @@ Status OlapScanLocalState::_init_scanners(std::list* s scan_range->version.data() + scan_range->version.size(), version); tablets.emplace_back(std::move(tablet), version); } - int64_t duration_ns = 0; + if (config::is_cloud_mode()) { + int64_t duration_ns = 0; SCOPED_RAW_TIMER(&duration_ns); std::vector> tasks; tasks.reserve(_scan_ranges.size()); @@ -296,8 +293,8 @@ Status OlapScanLocalState::_init_scanners(std::list* s }); } RETURN_IF_ERROR(cloud::bthread_fork_join(tasks, 10)); + _sync_rowset_timer->update(duration_ns); } - _sync_rowset_timer->update(duration_ns); if (enable_parallel_scan && !p._should_run_serial && !has_cpu_limit && p._push_down_agg_type == TPushAggOp::NONE && @@ -338,25 +335,6 @@ Status OlapScanLocalState::_init_scanners(std::list* s int scanners_per_tablet = std::max(1, 64 / (int)_scan_ranges.size()); - auto build_new_scanner = [&](BaseTabletSPtr tablet, int64_t version, - const std::vector& key_ranges) { - COUNTER_UPDATE(_key_range_counter, key_ranges.size()); - auto scanner = vectorized::NewOlapScanner::create_shared( - this, vectorized::NewOlapScanner::Params { - state(), - _scanner_profile.get(), - key_ranges, - std::move(tablet), - version, - {}, - p._limit, - p._olap_scan_node.is_preaggregation, - }); - RETURN_IF_ERROR(scanner->prepare(state(), _conjuncts)); - scanners->push_back(std::move(scanner)); - return Status::OK(); - }; - for (auto& scan_range : _scan_ranges) { auto tablet = DORIS_TRY(ExecEnv::get_tablet(scan_range->tablet_id)); int64_t version = 0; @@ -382,7 +360,21 @@ Status OlapScanLocalState::_init_scanners(std::list* s ++j, ++i) { scanner_ranges.push_back((*ranges)[i].get()); } - RETURN_IF_ERROR(build_new_scanner(tablet, version, scanner_ranges)); + + COUNTER_UPDATE(_key_range_counter, scanner_ranges.size()); + auto scanner = vectorized::NewOlapScanner::create_shared( + this, vectorized::NewOlapScanner::Params { + state(), + _scanner_profile.get(), + scanner_ranges, + std::move(tablet), + version, + {}, + p._limit, + p._olap_scan_node.is_preaggregation, + }); + RETURN_IF_ERROR(scanner->prepare(state(), _conjuncts)); + scanners->push_back(std::move(scanner)); } } @@ -395,10 +387,25 @@ TOlapScanNode& OlapScanLocalState::olap_scan_node() const { void OlapScanLocalState::set_scan_ranges(RuntimeState* state, const std::vector& scan_ranges) { - for (auto& scan_range : scan_ranges) { - DCHECK(scan_range.scan_range.__isset.palo_scan_range); - _scan_ranges.emplace_back(new TPaloScanRange(scan_range.scan_range.palo_scan_range)); - COUNTER_UPDATE(_tablet_counter, 1); + const auto& cache_param = _parent->cast()._cache_param; + bool hit_cache = false; + if (!cache_param.digest.empty() && !cache_param.force_refresh_query_cache) { + std::string cache_key; + int64_t version = 0; + auto status = QueryCache::build_cache_key(scan_ranges, cache_param, &cache_key, &version); + if (!status.ok()) { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, status.msg()); + } + doris::QueryCacheHandle handle; + hit_cache = QueryCache::instance()->lookup(cache_key, version, &handle); + } + + if (!hit_cache) { + for (auto& scan_range : scan_ranges) { + DCHECK(scan_range.scan_range.__isset.palo_scan_range); + _scan_ranges.emplace_back(new TPaloScanRange(scan_range.scan_range.palo_scan_range)); + COUNTER_UPDATE(_tablet_counter, 1); + } } } @@ -572,9 +579,11 @@ void OlapScanLocalState::add_filter_info(int id, const PredicateFilterInfo& upda } OlapScanOperatorX::OlapScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, - const DescriptorTbl& descs, int parallel_tasks) + const DescriptorTbl& descs, int parallel_tasks, + const TQueryCacheParam& param) : ScanOperatorX(pool, tnode, operator_id, descs, parallel_tasks), - _olap_scan_node(tnode.olap_scan_node) { + _olap_scan_node(tnode.olap_scan_node), + _cache_param(param) { _output_tuple_id = tnode.olap_scan_node.tuple_id; if (_olap_scan_node.__isset.sort_info && _olap_scan_node.__isset.sort_limit) { _limit_per_scanner = _olap_scan_node.sort_limit; diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index 6a03a46e65ef67..69b4f3701b32d2 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -97,11 +97,8 @@ class OlapScanLocalState final : public ScanLocalState { std::unique_ptr _segment_profile; - RuntimeProfile::Counter* _num_disks_accessed_counter = nullptr; - RuntimeProfile::Counter* _tablet_counter = nullptr; RuntimeProfile::Counter* _key_range_counter = nullptr; - RuntimeProfile::Counter* _rows_pushed_cond_filtered_counter = nullptr; RuntimeProfile::Counter* _reader_init_timer = nullptr; RuntimeProfile::Counter* _scanner_init_timer = nullptr; RuntimeProfile::Counter* _process_conjunct_timer = nullptr; @@ -110,7 +107,6 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _read_compressed_counter = nullptr; RuntimeProfile::Counter* _decompressor_timer = nullptr; RuntimeProfile::Counter* _read_uncompressed_counter = nullptr; - RuntimeProfile::Counter* _raw_rows_counter = nullptr; RuntimeProfile::Counter* _rows_vec_cond_filtered_counter = nullptr; RuntimeProfile::Counter* _rows_short_circuit_cond_filtered_counter = nullptr; @@ -140,23 +136,19 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _block_init_timer = nullptr; RuntimeProfile::Counter* _block_init_seek_timer = nullptr; RuntimeProfile::Counter* _block_init_seek_counter = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_bf_timer = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_timer = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_by_bf_timer = nullptr; RuntimeProfile::Counter* _collect_iterator_merge_next_timer = nullptr; - RuntimeProfile::Counter* _collect_iterator_normal_next_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_zonemap_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_zonemap_rp_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_dict_timer = nullptr; - RuntimeProfile::Counter* _first_read_timer = nullptr; - RuntimeProfile::Counter* _second_read_timer = nullptr; - RuntimeProfile::Counter* _first_read_seek_timer = nullptr; - RuntimeProfile::Counter* _first_read_seek_counter = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_by_zonemap_timer = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_by_dict_timer = nullptr; + RuntimeProfile::Counter* _predicate_column_read_timer = nullptr; + RuntimeProfile::Counter* _non_predicate_column_read_timer = nullptr; + RuntimeProfile::Counter* _predicate_column_read_seek_timer = nullptr; + RuntimeProfile::Counter* _predicate_column_read_seek_counter = nullptr; RuntimeProfile::Counter* _lazy_read_timer = nullptr; RuntimeProfile::Counter* _lazy_read_seek_timer = nullptr; RuntimeProfile::Counter* _lazy_read_seek_counter = nullptr; - RuntimeProfile::Counter* _block_convert_timer = nullptr; - // total pages read // used by segment v2 RuntimeProfile::Counter* _total_pages_num_counter = nullptr; @@ -176,7 +168,6 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_query_cache_miss_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_timer = nullptr; RuntimeProfile::Counter* _inverted_index_query_bitmap_copy_timer = nullptr; - RuntimeProfile::Counter* _inverted_index_query_bitmap_op_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_open_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_search_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_cache_hit_counter = nullptr; @@ -198,11 +189,13 @@ class OlapScanLocalState final : public ScanLocalState { class OlapScanOperatorX final : public ScanOperatorX { public: OlapScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, - const DescriptorTbl& descs, int parallel_tasks); + const DescriptorTbl& descs, int parallel_tasks, + const TQueryCacheParam& cache_param); private: friend class OlapScanLocalState; TOlapScanNode _olap_scan_node; + TQueryCacheParam _cache_param; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/operator.cpp b/be/src/pipeline/exec/operator.cpp index d65769254b9dfc..68ddd2500195b3 100644 --- a/be/src/pipeline/exec/operator.cpp +++ b/be/src/pipeline/exec/operator.cpp @@ -17,7 +17,6 @@ #include "operator.h" -#include "common/logging.h" #include "common/status.h" #include "pipeline/dependency.h" #include "pipeline/exec/aggregation_sink_operator.h" @@ -25,6 +24,8 @@ #include "pipeline/exec/analytic_sink_operator.h" #include "pipeline/exec/analytic_source_operator.h" #include "pipeline/exec/assert_num_rows_operator.h" +#include "pipeline/exec/cache_sink_operator.h" +#include "pipeline/exec/cache_source_operator.h" #include "pipeline/exec/datagen_operator.h" #include "pipeline/exec/distinct_streaming_aggregation_operator.h" #include "pipeline/exec/empty_set_operator.h" @@ -73,6 +74,7 @@ #include "pipeline/exec/union_source_operator.h" #include "pipeline/local_exchange/local_exchange_sink_operator.h" #include "pipeline/local_exchange/local_exchange_source_operator.h" +#include "pipeline/pipeline.h" #include "util/debug_util.h" #include "util/runtime_profile.h" #include "util/string_util.h" @@ -115,11 +117,16 @@ std::string PipelineXSinkLocalState::name_suffix() { }() + ")"; } -DataDistribution DataSinkOperatorXBase::required_data_distribution() const { - return _child && _child->ignore_data_distribution() +DataDistribution OperatorBase::required_data_distribution() const { + return _child && _child->is_serial_operator() && !is_source() ? DataDistribution(ExchangeType::PASSTHROUGH) : DataDistribution(ExchangeType::NOOP); } + +bool OperatorBase::require_shuffled_data_distribution() const { + return Pipeline::is_hash_exchange(required_data_distribution().distribution_type); +} + const RowDescriptor& OperatorBase::row_desc() const { return _child->row_desc(); } @@ -140,8 +147,9 @@ std::string PipelineXSinkLocalState::debug_string(int indentatio std::string OperatorXBase::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; - fmt::format_to(debug_string_buffer, "{}{}: id={}, parallel_tasks={}", - std::string(indentation_level * 2, ' '), _op_name, node_id(), _parallel_tasks); + fmt::format_to(debug_string_buffer, "{}{}: id={}, parallel_tasks={}, _is_serial_operator={}", + std::string(indentation_level * 2, ' '), _op_name, node_id(), _parallel_tasks, + _is_serial_operator); return fmt::to_string(debug_string_buffer); } @@ -353,8 +361,8 @@ void PipelineXLocalStateBase::reached_limit(vectorized::Block* block, bool* eos) std::string DataSinkOperatorXBase::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; - fmt::format_to(debug_string_buffer, "{}{}: id={}", std::string(indentation_level * 2, ' '), - _name, node_id()); + fmt::format_to(debug_string_buffer, "{}{}: id={}, _is_serial_operator={}", + std::string(indentation_level * 2, ' '), _name, node_id(), _is_serial_operator); return fmt::to_string(debug_string_buffer); } @@ -694,6 +702,7 @@ DECLARE_OPERATOR(SetSinkLocalState) DECLARE_OPERATOR(SetSinkLocalState) DECLARE_OPERATOR(PartitionedHashJoinSinkLocalState) DECLARE_OPERATOR(GroupCommitBlockSinkLocalState) +DECLARE_OPERATOR(CacheSinkLocalState) #undef DECLARE_OPERATOR @@ -725,6 +734,7 @@ DECLARE_OPERATOR(SchemaScanLocalState) DECLARE_OPERATOR(MetaScanLocalState) DECLARE_OPERATOR(LocalExchangeSourceLocalState) DECLARE_OPERATOR(PartitionedHashJoinProbeLocalState) +DECLARE_OPERATOR(CacheSourceLocalState) #undef DECLARE_OPERATOR @@ -754,6 +764,7 @@ template class PipelineXSinkLocalState; template class PipelineXSinkLocalState; template class PipelineXSinkLocalState; template class PipelineXSinkLocalState; +template class PipelineXSinkLocalState; template class PipelineXLocalState; template class PipelineXLocalState; @@ -765,6 +776,7 @@ template class PipelineXLocalState; template class PipelineXLocalState; template class PipelineXLocalState; template class PipelineXLocalState; +template class PipelineXLocalState; template class PipelineXLocalState; template class PipelineXLocalState; template class PipelineXLocalState; diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h index b848aea6e1ecd8..be54b7c4999840 100644 --- a/be/src/pipeline/exec/operator.h +++ b/be/src/pipeline/exec/operator.h @@ -101,6 +101,9 @@ class OperatorBase { return Status::OK(); } + // Operators need to be executed serially. (e.g. finalized agg without key) + [[nodiscard]] virtual bool is_serial_operator() const { return _is_serial_operator; } + [[nodiscard]] bool is_closed() const { return _is_closed; } virtual size_t revocable_mem_size(RuntimeState* state) const { return 0; } @@ -115,13 +118,15 @@ class OperatorBase { _followed_by_shuffled_operator = followed_by_shuffled_operator; } [[nodiscard]] virtual bool is_shuffled_operator() const { return false; } - [[nodiscard]] virtual bool require_shuffled_data_distribution() const { return false; } + [[nodiscard]] virtual DataDistribution required_data_distribution() const; + [[nodiscard]] virtual bool require_shuffled_data_distribution() const; protected: OperatorPtr _child = nullptr; bool _is_closed; bool _followed_by_shuffled_operator = false; + bool _is_serial_operator = false; }; class PipelineXLocalStateBase { @@ -344,6 +349,10 @@ class PipelineXSinkLocalStateBase { RuntimeProfile::Counter* rows_input_counter() { return _rows_input_counter; } RuntimeProfile::Counter* exec_time_counter() { return _exec_timer; } + RuntimeProfile::Counter* memory_used_counter() { return _memory_used_counter; } + RuntimeProfile::HighWaterMarkCounter* peak_memory_usage_counter() { + return _peak_memory_usage_counter; + } virtual std::vector dependencies() const { return {nullptr}; } // override in exchange sink , AsyncWriterSink @@ -443,7 +452,7 @@ class DataSinkOperatorXBase : public OperatorBase { Status init(const TDataSink& tsink) override; [[nodiscard]] virtual Status init(ExchangeType type, const int num_buckets, - const bool is_shuffled_hash_join, + const bool use_global_hash_shuffle, const std::map& shuffle_idx_to_instance_idx) { return Status::InternalError("init() is only implemented in local exchange!"); } @@ -478,7 +487,6 @@ class DataSinkOperatorXBase : public OperatorBase { } [[nodiscard]] virtual std::shared_ptr create_shared_state() const = 0; - [[nodiscard]] virtual DataDistribution required_data_distribution() const; Status close(RuntimeState* state) override { return Status::InternalError("Should not reach here!"); @@ -491,8 +499,6 @@ class DataSinkOperatorXBase : public OperatorBase { [[nodiscard]] bool is_sink() const override { return true; } - [[nodiscard]] bool is_source() const override { return false; } - static Status close(RuntimeState* state, Status exec_status) { auto result = state->get_sink_local_state_result(); if (!result) { @@ -647,19 +653,7 @@ class OperatorXBase : public OperatorBase { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, _op_name); } [[nodiscard]] std::string get_name() const override { return _op_name; } - [[nodiscard]] virtual DataDistribution required_data_distribution() const { - return _child && _child->ignore_data_distribution() && !is_source() - ? DataDistribution(ExchangeType::PASSTHROUGH) - : DataDistribution(ExchangeType::NOOP); - } - [[nodiscard]] virtual bool ignore_data_distribution() const { - return _child ? _child->ignore_data_distribution() : _ignore_data_distribution; - } - [[nodiscard]] bool ignore_data_hash_distribution() const { - return _child ? _child->ignore_data_hash_distribution() : _ignore_data_distribution; - } [[nodiscard]] virtual bool need_more_input_data(RuntimeState* state) const { return true; } - void set_ignore_data_distribution() { _ignore_data_distribution = true; } Status open(RuntimeState* state) override; @@ -730,8 +724,6 @@ class OperatorXBase : public OperatorBase { bool has_output_row_desc() const { return _output_row_descriptor != nullptr; } - [[nodiscard]] bool is_source() const override { return false; } - [[nodiscard]] virtual Status get_block_after_projects(RuntimeState* state, vectorized::Block* block, bool* eos); @@ -741,6 +733,9 @@ class OperatorXBase : public OperatorBase { void set_parallel_tasks(int parallel_tasks) { _parallel_tasks = parallel_tasks; } int parallel_tasks() const { return _parallel_tasks; } + // To keep compatibility with older FE + void set_serial_operator() { _is_serial_operator = true; } + protected: template friend class PipelineXLocalState; @@ -774,7 +769,6 @@ class OperatorXBase : public OperatorBase { uint32_t _debug_point_count = 0; std::string _op_name; - bool _ignore_data_distribution = false; int _parallel_tasks = 0; //_keep_origin is used to avoid copying during projection, diff --git a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h index 6b3a74c83df97c..15f6b22387a8e2 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h @@ -309,9 +309,6 @@ class PartitionedAggSinkOperatorX : public DataSinkOperatorXrequire_data_distribution(); } - bool require_shuffled_data_distribution() const override { - return _agg_sink_operator->require_shuffled_data_distribution(); - } Status set_child(OperatorPtr child) override { RETURN_IF_ERROR(DataSinkOperatorX::set_child(child)); diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp index 48df5587198b08..655a6e19725a9b 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp @@ -118,6 +118,10 @@ Status PartitionedAggSourceOperatorX::close(RuntimeState* state) { return _agg_source_operator->close(state); } +bool PartitionedAggSourceOperatorX::is_serial_operator() const { + return _agg_source_operator->is_serial_operator(); +} + Status PartitionedAggSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { auto& local_state = get_local_state(state); diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h index edae99c716a925..7e73241745e029 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h @@ -91,6 +91,8 @@ class PartitionedAggSourceOperatorX : public OperatorX bool is_source() const override { return true; } + bool is_serial_operator() const override; + private: friend class PartitionedAggLocalState; diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h index 3aab11f62d883e..f8fc0780b6fc3f 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h @@ -165,9 +165,6 @@ class PartitionedHashJoinProbeOperatorX final _distribution_partition_exprs)); } - bool require_shuffled_data_distribution() const override { - return _join_op != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; - } bool is_shuffled_operator() const override { return _join_distribution == TJoinDistributionType::PARTITIONED; } diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h index c768d7518b95c9..8e89763b50a9d5 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h @@ -115,9 +115,6 @@ class PartitionedHashJoinSinkOperatorX _distribution_partition_exprs); } - bool require_shuffled_data_distribution() const override { - return _join_op != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; - } bool is_shuffled_operator() const override { return _join_distribution == TJoinDistributionType::PARTITIONED; } diff --git a/be/src/pipeline/exec/repeat_operator.cpp b/be/src/pipeline/exec/repeat_operator.cpp index d355d99c2e352f..cd707ccc49f8c2 100644 --- a/be/src/pipeline/exec/repeat_operator.cpp +++ b/be/src/pipeline/exec/repeat_operator.cpp @@ -46,6 +46,16 @@ Status RepeatLocalState::open(RuntimeState* state) { return Status::OK(); } +Status RepeatLocalState::init(RuntimeState* state, LocalStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _evaluate_input_timer = ADD_TIMER(profile(), "EvaluateInputDataTime"); + _get_repeat_data_timer = ADD_TIMER(profile(), "GetRepeatDataTime"); + _filter_timer = ADD_TIMER(profile(), "FilterTime"); + return Status::OK(); +} + Status RepeatOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(OperatorXBase::init(tnode, state)); RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(tnode.repeat_node.exprs, _expr_ctxs)); @@ -166,23 +176,24 @@ Status RepeatLocalState::add_grouping_id_column(std::size_t rows, std::size_t& c Status RepeatOperatorX::push(RuntimeState* state, vectorized::Block* input_block, bool eos) const { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state._evaluate_input_timer); local_state._child_eos = eos; - auto& _intermediate_block = local_state._intermediate_block; - auto& _expr_ctxs = local_state._expr_ctxs; - DCHECK(!_intermediate_block || _intermediate_block->rows() == 0); + auto& intermediate_block = local_state._intermediate_block; + auto& expr_ctxs = local_state._expr_ctxs; + DCHECK(!intermediate_block || intermediate_block->rows() == 0); if (input_block->rows() > 0) { - _intermediate_block = vectorized::Block::create_unique(); + intermediate_block = vectorized::Block::create_unique(); - for (auto& expr : _expr_ctxs) { + for (auto& expr : expr_ctxs) { int result_column_id = -1; RETURN_IF_ERROR(expr->execute(input_block, &result_column_id)); DCHECK(result_column_id != -1); input_block->get_by_position(result_column_id).column = input_block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); - _intermediate_block->insert(input_block->get_by_position(result_column_id)); + intermediate_block->insert(input_block->get_by_position(result_column_id)); } - DCHECK_EQ(_expr_ctxs.size(), _intermediate_block->columns()); + DCHECK_EQ(expr_ctxs.size(), intermediate_block->columns()); } return Status::OK(); @@ -202,33 +213,39 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp } DCHECK(output_block->rows() == 0); - if (_intermediate_block && _intermediate_block->rows() > 0) { - RETURN_IF_ERROR(local_state.get_repeated_block(_intermediate_block.get(), _repeat_id_idx, - output_block)); + { + SCOPED_TIMER(local_state._get_repeat_data_timer); + if (_intermediate_block && _intermediate_block->rows() > 0) { + RETURN_IF_ERROR(local_state.get_repeated_block(_intermediate_block.get(), + _repeat_id_idx, output_block)); - _repeat_id_idx++; + _repeat_id_idx++; - int size = _repeat_id_list.size(); - if (_repeat_id_idx >= size) { - _intermediate_block->clear(); + int size = _repeat_id_list.size(); + if (_repeat_id_idx >= size) { + _intermediate_block->clear(); + _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); + _repeat_id_idx = 0; + } + } else if (local_state._expr_ctxs.empty()) { + auto m_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( + output_block, _output_slots); + auto rows = _child_block.rows(); + auto& columns = m_block.mutable_columns(); + + for (int repeat_id_idx = 0; repeat_id_idx < _repeat_id_list.size(); repeat_id_idx++) { + std::size_t cur_col = 0; + RETURN_IF_ERROR( + local_state.add_grouping_id_column(rows, cur_col, columns, repeat_id_idx)); + } _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); - _repeat_id_idx = 0; } - } else if (local_state._expr_ctxs.empty()) { - auto m_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block(output_block, - _output_slots); - auto rows = _child_block.rows(); - auto& columns = m_block.mutable_columns(); - - for (int repeat_id_idx = 0; repeat_id_idx < _repeat_id_list.size(); repeat_id_idx++) { - std::size_t cur_col = 0; - RETURN_IF_ERROR( - local_state.add_grouping_id_column(rows, cur_col, columns, repeat_id_idx)); - } - _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); } - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, - output_block->columns())); + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, + output_block->columns())); + } *eos = _child_eos && _child_block.rows() == 0; local_state.reached_limit(output_block, eos); COUNTER_SET(local_state._rows_returned_counter, local_state._num_rows_returned); diff --git a/be/src/pipeline/exec/repeat_operator.h b/be/src/pipeline/exec/repeat_operator.h index 22398df372ae65..31f88f37231aaa 100644 --- a/be/src/pipeline/exec/repeat_operator.h +++ b/be/src/pipeline/exec/repeat_operator.h @@ -36,6 +36,7 @@ class RepeatLocalState final : public PipelineXLocalState { using Base = PipelineXLocalState; RepeatLocalState(RuntimeState* state, OperatorXBase* parent); + Status init(RuntimeState* state, LocalStateInfo& info) override; Status open(RuntimeState* state) override; Status get_repeated_block(vectorized::Block* child_block, int repeat_id_idx, @@ -53,6 +54,10 @@ class RepeatLocalState final : public PipelineXLocalState { int _repeat_id_idx; std::unique_ptr _intermediate_block; vectorized::VExprContextSPtrs _expr_ctxs; + + RuntimeProfile::Counter* _evaluate_input_timer = nullptr; + RuntimeProfile::Counter* _get_repeat_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; class RepeatOperatorX final : public StatefulOperatorX { diff --git a/be/src/pipeline/exec/result_file_sink_operator.cpp b/be/src/pipeline/exec/result_file_sink_operator.cpp index 4d842db5d2346a..72fc1505573410 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.cpp +++ b/be/src/pipeline/exec/result_file_sink_operator.cpp @@ -84,12 +84,6 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i SCOPED_TIMER(_init_timer); _sender_id = info.sender_id; - _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); - _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); - _split_block_distribute_by_channel_timer = - ADD_TIMER(_profile, "SplitBlockDistributeByChannelTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); auto& p = _parent->cast(); CHECK(p._file_opts.get() != nullptr); if (p._is_top_sink) { diff --git a/be/src/pipeline/exec/result_file_sink_operator.h b/be/src/pipeline/exec/result_file_sink_operator.h index 86b6035c134ba9..0e6906709f10a8 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.h +++ b/be/src/pipeline/exec/result_file_sink_operator.h @@ -45,14 +45,6 @@ class ResultFileSinkLocalState final [[nodiscard]] int sender_id() const { return _sender_id; } - RuntimeProfile::Counter* brpc_wait_timer() { return _brpc_wait_timer; } - RuntimeProfile::Counter* local_send_timer() { return _local_send_timer; } - RuntimeProfile::Counter* brpc_send_timer() { return _brpc_send_timer; } - RuntimeProfile::Counter* merge_block_timer() { return _merge_block_timer; } - RuntimeProfile::Counter* split_block_distribute_by_channel_timer() { - return _split_block_distribute_by_channel_timer; - } - private: friend class ResultFileSinkOperatorX; @@ -66,12 +58,6 @@ class ResultFileSinkLocalState final bool _only_local_exchange = false; std::unique_ptr> _serializer; std::shared_ptr _block_holder; - RuntimeProfile::Counter* _brpc_wait_timer = nullptr; - RuntimeProfile::Counter* _local_send_timer = nullptr; - RuntimeProfile::Counter* _brpc_send_timer = nullptr; - RuntimeProfile::Counter* _merge_block_timer = nullptr; - RuntimeProfile::Counter* _split_block_distribute_by_channel_timer = nullptr; - int _sender_id; }; diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index 0608beaf522290..ccd74c6d572e5c 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -39,13 +39,12 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _fetch_row_id_timer = ADD_TIMER(profile(), "FetchRowIdTime"); + _write_data_timer = ADD_TIMER(profile(), "WriteDataTime"); static const std::string timer_name = "WaitForDependencyTime"; _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL(_profile, timer_name, 1); auto fragment_instance_id = state->fragment_instance_id(); - _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); - _rows_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "RowsProduced", TUnit::UNIT, 1); - if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { @@ -143,12 +142,15 @@ Status ResultSinkOperatorX::open(RuntimeState* state) { Status ResultSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - COUNTER_UPDATE(local_state.rows_sent_counter(), (int64_t)block->rows()); - COUNTER_UPDATE(local_state.blocks_sent_counter(), 1); + COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); if (_fetch_option.use_two_phase_fetch && block->rows() > 0) { + SCOPED_TIMER(local_state._fetch_row_id_timer); RETURN_IF_ERROR(_second_phase_fetch_data(state, block)); } - RETURN_IF_ERROR(local_state._writer->write(state, *block)); + { + SCOPED_TIMER(local_state._write_data_timer); + RETURN_IF_ERROR(local_state._writer->write(state, *block)); + } if (_fetch_option.use_two_phase_fetch) { // Block structure may be changed by calling _second_phase_fetch_data(). // So we should clear block in case of unmatched columns diff --git a/be/src/pipeline/exec/result_sink_operator.h b/be/src/pipeline/exec/result_sink_operator.h index 3c503096ecb51e..339c167825643b 100644 --- a/be/src/pipeline/exec/result_sink_operator.h +++ b/be/src/pipeline/exec/result_sink_operator.h @@ -128,8 +128,6 @@ class ResultSinkLocalState final : public PipelineXSinkLocalState _sender = nullptr; std::shared_ptr _writer = nullptr; - RuntimeProfile::Counter* _blocks_sent_counter = nullptr; - RuntimeProfile::Counter* _rows_sent_counter = nullptr; + + RuntimeProfile::Counter* _fetch_row_id_timer = nullptr; + RuntimeProfile::Counter* _write_data_timer = nullptr; }; class ResultSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index 44442fc53043e0..9a83d9c7838c21 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -70,7 +70,7 @@ Status ScanLocalState::init(RuntimeState* state, LocalStateInfo& info) SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); - RETURN_IF_ERROR(RuntimeFilterConsumer::init(state, p.ignore_data_distribution())); + RETURN_IF_ERROR(RuntimeFilterConsumer::init(state, p.is_serial_operator())); // init profile for runtime filter RuntimeFilterConsumer::_init_profile(profile()); init_runtime_filter_dependency(_filter_dependencies, p.operator_id(), p.node_id(), @@ -994,7 +994,7 @@ Status ScanLocalState::_start_scanners( auto& p = _parent->cast(); _scanner_ctx = vectorized::ScannerContext::create_shared( state(), this, p._output_tuple_desc, p.output_row_descriptor(), scanners, p.limit(), - _scan_dependency, p.ignore_data_distribution()); + _scan_dependency, p.is_serial_operator()); return Status::OK(); } @@ -1052,13 +1052,10 @@ Status ScanLocalState::_init_profile() { ADD_COUNTER(_scanner_profile, "NewlyCreateFreeBlocksNum", TUnit::UNIT); _scale_up_scanners_counter = ADD_COUNTER(_scanner_profile, "NumScaleUpScanners", TUnit::UNIT); // time of transfer thread to wait for block from scan thread - _scanner_wait_batch_timer = ADD_TIMER(_scanner_profile, "ScannerBatchWaitTime"); _scanner_sched_counter = ADD_COUNTER(_scanner_profile, "ScannerSchedCount", TUnit::UNIT); - _scanner_ctx_sched_time = ADD_TIMER(_scanner_profile, "ScannerCtxSchedTime"); _scan_timer = ADD_TIMER(_scanner_profile, "ScannerGetBlockTime"); _scan_cpu_timer = ADD_TIMER(_scanner_profile, "ScannerCpuTime"); - _convert_block_timer = ADD_TIMER(_scanner_profile, "ScannerConvertBlockTime"); _filter_timer = ADD_TIMER(_scanner_profile, "ScannerFilterTime"); // time of scan thread to wait for worker thread of the thread pool @@ -1068,6 +1065,13 @@ Status ScanLocalState::_init_profile() { _peak_running_scanner = _scanner_profile->AddHighWaterMarkCounter("PeakRunningScanner", TUnit::UNIT); + + // Rows read from storage. + // Include the rows read from doris page cache. + _scan_rows = ADD_COUNTER(_runtime_profile, "ScanRows", TUnit::UNIT); + // Size of data that read from storage. + // Does not include rows that are cached by doris page cache. + _scan_bytes = ADD_COUNTER(_runtime_profile, "ScanBytes", TUnit::BYTES); return Status::OK(); } @@ -1148,6 +1152,8 @@ ScanOperatorX::ScanOperatorX(ObjectPool* pool, const TPlanNode& _should_run_serial = true; } } + OperatorX::_is_serial_operator = + tnode.__isset.is_serial_operator && tnode.is_serial_operator; if (tnode.__isset.push_down_count) { _push_down_count = tnode.push_down_count; } diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index 28dbd01280f3c8..7c774a5aaa0dbc 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -102,8 +102,6 @@ class ScanLocalStateBase : public PipelineXLocalState<>, public RuntimeFilterCon std::shared_ptr _scanner_profile; RuntimeProfile::Counter* _scanner_sched_counter = nullptr; - RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr; - RuntimeProfile::Counter* _scanner_wait_batch_timer = nullptr; RuntimeProfile::Counter* _scanner_wait_worker_timer = nullptr; // Num of newly created free blocks when running query RuntimeProfile::Counter* _newly_create_free_blocks_num = nullptr; @@ -114,8 +112,6 @@ class ScanLocalStateBase : public PipelineXLocalState<>, public RuntimeFilterCon // time of get block from scanner RuntimeProfile::Counter* _scan_timer = nullptr; RuntimeProfile::Counter* _scan_cpu_timer = nullptr; - // time of convert input block to output block from scanner - RuntimeProfile::Counter* _convert_block_timer = nullptr; // time of filter output block from scanner RuntimeProfile::Counter* _filter_timer = nullptr; RuntimeProfile::Counter* _memory_usage_counter = nullptr; @@ -128,6 +124,9 @@ class ScanLocalStateBase : public PipelineXLocalState<>, public RuntimeFilterCon RuntimeProfile::Counter* _num_scanners = nullptr; RuntimeProfile::Counter* _wait_for_rf_timer = nullptr; + + RuntimeProfile::Counter* _scan_rows = nullptr; + RuntimeProfile::Counter* _scan_bytes = nullptr; }; template @@ -372,8 +371,8 @@ class ScanOperatorX : public OperatorX { TPushAggOp::type get_push_down_agg_type() { return _push_down_agg_type; } DataDistribution required_data_distribution() const override { - if (OperatorX::ignore_data_distribution()) { - // `ignore_data_distribution()` returns true means we ignore the distribution. + if (OperatorX::is_serial_operator()) { + // `is_serial_operator()` returns true means we ignore the distribution. return {ExchangeType::NOOP}; } return {ExchangeType::BUCKET_HASH_SHUFFLE}; diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp index 3ee79629e1a788..33ba7d73100a04 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.cpp +++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp @@ -71,12 +71,16 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized auto probe_rows = in_block->rows(); if (probe_rows > 0) { - RETURN_IF_ERROR(_extract_probe_column(local_state, *in_block, local_state._probe_columns, - _cur_child_id)); + { + SCOPED_TIMER(local_state._extract_probe_data_timer); + RETURN_IF_ERROR(_extract_probe_column(local_state, *in_block, + local_state._probe_columns, _cur_child_id)); + } RETURN_IF_ERROR(std::visit( [&](auto&& arg) -> Status { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { + SCOPED_TIMER(local_state._probe_timer); vectorized::HashTableProbe process_hashtable_ctx(&local_state, probe_rows); return process_hashtable_ctx.mark_data_in_hashtable(arg); @@ -99,6 +103,9 @@ Status SetProbeSinkLocalState::init(RuntimeState* state, LocalSink RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + + _probe_timer = ADD_TIMER(Base::profile(), "ProbeTime"); + _extract_probe_data_timer = ADD_TIMER(Base::profile(), "ExtractProbeDataTime"); Parent& parent = _parent->cast(); _shared_state->probe_finished_children_dependency[parent._cur_child_id] = _dependency; _dependency->block(); diff --git a/be/src/pipeline/exec/set_probe_sink_operator.h b/be/src/pipeline/exec/set_probe_sink_operator.h index ab53f5358c2a91..368ea812cdfe01 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.h +++ b/be/src/pipeline/exec/set_probe_sink_operator.h @@ -60,6 +60,9 @@ class SetProbeSinkLocalState final : public PipelineXSinkLocalState @@ -96,8 +99,6 @@ class SetProbeSinkOperatorX final : public DataSinkOperatorX create_shared_state() const override { return nullptr; } private: diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index 01c26e4d005c65..f8287ce5fa7863 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -39,8 +39,10 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo auto& valid_element_in_hash_tbl = local_state._shared_state->valid_element_in_hash_tbl; if (in_block->rows() != 0) { - RETURN_IF_ERROR(local_state._mutable_block.merge(*in_block)); - + { + SCOPED_TIMER(local_state._merge_block_timer); + RETURN_IF_ERROR(local_state._mutable_block.merge(*in_block)); + } if (local_state._mutable_block.rows() > std::numeric_limits::max()) { return Status::NotSupported("set operator do not support build table rows over:" + std::to_string(std::numeric_limits::max())); @@ -48,6 +50,7 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo } if (eos || local_state._mutable_block.allocated_bytes() >= BUILD_BLOCK_MAX_SIZE) { + SCOPED_TIMER(local_state._build_timer); build_block = local_state._mutable_block.to_block(); RETURN_IF_ERROR(_process_build_block(local_state, build_block, state)); local_state._mutable_block.clear(); @@ -151,6 +154,7 @@ Status SetSinkLocalState::init(RuntimeState* state, LocalSinkState RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _merge_block_timer = ADD_TIMER(_profile, "MergeBlocksTime"); _build_timer = ADD_TIMER(_profile, "BuildTime"); auto& parent = _parent->cast(); _shared_state->probe_finished_children_dependency[parent._cur_child_id] = _dependency; diff --git a/be/src/pipeline/exec/set_sink_operator.h b/be/src/pipeline/exec/set_sink_operator.h index 1c08eddc141f2e..8f917b2dc79e58 100644 --- a/be/src/pipeline/exec/set_sink_operator.h +++ b/be/src/pipeline/exec/set_sink_operator.h @@ -48,14 +48,14 @@ class SetSinkLocalState final : public PipelineXSinkLocalState { private: friend class SetSinkOperatorX; - template - friend struct vectorized::HashTableBuild; - RuntimeProfile::Counter* _build_timer; // time to build hash table vectorized::MutableBlock _mutable_block; // every child has its result expr list vectorized::VExprContextSPtrs _child_exprs; vectorized::Arena _arena; + + RuntimeProfile::Counter* _merge_block_timer = nullptr; + RuntimeProfile::Counter* _build_timer = nullptr; }; template @@ -93,7 +93,6 @@ class SetSinkOperatorX final : public DataSinkOperatorX diff --git a/be/src/pipeline/exec/set_source_operator.cpp b/be/src/pipeline/exec/set_source_operator.cpp index c6a80f8d06c94f..5a4500d34d8cdc 100644 --- a/be/src/pipeline/exec/set_source_operator.cpp +++ b/be/src/pipeline/exec/set_source_operator.cpp @@ -29,6 +29,8 @@ Status SetSourceLocalState::init(RuntimeState* state, LocalStateIn RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _get_data_timer = ADD_TIMER(_runtime_profile, "GetDataTime"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); _shared_state->probe_finished_children_dependency.resize( _parent->cast>()._child_quantity, nullptr); return Status::OK(); @@ -69,21 +71,26 @@ Status SetSourceOperatorX::get_block(RuntimeState* state, vectoriz auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); _create_mutable_cols(local_state, block); - auto st = std::visit( - [&](auto&& arg) -> Status { - using HashTableCtxType = std::decay_t; - if constexpr (!std::is_same_v) { - return _get_data_in_hashtable(local_state, arg, block, - state->batch_size(), eos); - } else { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - } - }, - *local_state._shared_state->hash_table_variants); - RETURN_IF_ERROR(st); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, - block->columns())); + { + SCOPED_TIMER(local_state._get_data_timer); + RETURN_IF_ERROR(std::visit( + [&](auto&& arg) -> Status { + using HashTableCtxType = std::decay_t; + if constexpr (!std::is_same_v) { + return _get_data_in_hashtable(local_state, arg, block, + state->batch_size(), eos); + } else { + LOG(FATAL) << "FATAL: uninited hash table"; + __builtin_unreachable(); + } + }, + *local_state._shared_state->hash_table_variants)); + } + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, + block->columns())); + } local_state.reached_limit(block, eos); return Status::OK(); } diff --git a/be/src/pipeline/exec/set_source_operator.h b/be/src/pipeline/exec/set_source_operator.h index 5157a2f9c979fe..53fbe4e5398432 100644 --- a/be/src/pipeline/exec/set_source_operator.h +++ b/be/src/pipeline/exec/set_source_operator.h @@ -46,6 +46,9 @@ class SetSourceLocalState final : public PipelineXLocalState { std::vector _mutable_cols; //record build column type vectorized::DataTypes _left_table_data_types; + + RuntimeProfile::Counter* _get_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; template diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp index b07942b9ab1c05..6f67262ef1f3ed 100644 --- a/be/src/pipeline/exec/sort_sink_operator.cpp +++ b/be/src/pipeline/exec/sort_sink_operator.cpp @@ -32,6 +32,8 @@ Status SortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { SCOPED_TIMER(_init_timer); _sort_blocks_memory_usage = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "SortBlocks", TUnit::BYTES, "MemoryUsage", 1); + _append_blocks_timer = ADD_TIMER(profile(), "AppendBlockTime"); + _update_runtime_predicate_timer = ADD_TIMER(profile(), "UpdateRuntimePredicateTime"); return Status::OK(); } @@ -90,7 +92,9 @@ SortSinkOperatorX::SortSinkOperatorX(ObjectPool* pool, int operator_id, const TP : std::vector {}), _algorithm(tnode.sort_node.__isset.algorithm ? tnode.sort_node.algorithm : TSortAlgorithm::FULL_SORT), - _reuse_mem(_algorithm != TSortAlgorithm::HEAP_SORT) {} + _reuse_mem(_algorithm != TSortAlgorithm::HEAP_SORT) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status SortSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::init(tnode, state)); @@ -118,11 +122,15 @@ Status SortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* in COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); if (in_block->rows() > 0) { COUNTER_UPDATE(local_state._sort_blocks_memory_usage, (int64_t)in_block->bytes()); - RETURN_IF_ERROR(local_state._shared_state->sorter->append_block(in_block)); + { + SCOPED_TIMER(local_state._append_blocks_timer); + RETURN_IF_ERROR(local_state._shared_state->sorter->append_block(in_block)); + } local_state._mem_tracker->set_consumption(local_state._shared_state->sorter->data_size()); RETURN_IF_CANCELLED(state); if (state->get_query_ctx()->has_runtime_predicate(_node_id)) { + SCOPED_TIMER(local_state._update_runtime_predicate_timer); auto& predicate = state->get_query_ctx()->get_runtime_predicate(_node_id); if (predicate.enable()) { vectorized::Field new_top = local_state._shared_state->sorter->get_top_value(); diff --git a/be/src/pipeline/exec/sort_sink_operator.h b/be/src/pipeline/exec/sort_sink_operator.h index 8462472dd02671..6bf87164e71026 100644 --- a/be/src/pipeline/exec/sort_sink_operator.h +++ b/be/src/pipeline/exec/sort_sink_operator.h @@ -46,6 +46,8 @@ class SortSinkLocalState : public PipelineXSinkLocalState { // topn top value vectorized::Field old_top {vectorized::Field::Types::Null}; + RuntimeProfile::Counter* _append_blocks_timer = nullptr; + RuntimeProfile::Counter* _update_runtime_predicate_timer = nullptr; }; class SortSinkOperatorX final : public DataSinkOperatorX { @@ -69,10 +71,10 @@ class SortSinkOperatorX final : public DataSinkOperatorX { } else if (_merge_by_exchange) { // The current sort node is used for the ORDER BY return {ExchangeType::PASSTHROUGH}; + } else { + return {ExchangeType::NOOP}; } - return DataSinkOperatorX::required_data_distribution(); } - bool require_shuffled_data_distribution() const override { return _is_analytic_sort; } bool require_data_distribution() const override { return _is_colocate; } size_t get_revocable_mem_size(RuntimeState* state) const; diff --git a/be/src/pipeline/exec/sort_source_operator.cpp b/be/src/pipeline/exec/sort_source_operator.cpp index 02a99e183c852e..7f801b79c0b12b 100644 --- a/be/src/pipeline/exec/sort_source_operator.cpp +++ b/be/src/pipeline/exec/sort_source_operator.cpp @@ -30,7 +30,9 @@ SortSourceOperatorX::SortSourceOperatorX(ObjectPool* pool, const TPlanNode& tnod const DescriptorTbl& descs) : OperatorX(pool, tnode, operator_id, descs), _merge_by_exchange(tnode.sort_node.merge_by_exchange), - _offset(tnode.sort_node.__isset.offset ? tnode.sort_node.offset : 0) {} + _offset(tnode.sort_node.__isset.offset ? tnode.sort_node.offset : 0) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status SortSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(Base::init(tnode, state)); diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp index dfbe42c637ea56..780bd194ac8b6b 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp @@ -93,25 +93,18 @@ Status StreamingAggLocalState::init(RuntimeState* state, LocalStateInfo& info) { "SerializeKeyArena", TUnit::BYTES, "MemoryUsage", 1); _build_timer = ADD_TIMER(Base::profile(), "BuildTime"); - _build_table_convert_timer = ADD_TIMER(Base::profile(), "BuildConvertToPartitionedTime"); - _serialize_key_timer = ADD_TIMER(Base::profile(), "SerializeKeyTime"); - _exec_timer = ADD_TIMER(Base::profile(), "ExecTime"); _merge_timer = ADD_TIMER(Base::profile(), "MergeTime"); _expr_timer = ADD_TIMER(Base::profile(), "ExprTime"); - _serialize_data_timer = ADD_TIMER(Base::profile(), "SerializeDataTime"); + _insert_values_to_column_timer = ADD_TIMER(Base::profile(), "InsertValuesToColumnTime"); _deserialize_data_timer = ADD_TIMER(Base::profile(), "DeserializeAndMergeTime"); _hash_table_compute_timer = ADD_TIMER(Base::profile(), "HashTableComputeTime"); _hash_table_emplace_timer = ADD_TIMER(Base::profile(), "HashTableEmplaceTime"); _hash_table_input_counter = ADD_COUNTER(Base::profile(), "HashTableInputCount", TUnit::UNIT); - _max_row_size_counter = ADD_COUNTER(Base::profile(), "MaxRowSizeInBytes", TUnit::UNIT); _hash_table_size_counter = ADD_COUNTER(profile(), "HashTableSize", TUnit::UNIT); - _queue_byte_size_counter = ADD_COUNTER(profile(), "MaxSizeInBlockQueue", TUnit::BYTES); - _queue_size_counter = ADD_COUNTER(profile(), "MaxSizeOfBlockQueue", TUnit::UNIT); _streaming_agg_timer = ADD_TIMER(profile(), "StreamingAggTime"); _build_timer = ADD_TIMER(profile(), "BuildTime"); _expr_timer = ADD_TIMER(Base::profile(), "ExprTime"); _get_results_timer = ADD_TIMER(profile(), "GetResultsTime"); - _serialize_result_timer = ADD_TIMER(profile(), "SerializeResultTime"); _hash_table_iterate_timer = ADD_TIMER(profile(), "HashTableIterateTime"); _insert_keys_to_column_timer = ADD_TIMER(profile(), "InsertKeysToColumnTime"); @@ -683,7 +676,7 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B } for (int i = 0; i != _aggregate_evaluators.size(); ++i) { - SCOPED_TIMER(_serialize_data_timer); + SCOPED_TIMER(_insert_values_to_column_timer); RETURN_IF_ERROR( _aggregate_evaluators[i]->streaming_agg_serialize_to_column( in_block, value_columns[i], rows, @@ -852,12 +845,12 @@ Status StreamingAggLocalState::_get_with_serialized_key_result(RuntimeState* sta return Status::OK(); } -Status StreamingAggLocalState::_serialize_without_key(RuntimeState* state, vectorized::Block* block, - bool* eos) { +Status StreamingAggLocalState::_get_results_without_key(RuntimeState* state, + vectorized::Block* block, bool* eos) { // 1. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return NULL result // level one aggregation node set `eos = true` return directly - SCOPED_TIMER(_serialize_result_timer); + SCOPED_TIMER(_get_results_timer); if (UNLIKELY(_input_num_rows == 0)) { *eos = true; return Status::OK(); @@ -896,10 +889,10 @@ Status StreamingAggLocalState::_serialize_without_key(RuntimeState* state, vecto return Status::OK(); } -Status StreamingAggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, - vectorized::Block* block, - bool* eos) { - SCOPED_TIMER(_serialize_result_timer); +Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* state, + vectorized::Block* block, + bool* eos) { + SCOPED_TIMER(_get_results_timer); auto& p = _parent->cast(); int key_size = _probe_expr_ctxs.size(); int agg_size = _aggregate_evaluators.size(); @@ -918,7 +911,6 @@ Status StreamingAggLocalState::_serialize_with_serialized_key_result(RuntimeStat } } - SCOPED_TIMER(_get_results_timer); std::visit( vectorized::Overload { [&](std::monostate& arg) -> void { @@ -974,7 +966,7 @@ Status StreamingAggLocalState::_serialize_with_serialized_key_result(RuntimeStat } { - SCOPED_TIMER(_serialize_data_timer); + SCOPED_TIMER(_insert_values_to_column_timer); for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { value_data_types[i] = _aggregate_evaluators[i]->function()->get_serialized_type(); diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.h b/be/src/pipeline/exec/streaming_aggregation_operator.h index c37fa5cbd881ca..59d5491d10c12f 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.h +++ b/be/src/pipeline/exec/streaming_aggregation_operator.h @@ -65,11 +65,11 @@ class StreamingAggLocalState final : public PipelineXLocalState void _update_memusage_with_serialized_key(); Status _init_hash_method(const vectorized::VExprContextSPtrs& probe_exprs); Status _get_without_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); + Status _get_results_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); Status _get_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, - bool* eos); + Status _get_results_with_serialized_key(RuntimeState* state, vectorized::Block* block, + bool* eos); template Status _merge_with_serialized_key_helper(vectorized::Block* block); @@ -83,25 +83,19 @@ class StreamingAggLocalState final : public PipelineXLocalState Status _create_agg_status(vectorized::AggregateDataPtr data); size_t _get_hash_table_size(); - RuntimeProfile::Counter* _queue_byte_size_counter = nullptr; - RuntimeProfile::Counter* _queue_size_counter = nullptr; RuntimeProfile::Counter* _streaming_agg_timer = nullptr; RuntimeProfile::Counter* _hash_table_compute_timer = nullptr; RuntimeProfile::Counter* _hash_table_emplace_timer = nullptr; RuntimeProfile::Counter* _hash_table_input_counter = nullptr; RuntimeProfile::Counter* _build_timer = nullptr; RuntimeProfile::Counter* _expr_timer = nullptr; - RuntimeProfile::Counter* _build_table_convert_timer = nullptr; - RuntimeProfile::Counter* _serialize_key_timer = nullptr; RuntimeProfile::Counter* _merge_timer = nullptr; - RuntimeProfile::Counter* _serialize_data_timer = nullptr; + RuntimeProfile::Counter* _insert_values_to_column_timer = nullptr; RuntimeProfile::Counter* _deserialize_data_timer = nullptr; - RuntimeProfile::Counter* _max_row_size_counter = nullptr; RuntimeProfile::Counter* _hash_table_memory_usage = nullptr; RuntimeProfile::HighWaterMarkCounter* _serialize_key_arena_memory_usage = nullptr; RuntimeProfile::Counter* _hash_table_size_counter = nullptr; RuntimeProfile::Counter* _get_results_timer = nullptr; - RuntimeProfile::Counter* _serialize_result_timer = nullptr; RuntimeProfile::Counter* _hash_table_iterate_timer = nullptr; RuntimeProfile::Counter* _insert_keys_to_column_timer = nullptr; @@ -136,13 +130,13 @@ class StreamingAggLocalState final : public PipelineXLocalState if constexpr (NeedFinalize) { return local_state->_get_without_key_result(state, block, eos); } else { - return local_state->_serialize_without_key(state, block, eos); + return local_state->_get_results_without_key(state, block, eos); } } else { if constexpr (NeedFinalize) { return local_state->_get_with_serialized_key_result(state, block, eos); } else { - return local_state->_serialize_with_serialized_key_result(state, block, eos); + return local_state->_get_results_with_serialized_key(state, block, eos); } } } diff --git a/be/src/pipeline/exec/table_function_operator.cpp b/be/src/pipeline/exec/table_function_operator.cpp index ff9dfe632faec6..c1621470f435b4 100644 --- a/be/src/pipeline/exec/table_function_operator.cpp +++ b/be/src/pipeline/exec/table_function_operator.cpp @@ -32,6 +32,18 @@ namespace doris::pipeline { TableFunctionLocalState::TableFunctionLocalState(RuntimeState* state, OperatorXBase* parent) : PipelineXLocalState<>(state, parent), _child_block(vectorized::Block::create_unique()) {} +Status TableFunctionLocalState::init(RuntimeState* state, LocalStateInfo& info) { + RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _init_function_timer = ADD_TIMER(_runtime_profile, "InitTableFunctionTime"); + _process_rows_timer = ADD_TIMER(_runtime_profile, "ProcessRowsTime"); + _copy_data_timer = ADD_TIMER(_runtime_profile, "CopyDataTime"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); + _repeat_data_timer = ADD_TIMER(_runtime_profile, "RepeatDataTime"); + return Status::OK(); +} + Status TableFunctionLocalState::open(RuntimeState* state) { SCOPED_TIMER(PipelineXLocalState<>::exec_time_counter()); SCOPED_TIMER(PipelineXLocalState<>::_open_timer); @@ -59,6 +71,7 @@ void TableFunctionLocalState::_copy_output_slots( if (!_current_row_insert_times) { return; } + SCOPED_TIMER(_copy_data_timer); auto& p = _parent->cast(); for (auto index : p._output_slot_indexs) { auto src_column = _child_block->get_by_position(index).column; @@ -197,15 +210,18 @@ Status TableFunctionLocalState::get_expanded_block(RuntimeState* state, columns[index]->insert_many_defaults(row_size - columns[index]->size()); } - // 3. eval conjuncts - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(_conjuncts, output_block, - output_block->columns())); + { + SCOPED_TIMER(_filter_timer); // 3. eval conjuncts + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(_conjuncts, output_block, + output_block->columns())); + } *eos = _child_eos && _cur_child_offset == -1; return Status::OK(); } void TableFunctionLocalState::process_next_child_row() { + SCOPED_TIMER(_process_rows_timer); _cur_child_offset++; if (_cur_child_offset >= _child_block->rows()) { @@ -232,9 +248,6 @@ TableFunctionOperatorX::TableFunctionOperatorX(ObjectPool* pool, const TPlanNode Status TableFunctionOperatorX::_prepare_output_slot_ids(const TPlanNode& tnode) { // Prepare output slot ids - if (tnode.table_function_node.outputSlotIds.empty()) { - return Status::InternalError("Output slots of table function node is empty"); - } SlotId max_id = -1; for (auto slot_id : tnode.table_function_node.outputSlotIds) { if (slot_id > max_id) { diff --git a/be/src/pipeline/exec/table_function_operator.h b/be/src/pipeline/exec/table_function_operator.h index 75b1608fad7112..81160acb7f7611 100644 --- a/be/src/pipeline/exec/table_function_operator.h +++ b/be/src/pipeline/exec/table_function_operator.h @@ -37,6 +37,7 @@ class TableFunctionLocalState final : public PipelineXLocalState<> { TableFunctionLocalState(RuntimeState* state, OperatorXBase* parent); ~TableFunctionLocalState() override = default; + Status init(RuntimeState* state, LocalStateInfo& infos) override; Status open(RuntimeState* state) override; Status close(RuntimeState* state) override { for (auto* fn : _fns) { @@ -67,6 +68,12 @@ class TableFunctionLocalState final : public PipelineXLocalState<> { std::unique_ptr _child_block; int _current_row_insert_times = 0; bool _child_eos = false; + + RuntimeProfile::Counter* _init_function_timer = nullptr; + RuntimeProfile::Counter* _process_rows_timer = nullptr; + RuntimeProfile::Counter* _copy_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; + RuntimeProfile::Counter* _repeat_data_timer = nullptr; }; class TableFunctionOperatorX final : public StatefulOperatorX { @@ -93,6 +100,7 @@ class TableFunctionOperatorX final : public StatefulOperatorXprocess_init(input_block, state)); } local_state.process_next_child_row(); diff --git a/be/src/pipeline/exec/union_sink_operator.cpp b/be/src/pipeline/exec/union_sink_operator.cpp index 288fc131037fab..8467eeb1d5467a 100644 --- a/be/src/pipeline/exec/union_sink_operator.cpp +++ b/be/src/pipeline/exec/union_sink_operator.cpp @@ -32,6 +32,7 @@ Status UnionSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _expr_timer = ADD_TIMER(_profile, "ExprTime"); auto& p = _parent->cast(); _shared_state->data_queue.set_sink_dependency(_dependency, p._cur_child_id); return Status::OK(); diff --git a/be/src/pipeline/exec/union_sink_operator.h b/be/src/pipeline/exec/union_sink_operator.h index f939950143ae92..aa94ed9a73038f 100644 --- a/be/src/pipeline/exec/union_sink_operator.h +++ b/be/src/pipeline/exec/union_sink_operator.h @@ -55,6 +55,7 @@ class UnionSinkLocalState final : public PipelineXSinkLocalState { @@ -136,6 +137,7 @@ class UnionSinkOperatorX final : public DataSinkOperatorX { Status materialize_block(RuntimeState* state, vectorized::Block* src_block, int child_idx, vectorized::Block* res_block) { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state._expr_timer); const auto& child_exprs = local_state._child_expr; vectorized::ColumnsWithTypeAndName colunms; for (size_t i = 0; i < child_exprs.size(); ++i) { diff --git a/be/src/pipeline/exec/union_source_operator.h b/be/src/pipeline/exec/union_source_operator.h index 2d112ebf2df579..200e7de8597b91 100644 --- a/be/src/pipeline/exec/union_source_operator.h +++ b/be/src/pipeline/exec/union_source_operator.h @@ -63,7 +63,9 @@ class UnionSourceOperatorX final : public OperatorX { using Base = OperatorX; UnionSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs) - : Base(pool, tnode, operator_id, descs), _child_size(tnode.num_children) {}; + : Base(pool, tnode, operator_id, descs), _child_size(tnode.num_children) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; + } ~UnionSourceOperatorX() override = default; Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos) override; diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp index d87113ca80a959..ff243186c47c43 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp @@ -36,17 +36,17 @@ std::vector LocalExchangeSinkLocalState::dependencies() const { } Status LocalExchangeSinkOperatorX::init(ExchangeType type, const int num_buckets, - const bool should_disable_bucket_shuffle, + const bool use_global_hash_shuffle, const std::map& shuffle_idx_to_instance_idx) { _name = "LOCAL_EXCHANGE_SINK_OPERATOR (" + get_exchange_type_name(type) + ")"; _type = type; if (_type == ExchangeType::HASH_SHUFFLE) { - _use_global_shuffle = should_disable_bucket_shuffle; + _use_global_shuffle = use_global_hash_shuffle; // For shuffle join, if data distribution has been broken by previous operator, we // should use a HASH_SHUFFLE local exchanger to shuffle data again. To be mentioned, // we should use map shuffle idx to instance idx because all instances will be // distributed to all BEs. Otherwise, we should use shuffle idx directly. - if (should_disable_bucket_shuffle) { + if (use_global_hash_shuffle) { std::for_each(shuffle_idx_to_instance_idx.begin(), shuffle_idx_to_instance_idx.end(), [&](const auto& item) { DCHECK(item.first != -1); diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h index 1cd9736d4291d6..09b1f2cc3105f2 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h @@ -100,7 +100,7 @@ class LocalExchangeSinkOperatorX final : public DataSinkOperatorX& shuffle_idx_to_instance_idx) override; Status open(RuntimeState* state) override; diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.h b/be/src/pipeline/local_exchange/local_exchange_source_operator.h index c0da5c8120c1e9..3c706d50182538 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.h @@ -81,9 +81,6 @@ class LocalExchangeSourceOperatorX final : public OperatorXunref(local_state._shared_state, local_state._channel_id); } } - } else if (_num_senders != _num_sources || _ignore_source_data_distribution) { + } else if (_num_senders != _num_sources) { // In this branch, data just should be distributed equally into all instances. new_block_wrapper->ref(_num_partitions); for (size_t i = 0; i < _num_partitions; i++) { diff --git a/be/src/pipeline/local_exchange/local_exchanger.h b/be/src/pipeline/local_exchange/local_exchanger.h index 01b55816ba8aad..cc33efbb934a4d 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.h +++ b/be/src/pipeline/local_exchange/local_exchanger.h @@ -217,24 +217,21 @@ class ShuffleExchanger : public Exchanger { protected: ShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, - bool ignore_source_data_distribution, int free_block_limit) + int free_block_limit) : Exchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit), - _ignore_source_data_distribution(ignore_source_data_distribution) { + free_block_limit) { _data_queue.resize(num_partitions); } Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, LocalExchangeSinkLocalState& local_state); - - const bool _ignore_source_data_distribution = false; }; class BucketShuffleExchanger final : public ShuffleExchanger { ENABLE_FACTORY_CREATOR(BucketShuffleExchanger); BucketShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, - bool ignore_source_data_distribution, int free_block_limit) + int free_block_limit) : ShuffleExchanger(running_sink_operators, num_sources, num_partitions, - ignore_source_data_distribution, free_block_limit) {} + free_block_limit) {} ~BucketShuffleExchanger() override = default; ExchangeType get_type() const override { return ExchangeType::BUCKET_HASH_SHUFFLE; } }; diff --git a/be/src/pipeline/pipeline.cpp b/be/src/pipeline/pipeline.cpp index 6e83c7805e46fc..96da754daa5d98 100644 --- a/be/src/pipeline/pipeline.cpp +++ b/be/src/pipeline/pipeline.cpp @@ -22,6 +22,7 @@ #include #include "pipeline/exec/operator.h" +#include "pipeline/pipeline_fragment_context.h" #include "pipeline/pipeline_task.h" namespace doris::pipeline { @@ -31,7 +32,48 @@ void Pipeline::_init_profile() { _pipeline_profile = std::make_unique(std::move(s)); } -Status Pipeline::add_operator(OperatorPtr& op) { +bool Pipeline::need_to_local_exchange(const DataDistribution target_data_distribution, + const int idx) const { + // If serial operator exists after `idx`-th operator, we should not improve parallelism. + if (std::any_of(_operators.begin() + idx, _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + return false; + } + // If all operators are serial and sink is not serial, we should improve parallelism for sink. + if (std::all_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + if (!_sink->is_serial_operator()) { + return true; + } + } else if (std::any_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + // If non-serial operators exist, we should improve parallelism for those. + return true; + } + + if (target_data_distribution.distribution_type != ExchangeType::BUCKET_HASH_SHUFFLE && + target_data_distribution.distribution_type != ExchangeType::HASH_SHUFFLE) { + // Always do local exchange if non-hash-partition exchanger is required. + // For example, `PASSTHROUGH` exchanger is always required to distribute data evenly. + return true; + } else if (_operators.front()->is_serial_operator()) { + DCHECK(std::all_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); }) && + _sink->is_serial_operator()) + << debug_string(); + // All operators and sink are serial in this path. + return false; + } else { + return _data_distribution.distribution_type != target_data_distribution.distribution_type && + !(is_hash_exchange(_data_distribution.distribution_type) && + is_hash_exchange(target_data_distribution.distribution_type)); + } +} + +Status Pipeline::add_operator(OperatorPtr& op, const int parallelism) { + if (parallelism > 0 && op->is_serial_operator()) { + set_num_tasks(parallelism); + } op->set_parallel_tasks(num_tasks()); _operators.emplace_back(op); if (op->is_source()) { diff --git a/be/src/pipeline/pipeline.h b/be/src/pipeline/pipeline.h index 8a20ccb631cc47..619848110d4a33 100644 --- a/be/src/pipeline/pipeline.h +++ b/be/src/pipeline/pipeline.h @@ -44,14 +44,16 @@ class Pipeline : public std::enable_shared_from_this { public: explicit Pipeline(PipelineId pipeline_id, int num_tasks, - std::weak_ptr context) - : _pipeline_id(pipeline_id), _num_tasks(num_tasks) { + std::weak_ptr context, int num_tasks_of_parent) + : _pipeline_id(pipeline_id), + _num_tasks(num_tasks), + _num_tasks_of_parent(num_tasks_of_parent) { _init_profile(); _tasks.resize(_num_tasks, nullptr); } // Add operators for pipelineX - Status add_operator(OperatorPtr& op); + Status add_operator(OperatorPtr& op, const int parallelism); // prepare operators for pipelineX Status prepare(RuntimeState* state); @@ -71,28 +73,8 @@ class Pipeline : public std::enable_shared_from_this { return idx == ExchangeType::HASH_SHUFFLE || idx == ExchangeType::BUCKET_HASH_SHUFFLE; } - bool need_to_local_exchange(const DataDistribution target_data_distribution) const { - if (target_data_distribution.distribution_type != ExchangeType::BUCKET_HASH_SHUFFLE && - target_data_distribution.distribution_type != ExchangeType::HASH_SHUFFLE) { - return true; - } else if (_operators.front()->ignore_data_hash_distribution()) { - if (_data_distribution.distribution_type == - target_data_distribution.distribution_type && - (_data_distribution.partition_exprs.empty() || - target_data_distribution.partition_exprs.empty())) { - return true; - } - return _data_distribution.distribution_type != - target_data_distribution.distribution_type && - !(is_hash_exchange(_data_distribution.distribution_type) && - is_hash_exchange(target_data_distribution.distribution_type)); - } else { - return _data_distribution.distribution_type != - target_data_distribution.distribution_type && - !(is_hash_exchange(_data_distribution.distribution_type) && - is_hash_exchange(target_data_distribution.distribution_type)); - } - } + bool need_to_local_exchange(const DataDistribution target_data_distribution, + const int idx) const; void init_data_distribution() { set_data_distribution(_operators.front()->required_data_distribution()); } @@ -120,11 +102,19 @@ class Pipeline : public std::enable_shared_from_this { for (auto& op : _operators) { op->set_parallel_tasks(_num_tasks); } + +#ifndef NDEBUG + if (num_tasks > 1 && + std::any_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + DCHECK(false) << debug_string(); + } +#endif } int num_tasks() const { return _num_tasks; } bool close_task() { return _num_tasks_running.fetch_sub(1) == 1; } - std::string debug_string() { + std::string debug_string() const { fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, "Pipeline [id: {}, _num_tasks: {}, _num_tasks_created: {}]", _pipeline_id, @@ -136,6 +126,8 @@ class Pipeline : public std::enable_shared_from_this { return fmt::to_string(debug_string_buffer); } + int num_tasks_of_parent() const { return _num_tasks_of_parent; } + private: void _init_profile(); @@ -173,6 +165,8 @@ class Pipeline : public std::enable_shared_from_this { std::atomic _num_tasks_running = 0; // Tasks in this pipeline. std::vector _tasks; + // Parallelism of parent pipeline. + const int _num_tasks_of_parent; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index b1ee5933d27984..4e05a39d77cd62 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -43,6 +43,8 @@ #include "pipeline/exec/analytic_sink_operator.h" #include "pipeline/exec/analytic_source_operator.h" #include "pipeline/exec/assert_num_rows_operator.h" +#include "pipeline/exec/cache_sink_operator.h" +#include "pipeline/exec/cache_source_operator.h" #include "pipeline/exec/datagen_operator.h" #include "pipeline/exec/distinct_streaming_aggregation_operator.h" #include "pipeline/exec/empty_set_operator.h" @@ -110,7 +112,6 @@ #include "vec/runtime/vdata_stream_mgr.h" namespace doris::pipeline { -bvar::Adder g_pipeline_tasks_count("doris_pipeline_tasks_count"); PipelineFragmentContext::PipelineFragmentContext( const TUniqueId& query_id, const int fragment_id, std::shared_ptr query_ctx, @@ -181,9 +182,10 @@ void PipelineFragmentContext::cancel(const Status reason) { LOG(WARNING) << "PipelineFragmentContext is cancelled due to timeout : " << debug_string(); } + // `ILLEGAL_STATE` means queries this fragment belongs to was not found in FE (maybe finished) if (reason.is()) { LOG_WARNING("PipelineFragmentContext is cancelled due to illegal state : {}", - this->debug_string()); + debug_string()); } _query_ctx->cancel(reason, _fragment_id); @@ -208,28 +210,20 @@ void PipelineFragmentContext::cancel(const Status reason) { } } -PipelinePtr PipelineFragmentContext::add_pipeline() { - // _prepared、_submitted, _canceled should do not add pipeline - PipelineId id = _next_pipeline_id++; - auto pipeline = std::make_shared( - id, _num_instances, - std::dynamic_pointer_cast(shared_from_this())); - _pipelines.emplace_back(pipeline); - return pipeline; -} - PipelinePtr PipelineFragmentContext::add_pipeline(PipelinePtr parent, int idx) { - // _prepared、_submitted, _canceled should do not add pipeline PipelineId id = _next_pipeline_id++; auto pipeline = std::make_shared( - id, _num_instances, - std::dynamic_pointer_cast(shared_from_this())); + id, parent ? std::min(parent->num_tasks(), _num_instances) : _num_instances, + std::dynamic_pointer_cast(shared_from_this()), + parent ? parent->num_tasks() : _num_instances); if (idx >= 0) { _pipelines.insert(_pipelines.begin() + idx, pipeline); } else { _pipelines.emplace_back(pipeline); } - parent->set_children(pipeline); + if (parent) { + parent->set_children(pipeline); + } return pipeline; } @@ -247,7 +241,7 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re SCOPED_TIMER(_prepare_timer); _build_pipelines_timer = ADD_TIMER(_runtime_profile, "BuildPipelinesTime"); _init_context_timer = ADD_TIMER(_runtime_profile, "InitContextTime"); - _plan_local_shuffle_timer = ADD_TIMER(_runtime_profile, "PlanLocalShuffleTime"); + _plan_local_exchanger_timer = ADD_TIMER(_runtime_profile, "PlanLocalLocalExchangerTime"); _build_tasks_timer = ADD_TIMER(_runtime_profile, "BuildTasksTime"); _prepare_all_pipelines_timer = ADD_TIMER(_runtime_profile, "PrepareAllPipelinesTime"); { @@ -334,14 +328,15 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re RETURN_IF_ERROR(pipeline->sink()->set_child(pipeline->operators().back())); } } - if (_enable_local_shuffle()) { - SCOPED_TIMER(_plan_local_shuffle_timer); + // 4. Build local exchanger + if (_runtime_state->enable_local_shuffle()) { + SCOPED_TIMER(_plan_local_exchanger_timer); RETURN_IF_ERROR(_plan_local_exchange(request.num_buckets, request.bucket_seq_to_instance_idx, request.shuffle_idx_to_instance_idx)); } - // 4. Initialize global states in pipelines. + // 5. Initialize global states in pipelines. for (PipelinePtr& pipeline : _pipelines) { SCOPED_TIMER(_prepare_all_pipelines_timer); pipeline->children().clear(); @@ -350,7 +345,7 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re { SCOPED_TIMER(_build_tasks_timer); - // 5. Build pipeline tasks and initialize local state. + // 6. Build pipeline tasks and initialize local state. RETURN_IF_ERROR(_build_pipeline_tasks(request, thread_pool)); } @@ -378,40 +373,6 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag const auto& local_params = request.local_params[i]; auto fragment_instance_id = local_params.fragment_instance_id; _fragment_instance_ids[i] = fragment_instance_id; - std::unique_ptr runtime_filter_mgr; - auto init_runtime_state = [&](std::unique_ptr& runtime_state) { - runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker); - - runtime_state->set_task_execution_context(shared_from_this()); - runtime_state->set_be_number(local_params.backend_num); - - if (request.__isset.backend_id) { - runtime_state->set_backend_id(request.backend_id); - } - if (request.__isset.import_label) { - runtime_state->set_import_label(request.import_label); - } - if (request.__isset.db_name) { - runtime_state->set_db_name(request.db_name); - } - if (request.__isset.load_job_id) { - runtime_state->set_load_job_id(request.load_job_id); - } - if (request.__isset.wal_id) { - runtime_state->set_wal_id(request.wal_id); - } - - runtime_state->set_desc_tbl(_desc_tbl); - runtime_state->set_per_fragment_instance_idx(local_params.sender_id); - runtime_state->set_num_per_fragment_instances(request.num_senders); - runtime_state->resize_op_id_to_local_state(max_operator_id()); - runtime_state->set_max_operator_id(max_operator_id()); - runtime_state->set_load_stream_per_node(request.load_stream_per_node); - runtime_state->set_total_load_streams(request.total_load_streams); - runtime_state->set_num_local_sink(request.num_local_sink); - DCHECK(runtime_filter_mgr); - runtime_state->set_runtime_filter_mgr(runtime_filter_mgr.get()); - }; auto filterparams = std::make_unique(); @@ -430,8 +391,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag filterparams->query_ctx = _query_ctx.get(); } - // build local_runtime_filter_mgr for each instance - runtime_filter_mgr = std::make_unique( + auto runtime_filter_mgr = std::make_unique( request.query_id, filterparams.get(), _query_ctx->query_mem_tracker); filterparams->runtime_filter_mgr = runtime_filter_mgr.get(); @@ -468,7 +428,41 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag request.fragment_id, request.query_options, _query_ctx->query_globals, _exec_env, _query_ctx.get()); auto& task_runtime_state = _task_runtime_states[pip_idx][i]; - init_runtime_state(task_runtime_state); + { + // Initialize runtime state for this task + task_runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker); + + task_runtime_state->set_task_execution_context(shared_from_this()); + task_runtime_state->set_be_number(local_params.backend_num); + + if (request.__isset.backend_id) { + task_runtime_state->set_backend_id(request.backend_id); + } + if (request.__isset.import_label) { + task_runtime_state->set_import_label(request.import_label); + } + if (request.__isset.db_name) { + task_runtime_state->set_db_name(request.db_name); + } + if (request.__isset.load_job_id) { + task_runtime_state->set_load_job_id(request.load_job_id); + } + if (request.__isset.wal_id) { + task_runtime_state->set_wal_id(request.wal_id); + } + + task_runtime_state->set_desc_tbl(_desc_tbl); + task_runtime_state->set_per_fragment_instance_idx(local_params.sender_id); + task_runtime_state->set_num_per_fragment_instances(request.num_senders); + task_runtime_state->resize_op_id_to_local_state(max_operator_id()); + task_runtime_state->set_max_operator_id(max_operator_id()); + task_runtime_state->set_load_stream_per_node(request.load_stream_per_node); + task_runtime_state->set_total_load_streams(request.total_load_streams); + task_runtime_state->set_num_local_sink(request.num_local_sink); + DCHECK(_runtime_filter_states[i]->runtime_filter_mgr); + task_runtime_state->set_runtime_filter_mgr( + _runtime_filter_states[i]->runtime_filter_mgr); + } auto cur_task_id = _total_tasks++; task_runtime_state->set_task_id(cur_task_id); task_runtime_state->set_task_num(pipeline->num_tasks()); @@ -500,22 +494,12 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag * Finally, we have two upstream dependencies in Pipeline1 corresponding to JoinProbeOperator1 * and JoinProbeOperator2. */ - - // First, set up the parent profile,task runtime state - - auto prepare_and_set_parent_profile = [&](PipelineTask* task, size_t pip_idx) { - DCHECK(pipeline_id_to_profile[pip_idx]); - RETURN_IF_ERROR( - task->prepare(local_params, request.fragment.output_sink, _query_ctx.get())); - return Status::OK(); - }; - for (auto& _pipeline : _pipelines) { if (pipeline_id_to_task.contains(_pipeline->id())) { auto* task = pipeline_id_to_task[_pipeline->id()]; DCHECK(task != nullptr); - // if this task has upstream dependency, then record them. + // If this task has upstream dependency, then inject it into this task. if (_dag.find(_pipeline->id()) != _dag.end()) { auto& deps = _dag[_pipeline->id()]; for (auto& dep : deps) { @@ -535,7 +519,9 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag for (size_t pip_idx = 0; pip_idx < _pipelines.size(); pip_idx++) { if (pipeline_id_to_task.contains(_pipelines[pip_idx]->id())) { auto* task = pipeline_id_to_task[_pipelines[pip_idx]->id()]; - RETURN_IF_ERROR(prepare_and_set_parent_profile(task, pip_idx)); + DCHECK(pipeline_id_to_profile[pip_idx]); + RETURN_IF_ERROR(task->prepare(local_params, request.fragment.output_sink, + _query_ctx.get())); } } { @@ -547,6 +533,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag if (target_size > 1 && (_runtime_state->query_options().__isset.parallel_prepare_threshold && target_size > _runtime_state->query_options().parallel_prepare_threshold)) { + // If instances parallelism is big enough ( > parallel_prepare_threshold), we will prepare all tasks by multi-threads std::vector prepare_status(target_size); std::mutex m; std::condition_variable cv; @@ -634,8 +621,8 @@ void PipelineFragmentContext::trigger_report_if_necessary() { _runtime_state->load_channel_profile()->pretty_print(&ss); } - VLOG_FILE << "Query " << print_id(this->get_query_id()) << " fragment " - << this->get_fragment_id() << " profile:\n" + VLOG_FILE << "Query " << print_id(get_query_id()) << " fragment " << get_fragment_id() + << " profile:\n" << ss.str(); } auto st = send_report(false); @@ -714,6 +701,9 @@ Status PipelineFragmentContext::_create_tree_helper(ObjectPool* pool, (followed_by_shuffled_operator || op->is_shuffled_operator()) && require_shuffled_data_distribution; + if (num_children == 0) { + _use_serial_source = op->is_serial_operator(); + } // rely on that tnodes is preorder of the plan for (int i = 0; i < num_children; i++) { ++*node_idx; @@ -746,9 +736,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( int idx, ObjectPool* pool, PipelinePtr cur_pipe, PipelinePtr new_pip, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_hash_distribution) { - num_buckets = num_buckets != 0 ? num_buckets : _num_instances; + const std::map& shuffle_idx_to_instance_idx) { auto& operators = cur_pipe->operators(); const auto downstream_pipeline_id = cur_pipe->id(); auto local_exchange_id = next_operator_id(); @@ -763,13 +751,12 @@ Status PipelineFragmentContext::_add_local_exchange_impl( const bool followed_by_shuffled_operator = operators.size() > idx ? operators[idx]->followed_by_shuffled_operator() : cur_pipe->sink()->followed_by_shuffled_operator(); - const bool should_disable_bucket_shuffle = + const bool use_global_hash_shuffle = bucket_seq_to_instance_idx.empty() && shuffle_idx_to_instance_idx.find(-1) == shuffle_idx_to_instance_idx.end() && - followed_by_shuffled_operator; + followed_by_shuffled_operator && !_use_serial_source; sink.reset(new LocalExchangeSinkOperatorX( - sink_id, local_exchange_id, - should_disable_bucket_shuffle ? _total_instances : _num_instances, + sink_id, local_exchange_id, use_global_hash_shuffle ? _total_instances : _num_instances, data_distribution.partition_exprs, bucket_seq_to_instance_idx)); if (bucket_seq_to_instance_idx.empty() && data_distribution.distribution_type == ExchangeType::BUCKET_HASH_SHUFFLE) { @@ -777,8 +764,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( } RETURN_IF_ERROR(new_pip->set_sink(sink)); RETURN_IF_ERROR(new_pip->sink()->init(data_distribution.distribution_type, num_buckets, - should_disable_bucket_shuffle, - shuffle_idx_to_instance_idx)); + use_global_hash_shuffle, shuffle_idx_to_instance_idx)); // 2. Create and initialize LocalExchangeSharedState. std::shared_ptr shared_state = @@ -789,7 +775,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( case ExchangeType::HASH_SHUFFLE: shared_state->exchanger = ShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), - should_disable_bucket_shuffle ? _total_instances : _num_instances, + use_global_hash_shuffle ? _total_instances : _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? _runtime_state->query_options().local_exchange_free_blocks_limit : 0); @@ -797,7 +783,6 @@ Status PipelineFragmentContext::_add_local_exchange_impl( case ExchangeType::BUCKET_HASH_SHUFFLE: shared_state->exchanger = BucketShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, num_buckets, - ignore_data_hash_distribution, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? _runtime_state->query_options().local_exchange_free_blocks_limit : 0); @@ -927,14 +912,12 @@ Status PipelineFragmentContext::_add_local_exchange( int pip_idx, int idx, int node_id, ObjectPool* pool, PipelinePtr cur_pipe, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_distribution) { - DCHECK(_enable_local_shuffle()); - if (_num_instances <= 1) { + const std::map& shuffle_idx_to_instance_idx) { + if (_num_instances <= 1 || cur_pipe->num_tasks_of_parent() <= 1) { return Status::OK(); } - if (!cur_pipe->need_to_local_exchange(data_distribution)) { + if (!cur_pipe->need_to_local_exchange(data_distribution, idx)) { return Status::OK(); } *do_local_exchange = true; @@ -944,7 +927,7 @@ Status PipelineFragmentContext::_add_local_exchange( auto new_pip = add_pipeline(cur_pipe, pip_idx + 1); RETURN_IF_ERROR(_add_local_exchange_impl( idx, pool, cur_pipe, new_pip, data_distribution, do_local_exchange, num_buckets, - bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx, ignore_data_distribution)); + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); CHECK(total_op_num + 1 == cur_pipe->operators().size() + new_pip->operators().size()) << "total_op_num: " << total_op_num @@ -957,7 +940,7 @@ Status PipelineFragmentContext::_add_local_exchange( RETURN_IF_ERROR(_add_local_exchange_impl( new_pip->operators().size(), pool, new_pip, add_pipeline(new_pip, pip_idx + 2), DataDistribution(ExchangeType::PASSTHROUGH), do_local_exchange, num_buckets, - bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx, ignore_data_distribution)); + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -983,13 +966,8 @@ Status PipelineFragmentContext::_plan_local_exchange( // scan node. so here use `_num_instance` to replace the `num_buckets` to prevent dividing 0 // still keep colocate plan after local shuffle RETURN_IF_ERROR(_plan_local_exchange( - _pipelines[pip_idx]->operators().front()->ignore_data_hash_distribution() || - num_buckets == 0 - ? _num_instances - : num_buckets, - pip_idx, _pipelines[pip_idx], bucket_seq_to_instance_idx, - shuffle_idx_to_instance_idx, - _pipelines[pip_idx]->operators().front()->ignore_data_hash_distribution())); + _use_serial_source || num_buckets == 0 ? _num_instances : num_buckets, pip_idx, + _pipelines[pip_idx], bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -997,8 +975,7 @@ Status PipelineFragmentContext::_plan_local_exchange( Status PipelineFragmentContext::_plan_local_exchange( int num_buckets, int pip_idx, PipelinePtr pip, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_hash_distribution) { + const std::map& shuffle_idx_to_instance_idx) { int idx = 1; bool do_local_exchange = false; do { @@ -1010,8 +987,7 @@ Status PipelineFragmentContext::_plan_local_exchange( RETURN_IF_ERROR(_add_local_exchange( pip_idx, idx, ops[idx]->node_id(), _runtime_state->obj_pool(), pip, ops[idx]->required_data_distribution(), &do_local_exchange, num_buckets, - bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx, - ignore_data_hash_distribution)); + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } if (do_local_exchange) { // If local exchange is needed for current operator, we will split this pipeline to @@ -1028,8 +1004,7 @@ Status PipelineFragmentContext::_plan_local_exchange( RETURN_IF_ERROR(_add_local_exchange( pip_idx, idx, pip->sink()->node_id(), _runtime_state->obj_pool(), pip, pip->sink()->required_data_distribution(), &do_local_exchange, num_buckets, - bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx, - ignore_data_hash_distribution)); + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -1168,7 +1143,8 @@ Status PipelineFragmentContext::_create_data_sink(ObjectPool* pool, const TDataS // 1. create and set the source operator of multi_cast_data_stream_source for new pipeline source_op.reset(new MultiCastDataStreamerSourceOperatorX( i, pool, thrift_sink.multi_cast_stream_sink.sinks[i], row_desc, source_id)); - RETURN_IF_ERROR(new_pipeline->add_operator(source_op)); + RETURN_IF_ERROR(new_pipeline->add_operator( + source_op, params.__isset.parallel_instances ? params.parallel_instances : 0)); // 2. create and set sink operator of data stream sender for new pipeline DataSinkOperatorPtr sink_op; @@ -1210,15 +1186,17 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo // Therefore, here we need to use a stack-like structure. _pipeline_parent_map.pop(cur_pipe, parent_idx, child_idx); std::stringstream error_msg; + bool enable_query_cache = request.fragment.__isset.query_cache_param; + bool fe_with_old_version = false; switch (tnode.node_type) { case TPlanNodeType::OLAP_SCAN_NODE: { - op.reset(new OlapScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + op.reset(new OlapScanOperatorX( + pool, tnode, next_operator_id(), descs, _num_instances, + enable_query_cache ? request.fragment.query_cache_param : TQueryCacheParam {})); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::GROUP_COMMIT_SCAN_NODE: { @@ -1227,56 +1205,46 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _query_ctx->query_mem_tracker->is_group_commit_load = true; #endif op.reset(new GroupCommitOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::JDBC_SCAN_NODE: { if (config::enable_java_support) { op.reset(new JDBCScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } else { return Status::InternalError( "Jdbc scan node is disabled, you can change be config enable_java_support " "to true and restart be."); } - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::FILE_SCAN_NODE: { op.reset(new FileScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::ES_SCAN_NODE: case TPlanNodeType::ES_HTTP_SCAN_NODE: { op.reset(new EsScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::EXCHANGE_NODE: { int num_senders = find_with_default(request.per_exch_num_senders, tnode.node_id, 0); DCHECK_GT(num_senders, 0); op.reset(new ExchangeSourceOperatorX(pool, tnode, next_operator_id(), descs, num_senders)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - op->set_ignore_data_distribution(); - cur_pipe->set_num_tasks(request.parallel_instances); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::AGGREGATION_NODE: { @@ -1286,6 +1254,27 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo ": group by and output is empty"); } + auto create_query_cache_operator = [&](PipelinePtr& new_pipe) { + auto cache_node_id = request.local_params[0].per_node_scan_ranges.begin()->first; + auto cache_source_id = next_operator_id(); + op.reset(new CacheSourceOperatorX(pool, cache_node_id, cache_source_id, + request.fragment.query_cache_param)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + + const auto downstream_pipeline_id = cur_pipe->id(); + if (_dag.find(downstream_pipeline_id) == _dag.end()) { + _dag.insert({downstream_pipeline_id, {}}); + } + new_pipe = add_pipeline(cur_pipe); + _dag[downstream_pipeline_id].push_back(new_pipe->id()); + + DataSinkOperatorPtr cache_sink( + new CacheSinkOperatorX(next_sink_operator_id(), cache_source_id)); + cache_sink->set_dests_id({op->operator_id()}); + RETURN_IF_ERROR(new_pipe->set_sink(cache_sink)); + return Status::OK(); + }; const bool group_by_limit_opt = tnode.agg_node.__isset.agg_sort_info_by_group_key && tnode.limit > 0; @@ -1298,24 +1287,65 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo request.query_options.__isset.enable_distinct_streaming_aggregation && request.query_options.enable_distinct_streaming_aggregation && !tnode.agg_node.grouping_exprs.empty() && !group_by_limit_opt) { - op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, - _require_bucket_distribution)); - op->set_followed_by_shuffled_operator(followed_by_shuffled_operator); - _require_bucket_distribution = - _require_bucket_distribution || op->require_data_distribution(); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + if (enable_query_cache) { + PipelinePtr new_pipe; + RETURN_IF_ERROR(create_query_cache_operator(new_pipe)); + + op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, + _require_bucket_distribution)); + op->set_followed_by_shuffled_operator(false); + _require_bucket_distribution = true; + RETURN_IF_ERROR(new_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); + cur_pipe = new_pipe; + } else { + op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, + _require_bucket_distribution)); + op->set_followed_by_shuffled_operator(followed_by_shuffled_operator); + _require_bucket_distribution = + _require_bucket_distribution || op->require_data_distribution(); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + } } else if (tnode.agg_node.__isset.use_streaming_preaggregation && tnode.agg_node.use_streaming_preaggregation && !tnode.agg_node.grouping_exprs.empty()) { - op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + if (enable_query_cache) { + PipelinePtr new_pipe; + RETURN_IF_ERROR(create_query_cache_operator(new_pipe)); + + op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); + RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); + RETURN_IF_ERROR(new_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + cur_pipe = new_pipe; + } else { + op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + } } else { + // create new pipeline to add query cache operator + PipelinePtr new_pipe; + if (enable_query_cache) { + RETURN_IF_ERROR(create_query_cache_operator(new_pipe)); + } + if (enable_spill) { op.reset(new PartitionedAggSourceOperatorX(pool, tnode, next_operator_id(), descs)); } else { op.reset(new AggSourceOperatorX(pool, tnode, next_operator_id(), descs)); } - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + if (enable_query_cache) { + RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); + RETURN_IF_ERROR(new_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + cur_pipe = new_pipe; + } else { + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + } const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1362,7 +1392,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo pool, tnode_, next_operator_id(), descs, partition_count); probe_operator->set_inner_operators(inner_sink_operator, inner_probe_operator); op = std::move(probe_operator); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1386,7 +1417,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op->set_followed_by_shuffled_operator(op->is_shuffled_operator()); } else { op.reset(new HashJoinProbeOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1413,7 +1445,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case TPlanNodeType::CROSS_JOIN_NODE: { op.reset(new NestedLoopJoinProbeOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1436,7 +1469,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo int child_count = tnode.num_children; op.reset(new UnionSourceOperatorX(pool, tnode, next_operator_id(), descs)); op->set_followed_by_shuffled_operator(_require_bucket_distribution); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1464,7 +1498,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } else { op.reset(new SortSourceOperatorX(pool, tnode, next_operator_id(), descs)); } - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1491,7 +1526,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case doris::TPlanNodeType::PARTITION_SORT_NODE: { op.reset(new PartitionSortSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1509,7 +1545,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case TPlanNodeType::ANALYTIC_EVAL_NODE: { op.reset(new AnalyticSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1531,64 +1568,73 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case TPlanNodeType::INTERSECT_NODE: { RETURN_IF_ERROR(_build_operators_for_set_operation_node( - pool, tnode, descs, op, cur_pipe, parent_idx, child_idx)); + pool, tnode, descs, op, cur_pipe, parent_idx, child_idx, request)); op->set_followed_by_shuffled_operator(_require_bucket_distribution); break; } case TPlanNodeType::EXCEPT_NODE: { RETURN_IF_ERROR(_build_operators_for_set_operation_node( - pool, tnode, descs, op, cur_pipe, parent_idx, child_idx)); + pool, tnode, descs, op, cur_pipe, parent_idx, child_idx, request)); op->set_followed_by_shuffled_operator(_require_bucket_distribution); break; } case TPlanNodeType::REPEAT_NODE: { op.reset(new RepeatOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::TABLE_FUNCTION_NODE: { op.reset(new TableFunctionOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::ASSERT_NUM_ROWS_NODE: { op.reset(new AssertNumRowsOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::EMPTY_SET_NODE: { op.reset(new EmptySetSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::DATA_GEN_SCAN_NODE: { op.reset(new DataGenSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::SCHEMA_SCAN_NODE: { op.reset(new SchemaScanOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::META_SCAN_NODE: { op.reset(new MetaScanOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::SELECT_NODE: { op.reset(new SelectOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } default: return Status::InternalError("Unsupported exec type in pipeline: {}", print_plan_node_type(tnode.node_type)); } + if (request.__isset.parallel_instances && fe_with_old_version) { + cur_pipe->set_num_tasks(request.parallel_instances); + op->set_serial_operator(); + } return Status::OK(); } @@ -1598,9 +1644,11 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo template Status PipelineFragmentContext::_build_operators_for_set_operation_node( ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, OperatorPtr& op, - PipelinePtr& cur_pipe, int parent_idx, int child_idx) { + PipelinePtr& cur_pipe, int parent_idx, int child_idx, + const doris::TPipelineFragmentParams& request) { op.reset(new SetSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1694,8 +1742,7 @@ void PipelineFragmentContext::_close_fragment_instance() { _runtime_state->load_channel_profile()->pretty_print(&ss); } - LOG_INFO("Query {} fragment {} profile:\n {}", print_id(this->_query_id), - this->_fragment_id, ss.str()); + LOG_INFO("Query {} fragment {} profile:\n {}", print_id(_query_id), _fragment_id, ss.str()); } if (_query_ctx->enable_profile()) { @@ -1719,7 +1766,6 @@ void PipelineFragmentContext::close_a_pipeline(PipelineId pipeline_id) { } } std::lock_guard l(_task_mutex); - g_pipeline_tasks_count << -1; ++_closed_tasks; if (_closed_tasks == _total_tasks) { _close_fragment_instance(); @@ -1789,9 +1835,9 @@ PipelineFragmentContext::collect_realtime_profile() const { // we do not have mutex to protect pipeline_id_to_profile // so we need to make sure this funciton is invoked after fragment context // has already been prepared. - if (!this->_prepared) { + if (!_prepared) { std::string msg = - "Query " + print_id(this->_query_id) + " collecting profile, but its not prepared"; + "Query " + print_id(_query_id) + " collecting profile, but its not prepared"; DCHECK(false) << msg; LOG_ERROR(msg); return res; @@ -1812,9 +1858,9 @@ PipelineFragmentContext::collect_realtime_load_channel_profile() const { // we do not have mutex to protect pipeline_id_to_profile // so we need to make sure this funciton is invoked after fragment context // has already been prepared. - if (!this->_prepared) { + if (!_prepared) { std::string msg = - "Query " + print_id(this->_query_id) + " collecting profile, but its not prepared"; + "Query " + print_id(_query_id) + " collecting profile, but its not prepared"; DCHECK(false) << msg; LOG_ERROR(msg); return nullptr; @@ -1822,19 +1868,19 @@ PipelineFragmentContext::collect_realtime_load_channel_profile() const { for (auto& runtime_states : _task_runtime_states) { for (auto& runtime_state : runtime_states) { - if (runtime_state->runtime_profile() == nullptr) { + if (runtime_state == nullptr || runtime_state->runtime_profile() == nullptr) { continue; } auto tmp_load_channel_profile = std::make_shared(); runtime_state->runtime_profile()->to_thrift(tmp_load_channel_profile.get()); - this->_runtime_state->load_channel_profile()->update(*tmp_load_channel_profile); + _runtime_state->load_channel_profile()->update(*tmp_load_channel_profile); } } auto load_channel_profile = std::make_shared(); - this->_runtime_state->load_channel_profile()->to_thrift(load_channel_profile.get()); + _runtime_state->load_channel_profile()->to_thrift(load_channel_profile.get()); return load_channel_profile; } diff --git a/be/src/pipeline/pipeline_fragment_context.h b/be/src/pipeline/pipeline_fragment_context.h index 822a23c54bda4e..2e75aeb414ef0f 100644 --- a/be/src/pipeline/pipeline_fragment_context.h +++ b/be/src/pipeline/pipeline_fragment_context.h @@ -78,9 +78,7 @@ class PipelineFragmentContext : public TaskExecutionContext { int timeout_second() const { return _timeout; } - PipelinePtr add_pipeline(); - - PipelinePtr add_pipeline(PipelinePtr parent, int idx = -1); + PipelinePtr add_pipeline(PipelinePtr parent = nullptr, int idx = -1); RuntimeState* get_runtime_state() { return _runtime_state.get(); } @@ -123,7 +121,7 @@ class PipelineFragmentContext : public TaskExecutionContext { _tasks[j][i]->stop_if_finished(); } } - }; + } private: Status _build_pipelines(ObjectPool* pool, const doris::TPipelineFragmentParams& request, @@ -142,7 +140,8 @@ class PipelineFragmentContext : public TaskExecutionContext { Status _build_operators_for_set_operation_node(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, OperatorPtr& op, PipelinePtr& cur_pipe, int parent_idx, - int child_idx); + int child_idx, + const doris::TPipelineFragmentParams& request); Status _create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink, const std::vector& output_exprs, @@ -154,24 +153,19 @@ class PipelineFragmentContext : public TaskExecutionContext { const std::map& shuffle_idx_to_instance_idx); Status _plan_local_exchange(int num_buckets, int pip_idx, PipelinePtr pip, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_distribution); + const std::map& shuffle_idx_to_instance_idx); void _inherit_pipeline_properties(const DataDistribution& data_distribution, PipelinePtr pipe_with_source, PipelinePtr pipe_with_sink); Status _add_local_exchange(int pip_idx, int idx, int node_id, ObjectPool* pool, PipelinePtr cur_pipe, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_distribution); + const std::map& shuffle_idx_to_instance_idx); Status _add_local_exchange_impl(int idx, ObjectPool* pool, PipelinePtr cur_pipe, PipelinePtr new_pip, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_hash_distribution); - - bool _enable_local_shuffle() const { return _runtime_state->enable_local_shuffle(); } + const std::map& shuffle_idx_to_instance_idx); Status _build_pipeline_tasks(const doris::TPipelineFragmentParams& request, ThreadPool* thread_pool); @@ -206,7 +200,7 @@ class PipelineFragmentContext : public TaskExecutionContext { RuntimeProfile::Counter* _prepare_timer = nullptr; RuntimeProfile::Counter* _init_context_timer = nullptr; RuntimeProfile::Counter* _build_pipelines_timer = nullptr; - RuntimeProfile::Counter* _plan_local_shuffle_timer = nullptr; + RuntimeProfile::Counter* _plan_local_exchanger_timer = nullptr; RuntimeProfile::Counter* _prepare_all_pipelines_timer = nullptr; RuntimeProfile::Counter* _build_tasks_timer = nullptr; @@ -228,6 +222,7 @@ class PipelineFragmentContext : public TaskExecutionContext { int _num_instances = 1; int _timeout = -1; + bool _use_serial_source = false; OperatorPtr _root_op = nullptr; // this is a [n * m] matrix. n is parallelism of pipeline engine and m is the number of pipelines. @@ -290,7 +285,20 @@ class PipelineFragmentContext : public TaskExecutionContext { // - _task_runtime_states is at the task level, unique to each task. std::vector _fragment_instance_ids; - // Local runtime states for each task + /** + * Local runtime states for each task. + * + * 2-D matrix: + * +-------------------------+------------+-------+ + * | | Instance 0 | Instance 1 | ... | + * +------------+------------+------------+-------+ + * | Pipeline 0 | task 0-0 | task 0-1 | ... | + * +------------+------------+------------+-------+ + * | Pipeline 1 | task 1-0 | task 1-1 | ... | + * +------------+------------+------------+-------+ + * | ... | + * +--------------------------------------+-------+ + */ std::vector>> _task_runtime_states; std::vector> _runtime_filter_states; diff --git a/be/src/pipeline/query_cache/query_cache.cpp b/be/src/pipeline/query_cache/query_cache.cpp new file mode 100644 index 00000000000000..20e342e140f156 --- /dev/null +++ b/be/src/pipeline/query_cache/query_cache.cpp @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "query_cache.h" + +namespace doris { + +std::vector* QueryCacheHandle::get_cache_slot_orders() { + DCHECK(_handle); + auto result_ptr = reinterpret_cast(_handle)->value; + return &((QueryCache::CacheValue*)(result_ptr))->slot_orders; +} + +CacheResult* QueryCacheHandle::get_cache_result() { + DCHECK(_handle); + auto result_ptr = reinterpret_cast(_handle)->value; + return &((QueryCache::CacheValue*)(result_ptr))->result; +} + +int64_t QueryCacheHandle::get_cache_version() { + DCHECK(_handle); + auto result_ptr = reinterpret_cast(_handle)->value; + return ((QueryCache::CacheValue*)(result_ptr))->version; +} + +void QueryCache::insert(const CacheKey& key, int64_t version, CacheResult& res, + const std::vector& slot_orders, int64_t cache_size) { + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->query_cache_mem_tracker()); + CacheResult cache_result; + for (auto& block_data : res) { + cache_result.emplace_back(vectorized::Block::create_unique()) + ->swap(block_data->clone_empty()); + (void)vectorized::MutableBlock(cache_result.back().get()).merge(*block_data); + } + auto cache_value_ptr = + std::make_unique(version, std::move(cache_result), slot_orders); + + QueryCacheHandle(this, LRUCachePolicy::insert(key, (void*)cache_value_ptr.release(), cache_size, + cache_size, CachePriority::NORMAL)); +} + +bool QueryCache::lookup(const CacheKey& key, int64_t version, doris::QueryCacheHandle* handle) { + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->query_cache_mem_tracker()); + auto* lru_handle = LRUCachePolicy::lookup(key); + if (lru_handle) { + QueryCacheHandle tmp_handle(this, lru_handle); + if (tmp_handle.get_cache_version() == version) { + *handle = std::move(tmp_handle); + return true; + } + } + return false; +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/query_cache/query_cache.h b/be/src/pipeline/query_cache/query_cache.h new file mode 100644 index 00000000000000..827c516ad75f07 --- /dev/null +++ b/be/src/pipeline/query_cache/query_cache.h @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/config.h" +#include "common/status.h" +#include "io/fs/file_system.h" +#include "io/fs/path.h" +#include "olap/lru_cache.h" +#include "runtime/exec_env.h" +#include "runtime/memory/lru_cache_policy.h" +#include "runtime/memory/mem_tracker.h" +#include "util/slice.h" +#include "util/time.h" +#include "vec/core/block.h" + +namespace doris { + +using CacheResult = std::vector; +// A handle for mid-result from query lru cache. +// The handle will automatically release the cache entry when it is destroyed. +// So the caller need to make sure the handle is valid in lifecycle. +class QueryCacheHandle { +public: + QueryCacheHandle() = default; + QueryCacheHandle(LRUCachePolicy* cache, Cache::Handle* handle) + : _cache(cache), _handle(handle) {} + + ~QueryCacheHandle() { + if (_handle != nullptr) { + CHECK(_cache != nullptr); + { + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER( + ExecEnv::GetInstance()->query_cache_mem_tracker()); + _cache->release(_handle); + } + } + } + + QueryCacheHandle(QueryCacheHandle&& other) noexcept { + std::swap(_cache, other._cache); + std::swap(_handle, other._handle); + } + + QueryCacheHandle& operator=(QueryCacheHandle&& other) noexcept { + std::swap(_cache, other._cache); + std::swap(_handle, other._handle); + return *this; + } + + std::vector* get_cache_slot_orders(); + + CacheResult* get_cache_result(); + + int64_t get_cache_version(); + +private: + LRUCachePolicy* _cache = nullptr; + Cache::Handle* _handle = nullptr; + + // Don't allow copy and assign + DISALLOW_COPY_AND_ASSIGN(QueryCacheHandle); +}; + +class QueryCache : public LRUCachePolicy { +public: + using LRUCachePolicy::insert; + + struct CacheValue : public LRUCacheValueBase { + int64_t version; + CacheResult result; + std::vector slot_orders; + + CacheValue(int64_t v, CacheResult&& r, const std::vector& so) + : LRUCacheValueBase(), version(v), result(std::move(r)), slot_orders(so) {} + }; + + // Create global instance of this class + static QueryCache* create_global_cache(size_t capacity, uint32_t num_shards = 16) { + auto* res = new QueryCache(capacity, num_shards); + return res; + } + + static Status build_cache_key(const std::vector& scan_ranges, + const TQueryCacheParam& cache_param, std::string* cache_key, + int64_t* version) { + if (scan_ranges.size() > 1) { + return Status::InternalError( + "CacheSourceOperator only support one scan range, plan error"); + } + auto& scan_range = scan_ranges[0]; + DCHECK(scan_range.scan_range.__isset.palo_scan_range); + auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id; + + std::from_chars(scan_range.scan_range.palo_scan_range.version.data(), + scan_range.scan_range.palo_scan_range.version.data() + + scan_range.scan_range.palo_scan_range.version.size(), + *version); + + auto find_tablet = cache_param.tablet_to_range.find(tablet_id); + if (find_tablet == cache_param.tablet_to_range.end()) { + return Status::InternalError("Not find tablet in partition_to_tablets, plan error"); + } + + *cache_key = cache_param.digest + + std::string(reinterpret_cast(&tablet_id), sizeof(tablet_id)) + + find_tablet->second; + + return Status::OK(); + } + + // Return global instance. + // Client should call create_global_cache before. + static QueryCache* instance() { return ExecEnv::GetInstance()->get_query_cache(); } + + QueryCache() = delete; + + QueryCache(size_t capacity, uint32_t num_shards) + : LRUCachePolicy(CachePolicy::CacheType::QUERY_CACHE, capacity, LRUCacheType::SIZE, + 3600 * 24, num_shards) {} + + bool lookup(const CacheKey& key, int64_t version, QueryCacheHandle* handle); + + void insert(const CacheKey& key, int64_t version, CacheResult& result, + const std::vector& solt_orders, int64_t cache_size); +}; +} // namespace doris diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index e77a1c7ae41980..afd99a624f2c78 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -65,6 +65,7 @@ class InvertedIndexQueryCache; class TmpFileDirs; } // namespace segment_v2 +class QueryCache; class WorkloadSchedPolicyMgr; class BfdParser; class BrokerMgr; @@ -100,6 +101,7 @@ class FrontendServiceClient; class FileMetaCache; class GroupCommitMgr; class TabletSchemaCache; +class TabletColumnObjectPool; class UserFunctionCache; class SchemaCache; class StoragePageCache; @@ -187,6 +189,9 @@ class ExecEnv { std::shared_ptr point_query_executor_mem_tracker() { return _point_query_executor_mem_tracker; } + std::shared_ptr query_cache_mem_tracker() { + return _query_cache_mem_tracker; + } std::shared_ptr block_compression_mem_tracker() { return _block_compression_mem_tracker; } @@ -268,6 +273,9 @@ class ExecEnv { void set_storage_engine(std::unique_ptr&& engine); void set_cache_manager(CacheManager* cm) { this->_cache_manager = cm; } void set_tablet_schema_cache(TabletSchemaCache* c) { this->_tablet_schema_cache = c; } + void set_tablet_column_object_pool(TabletColumnObjectPool* c) { + this->_tablet_column_object_pool = c; + } void set_storage_page_cache(StoragePageCache* c) { this->_storage_page_cache = c; } void set_segment_loader(SegmentLoader* sl) { this->_segment_loader = sl; } void set_routine_load_task_executor(RoutineLoadTaskExecutor* r) { @@ -293,6 +301,7 @@ class ExecEnv { std::map get_running_frontends(); TabletSchemaCache* get_tablet_schema_cache() { return _tablet_schema_cache; } + TabletColumnObjectPool* get_tablet_column_object_pool() { return _tablet_column_object_pool; } SchemaCache* schema_cache() { return _schema_cache; } StoragePageCache* get_storage_page_cache() { return _storage_page_cache; } SegmentLoader* segment_loader() { return _segment_loader; } @@ -305,6 +314,7 @@ class ExecEnv { segment_v2::InvertedIndexQueryCache* get_inverted_index_query_cache() { return _inverted_index_query_cache; } + QueryCache* get_query_cache() { return _query_cache; } std::shared_ptr get_dummy_lru_cache() { return _dummy_lru_cache; } pipeline::RuntimeFilterTimerQueue* runtime_filter_timer_queue() { @@ -366,6 +376,7 @@ class ExecEnv { // Tracking memory may be shared between multiple queries. std::shared_ptr _point_query_executor_mem_tracker; std::shared_ptr _block_compression_mem_tracker; + std::shared_ptr _query_cache_mem_tracker; // TODO, looking forward to more accurate tracking. std::shared_ptr _rowid_storage_reader_tracker; @@ -428,6 +439,7 @@ class ExecEnv { // these redundancy header could introduce potential bug, at least, more header means slow compile. // So we choose to use raw pointer, please remember to delete these pointer in deconstructor. TabletSchemaCache* _tablet_schema_cache = nullptr; + TabletColumnObjectPool* _tablet_column_object_pool = nullptr; std::unique_ptr _storage_engine; SchemaCache* _schema_cache = nullptr; StoragePageCache* _storage_page_cache = nullptr; @@ -437,6 +449,7 @@ class ExecEnv { CacheManager* _cache_manager = nullptr; segment_v2::InvertedIndexSearcherCache* _inverted_index_searcher_cache = nullptr; segment_v2::InvertedIndexQueryCache* _inverted_index_query_cache = nullptr; + QueryCache* _query_cache = nullptr; std::shared_ptr _dummy_lru_cache = nullptr; std::unique_ptr _file_cache_open_fd_cache; diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index eb9fa12ea4bbdb..062069044dcd37 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -53,9 +53,11 @@ #include "olap/schema_cache.h" #include "olap/segment_loader.h" #include "olap/storage_engine.h" +#include "olap/tablet_column_object_pool.h" #include "olap/tablet_schema_cache.h" #include "olap/wal/wal_manager.h" #include "pipeline/pipeline_tracing.h" +#include "pipeline/query_cache/query_cache.h" #include "pipeline/task_queue.h" #include "pipeline/task_scheduler.h" #include "runtime/broker_mgr.h" @@ -336,6 +338,9 @@ Status ExecEnv::_init(const std::vector& store_paths, _tablet_schema_cache = TabletSchemaCache::create_global_schema_cache(config::tablet_schema_cache_capacity); + _tablet_column_object_pool = TabletColumnObjectPool::create_global_column_cache( + config::tablet_schema_cache_capacity); + // Storage engine doris::EngineOptions options; options.store_paths = store_paths; @@ -584,6 +589,9 @@ Status ExecEnv::_init_mem_env() { _orc_memory_pool = new doris::vectorized::ORCMemoryPool(); _arrow_memory_pool = new doris::vectorized::ArrowMemoryPool(); + _query_cache = QueryCache::create_global_cache(config::query_cache_size * 1024L * 1024L); + LOG(INFO) << "query cache memory limit: " << config::query_cache_size << "MB"; + return Status::OK(); } @@ -600,7 +608,9 @@ void ExecEnv::init_mem_tracker() { MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "SegCompaction"); _point_query_executor_mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "PointQueryExecutor"); - _block_compression_mem_tracker = + _query_cache_mem_tracker = + MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "QueryCache"); + _block_compression_mem_tracker = _block_compression_mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "BlockCompression"); _rowid_storage_reader_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "RowIdStorageReader"); @@ -666,7 +676,7 @@ void ExecEnv::destroy() { SAFE_STOP(_write_cooldown_meta_executors); // StorageEngine must be destoried before _page_no_cache_mem_tracker.reset and _cache_manager destory - // shouldn't use SAFE_STOP. otherwise will lead to twice stop. + SAFE_STOP(_storage_engine); _storage_engine.reset(); SAFE_STOP(_spill_stream_mgr); @@ -690,6 +700,7 @@ void ExecEnv::destroy() { SAFE_DELETE(_schema_cache); SAFE_DELETE(_segment_loader); SAFE_DELETE(_row_cache); + SAFE_DELETE(_query_cache); // Free resource after threads are stopped. // Some threads are still running, like threads created by _new_load_stream_mgr ... diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 7ba73442c90168..e683b84e2b4e4e 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -299,6 +299,10 @@ Status FragmentMgr::trigger_pipeline_context_report( // including the final status when execution finishes. void FragmentMgr::coordinator_callback(const ReportStatusRequest& req) { DCHECK(req.status.ok() || req.done); // if !status.ok() => done + if (req.coord_addr.hostname == "external") { + // External query (flink/spark read tablets) not need to report to FE. + return; + } Status exec_status = req.status; Status coord_status; FrontendServiceConnection coord(_exec_env->frontend_client_cache(), req.coord_addr, @@ -836,31 +840,33 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, query_ctx->set_merge_controller_handler(handler); } - for (const auto& local_param : params.local_params) { - const TUniqueId& fragment_instance_id = local_param.fragment_instance_id; + { + // (query_id, fragment_id) is executed only on one BE, locks _pipeline_map. std::lock_guard lock(_lock); - auto iter = _pipeline_map.find({params.query_id, params.fragment_id}); - if (iter != _pipeline_map.end()) { - return Status::InternalError("exec_plan_fragment input duplicated fragment_id({})", - params.fragment_id); + for (const auto& local_param : params.local_params) { + const TUniqueId& fragment_instance_id = local_param.fragment_instance_id; + auto iter = _pipeline_map.find({params.query_id, params.fragment_id}); + if (iter != _pipeline_map.end()) { + return Status::InternalError( + "exec_plan_fragment query_id({}) input duplicated fragment_id({})", + print_id(params.query_id), params.fragment_id); + } + query_ctx->fragment_instance_ids.push_back(fragment_instance_id); } - query_ctx->fragment_instance_ids.push_back(fragment_instance_id); - } - if (!params.__isset.need_wait_execution_trigger || !params.need_wait_execution_trigger) { - query_ctx->set_ready_to_execute_only(); - } - - int64 now = duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - { + int64 now = duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); g_fragment_executing_count << 1; g_fragment_last_active_time.set_value(now); - std::lock_guard lock(_lock); // TODO: simplify this mapping _pipeline_map.insert({{params.query_id, params.fragment_id}, context}); } + + if (!params.__isset.need_wait_execution_trigger || !params.need_wait_execution_trigger) { + query_ctx->set_ready_to_execute_only(); + } + query_ctx->set_pipeline_context(params.fragment_id, context); RETURN_IF_ERROR(context->submit()); @@ -1070,6 +1076,7 @@ void FragmentMgr::debug(std::stringstream& ss) {} */ Status FragmentMgr::exec_external_plan_fragment(const TScanOpenParams& params, const TQueryPlanInfo& t_query_plan_info, + const TUniqueId& query_id, const TUniqueId& fragment_instance_id, std::vector* selected_columns) { // set up desc tbl @@ -1110,8 +1117,9 @@ Status FragmentMgr::exec_external_plan_fragment(const TScanOpenParams& params, // assign the param used for executing of PlanFragment-self TPipelineInstanceParams fragment_exec_params; - exec_fragment_params.query_id = t_query_plan_info.query_id; + exec_fragment_params.query_id = query_id; fragment_exec_params.fragment_instance_id = fragment_instance_id; + exec_fragment_params.coord.hostname = "external"; std::map<::doris::TPlanNodeId, std::vector> per_node_scan_ranges; std::vector scan_ranges; std::vector tablet_ids = params.tablet_ids; diff --git a/be/src/runtime/fragment_mgr.h b/be/src/runtime/fragment_mgr.h index 41b63db0b23ad9..20b2fd8cdc2063 100644 --- a/be/src/runtime/fragment_mgr.h +++ b/be/src/runtime/fragment_mgr.h @@ -112,6 +112,7 @@ class FragmentMgr : public RestMonitorIface { // execute external query, all query info are packed in TScanOpenParams Status exec_external_plan_fragment(const TScanOpenParams& params, const TQueryPlanInfo& t_query_plan_info, + const TUniqueId& query_id, const TUniqueId& fragment_instance_id, std::vector* selected_columns); diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h index c43ca0b2fb7e0a..e7e1c73e7cbb41 100644 --- a/be/src/runtime/memory/cache_policy.h +++ b/be/src/runtime/memory/cache_policy.h @@ -48,6 +48,8 @@ class CachePolicy { CLOUD_TXN_DELETE_BITMAP_CACHE = 17, NONE = 18, // not be used FOR_UT_CACHE_NUMBER = 19, + QUERY_CACHE = 20, + TABLET_COLUMN_OBJECT_POOL = 21, }; static std::string type_string(CacheType type) { @@ -90,6 +92,10 @@ class CachePolicy { return "CloudTxnDeleteBitmapCache"; case CacheType::FOR_UT_CACHE_NUMBER: return "ForUTCacheNumber"; + case CacheType::QUERY_CACHE: + return "QueryCache"; + case CacheType::TABLET_COLUMN_OBJECT_POOL: + return "TabletColumnObjectPool"; default: LOG(FATAL) << "not match type of cache policy :" << static_cast(type); } @@ -116,7 +122,8 @@ class CachePolicy { {"CreateTabletRRIdxCache", CacheType::CREATE_TABLET_RR_IDX_CACHE}, {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE}, {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}, - {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}}; + {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}, + {"TabletColumnObjectPool", CacheType::TABLET_COLUMN_OBJECT_POOL}}; static CacheType string_to_type(std::string type) { if (StringToType.contains(type)) { diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp index 046de58fe5fcc4..354de1426e9ee7 100644 --- a/be/src/runtime/query_context.cpp +++ b/be/src/runtime/query_context.cpp @@ -150,7 +150,7 @@ QueryContext::~QueryContext() { std::string mem_tracker_msg; if (query_mem_tracker->peak_consumption() != 0) { mem_tracker_msg = fmt::format( - ", deregister query/load memory tracker, queryId={}, Limit={}, CurrUsed={}, " + "deregister query/load memory tracker, queryId={}, Limit={}, CurrUsed={}, " "PeakUsed={}", print_id(_query_id), MemCounter::print_bytes(query_mem_tracker->limit()), MemCounter::print_bytes(query_mem_tracker->consumption()), diff --git a/be/src/runtime/record_batch_queue.cpp b/be/src/runtime/record_batch_queue.cpp index 83982688880948..25db550db3a7f1 100644 --- a/be/src/runtime/record_batch_queue.cpp +++ b/be/src/runtime/record_batch_queue.cpp @@ -23,10 +23,16 @@ namespace doris { bool RecordBatchQueue::blocking_get(std::shared_ptr* result) { - auto res = _queue.blocking_get(result); - if (_dep && size() <= 10) { + if (_dep && size() <= config::max_memory_sink_batch_count) { _dep->set_ready(); } + // Before each get queue, will set sink task dependency ready. + // so if the sink task put queue faster than the fetch result get queue, + // the queue size will always be 10. + // be sure to set sink dependency ready before getting queue. + // otherwise, if queue is emptied after sink task put queue and before block dependency, + // get queue will stuck and will never set sink dependency ready. + auto res = _queue.blocking_get(result); return res; } diff --git a/be/src/runtime/result_queue_mgr.cpp b/be/src/runtime/result_queue_mgr.cpp index 8090a3e6ee4787..8a6e5b1093542d 100644 --- a/be/src/runtime/result_queue_mgr.cpp +++ b/be/src/runtime/result_queue_mgr.cpp @@ -82,8 +82,10 @@ void ResultQueueMgr::create_queue(const TUniqueId& fragment_instance_id, if (iter != _fragment_queue_map.end()) { *queue = iter->second; } else { - // the blocking queue size = 20 (default), in this way, one queue have 20 * 1024 rows at most - BlockQueueSharedPtr tmp(new RecordBatchQueue(config::max_memory_sink_batch_count)); + // max_elements will not take effect, because when queue size reaches max_memory_sink_batch_count, + // MemoryScratchSink will block queue dependency, in this way, one queue have 20 * 1024 rows at most. + // use MemoryScratchSink queue dependency instead of BlockingQueue to achieve blocking. + BlockQueueSharedPtr tmp(new RecordBatchQueue(config::max_memory_sink_batch_count * 2)); _fragment_queue_map.insert(std::make_pair(fragment_instance_id, tmp)); *queue = tmp; } diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index e7f2c18b09404a..b44aba5e7314de 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -174,7 +174,7 @@ class RuntimeState { _query_options.check_overflow_for_decimal; } - bool enable_decima256() const { + bool enable_decimal256() const { return _query_options.__isset.enable_decimal256 && _query_options.enable_decimal256; } diff --git a/be/src/runtime/stream_load/stream_load_context.h b/be/src/runtime/stream_load/stream_load_context.h index 9d1601372f877d..93f76fad4e613c 100644 --- a/be/src/runtime/stream_load/stream_load_context.h +++ b/be/src/runtime/stream_load/stream_load_context.h @@ -164,9 +164,10 @@ class StreamLoadContext { // the following members control the max progress of a consuming // process. if any of them reach, the consuming will finish. - int64_t max_interval_s = 5; - int64_t max_batch_rows = 100000; - int64_t max_batch_size = 100 * 1024 * 1024; // 100MB + // same as values set in fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java + int64_t max_interval_s = 60; + int64_t max_batch_rows = 20000000; + int64_t max_batch_size = 1024 * 1024 * 1024; // 1GB // for parse json-data std::string data_format = ""; diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp index d56aa49b19b1cf..e6fdfaa87657f8 100644 --- a/be/src/service/backend_service.cpp +++ b/be/src/service/backend_service.cpp @@ -802,6 +802,11 @@ void BaseBackendService::submit_routine_load_task(TStatus& t_status, void BaseBackendService::open_scanner(TScanOpenResult& result_, const TScanOpenParams& params) { TStatus t_status; TUniqueId fragment_instance_id = generate_uuid(); + // A query_id is randomly generated to replace t_query_plan_info.query_id. + // external query does not need to report anything to FE, so the query_id can be changed. + // Otherwise, multiple independent concurrent open tablet scanners have the same query_id. + // when one of the scanners ends, the other scanners will be canceled through FragmentMgr.cancel(query_id). + TUniqueId query_id = generate_uuid(); std::shared_ptr p_context; static_cast(_exec_env->external_scan_context_mgr()->create_scan_context(&p_context)); p_context->fragment_instance_id = fragment_instance_id; @@ -838,13 +843,18 @@ void BaseBackendService::open_scanner(TScanOpenResult& result_, const TScanOpenP << " deserialize error, should not be modified after returned Doris FE processed"; exec_st = Status::InvalidArgument(msg.str()); } - p_context->query_id = t_query_plan_info.query_id; + p_context->query_id = query_id; } std::vector selected_columns; if (exec_st.ok()) { // start the scan procedure + LOG(INFO) << fmt::format( + "exec external scanner, old_query_id = {}, new_query_id = {}, fragment_instance_id " + "= {}", + print_id(t_query_plan_info.query_id), print_id(query_id), + print_id(fragment_instance_id)); exec_st = _exec_env->fragment_mgr()->exec_external_plan_fragment( - params, t_query_plan_info, fragment_instance_id, &selected_columns); + params, t_query_plan_info, query_id, fragment_instance_id, &selected_columns); } exec_st.to_thrift(&t_status); //return status diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 8217bd11bb9156..89b43ec5223501 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -886,13 +886,10 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController Status PInternalService::_tablet_fetch_data(const PTabletKeyLookupRequest* request, PTabletKeyLookupResponse* response) { - PointQueryExecutor lookup_util; - RETURN_IF_ERROR(lookup_util.init(request, response)); - RETURN_IF_ERROR(lookup_util.lookup_up()); - if (VLOG_DEBUG_IS_ON) { - VLOG_DEBUG << lookup_util.print_profile(); - } - LOG_EVERY_N(INFO, 500) << lookup_util.print_profile(); + PointQueryExecutor executor; + RETURN_IF_ERROR(executor.init(request, response)); + RETURN_IF_ERROR(executor.lookup_up()); + executor.print_profile(); return Status::OK(); } @@ -1159,7 +1156,10 @@ void PInternalService::fetch_remote_tablet_schema(google::protobuf::RpcControlle LOG(WARNING) << "tablet does not exist, tablet id is " << tablet_id; continue; } - tablet_schemas.push_back(res.value()->merged_tablet_schema()); + auto schema = res.value()->merged_tablet_schema(); + if (schema != nullptr) { + tablet_schemas.push_back(schema); + } } if (!tablet_schemas.empty()) { // merge all diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 9719a672b8dff4..2550a53a6ad03d 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ #include "olap/olap_tuple.h" #include "olap/row_cursor.h" #include "olap/rowset/beta_rowset.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/storage_engine.h" #include "olap/tablet_manager.h" #include "olap/tablet_schema.h" @@ -313,34 +315,48 @@ Status PointQueryExecutor::lookup_up() { return Status::OK(); } -std::string PointQueryExecutor::print_profile() { +void PointQueryExecutor::print_profile() { auto init_us = _profile_metrics.init_ns.value() / 1000; auto init_key_us = _profile_metrics.init_key_ns.value() / 1000; auto lookup_key_us = _profile_metrics.lookup_key_ns.value() / 1000; auto lookup_data_us = _profile_metrics.lookup_data_ns.value() / 1000; auto output_data_us = _profile_metrics.output_data_ns.value() / 1000; + auto load_segments_key_us = _profile_metrics.load_segment_key_stage_ns.value() / 1000; + auto load_segments_data_us = _profile_metrics.load_segment_data_stage_ns.value() / 1000; auto total_us = init_us + lookup_key_us + lookup_data_us + output_data_us; auto read_stats = _profile_metrics.read_stats; - return fmt::format( - "" + const std::string stats_str = fmt::format( "[lookup profile:{}us] init:{}us, init_key:{}us," - "" - "" - "lookup_key:{}us, lookup_data:{}us, output_data:{}us, hit_lookup_cache:{}" - "" - "" + " lookup_key:{}us, load_segments_key:{}us, lookup_data:{}us, load_segments_data:{}us," + " output_data:{}us, " + "hit_lookup_cache:{}" ", is_binary_row:{}, output_columns:{}, total_keys:{}, row_cache_hits:{}" ", hit_cached_pages:{}, total_pages_read:{}, compressed_bytes_read:{}, " "io_latency:{}ns, " "uncompressed_bytes_read:{}, result_data_bytes:{}, row_hits:{}" - ", rs_column_uid:{}" - "", - total_us, init_us, init_key_us, lookup_key_us, lookup_data_us, output_data_us, - _profile_metrics.hit_lookup_cache, _binary_row_format, _reusable->output_exprs().size(), - _row_read_ctxs.size(), _profile_metrics.row_cache_hits, read_stats.cached_pages_num, + ", rs_column_uid:{}, bytes_read_from_local:{}, bytes_read_from_remote:{}, " + "local_io_timer:{}, remote_io_timer:{}, local_write_timer:{}", + total_us, init_us, init_key_us, lookup_key_us, load_segments_key_us, lookup_data_us, + load_segments_data_us, output_data_us, _profile_metrics.hit_lookup_cache, + _binary_row_format, _reusable->output_exprs().size(), _row_read_ctxs.size(), + _profile_metrics.row_cache_hits, read_stats.cached_pages_num, read_stats.total_pages_num, read_stats.compressed_bytes_read, read_stats.io_ns, read_stats.uncompressed_bytes_read, _profile_metrics.result_data_bytes, _row_hits, - _reusable->rs_column_uid()); + _reusable->rs_column_uid(), + _profile_metrics.read_stats.file_cache_stats.bytes_read_from_local, + _profile_metrics.read_stats.file_cache_stats.bytes_read_from_remote, + _profile_metrics.read_stats.file_cache_stats.local_io_timer, + _profile_metrics.read_stats.file_cache_stats.remote_io_timer, + _profile_metrics.read_stats.file_cache_stats.write_cache_io_timer); + + constexpr static int kSlowThreholdUs = 50 * 1000; // 50ms + if (total_us > kSlowThreholdUs) { + LOG(WARNING) << "slow query, " << stats_str; + } else if (VLOG_DEBUG_IS_ON) { + VLOG_DEBUG << stats_str; + } else { + LOG_EVERY_N(INFO, 1000) << stats_str; + } } Status PointQueryExecutor::_init_keys(const PTabletKeyLookupRequest* request) { @@ -380,6 +396,17 @@ Status PointQueryExecutor::_lookup_row_key() { specified_rowsets = _tablet->get_rowset_by_ids(nullptr); } std::vector> segment_caches(specified_rowsets.size()); + // init segment_cache + { + SCOPED_TIMER(&_profile_metrics.load_segment_key_stage_ns); + for (size_t i = 0; i < specified_rowsets.size(); i++) { + auto& rs = specified_rowsets[i]; + segment_caches[i] = std::make_unique(); + RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( + std::static_pointer_cast(rs), segment_caches[i].get(), true, true, + &_profile_metrics.read_stats)); + } + } for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { RowLocation location; if (!config::disable_storage_row_cache) { @@ -396,7 +423,8 @@ Status PointQueryExecutor::_lookup_row_key() { auto rowset_ptr = std::make_unique(); st = (_tablet->lookup_row_key(_row_read_ctxs[i]._primary_key, nullptr, false, specified_rowsets, &location, INT32_MAX /*rethink?*/, - segment_caches, rowset_ptr.get(), false)); + segment_caches, rowset_ptr.get(), false, + &_profile_metrics.read_stats)); if (st.is()) { continue; } @@ -459,7 +487,11 @@ Status PointQueryExecutor::_lookup_row_data() { BetaRowsetSharedPtr rowset = std::static_pointer_cast(_tablet->get_rowset(row_loc.rowset_id)); SegmentCacheHandle segment_cache; - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + { + SCOPED_TIMER(&_profile_metrics.load_segment_data_stage_ns); + RETURN_IF_ERROR( + SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + } // find segment auto it = std::find_if(segment_cache.get_segments().cbegin(), segment_cache.get_segments().cend(), diff --git a/be/src/service/point_query_executor.h b/be/src/service/point_query_executor.h index b22dc5bfd1d73f..89f4ecff9b137a 100644 --- a/be/src/service/point_query_executor.h +++ b/be/src/service/point_query_executor.h @@ -276,12 +276,16 @@ struct Metrics { init_key_ns(TUnit::TIME_NS), lookup_key_ns(TUnit::TIME_NS), lookup_data_ns(TUnit::TIME_NS), - output_data_ns(TUnit::TIME_NS) {} + output_data_ns(TUnit::TIME_NS), + load_segment_key_stage_ns(TUnit::TIME_NS), + load_segment_data_stage_ns(TUnit::TIME_NS) {} RuntimeProfile::Counter init_ns; RuntimeProfile::Counter init_key_ns; RuntimeProfile::Counter lookup_key_ns; RuntimeProfile::Counter lookup_data_ns; RuntimeProfile::Counter output_data_ns; + RuntimeProfile::Counter load_segment_key_stage_ns; + RuntimeProfile::Counter load_segment_data_stage_ns; OlapReaderStatistics read_stats; size_t row_cache_hits = 0; bool hit_lookup_cache = false; @@ -297,7 +301,9 @@ class PointQueryExecutor { Status lookup_up(); - std::string print_profile(); + void print_profile(); + + const OlapReaderStatistics& read_stats() const { return _read_stats; } private: Status _init_keys(const PTabletKeyLookupRequest* request); diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index 728057667e8505..084765e5aaa9c0 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -163,8 +163,7 @@ Status convert_to_arrow_field(SlotDescriptor* desc, std::shared_ptr* result) { +Status get_arrow_schema(const vectorized::Block& block, std::shared_ptr* result) { std::vector> fields; for (const auto& type_and_name : block) { std::shared_ptr arrow_type; @@ -177,20 +176,6 @@ Status convert_block_arrow_schema(const vectorized::Block& block, return Status::OK(); } -Status convert_to_arrow_schema(const RowDescriptor& row_desc, - std::shared_ptr* result) { - std::vector> fields; - for (auto tuple_desc : row_desc.tuple_descriptors()) { - for (auto desc : tuple_desc->slots()) { - std::shared_ptr field; - RETURN_IF_ERROR(convert_to_arrow_field(desc, &field)); - fields.push_back(field); - } - } - *result = arrow::schema(std::move(fields)); - return Status::OK(); -} - Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, std::shared_ptr* result) { std::vector> fields; diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h index ddffc3324d3451..5dd76ff66d7ff8 100644 --- a/be/src/util/arrow/row_batch.h +++ b/be/src/util/arrow/row_batch.h @@ -43,12 +43,7 @@ class RowDescriptor; Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* result); -// Convert Doris RowDescriptor to Arrow Schema. -Status convert_to_arrow_schema(const RowDescriptor& row_desc, - std::shared_ptr* result); - -Status convert_block_arrow_schema(const vectorized::Block& block, - std::shared_ptr* result); +Status get_arrow_schema(const vectorized::Block& block, std::shared_ptr* result); Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, std::shared_ptr* result); diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index d13c0c091b9ced..d1788b0948a6f2 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "common/config.h" diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp index 3fe6b92c923f92..e9d4f31e5ca137 100644 --- a/be/src/util/doris_metrics.cpp +++ b/be/src/util/doris_metrics.cpp @@ -91,13 +91,13 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(base_compaction_deltas_total, MetricUnit::R DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(cumulative_compaction_deltas_total, MetricUnit::ROWSETS, "", compaction_deltas_total, Labels({{"type", "cumulative"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(full_compaction_deltas_total, MetricUnit::ROWSETS, "", - compaction_deltas_total, Labels({{"type", "base"}})); + compaction_deltas_total, Labels({{"type", "full"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(base_compaction_bytes_total, MetricUnit::BYTES, "", compaction_bytes_total, Labels({{"type", "base"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(cumulative_compaction_bytes_total, MetricUnit::BYTES, "", compaction_bytes_total, Labels({{"type", "cumulative"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(full_compaction_bytes_total, MetricUnit::BYTES, "", - compaction_bytes_total, Labels({{"type", "base"}})); + compaction_bytes_total, Labels({{"type", "full"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(segment_read_total, MetricUnit::OPERATIONS, "(segment_v2) total number of segments read", segment_read, diff --git a/be/src/util/jni-util.cpp b/be/src/util/jni-util.cpp index 02d20ed9a4fe80..6ad0790ef0859e 100644 --- a/be/src/util/jni-util.cpp +++ b/be/src/util/jni-util.cpp @@ -317,6 +317,7 @@ Status JniUtil::GetJniExceptionMsg(JNIEnv* env, bool log_stack, const string& pr } jobject JniUtil::convert_to_java_map(JNIEnv* env, const std::map& map) { + //TODO: ADD EXCEPTION CHECK. jclass hashmap_class = env->FindClass("java/util/HashMap"); jmethodID hashmap_constructor = env->GetMethodID(hashmap_class, "", "(I)V"); jobject hashmap_object = env->NewObject(hashmap_class, hashmap_constructor, map.size()); @@ -399,16 +400,26 @@ std::map JniUtil::convert_to_cpp_map(JNIEnv* env, jobj Status JniUtil::GetGlobalClassRef(JNIEnv* env, const char* class_str, jclass* class_ref) { *class_ref = NULL; - jclass local_cl = env->FindClass(class_str); - RETURN_ERROR_IF_EXC(env); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF(jclass, local_cl, env, FindClass(class_str)); RETURN_IF_ERROR(LocalToGlobalRef(env, local_cl, reinterpret_cast(class_ref))); - env->DeleteLocalRef(local_cl); - RETURN_ERROR_IF_EXC(env); return Status::OK(); } Status JniUtil::LocalToGlobalRef(JNIEnv* env, jobject local_ref, jobject* global_ref) { *global_ref = env->NewGlobalRef(local_ref); + // NewGlobalRef: + // Returns a global reference to the given obj. + // + //May return NULL if: + // obj refers to null + // the system has run out of memory + // obj was a weak global reference and has already been garbage collected + if (*global_ref == NULL) { + return Status::InternalError( + "LocalToGlobalRef fail,global ref is NULL,maybe the system has run out of memory."); + } + + //NewGlobalRef not throw exception,maybe we just need check NULL. RETURN_ERROR_IF_EXC(env); return Status::OK(); } diff --git a/be/src/util/jni-util.h b/be/src/util/jni-util.h index 666a5e526dfbda..df332951afebb8 100644 --- a/be/src/util/jni-util.h +++ b/be/src/util/jni-util.h @@ -28,6 +28,7 @@ #include "common/status.h" #include "jni_md.h" +#include "util/defer_op.h" #include "util/thrift_util.h" #ifdef USE_HADOOP_HDFS @@ -38,12 +39,25 @@ extern "C" JNIEnv* getJNIEnv(void); namespace doris { class JniUtil; -#define RETURN_ERROR_IF_EXC(env) \ - do { \ - jthrowable exc = (env)->ExceptionOccurred(); \ - if (exc != nullptr) return JniUtil::GetJniExceptionMsg(env); \ +#define RETURN_ERROR_IF_EXC(env) \ + do { \ + if (env->ExceptionCheck()) [[unlikely]] \ + return JniUtil::GetJniExceptionMsg(env); \ } while (false) +#define JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF(type, result, env, func) \ + type result = env->func; \ + DEFER(env->DeleteLocalRef(result)); \ + RETURN_ERROR_IF_EXC(env) + +#define JNI_CALL_METHOD_CHECK_EXCEPTION(type, result, env, func) \ + type result = env->func; \ + RETURN_ERROR_IF_EXC(env) + +//In order to reduce the potential risks caused by not handling exceptions, +// you need to refer to https://docs.oracle.com/javase/8/docs/technotes/guides/jni/spec/functions.html +// to confirm whether the jni method will throw an exception. + class JniUtil { public: static Status Init() WARN_UNUSED_RESULT; @@ -65,6 +79,10 @@ class JniUtil { return Status::OK(); } + //jclass is generally a local reference. + //Method ID and field ID values are forever. + //If you want to use the jclass across multiple threads or multiple calls into the JNI code you need + // to create a global reference to it with GetGlobalClassRef(). static Status GetGlobalClassRef(JNIEnv* env, const char* class_str, jclass* class_ref) WARN_UNUSED_RESULT; diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp index fc30d1073acdc6..4cb71f5e827878 100644 --- a/be/src/util/jvm_metrics.cpp +++ b/be/src/util/jvm_metrics.cpp @@ -22,7 +22,9 @@ #include #include "common/config.h" +#include "util/defer_op.h" #include "util/metrics.h" + namespace doris { #define DEFINE_JVM_SIZE_BYTES_METRIC(name, type) \ @@ -90,9 +92,13 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) { break; } try { - _jvm_stats.init(env); + Status st = _jvm_stats.init(env); + if (!st) { + LOG(WARNING) << "jvm Stats Init Fail. " << st.to_string(); + break; + } } catch (...) { - LOG(WARNING) << "JVM STATS INIT FAIL"; + LOG(WARNING) << "jvm Stats Throw Exception Init Fail."; break; } if (!_jvm_stats.init_complete()) { @@ -133,21 +139,22 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) { void JvmMetrics::update() { static long fail_count = 0; - bool have_exception = false; try { - _jvm_stats.refresh(this); + Status st = _jvm_stats.refresh(this); + if (!st) { + fail_count++; + LOG(WARNING) << "Jvm Stats update Fail! " << st.to_string(); + } else { + fail_count = 0; + } } catch (...) { - have_exception = true; - LOG(WARNING) << "JVM MONITOR UPDATE FAIL!"; + LOG(WARNING) << "Jvm Stats update throw Exception!"; fail_count++; } //When 30 consecutive exceptions occur, turn off jvm information collection. - if (!have_exception) { - fail_count = 0; - } if (fail_count >= 30) { - LOG(WARNING) << "JVM MONITOR CLOSE!"; + LOG(WARNING) << "Jvm Stats CLOSE!"; _jvm_stats.set_complete(false); _server_entity->deregister_hook(_s_hook_name); @@ -182,193 +189,257 @@ void JvmMetrics::update() { } } -void JvmStats::init(JNIEnv* ENV) { - env = ENV; - _managementFactoryClass = env->FindClass("java/lang/management/ManagementFactory"); - if (_managementFactoryClass == nullptr) { - LOG(WARNING) - << "Class java/lang/management/ManagementFactory Not Find.JVM monitoring fails."; - return; - } +Status JvmStats::init(JNIEnv* env) { + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/ManagementFactory", + &_managementFactoryClass)); - _getMemoryMXBeanMethod = env->GetStaticMethodID(_managementFactoryClass, "getMemoryMXBean", - "()Ljava/lang/management/MemoryMXBean;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryMXBeanMethod, env, + GetStaticMethodID(_managementFactoryClass, "getMemoryMXBean", + "()Ljava/lang/management/MemoryMXBean;")); - _memoryUsageClass = env->FindClass("java/lang/management/MemoryUsage"); - if (_memoryUsageClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/MemoryUsage Not Find.JVM monitoring fails."; - return; - } - _getMemoryUsageUsedMethod = env->GetMethodID(_memoryUsageClass, "getUsed", "()J"); - _getMemoryUsageCommittedMethod = env->GetMethodID(_memoryUsageClass, "getCommitted", "()J"); - _getMemoryUsageMaxMethod = env->GetMethodID(_memoryUsageClass, "getMax", "()J"); + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/MemoryUsage", + &_memoryUsageClass)); - _memoryMXBeanClass = env->FindClass("java/lang/management/MemoryMXBean"); - if (_memoryMXBeanClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/MemoryMXBean Not Find.JVM monitoring fails."; - return; - } - _getHeapMemoryUsageMethod = env->GetMethodID(_memoryMXBeanClass, "getHeapMemoryUsage", - "()Ljava/lang/management/MemoryUsage;"); - _getNonHeapMemoryUsageMethod = env->GetMethodID(_memoryMXBeanClass, "getNonHeapMemoryUsage", - "()Ljava/lang/management/MemoryUsage;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryUsageUsedMethod, env, + GetMethodID(_memoryUsageClass, "getUsed", "()J")); - _getMemoryPoolMXBeansMethod = env->GetStaticMethodID( - _managementFactoryClass, "getMemoryPoolMXBeans", "()Ljava/util/List;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryUsageCommittedMethod, env, + GetMethodID(_memoryUsageClass, "getCommitted", "()J")); - _listClass = env->FindClass("java/util/List"); - if (_listClass == nullptr) { - LOG(WARNING) << "Class java/util/List Not Find.JVM monitoring fails."; - return; - } - _getListSizeMethod = env->GetMethodID(_listClass, "size", "()I"); - _getListUseIndexMethod = env->GetMethodID(_listClass, "get", "(I)Ljava/lang/Object;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryUsageMaxMethod, env, + GetMethodID(_memoryUsageClass, "getMax", "()J")); - _memoryPoolMXBeanClass = env->FindClass("java/lang/management/MemoryPoolMXBean"); - if (_memoryPoolMXBeanClass == nullptr) { - LOG(WARNING) - << "Class java/lang/management/MemoryPoolMXBean Not Find.JVM monitoring fails."; - return; - } - _getMemoryPoolMXBeanUsageMethod = env->GetMethodID(_memoryPoolMXBeanClass, "getUsage", - "()Ljava/lang/management/MemoryUsage;"); - _getMemoryPollMXBeanPeakMethod = env->GetMethodID(_memoryPoolMXBeanClass, "getPeakUsage", - "()Ljava/lang/management/MemoryUsage;"); - _getMemoryPollMXBeanNameMethod = - env->GetMethodID(_memoryPoolMXBeanClass, "getName", "()Ljava/lang/String;"); - - _getThreadMXBeanMethod = env->GetStaticMethodID(_managementFactoryClass, "getThreadMXBean", - "()Ljava/lang/management/ThreadMXBean;"); - - _getGarbageCollectorMXBeansMethod = env->GetStaticMethodID( - _managementFactoryClass, "getGarbageCollectorMXBeans", "()Ljava/util/List;"); - - _garbageCollectorMXBeanClass = env->FindClass("java/lang/management/GarbageCollectorMXBean"); - if (_garbageCollectorMXBeanClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/GarbageCollectorMXBean Not Find.JVM monitoring " - "fails."; - return; - } - _getGCNameMethod = - env->GetMethodID(_garbageCollectorMXBeanClass, "getName", "()Ljava/lang/String;"); - _getGCCollectionCountMethod = - env->GetMethodID(_garbageCollectorMXBeanClass, "getCollectionCount", "()J"); - _getGCCollectionTimeMethod = - env->GetMethodID(_garbageCollectorMXBeanClass, "getCollectionTime", "()J"); - - _threadMXBeanClass = env->FindClass("java/lang/management/ThreadMXBean"); - if (_threadMXBeanClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/ThreadMXBean Not Find.JVM monitoring fails."; - return; - } - _getAllThreadIdsMethod = env->GetMethodID(_threadMXBeanClass, "getAllThreadIds", "()[J"); - _getThreadInfoMethod = env->GetMethodID(_threadMXBeanClass, "getThreadInfo", - "([JI)[Ljava/lang/management/ThreadInfo;"); - _getPeakThreadCountMethod = env->GetMethodID(_threadMXBeanClass, "getPeakThreadCount", "()I"); - - _threadInfoClass = env->FindClass("java/lang/management/ThreadInfo"); - if (_threadInfoClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/ThreadInfo Not Find.JVM monitoring fails."; - return; - } + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/MemoryMXBean", + &_memoryMXBeanClass)); - _getThreadStateMethod = - env->GetMethodID(_threadInfoClass, "getThreadState", "()Ljava/lang/Thread$State;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getHeapMemoryUsageMethod, env, + GetMethodID(_memoryMXBeanClass, "getHeapMemoryUsage", + "()Ljava/lang/management/MemoryUsage;")); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getNonHeapMemoryUsageMethod, env, + GetMethodID(_memoryMXBeanClass, "getNonHeapMemoryUsage", + "()Ljava/lang/management/MemoryUsage;")); - _threadStateClass = env->FindClass("java/lang/Thread$State"); - if (_threadStateClass == nullptr) { - LOG(WARNING) << "Class java/lang/Thread$State Not Find.JVM monitoring fails."; - return; - } + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getMemoryPoolMXBeansMethod, env, + GetStaticMethodID(_managementFactoryClass, "getMemoryPoolMXBeans", + "()Ljava/util/List;")); - jfieldID newThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "NEW", "Ljava/lang/Thread$State;"); - jfieldID runnableThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "RUNNABLE", "Ljava/lang/Thread$State;"); - jfieldID blockedThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "BLOCKED", "Ljava/lang/Thread$State;"); - jfieldID waitingThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "WAITING", "Ljava/lang/Thread$State;"); - jfieldID timedWaitingThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "TIMED_WAITING", "Ljava/lang/Thread$State;"); - jfieldID terminatedThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "TERMINATED", "Ljava/lang/Thread$State;"); - - _newThreadStateObj = env->GetStaticObjectField(_threadStateClass, newThreadFieldID); - _runnableThreadStateObj = env->GetStaticObjectField(_threadStateClass, runnableThreadFieldID); - _blockedThreadStateObj = env->GetStaticObjectField(_threadStateClass, blockedThreadFieldID); - _waitingThreadStateObj = env->GetStaticObjectField(_threadStateClass, waitingThreadFieldID); - _timedWaitingThreadStateObj = - env->GetStaticObjectField(_threadStateClass, timedWaitingThreadFieldID); - _terminatedThreadStateObj = - env->GetStaticObjectField(_threadStateClass, terminatedThreadFieldID); + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/util/List", &_listClass)); - LOG(INFO) << "Start JVM monitoring."; + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getListSizeMethod, env, + GetMethodID(_listClass, "size", "()I")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getListUseIndexMethod, env, + GetMethodID(_listClass, "get", "(I)Ljava/lang/Object;")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/MemoryPoolMXBean", + &_memoryPoolMXBeanClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryPoolMXBeanUsageMethod, env, + GetMethodID(_memoryPoolMXBeanClass, "getUsage", + "()Ljava/lang/management/MemoryUsage;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryPollMXBeanPeakMethod, env, + GetMethodID(_memoryPoolMXBeanClass, "getPeakUsage", + "()Ljava/lang/management/MemoryUsage;")); + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getMemoryPollMXBeanNameMethod, env, + GetMethodID(_memoryPoolMXBeanClass, "getName", "()Ljava/lang/String;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getThreadMXBeanMethod, env, + GetStaticMethodID(_managementFactoryClass, "getThreadMXBean", + "()Ljava/lang/management/ThreadMXBean;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGarbageCollectorMXBeansMethod, env, + GetStaticMethodID(_managementFactoryClass, "getGarbageCollectorMXBeans", + "()Ljava/util/List;")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/GarbageCollectorMXBean", + &_garbageCollectorMXBeanClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGCNameMethod, env, + GetMethodID(_garbageCollectorMXBeanClass, "getName", "()Ljava/lang/String;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGCCollectionCountMethod, env, + GetMethodID(_garbageCollectorMXBeanClass, "getCollectionCount", "()J")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGCCollectionTimeMethod, env, + GetMethodID(_garbageCollectorMXBeanClass, "getCollectionTime", "()J")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/ThreadMXBean", + &_threadMXBeanClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, + + _getAllThreadIdsMethod, env, + GetMethodID(_threadMXBeanClass, "getAllThreadIds", "()[J")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, + + _getThreadInfoMethod, env, + GetMethodID(_threadMXBeanClass, "getThreadInfo", + "([JI)[Ljava/lang/management/ThreadInfo;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, + + _getPeakThreadCountMethod, env, + GetMethodID(_threadMXBeanClass, "getPeakThreadCount", "()I")); + + RETURN_IF_ERROR( + JniUtil::GetGlobalClassRef(env, "java/lang/management/ThreadInfo", &_threadInfoClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , + + _getThreadStateMethod, env, + GetMethodID(_threadInfoClass, "getThreadState", "()Ljava/lang/Thread$State;")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/Thread$State", &_threadStateClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, newThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "NEW", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, runnableThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "RUNNABLE", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, blockedThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "BLOCKED", "Ljava/lang/Thread$State;")); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, waitingThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "WAITING", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, timedWaitingThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "TIMED_WAITING", "Ljava/lang/Thread$State;")); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, terminatedThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "TERMINATED", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, newThreadStateObj, env, + GetStaticObjectField(_threadStateClass, newThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, newThreadStateObj, &_newThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, runnableThreadStateObj, env, + GetStaticObjectField(_threadStateClass, runnableThreadFieldID)); + RETURN_IF_ERROR( + JniUtil::LocalToGlobalRef(env, runnableThreadStateObj, &_runnableThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, blockedThreadStateObj, env, + GetStaticObjectField(_threadStateClass, blockedThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, blockedThreadStateObj, &_blockedThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, waitingThreadStateObj, env, + GetStaticObjectField(_threadStateClass, waitingThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, waitingThreadStateObj, &_waitingThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jobject, timedWaitingThreadStateObj, env, + GetStaticObjectField(_threadStateClass, timedWaitingThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, timedWaitingThreadStateObj, + &_timedWaitingThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jobject, terminatedThreadStateObj, env, + GetStaticObjectField(_threadStateClass, terminatedThreadFieldID)); + RETURN_IF_ERROR( + JniUtil::LocalToGlobalRef(env, terminatedThreadStateObj, &_terminatedThreadStateObj)); _init_complete = true; - return; + + LOG(INFO) << "Start JVM monitoring."; + return Status::OK(); } -void JvmStats::refresh(JvmMetrics* jvm_metrics) { +Status JvmStats::refresh(JvmMetrics* jvm_metrics) const { if (!_init_complete) { - return; + return Status::InternalError("Jvm Stats not init complete."); } - Status st = JniUtil::GetJNIEnv(&env); - if (!st.ok()) { - LOG(WARNING) << "JVM STATS GET JNI ENV FAIL"; - return; - } + JNIEnv* env = nullptr; + RETURN_IF_ERROR(JniUtil::GetJNIEnv(&env)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, memoryMXBeanObj, env, + CallStaticObjectMethod(_managementFactoryClass, _getMemoryMXBeanMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, heapMemoryUsageObj, env, + CallObjectMethod(memoryMXBeanObj, _getHeapMemoryUsageMethod)); - jobject memoryMXBeanObj = - env->CallStaticObjectMethod(_managementFactoryClass, _getMemoryMXBeanMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, heapMemoryUsed, env, + CallLongMethod(heapMemoryUsageObj, _getMemoryUsageUsedMethod)); - jobject heapMemoryUsageObj = env->CallObjectMethod(memoryMXBeanObj, _getHeapMemoryUsageMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jlong, heapMemoryCommitted, env, + CallLongMethod(heapMemoryUsageObj, _getMemoryUsageCommittedMethod)); - jlong heapMemoryUsed = env->CallLongMethod(heapMemoryUsageObj, _getMemoryUsageUsedMethod); - jlong heapMemoryCommitted = - env->CallLongMethod(heapMemoryUsageObj, _getMemoryUsageCommittedMethod); - jlong heapMemoryMax = env->CallLongMethod(heapMemoryUsageObj, _getMemoryUsageMaxMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, heapMemoryMax, env, + CallLongMethod(heapMemoryUsageObj, _getMemoryUsageMaxMethod)); jvm_metrics->jvm_heap_size_bytes_used->set_value(heapMemoryUsed < 0 ? 0 : heapMemoryUsed); jvm_metrics->jvm_heap_size_bytes_committed->set_value( heapMemoryCommitted < 0 ? 0 : heapMemoryCommitted); jvm_metrics->jvm_heap_size_bytes_max->set_value(heapMemoryMax < 0 ? 0 : heapMemoryMax); - jobject nonHeapMemoryUsageObj = - env->CallObjectMethod(memoryMXBeanObj, _getNonHeapMemoryUsageMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, nonHeapMemoryUsageObj, env, + CallObjectMethod(memoryMXBeanObj, _getNonHeapMemoryUsageMethod)); - jlong nonHeapMemoryCommitted = - env->CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageCommittedMethod); - jlong nonHeapMemoryUsed = env->CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageUsedMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jlong, nonHeapMemoryCommitted, env, + CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageCommittedMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jlong, nonHeapMemoryUsed, env, + CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageUsedMethod)); jvm_metrics->jvm_non_heap_size_bytes_committed->set_value( nonHeapMemoryCommitted < 0 ? 0 : nonHeapMemoryCommitted); jvm_metrics->jvm_non_heap_size_bytes_used->set_value(nonHeapMemoryUsed < 0 ? 0 : nonHeapMemoryUsed); - jobject memoryPoolMXBeansList = - env->CallStaticObjectMethod(_managementFactoryClass, _getMemoryPoolMXBeansMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, memoryPoolMXBeansList, env, + CallStaticObjectMethod(_managementFactoryClass, _getMemoryPoolMXBeansMethod)); - jint size = env->CallIntMethod(memoryPoolMXBeansList, _getListSizeMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, size, env, + CallIntMethod(memoryPoolMXBeansList, _getListSizeMethod)); for (int i = 0; i < size; ++i) { - jobject memoryPoolMXBean = - env->CallObjectMethod(memoryPoolMXBeansList, _getListUseIndexMethod, i); - jobject usageObject = - env->CallObjectMethod(memoryPoolMXBean, _getMemoryPoolMXBeanUsageMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, memoryPoolMXBean, env, + CallObjectMethod(memoryPoolMXBeansList, _getListUseIndexMethod, i)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, usageObject, env, + CallObjectMethod(memoryPoolMXBean, _getMemoryPoolMXBeanUsageMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, used, env, + CallLongMethod(usageObject, _getMemoryUsageUsedMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, max, env, + CallLongMethod(usageObject, _getMemoryUsageMaxMethod)); - jlong used = env->CallLongMethod(usageObject, _getMemoryUsageUsedMethod); - jlong max = env->CallLongMethod(usageObject, _getMemoryUsageMaxMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, peakUsageObject, env, + CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanPeakMethod)); - jobject peakUsageObject = - env->CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanPeakMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, peakUsed, env, + CallLongMethod(peakUsageObject, _getMemoryUsageUsedMethod)); - jlong peakUsed = env->CallLongMethod(peakUsageObject, _getMemoryUsageUsedMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, name, env, + CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanNameMethod)); - jstring name = - (jstring)env->CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanNameMethod); - const char* nameStr = env->GetStringUTFChars(name, nullptr); + const char* nameStr = env->GetStringUTFChars( + (jstring)name, nullptr); // GetStringUTFChars not throw exception if (nameStr != nullptr) { auto it = _memoryPoolName.find(nameStr); if (it == _memoryPoolName.end()) { @@ -385,36 +456,46 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { jvm_metrics->jvm_old_size_bytes_max->set_value(max < 0 ? 0 : max); } - env->ReleaseStringUTFChars(name, nameStr); + env->ReleaseStringUTFChars((jstring)name, + nameStr); // ReleaseStringUTFChars not throw exception } - env->DeleteLocalRef(memoryPoolMXBean); - env->DeleteLocalRef(usageObject); - env->DeleteLocalRef(peakUsageObject); } + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadMXBean, env, + CallStaticObjectMethod(_managementFactoryClass, _getThreadMXBeanMethod)); - jobject threadMXBean = - env->CallStaticObjectMethod(_managementFactoryClass, _getThreadMXBeanMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadIdsObject, env, CallObjectMethod(threadMXBean, _getAllThreadIdsMethod)); - jlongArray threadIds = (jlongArray)env->CallObjectMethod(threadMXBean, _getAllThreadIdsMethod); - jint threadCount = env->GetArrayLength(threadIds); + auto threadIds = (jlongArray)threadIdsObject; - jobjectArray threadInfos = - (jobjectArray)env->CallObjectMethod(threadMXBean, _getThreadInfoMethod, threadIds, 0); + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, threadCount, env, GetArrayLength(threadIds)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadInfos, env, + CallObjectMethod(threadMXBean, _getThreadInfoMethod, (jlongArray)threadIds, 0)); int threadsNew = 0, threadsRunnable = 0, threadsBlocked = 0, threadsWaiting = 0, threadsTimedWaiting = 0, threadsTerminated = 0; - jint peakThreadCount = env->CallIntMethod(threadMXBean, _getPeakThreadCountMethod); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, peakThreadCount, env, + CallIntMethod(threadMXBean, _getPeakThreadCountMethod)); jvm_metrics->jvm_thread_peak_count->set_value(peakThreadCount < 0 ? 0 : peakThreadCount); jvm_metrics->jvm_thread_count->set_value(threadCount < 0 ? 0 : threadCount); for (int i = 0; i < threadCount; i++) { - jobject threadInfo = env->GetObjectArrayElement(threadInfos, i); + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, threadInfo, env, + GetObjectArrayElement((jobjectArray)threadInfos, i)); + if (threadInfo == nullptr) { continue; } - jobject threadState = env->CallObjectMethod(threadInfo, _getThreadStateMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadState, env, CallObjectMethod(threadInfo, _getThreadStateMethod)); + + //IsSameObject not throw exception if (env->IsSameObject(threadState, _newThreadStateObj)) { threadsNew++; } else if (env->IsSameObject(threadState, _runnableThreadStateObj)) { @@ -428,8 +509,6 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { } else if (env->IsSameObject(threadState, _terminatedThreadStateObj)) { threadsTerminated++; } - env->DeleteLocalRef(threadInfo); - env->DeleteLocalRef(threadState); } jvm_metrics->jvm_thread_new_count->set_value(threadsNew < 0 ? 0 : threadsNew); @@ -441,18 +520,27 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { jvm_metrics->jvm_thread_terminated_count->set_value(threadsTerminated < 0 ? 0 : threadsTerminated); - jobject gcMXBeansList = - env->CallStaticObjectMethod(_managementFactoryClass, _getGarbageCollectorMXBeansMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, gcMXBeansList, env, + CallStaticObjectMethod(_managementFactoryClass, _getGarbageCollectorMXBeansMethod)); - jint numCollectors = env->CallIntMethod(gcMXBeansList, _getListSizeMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, numCollectors, env, + CallIntMethod(gcMXBeansList, _getListSizeMethod)); for (int i = 0; i < numCollectors; i++) { - jobject gcMXBean = env->CallObjectMethod(gcMXBeansList, _getListUseIndexMethod, i); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, gcMXBean, env, CallObjectMethod(gcMXBeansList, _getListUseIndexMethod, i)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF(jobject, gcName, env, + CallObjectMethod(gcMXBean, _getGCNameMethod)); - jstring gcName = (jstring)env->CallObjectMethod(gcMXBean, _getGCNameMethod); - jlong gcCollectionCount = env->CallLongMethod(gcMXBean, _getGCCollectionCountMethod); - jlong gcCollectionTime = env->CallLongMethod(gcMXBean, _getGCCollectionTimeMethod); - const char* gcNameStr = env->GetStringUTFChars(gcName, NULL); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, gcCollectionCount, env, + CallLongMethod(gcMXBean, _getGCCollectionCountMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, gcCollectionTime, env, + CallLongMethod(gcMXBean, _getGCCollectionTimeMethod)); + + const char* gcNameStr = env->GetStringUTFChars((jstring)gcName, NULL); if (gcNameStr != nullptr) { if (strcmp(gcNameStr, "G1 Young Generation") == 0) { jvm_metrics->jvm_gc_g1_young_generation_count->set_value(gcCollectionCount); @@ -463,31 +551,40 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { jvm_metrics->jvm_gc_g1_old_generation_time_ms->set_value(gcCollectionTime); } - env->ReleaseStringUTFChars(gcName, gcNameStr); + env->ReleaseStringUTFChars((jstring)gcName, gcNameStr); } - env->DeleteLocalRef(gcMXBean); } - env->DeleteLocalRef(memoryMXBeanObj); - env->DeleteLocalRef(heapMemoryUsageObj); - env->DeleteLocalRef(nonHeapMemoryUsageObj); - env->DeleteLocalRef(memoryPoolMXBeansList); - env->DeleteLocalRef(threadMXBean); - env->DeleteLocalRef(gcMXBeansList); + + return Status::OK(); } JvmStats::~JvmStats() { if (!_init_complete) { return; } try { - env->DeleteLocalRef(_newThreadStateObj); - env->DeleteLocalRef(_runnableThreadStateObj); - env->DeleteLocalRef(_blockedThreadStateObj); - env->DeleteLocalRef(_waitingThreadStateObj); - env->DeleteLocalRef(_timedWaitingThreadStateObj); - env->DeleteLocalRef(_terminatedThreadStateObj); + JNIEnv* env = nullptr; + Status st = JniUtil::GetJNIEnv(&env); + if (!st.ok()) { + return; + } + env->DeleteGlobalRef(_managementFactoryClass); + env->DeleteGlobalRef(_memoryUsageClass); + env->DeleteGlobalRef(_memoryMXBeanClass); + env->DeleteGlobalRef(_listClass); + env->DeleteGlobalRef(_memoryPoolMXBeanClass); + env->DeleteGlobalRef(_threadMXBeanClass); + env->DeleteGlobalRef(_threadInfoClass); + env->DeleteGlobalRef(_threadStateClass); + env->DeleteGlobalRef(_garbageCollectorMXBeanClass); + + env->DeleteGlobalRef(_newThreadStateObj); + env->DeleteGlobalRef(_runnableThreadStateObj); + env->DeleteGlobalRef(_blockedThreadStateObj); + env->DeleteGlobalRef(_waitingThreadStateObj); + env->DeleteGlobalRef(_timedWaitingThreadStateObj); + env->DeleteGlobalRef(_terminatedThreadStateObj); } catch (...) { - // When be is killed, DeleteLocalRef may fail. // In order to exit more gracefully, we catch the exception here. } } diff --git a/be/src/util/jvm_metrics.h b/be/src/util/jvm_metrics.h index 459a3cbf938f79..78346c022b0aba 100644 --- a/be/src/util/jvm_metrics.h +++ b/be/src/util/jvm_metrics.h @@ -27,7 +27,6 @@ class JvmMetrics; class JvmStats { private: - JNIEnv* env = nullptr; jclass _managementFactoryClass = nullptr; jmethodID _getMemoryMXBeanMethod = nullptr; jclass _memoryUsageClass = nullptr; @@ -96,11 +95,10 @@ class JvmStats { bool _init_complete = false; public: - // JvmStats(JNIEnv* ENV); - void init(JNIEnv* ENV); + Status init(JNIEnv* env); bool init_complete() const { return _init_complete; } void set_complete(bool val) { _init_complete = val; } - void refresh(JvmMetrics* jvm_metrics); + Status refresh(JvmMetrics* jvm_metrics) const; ~JvmStats(); }; diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index 05f1bd2a602c68..f67fe14fa42600 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -38,6 +38,10 @@ class Arena; class IColumn; class IDataType; +struct AggregateFunctionAttr { + bool enable_decimal256 {false}; +}; + template class AggregateFunctionBitmapCount; template @@ -111,21 +115,21 @@ class IAggregateFunction { * Additional parameter arena should be used instead of standard memory allocator if the addition requires memory allocation. */ virtual void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const = 0; + Arena*) const = 0; virtual void add_many(AggregateDataPtr __restrict place, const IColumn** columns, - std::vector& rows, Arena* arena) const {} + std::vector& rows, Arena*) const {} /// Merges state (on which place points to) with other state of current aggregation function. virtual void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const = 0; + Arena*) const = 0; virtual void merge_vec(const AggregateDataPtr* places, size_t offset, ConstAggregateDataPtr rhs, - Arena* arena, const size_t num_rows) const = 0; + Arena*, const size_t num_rows) const = 0; // same as merge_vec, but only call "merge" function when place is not nullptr virtual void merge_vec_selected(const AggregateDataPtr* places, size_t offset, - ConstAggregateDataPtr rhs, Arena* arena, + ConstAggregateDataPtr rhs, Arena*, const size_t num_rows) const = 0; /// Serializes state (to transmit it over the network, for example). @@ -142,21 +146,21 @@ class IAggregateFunction { /// Deserializes state. This function is called only for empty (just created) states. virtual void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, - Arena* arena) const = 0; + Arena*) const = 0; - virtual void deserialize_vec(AggregateDataPtr places, const ColumnString* column, Arena* arena, + virtual void deserialize_vec(AggregateDataPtr places, const ColumnString* column, Arena*, size_t num_rows) const = 0; virtual void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const = 0; + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const = 0; virtual void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const = 0; + Arena*, const size_t num_rows) const = 0; - virtual void deserialize_from_column(AggregateDataPtr places, const IColumn& column, - Arena* arena, size_t num_rows) const = 0; + virtual void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, + size_t num_rows) const = 0; /// Deserializes state and merge it with current aggregation function. virtual void deserialize_and_merge(AggregateDataPtr __restrict place, @@ -165,10 +169,10 @@ class IAggregateFunction { virtual void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, - size_t end, Arena* arena) const = 0; + size_t end, Arena*) const = 0; virtual void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, - const IColumn& column, Arena* arena) const = 0; + const IColumn& column, Arena*) const = 0; /// Inserts results into a column. virtual void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const = 0; @@ -181,33 +185,32 @@ class IAggregateFunction { * and do a single call to "add_batch" for devirtualization and inlining. */ virtual void add_batch(size_t batch_size, AggregateDataPtr* places, size_t place_offset, - const IColumn** columns, Arena* arena, bool agg_many = false) const = 0; + const IColumn** columns, Arena*, bool agg_many = false) const = 0; // same as add_batch, but only call "add" function when place is not nullptr virtual void add_batch_selected(size_t batch_size, AggregateDataPtr* places, - size_t place_offset, const IColumn** columns, - Arena* arena) const = 0; + size_t place_offset, const IColumn** columns, Arena*) const = 0; /** The same for single place. */ virtual void add_batch_single_place(size_t batch_size, AggregateDataPtr place, - const IColumn** columns, Arena* arena) const = 0; + const IColumn** columns, Arena*) const = 0; // only used at agg reader virtual void add_batch_range(size_t batch_begin, size_t batch_end, AggregateDataPtr place, - const IColumn** columns, Arena* arena, bool has_null = false) = 0; + const IColumn** columns, Arena*, bool has_null = false) = 0; // only used at window function virtual void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end, AggregateDataPtr place, const IColumn** columns, - Arena* arena) const = 0; + Arena*) const = 0; virtual void streaming_agg_serialize(const IColumn** columns, BufferWritable& buf, - const size_t num_rows, Arena* arena) const = 0; + const size_t num_rows, Arena*) const = 0; virtual void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const = 0; + const size_t num_rows, Arena*) const = 0; const DataTypes& get_argument_types() const { return argument_types; } diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp index 10616be4258477..18662bf66cf38c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp @@ -31,7 +31,8 @@ namespace doris::vectorized { AggregateFunctionPtr create_aggregate_function_approx_count_distinct( - const std::string& name, const DataTypes& argument_types, const bool result_is_nullable) { + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { WhichDataType which(remove_nullable(argument_types[0])); #define DISPATCH(TYPE, COLUMN_TYPE) \ diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.cpp b/be/src/vec/aggregate_functions/aggregate_function_avg.cpp index 0f3d0fd3bdad6b..6a6711f90f983e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_avg.cpp @@ -45,8 +45,17 @@ template using AggregateFuncAvgDecimal256 = typename AvgDecimal256::Function; void register_aggregate_function_avg(AggregateFunctionSimpleFactory& factory) { - factory.register_function_both("avg", creator_with_type::creator); - factory.register_function_both("avg_decimal256", - creator_with_type::creator); + AggregateFunctionCreator creator = [&](const std::string& name, const DataTypes& types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (attr.enable_decimal256) { + return creator_with_type::creator(name, types, + result_is_nullable, attr); + } else { + return creator_with_type::creator(name, types, result_is_nullable, + attr); + } + }; + factory.register_function_both("avg", creator); } } // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.h b/be/src/vec/aggregate_functions/aggregate_function_avg.h index 8a18a88839b4db..62fbb8078ea949 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_avg.h @@ -184,7 +184,7 @@ class AggregateFunctionAvg final column.get_data().push_back(this->data(place).template result()); } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto& col = assert_cast(column); DCHECK(col.size() >= num_rows) << "source column's size should greater than num_rows"; @@ -205,7 +205,7 @@ class AggregateFunctionAvg final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto* src_data = assert_cast(*columns[0]).get_data().data(); auto& dst_col = assert_cast(*dst); dst_col.set_item_size(sizeof(Data)); @@ -219,7 +219,7 @@ class AggregateFunctionAvg final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); DCHECK(col.size() >= num_rows) << "source column's size should greater than num_rows"; @@ -233,7 +233,7 @@ class AggregateFunctionAvg final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -245,19 +245,19 @@ class AggregateFunctionAvg final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec(places, offset, rhs, arena, num_rows); + this->merge_vec(places, offset, rhs, nullptr, num_rows); } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec_selected(places, offset, rhs, arena, num_rows); + this->merge_vec_selected(places, offset, rhs, nullptr, num_rows); } void serialize_without_key_to_column(ConstAggregateDataPtr __restrict place, diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp b/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp index 0676fd5bc27090..e9c86d4b9556da 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp @@ -40,9 +40,9 @@ AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_type) { return nullptr; } -AggregateFunctionPtr create_aggregate_function_bitmap_union_count(const std::string& name, - const DataTypes& argument_types, - const bool result_is_nullable) { +AggregateFunctionPtr create_aggregate_function_bitmap_union_count( + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { const bool arg_is_nullable = argument_types[0]->is_nullable(); if (arg_is_nullable) { return std::make_shared>(argument_types); @@ -53,7 +53,8 @@ AggregateFunctionPtr create_aggregate_function_bitmap_union_count(const std::str AggregateFunctionPtr create_aggregate_function_bitmap_union_int(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { const bool arg_is_nullable = argument_types[0]->is_nullable(); if (arg_is_nullable) { return AggregateFunctionPtr( diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h index 6c504b91bf4abd..b0619a63e1ffe8 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h @@ -159,7 +159,7 @@ class AggregateFunctionBitmapSerializationHelper : IAggregateFunctionDataHelper(argument_types_) {} void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { if (version >= BITMAP_SERDE) { auto& col = assert_cast(*dst); char place[sizeof(Data)]; @@ -171,11 +171,11 @@ class AggregateFunctionBitmapSerializationHelper assert_cast(this)->destroy(place); }); assert_cast(this)->add(place, columns, - i, arena); + i, nullptr); data[i] = std::move(this->data(place).value); } } else { - BaseHelper::streaming_agg_serialize_to_column(columns, dst, num_rows, arena); + BaseHelper::streaming_agg_serialize_to_column(columns, dst, num_rows, nullptr); } } @@ -194,7 +194,7 @@ class AggregateFunctionBitmapSerializationHelper } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { if (version >= BITMAP_SERDE) { auto& col = assert_cast(column); const size_t num_rows = column.size(); @@ -204,13 +204,13 @@ class AggregateFunctionBitmapSerializationHelper this->data(place).merge(data[i]); } } else { - BaseHelper::deserialize_and_merge_from_column(place, column, arena); + BaseHelper::deserialize_and_merge_from_column(place, column, nullptr); } } void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); if (version >= BITMAP_SERDE) { @@ -220,12 +220,12 @@ class AggregateFunctionBitmapSerializationHelper this->data(place).merge(data[i]); } } else { - BaseHelper::deserialize_and_merge_from_column_range(place, column, begin, end, arena); + BaseHelper::deserialize_and_merge_from_column_range(place, column, begin, end, nullptr); } } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { if (version >= BITMAP_SERDE) { const auto& col = assert_cast(*column); @@ -234,13 +234,13 @@ class AggregateFunctionBitmapSerializationHelper this->data(places[i] + offset).merge(data[i]); } } else { - BaseHelper::deserialize_and_merge_vec(places, offset, rhs, column, arena, num_rows); + BaseHelper::deserialize_and_merge_vec(places, offset, rhs, column, nullptr, num_rows); } } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { if (version >= BITMAP_SERDE) { const auto& col = assert_cast(*column); const auto* data = col.get_data().data(); @@ -250,7 +250,7 @@ class AggregateFunctionBitmapSerializationHelper } } } else { - BaseHelper::deserialize_and_merge_vec_selected(places, offset, rhs, column, arena, + BaseHelper::deserialize_and_merge_vec_selected(places, offset, rhs, column, nullptr, num_rows); } } diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp index b8ae4c6530d575..0b95ddfd46f0d5 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp @@ -41,7 +41,8 @@ AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_types) AggregateFunctionPtr create_aggregate_function_bitmap_agg(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { const bool arg_is_nullable = argument_types[0]->is_nullable(); if (arg_is_nullable) { return AggregateFunctionPtr(create_with_int_data_type(argument_types)); diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h index 19352e022fa7a2..5747faf1b8e8c1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h @@ -72,7 +72,7 @@ class AggregateFunctionBitmapAgg final DataTypePtr get_return_type() const override { return std::make_shared(); } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { DCHECK_LT(row_num, columns[0]->size()); if constexpr (arg_nullable) { auto& nullable_col = @@ -90,7 +90,7 @@ class AggregateFunctionBitmapAgg final } void add_batch_single_place(size_t batch_size, AggregateDataPtr place, const IColumn** columns, - Arena* arena) const override { + Arena*) const override { if constexpr (arg_nullable) { auto& nullable_column = assert_cast(*columns[0]); const auto& column = @@ -111,7 +111,7 @@ class AggregateFunctionBitmapAgg final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } @@ -130,7 +130,7 @@ class AggregateFunctionBitmapAgg final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& col = assert_cast(*dst); char place[sizeof(Data)]; col.resize(num_rows); @@ -138,12 +138,12 @@ class AggregateFunctionBitmapAgg final for (size_t i = 0; i != num_rows; ++i) { this->create(place); DEFER({ this->destroy(place); }); - this->add(place, columns, i, arena); + this->add(place, columns, i, nullptr); data[i] = std::move(this->data(place).value); } } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto& col = assert_cast(column); DCHECK(col.size() >= num_rows) << "source column's size should greater than num_rows"; @@ -165,7 +165,7 @@ class AggregateFunctionBitmapAgg final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); auto* data = col.get_data().data(); @@ -177,7 +177,7 @@ class AggregateFunctionBitmapAgg final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -188,7 +188,7 @@ class AggregateFunctionBitmapAgg final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { const auto& col = assert_cast(*column); const auto* data = col.get_data().data(); @@ -198,8 +198,8 @@ class AggregateFunctionBitmapAgg final } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { const auto& col = assert_cast(*column); const auto* data = col.get_data().data(); for (size_t i = 0; i != num_rows; ++i) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.cpp b/be/src/vec/aggregate_functions/aggregate_function_collect.cpp index 4fcf09b59b33c6..d726b7c6355318 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.cpp @@ -96,7 +96,8 @@ AggregateFunctionPtr create_aggregate_function_collect_impl(const std::string& n AggregateFunctionPtr create_aggregate_function_collect(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() == 1) { if (name == "array_agg") { return create_aggregate_function_collect_impl( diff --git a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp index a454afb45f22e0..cdaab6e086f4a5 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp @@ -89,7 +89,8 @@ struct CorrMoment { AggregateFunctionPtr create_aggregate_corr_function(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_binary(name, argument_types); return create_with_two_basic_numeric_types(argument_types[0], argument_types[1], argument_types, result_is_nullable); diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.cpp b/be/src/vec/aggregate_functions/aggregate_function_count.cpp index 8c54714b046da1..5cfe5af41982f6 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_count.cpp @@ -29,15 +29,16 @@ namespace doris::vectorized { AggregateFunctionPtr create_aggregate_function_count(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_arity_at_most<1>(name, argument_types); return std::make_shared(argument_types); } -AggregateFunctionPtr create_aggregate_function_count_not_null_unary(const std::string& name, - const DataTypes& argument_types, - const bool result_is_nullable) { +AggregateFunctionPtr create_aggregate_function_count_not_null_unary( + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_arity_at_most<1>(name, argument_types); return std::make_shared(argument_types); diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.h b/be/src/vec/aggregate_functions/aggregate_function_count.h index 62aa869771c0a5..7b54d074683b04 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count.h @@ -91,7 +91,7 @@ class AggregateFunctionCount final assert_cast(to).get_data().push_back(data(place).count); } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto data = assert_cast(column).get_data().data(); memcpy(places, data, sizeof(Data) * num_rows); @@ -111,7 +111,7 @@ class AggregateFunctionCount final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& dst_col = assert_cast(*dst); DCHECK(dst_col.item_size() == sizeof(Data)) << "size is not equal: " << dst_col.item_size() << " " << sizeof(Data); @@ -124,7 +124,7 @@ class AggregateFunctionCount final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); auto* data = reinterpret_cast(col.get_data().data()); @@ -135,7 +135,7 @@ class AggregateFunctionCount final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -146,19 +146,19 @@ class AggregateFunctionCount final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec(places, offset, rhs, arena, num_rows); + this->merge_vec(places, offset, rhs, nullptr, num_rows); } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec_selected(places, offset, rhs, arena, num_rows); + this->merge_vec_selected(places, offset, rhs, nullptr, num_rows); } void serialize_without_key_to_column(ConstAggregateDataPtr __restrict place, @@ -229,7 +229,7 @@ class AggregateFunctionCountNotNullUnary final } } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto data = assert_cast(column).get_data().data(); memcpy(places, data, sizeof(Data) * num_rows); @@ -249,7 +249,7 @@ class AggregateFunctionCountNotNullUnary final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& col = assert_cast(*dst); DCHECK(col.item_size() == sizeof(Data)) << "size is not equal: " << col.item_size() << " " << sizeof(Data); @@ -263,7 +263,7 @@ class AggregateFunctionCountNotNullUnary final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); auto* data = reinterpret_cast(col.get_data().data()); @@ -274,7 +274,7 @@ class AggregateFunctionCountNotNullUnary final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -286,19 +286,19 @@ class AggregateFunctionCountNotNullUnary final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec(places, offset, rhs, arena, num_rows); + this->merge_vec(places, offset, rhs, nullptr, num_rows); } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec_selected(places, offset, rhs, arena, num_rows); + this->merge_vec_selected(places, offset, rhs, nullptr, num_rows); } void serialize_without_key_to_column(ConstAggregateDataPtr __restrict place, diff --git a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp index 1a0bf2518202f3..093b31d57db554 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp @@ -29,7 +29,8 @@ namespace doris::vectorized { AggregateFunctionPtr create_aggregate_function_count_by_enum(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() < 1) { LOG(WARNING) << fmt::format("Illegal number {} of argument for aggregate function {}", argument_types.size(), name); diff --git a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h index 5d4a3dde3550a1..1f5093de68263e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h @@ -197,7 +197,7 @@ class AggregateFunctionCountByEnum final DataTypePtr get_return_type() const override { return std::make_shared(); } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { for (int i = 0; i < arg_count; i++) { const auto* nullable_column = check_and_get_column(columns[i]); if (nullable_column == nullptr) { @@ -217,7 +217,7 @@ class AggregateFunctionCountByEnum final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp index b02d6ae0e12572..71d09f61de4302 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp @@ -53,14 +53,16 @@ AggregateFunctionPtr create_function_single_value(const String& name, AggregateFunctionPtr create_aggregate_function_covariance_samp(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { return create_function_single_value( name, argument_types, result_is_nullable, NOTNULLABLE); } AggregateFunctionPtr create_aggregate_function_covariance_pop(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { return create_function_single_value( name, argument_types, result_is_nullable, NOTNULLABLE); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp index 9bb2954207babb..fce58b38688b28 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp @@ -83,7 +83,8 @@ const std::string DISTINCT_FUNCTION_PREFIX = "multi_distinct_"; void register_aggregate_function_combinator_distinct(AggregateFunctionSimpleFactory& factory) { AggregateFunctionCreator creator = [&](const std::string& name, const DataTypes& types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { // 1. we should get not nullable types; DataTypes nested_types(types.size()); std::transform(types.begin(), types.end(), nested_types.begin(), @@ -92,7 +93,7 @@ void register_aggregate_function_combinator_distinct(AggregateFunctionSimpleFact auto transform_arguments = function_combinator->transform_arguments(nested_types); auto nested_function_name = name.substr(DISTINCT_FUNCTION_PREFIX.size()); auto nested_function = factory.get(nested_function_name, transform_arguments, false, - BeExecVersionManager::get_newest_version()); + BeExecVersionManager::get_newest_version(), attr); return function_combinator->transform_aggregate_function(nested_function, types, result_is_nullable); }; diff --git a/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp b/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp index ab6d0142f6a8c0..c1cbcc89996caf 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp @@ -34,8 +34,9 @@ namespace doris::vectorized { void register_aggregate_function_combinator_foreach(AggregateFunctionSimpleFactory& factory) { - AggregateFunctionCreator creator = [&](const std::string& name, const DataTypes& types, - const bool result_is_nullable) -> AggregateFunctionPtr { + AggregateFunctionCreator creator = + [&](const std::string& name, const DataTypes& types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) -> AggregateFunctionPtr { const std::string& suffix = AggregateFunctionForEach::AGG_FOREACH_SUFFIX; DataTypes transform_arguments; for (const auto& t : types) { @@ -46,7 +47,7 @@ void register_aggregate_function_combinator_foreach(AggregateFunctionSimpleFacto auto nested_function_name = name.substr(0, name.size() - suffix.size()); auto nested_function = factory.get(nested_function_name, transform_arguments, result_is_nullable, - BeExecVersionManager::get_newest_version(), false); + BeExecVersionManager::get_newest_version(), attr); if (!nested_function) { throw Exception( ErrorCode::INTERNAL_ERROR, diff --git a/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp b/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp index b3b9a8b9af47c6..24faf58b2e1ff9 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp @@ -70,7 +70,8 @@ inline AggregateFunctionPtr create_aggregate_function_group_array_intersect_impl } AggregateFunctionPtr create_aggregate_function_group_array_intersect( - const std::string& name, const DataTypes& argument_types, const bool result_is_nullable) { + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_unary(name, argument_types); const DataTypePtr& argument_type = remove_nullable(argument_types[0]); diff --git a/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp b/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp index 9661b9c89d5700..286795ea2ba70c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp @@ -28,7 +28,8 @@ const std::string AggregateFunctionGroupConcatImplStr::separator = ","; AggregateFunctionPtr create_aggregate_function_group_concat(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() == 1) { return creator_without_type::create< AggregateFunctionGroupConcat>( diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp index 5b06af28399d71..fb2fa9c2513ec0 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp @@ -47,7 +47,8 @@ AggregateFunctionPtr create_agg_function_histogram(const DataTypes& argument_typ AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { WhichDataType type(remove_nullable(argument_types[0])); #define DISPATCH(TYPE) \ diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h b/be/src/vec/aggregate_functions/aggregate_function_histogram.h index 25fc6957321586..1d2c5725ed370f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h +++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h @@ -192,7 +192,7 @@ class AggregateFunctionHistogram final DataTypePtr get_return_type() const override { return std::make_shared(); } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { if constexpr (has_input_param) { Int32 input_max_num_buckets = assert_cast(columns[1])->get_element(row_num); @@ -220,7 +220,7 @@ class AggregateFunctionHistogram final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h index 1cf6dc7f2a29a9..44835194eb4b88 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h @@ -122,7 +122,7 @@ class AggregateFunctionHLLUnion } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { this->data(place).add(columns[0], row_num); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h b/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h index d314cba7a656a9..d16da1a34e66e3 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h +++ b/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h @@ -148,6 +148,7 @@ struct AggregateJavaUdafData { jbyteArray arr = env->NewByteArray(len); env->SetByteArrayRegion(arr, 0, len, reinterpret_cast(serialize_data.data())); env->CallNonvirtualVoidMethod(executor_obj, executor_cl, executor_merge_id, place, arr); + RETURN_IF_ERROR(JniUtil::GetJniExceptionMsg(env)); jbyte* pBytes = env->GetByteArrayElements(arr, nullptr); env->ReleaseByteArrayElements(arr, pBytes, JNI_ABORT); env->DeleteLocalRef(arr); @@ -332,7 +333,7 @@ class AggregateJavaUdaf final } void add_batch(size_t batch_size, AggregateDataPtr* places, size_t place_offset, - const IColumn** columns, Arena* /*arena*/, bool /*agg_many*/) const override { + const IColumn** columns, Arena*, bool /*agg_many*/) const override { int64_t places_address = reinterpret_cast(places); Status st = this->data(_exec_place) .add(places_address, false, columns, 0, batch_size, argument_types, @@ -343,7 +344,7 @@ class AggregateJavaUdaf final } void add_batch_single_place(size_t batch_size, AggregateDataPtr place, const IColumn** columns, - Arena* /*arena*/) const override { + Arena*) const override { int64_t places_address = reinterpret_cast(place); Status st = this->data(_exec_place) .add(places_address, true, columns, 0, batch_size, argument_types, 0); @@ -354,7 +355,7 @@ class AggregateJavaUdaf final void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end, AggregateDataPtr place, const IColumn** columns, - Arena* arena) const override { + Arena*) const override { frame_start = std::max(frame_start, partition_start); frame_end = std::min(frame_end, partition_end); int64_t places_address = reinterpret_cast(place); diff --git a/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp b/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp index 00ad1893eafcf6..a763721f3f4061 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp @@ -45,7 +45,8 @@ AggregateFunctionPtr type_dispatch_for_aggregate_function_kurt(const DataTypes& AggregateFunctionPtr create_aggregate_function_kurt(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() != 1) { LOG(WARNING) << "aggregate function " << name << " requires exactly 1 argument"; return nullptr; diff --git a/be/src/vec/aggregate_functions/aggregate_function_map.cpp b/be/src/vec/aggregate_functions/aggregate_function_map.cpp index bcf3f2d66dfeaf..f289d885f48f52 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_map.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_map.cpp @@ -32,7 +32,8 @@ AggregateFunctionPtr create_agg_function_map_agg(const DataTypes& argument_types AggregateFunctionPtr create_aggregate_function_map_agg(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { WhichDataType type(remove_nullable(argument_types[0])); #define DISPATCH(TYPE) \ diff --git a/be/src/vec/aggregate_functions/aggregate_function_map.h b/be/src/vec/aggregate_functions/aggregate_function_map.h index d56cbf21f31136..3ec25cdc706152 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_map.h +++ b/be/src/vec/aggregate_functions/aggregate_function_map.h @@ -203,7 +203,7 @@ class AggregateFunctionMapAgg final } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { if (columns[0]->is_nullable()) { const auto& nullable_col = assert_cast(*columns[0]); @@ -234,7 +234,7 @@ class AggregateFunctionMapAgg final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } @@ -248,7 +248,7 @@ class AggregateFunctionMapAgg final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& col = assert_cast(*dst); for (size_t i = 0; i != num_rows; ++i) { Field key, value; @@ -263,7 +263,7 @@ class AggregateFunctionMapAgg final } } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { const auto& col = assert_cast(column); auto* data = &(this->data(places)); @@ -282,7 +282,7 @@ class AggregateFunctionMapAgg final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); for (size_t i = 0; i != num_rows; ++i) { @@ -293,7 +293,7 @@ class AggregateFunctionMapAgg final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); const auto& col = assert_cast(column); @@ -304,7 +304,7 @@ class AggregateFunctionMapAgg final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { const auto& col = assert_cast(*column); for (size_t i = 0; i != num_rows; ++i) { @@ -314,8 +314,8 @@ class AggregateFunctionMapAgg final } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { const auto& col = assert_cast(*column); for (size_t i = 0; i != num_rows; ++i) { if (places[i]) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp b/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp index 8aa8850a314d84..c1a72fd52bdd76 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp @@ -30,7 +30,8 @@ namespace doris::vectorized { template