diff --git a/.asf.yaml b/.asf.yaml index 1ef19652d399a5..3892aca2eddb77 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -70,16 +70,29 @@ github: dismiss_stale_reviews: true require_code_owner_reviews: true required_approving_review_count: 1 - branch-1.1-lts: + + branch-3.0: required_status_checks: # if strict is true, means "Require branches to be up to date before merging". strict: false contexts: - License Check - - required_pull_request_reviews: - dismiss_stale_reviews: true - required_approving_review_count: 1 + - Clang Formatter + - CheckStyle + - Build Broker + - ShellCheck + - Build Third Party Libraries (Linux) + - Build Third Party Libraries (macOS) + - FE UT (Doris FE UT) + - BE UT (Doris BE UT) + - Cloud UT (Doris Cloud UT) + - COMPILE (DORIS_COMPILE) + - P0 Regression (Doris Regression) + - External Regression (Doris External Regression) + - cloud_p0 (Doris Cloud Regression) + #required_pull_request_reviews: + # dismiss_stale_reviews: true + # required_approving_review_count: 1 branch-2.1: required_status_checks: @@ -124,17 +137,28 @@ github: dismiss_stale_reviews: true required_approving_review_count: 1 + branch-1.1-lts: + required_status_checks: + # if strict is true, means "Require branches to be up to date before merging". + strict: false + contexts: + - License Check + + required_pull_request_reviews: + dismiss_stale_reviews: true + required_approving_review_count: 1 + collaborators: - LemonLiTree - Yukang-Lian - TangSiyang2001 - - Lchangliang - freemandealer - shuke987 - wm1581066 - KassieZ - yujun777 - - gavinchou + - doris-robot + - LiBinfeng-01 notifications: pullrequests_status: commits@doris.apache.org diff --git a/.dlc.json b/.dlc.json deleted file mode 100644 index 99e425420b0b43..00000000000000 --- a/.dlc.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "ignorePatterns": [ - { - "pattern": "^http://localhost" - }, - { - "pattern": "^https://twitter.com*" - } - ], - "timeout": "10s", - "retryOn429": true, - "retryCount": 10, - "fallbackRetryDelay": "1000s", - "aliveStatusCodes": [ - 200, - 401, - 403 - ] -} \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 98febd914c2724..c7c41345761643 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,39 @@ -## Proposed changes +### What problem does this PR solve? Issue Number: close #xxx - +Related PR: #xxx + +Problem Summary: + +### Release note + +None + +### Check List (For Author) + +- Test + - [ ] Regression test + - [ ] Unit Test + - [ ] Manual test (add detailed scripts or steps below) + - [ ] No need to test or manual test. Explain why: + - [ ] This is a refactor/code format and no logic has been changed. + - [ ] Previous test can cover this change. + - [ ] No code files have been changed. + - [ ] Other reason + +- Behavior changed: + - [ ] No. + - [ ] Yes. + +- Does this need documentation? + - [ ] No. + - [ ] Yes. + +### Check List (For Reviewer who merge this PR) + +- [ ] Confirm the release note +- [ ] Confirm test cases +- [ ] Confirm document +- [ ] Add branch pick label diff --git a/.github/workflows/auto-cherry-pick.yml b/.github/workflows/auto-cherry-pick.yml new file mode 100644 index 00000000000000..f76c88934fdc9e --- /dev/null +++ b/.github/workflows/auto-cherry-pick.yml @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +name: Auto Cherry-Pick to Branch + +on: + pull_request_target: + types: + - closed + branches: + - master +permissions: + checks: write + contents: write + pull-requests: write +jobs: + auto_cherry_pick: + runs-on: ubuntu-latest + if: ${{ (contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) && github.event.pull_request.merged == true }} + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + pip install PyGithub + - name: Check SHA + run: | + expected_sha="4e4c0d7689b765c7f0677d75d23222555afa9286af46cf77ced66fa247a298d9f8a8c86830d0ce55f70e5f09532b54fbafee040c0343833077cbc7e214d486d2" + calculated_sha=$(sha512sum tools/auto-pick-script.py | awk '{ print $1 }') + if [ "$calculated_sha" != "$expected_sha" ]; then + echo "SHA mismatch! Expected: $expected_sha, but got: $calculated_sha" + exit 1 + else + echo "SHA matches: $calculated_sha" + fi + - name: Auto cherry-pick to branch-3.0 + if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_NAME: ${{ github.repository }} + CONFLICT_LABEL: cherry-pick-conflict-in-3.0 + run: | + python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-3.0 + - name: Auto cherry-pick to branch-2.1 + if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_NAME: ${{ github.repository }} + CONFLICT_LABEL: cherry-pick-conflict-in-2.1.x + run: | + python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-2.1 diff --git a/.github/workflows/build-extension.yml b/.github/workflows/build-extension.yml index 14998f24144b73..d12fe7d9d71387 100644 --- a/.github/workflows/build-extension.yml +++ b/.github/workflows/build-extension.yml @@ -20,7 +20,9 @@ name: Build Extensions on: pull_request: - + workflow_dispatch: + issue_comment: + types: [ created ] concurrency: group: ${{ github.ref }} (Build Extensions) cancel-in-progress: true @@ -29,6 +31,12 @@ jobs: changes: name: Detect Changes runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') outputs: broker_changes: ${{ steps.filter.outputs.broker_changes }} docs_changes: ${{ steps.filter.outputs.docs_changes }} diff --git a/.github/workflows/build-thirdparty.yml b/.github/workflows/build-thirdparty.yml index 991b5089035699..7bc5d8a8182a71 100644 --- a/.github/workflows/build-thirdparty.yml +++ b/.github/workflows/build-thirdparty.yml @@ -19,6 +19,9 @@ name: Build Third Party Libraries on: pull_request: + workflow_dispatch: + issue_comment: + types: [ created ] concurrency: group: ${{ github.ref }} (Build Third Party Libraries) @@ -28,6 +31,12 @@ jobs: changes: name: Detect Changes runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') outputs: thirdparty_changes: ${{ steps.filter.outputs.thirdparty_changes }} steps: diff --git a/.github/workflows/checkstyle.yaml b/.github/workflows/checkstyle.yaml index 13ab46b2cd50b2..a53a19d82649b9 100644 --- a/.github/workflows/checkstyle.yaml +++ b/.github/workflows/checkstyle.yaml @@ -20,11 +20,20 @@ name: FE Code Style Checker on: pull_request: + workflow_dispatch: + issue_comment: + types: [ created ] jobs: java-checkstyle: name: "CheckStyle" runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') steps: - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index adc77450d78c01..a81d64e4e2b1f1 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -19,12 +19,22 @@ --- name: Code Formatter -on: [push, pull_request_target] - +on: + pull_request: + pull_request_target: + workflow_dispatch: + issue_comment: + types: [ created ] jobs: clang-format: name: "Clang Formatter" runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || (github.event_name == 'pull_request_target') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" if: ${{ github.event_name != 'pull_request_target' }} diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 4fe4090b516a1e..6aaa83f47cd4ff 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -105,7 +105,7 @@ jobs: popd export PATH="${DEFAULT_DIR}/ldb-toolchain/bin/:$(pwd)/thirdparty/installed/bin/:${PATH}" - DISABLE_BE_JAVA_EXTENSIONS=ON DO_NOT_CHECK_JAVA_ENV=ON DORIS_TOOLCHAIN=clang ENABLE_PCH=OFF OUTPUT_BE_BINARY=0 ./build.sh --be --cloud + DISABLE_BE_JAVA_EXTENSIONS=ON DO_NOT_CHECK_JAVA_ENV=ON DORIS_TOOLCHAIN=clang ENABLE_PCH=OFF OUTPUT_BE_BINARY=0 ./build.sh --be fi echo "should_check=${{ steps.filter.outputs.cpp_changes }}" >>${GITHUB_OUTPUT} diff --git a/.github/workflows/deadlink-check.yml b/.github/workflows/deadlink-check.yml deleted file mode 100644 index 9292e17c101154..00000000000000 --- a/.github/workflows/deadlink-check.yml +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: Dead Link Checker - -on: - schedule: - - cron: '30 3,14 * * *' - -concurrency: - group: dlc-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - CheckDeadLinks: - runs-on: ubuntu-latest - timeout-minutes: 30 - steps: - - uses: actions/checkout@v3 - - run: sudo npm install -g markdown-link-check@3.10.0 - - run: | - for file in $(find . -name "*.md"); do - markdown-link-check -c .dlc.json -q "$file" - done \ No newline at end of file diff --git a/.github/workflows/license-eyes.yml b/.github/workflows/license-eyes.yml index 890efb2d9d1196..c17081fc75b9e8 100644 --- a/.github/workflows/license-eyes.yml +++ b/.github/workflows/license-eyes.yml @@ -22,10 +22,21 @@ on: push: branches: - master + workflow_dispatch: + issue_comment: + types: [ created ] + jobs: license-check: name: "License Check" runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request_target') || + (github.event_name == 'push' && github.ref == 'refs/heads/master') || + (github.event_name == 'issue_comment' && + github.event.comment.body == 'run buildall' && + github.actor == 'doris-robot' && + github.event.issue.user.login == 'github-actions[bot]') steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" if: ${{ github.event_name != 'pull_request_target' }} diff --git a/.github/workflows/scope-label.yml b/.github/workflows/scope-label.yml index ba88dbbb02f7c0..6a9b094a84afbb 100644 --- a/.github/workflows/scope-label.yml +++ b/.github/workflows/scope-label.yml @@ -18,11 +18,12 @@ --- name: Add Scope Labeler -on: - pull_request_target: - types: - - opened - - synchronize +# This action has some error, skip it temporarily +#on: +# pull_request_target: +# types: +# - opened +# - synchronize jobs: process: diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index f554ba6053a5e6..1d79048f96511c 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -299,8 +299,6 @@ if (COMPILER_CLANG) -Wno-implicit-float-conversion -Wno-implicit-int-conversion -Wno-sign-conversion - -Wno-missing-field-initializers - -Wno-unused-const-variable -Wno-shorten-64-to-32) if (USE_LIBCPP) add_compile_options($<$:-stdlib=libc++>) @@ -344,6 +342,10 @@ if (ENABLE_INJECTION_POINT) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DENABLE_INJECTION_POINT") endif() +if (ENABLE_CACHE_LOCK_DEBUG) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DENABLE_CACHE_LOCK_DEBUG") +endif() + # Enable memory tracker, which allows BE to limit the memory of tasks such as query, load, # and compaction,and observe the memory of BE through be_ip:http_port/MemTracker. # Adding the option `USE_MEM_TRACKER=OFF sh build.sh` when compiling can turn off the memory tracker, @@ -784,6 +786,7 @@ install(DIRECTORY DESTINATION ${OUTPUT_DIR}/conf) install(FILES ${BASE_DIR}/../bin/start_be.sh ${BASE_DIR}/../bin/stop_be.sh + ${BASE_DIR}/../tools/jeprof PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_WRITE GROUP_EXECUTE WORLD_READ WORLD_EXECUTE diff --git a/be/src/agent/agent_server.cpp b/be/src/agent/agent_server.cpp index 9d36148b64f305..361a8ab93a90a6 100644 --- a/be/src/agent/agent_server.cpp +++ b/be/src/agent/agent_server.cpp @@ -33,6 +33,7 @@ #include "agent/utils.h" #include "agent/workload_group_listener.h" #include "agent/workload_sched_policy_listener.h" +#include "cloud/config.h" #include "common/config.h" #include "common/logging.h" #include "common/status.h" @@ -193,7 +194,7 @@ void AgentServer::start_workers(StorageEngine& engine, ExecEnv* exec_env) { "REPORT_DISK_STATE", _master_info, config::report_disk_state_interval_seconds, [&engine, &master_info = _master_info] { report_disk_callback(engine, master_info); })); _report_workers.push_back(std::make_unique( - "REPORT_OLAP_TABLE", _master_info, config::report_tablet_interval_seconds,[&engine, &master_info = _master_info] { report_tablet_callback(engine, master_info); })); + "REPORT_OLAP_TABLET", _master_info, config::report_tablet_interval_seconds,[&engine, &master_info = _master_info] { report_tablet_callback(engine, master_info); })); // clang-format on } @@ -211,6 +212,10 @@ void AgentServer::cloud_start_workers(CloudStorageEngine& engine, ExecEnv* exec_ "CALC_DBM_TASK", config::calc_delete_bitmap_worker_count, [&engine](auto&& task) { return calc_delete_bitmap_callback(engine, task); }); + // cloud, drop tablet just clean clear_cache, so just one thread do it + _workers[TTaskType::DROP] = std::make_unique( + "DROP_TABLE", 1, [&engine](auto&& task) { return drop_tablet_callback(engine, task); }); + _report_workers.push_back(std::make_unique( "REPORT_TASK", _master_info, config::report_task_interval_seconds, [&master_info = _master_info] { report_task_callback(master_info); })); @@ -218,6 +223,14 @@ void AgentServer::cloud_start_workers(CloudStorageEngine& engine, ExecEnv* exec_ _report_workers.push_back(std::make_unique( "REPORT_DISK_STATE", _master_info, config::report_disk_state_interval_seconds, [&engine, &master_info = _master_info] { report_disk_callback(engine, master_info); })); + + if (config::enable_cloud_tablet_report) { + _report_workers.push_back(std::make_unique( + "REPORT_OLAP_TABLET", _master_info, config::report_tablet_interval_seconds, + [&engine, &master_info = _master_info] { + report_tablet_callback(engine, master_info); + })); + } } // TODO(lingbin): each task in the batch may have it own status or FE must check and diff --git a/be/src/agent/be_exec_version_manager.cpp b/be/src/agent/be_exec_version_manager.cpp index 0bdb55f7bc341e..bfd0745e3166b2 100644 --- a/be/src/agent/be_exec_version_manager.cpp +++ b/be/src/agent/be_exec_version_manager.cpp @@ -34,6 +34,13 @@ Status BeExecVersionManager::check_be_exec_version(int be_exec_version) { int BeExecVersionManager::get_function_compatibility(int be_exec_version, std::string function_name) { + if (_function_restrict_map.contains(function_name) && be_exec_version != get_newest_version()) { + throw Exception(Status::InternalError( + "function {} do not support old be exec version, maybe it's because doris are " + "doing a rolling upgrade. newest_version={}, input_be_exec_version={}", + function_name, get_newest_version(), be_exec_version)); + } + auto it = _function_change_map.find(function_name); if (it == _function_change_map.end()) { // 0 means no compatibility issues need to be dealt with @@ -82,7 +89,7 @@ void BeExecVersionManager::check_function_compatibility(int current_be_exec_vers * 3: start from doris 2.0.0 (by some mistakes) * a. aggregation function do not serialize bitmap to string. * b. support window funnel mode. - * 4/5: start from doris 2.1.0 + * 4: start from doris 2.1.0 * a. ignore this line, window funnel mode should be enabled from 2.0. * b. array contains/position/countequal function return nullable in less situations. * c. cleared old version of Version 2. @@ -92,15 +99,22 @@ void BeExecVersionManager::check_function_compatibility(int current_be_exec_vers * g. do local merge of remote runtime filter * h. "now": ALWAYS_NOT_NULLABLE -> DEPEND_ON_ARGUMENTS * - * 7: start from doris 3.0.0 + * 5: start from doris 3.0.0 + * a. change some agg function nullable property: PR #37215 + * + * 6: start from doris 3.0.1 and 2.1.6 * a. change the impl of percentile (need fix) * b. clear old version of version 3->4 * c. change FunctionIsIPAddressInRange from AlwaysNotNullable to DependOnArguments - * d. change some agg function nullable property: PR #37215 - * e. change variant serde to fix PR #38413 - * f. support const column in serialize/deserialize function: PR #41175 + * d. change variant serde to fix PR #38413 + * + * 7: start from doris 3.0.2 + * a. window funnel logic change +* b. support const column in serialize/deserialize function: PR #41175 */ -const int BeExecVersionManager::max_be_exec_version = 7; + +const int BeExecVersionManager::max_be_exec_version = 8; const int BeExecVersionManager::min_be_exec_version = 0; std::map> BeExecVersionManager::_function_change_map {}; +std::set BeExecVersionManager::_function_restrict_map; } // namespace doris diff --git a/be/src/agent/be_exec_version_manager.h b/be/src/agent/be_exec_version_manager.h index a51fb8e36b4008..f4158a40152068 100644 --- a/be/src/agent/be_exec_version_manager.h +++ b/be/src/agent/be_exec_version_manager.h @@ -25,7 +25,6 @@ namespace doris { -constexpr static int AGG_FUNCTION_NEW_WINDOW_FUNNEL = 6; constexpr inline int BITMAP_SERDE = 3; constexpr inline int USE_NEW_SERDE = 4; // release on DORIS version 2.1 constexpr inline int OLD_WAL_SERDE = 3; // use to solve compatibility issues, see pr #32299 @@ -34,7 +33,7 @@ constexpr inline int VARIANT_SERDE = 6; // change variant serde to fix P constexpr inline int AGGREGATION_2_1_VERSION = 6; // some aggregation changed the data format after this version constexpr inline int USE_CONST_SERDE = - 7; // support const column in serialize/deserialize function: PR #41175 + 8; // support const column in serialize/deserialize function: PR #41175 class BeExecVersionManager { public: @@ -59,11 +58,17 @@ class BeExecVersionManager { _function_change_map[function_name].insert(breaking_old_version); } + static void registe_restrict_function_compatibility(std::string function_name) { + _function_restrict_map.insert(function_name); + } + private: static const int max_be_exec_version; static const int min_be_exec_version; // [function name] -> [breaking change start version] static std::map> _function_change_map; + // those function must has input newest be exec version + static std::set _function_restrict_map; }; } // namespace doris diff --git a/be/src/agent/heartbeat_server.cpp b/be/src/agent/heartbeat_server.cpp index 146604aaab20f4..78002ed08fe0df 100644 --- a/be/src/agent/heartbeat_server.cpp +++ b/be/src/agent/heartbeat_server.cpp @@ -26,6 +26,7 @@ #include #include +#include "cloud/cloud_tablet_mgr.h" #include "cloud/config.h" #include "common/config.h" #include "common/status.h" @@ -275,6 +276,11 @@ Status HeartbeatServer::_heartbeat(const TMasterInfo& master_info) { LOG(INFO) << "set config cloud_unique_id " << master_info.cloud_unique_id << " " << st; } + if (master_info.__isset.tablet_report_inactive_duration_ms) { + doris::g_tablet_report_inactive_duration_ms = + master_info.tablet_report_inactive_duration_ms; + } + if (need_report) { LOG(INFO) << "Master FE is changed or restarted. report tablet and disk info immediately"; _engine.notify_listeners(); diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 5906511ce15794..8a034001378f6f 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -48,6 +48,8 @@ #include "cloud/cloud_delete_task.h" #include "cloud/cloud_engine_calc_delete_bitmap_task.h" #include "cloud/cloud_schema_change_job.h" +#include "cloud/cloud_tablet_mgr.h" +#include "cloud/config.h" #include "common/config.h" #include "common/logging.h" #include "common/status.h" @@ -116,6 +118,10 @@ bool register_task_info(const TTaskType::type task_type, int64_t signature) { // no need to report task of these types return true; } + if (task_type == TTaskType::type::DROP && config::is_cloud_mode()) { + // cloud no need to report drop task status + return true; + } if (signature == -1) { // No need to report task with unintialized signature return true; @@ -1134,6 +1140,46 @@ void report_tablet_callback(StorageEngine& engine, const TMasterInfo& master_inf } } +void report_tablet_callback(CloudStorageEngine& engine, const TMasterInfo& master_info) { + // Random sleep 1~5 seconds before doing report. + // In order to avoid the problem that the FE receives many report requests at the same time + // and can not be processed. + if (config::report_random_wait) { + random_sleep(5); + } + + TReportRequest request; + request.__set_backend(BackendOptions::get_local_backend()); + request.__isset.tablets = true; + + increase_report_version(); + uint64_t report_version; + uint64_t total_num_tablets = 0; + for (int i = 0; i < 5; i++) { + request.tablets.clear(); + report_version = s_report_version; + engine.tablet_mgr().build_all_report_tablets_info(&request.tablets, &total_num_tablets); + if (report_version == s_report_version) { + break; + } + } + + if (report_version < s_report_version) { + LOG(WARNING) << "report version " << report_version << " change to " << s_report_version; + DorisMetrics::instance()->report_all_tablets_requests_skip->increment(1); + return; + } + + request.__set_report_version(report_version); + request.__set_num_tablets(total_num_tablets); + + bool succ = handle_report(request, master_info, "tablet"); + report_tablet_total << 1; + if (!succ) [[unlikely]] { + report_tablet_failed << 1; + } +} + void upload_callback(StorageEngine& engine, ExecEnv* env, const TAgentTaskRequest& req) { const auto& upload_request = req.upload_req; @@ -1390,15 +1436,7 @@ void update_s3_resource(const TStorageResource& param, io::RemoteFileSystemSPtr DCHECK_EQ(existed_fs->type(), io::FileSystemType::S3) << param.id << ' ' << param.name; auto client = static_cast(existed_fs.get())->client_holder(); auto new_s3_conf = S3Conf::get_s3_conf(param.s3_storage_param); - S3ClientConf conf { - .endpoint {}, - .region {}, - .ak = std::move(new_s3_conf.client_conf.ak), - .sk = std::move(new_s3_conf.client_conf.sk), - .token = std::move(new_s3_conf.client_conf.token), - .bucket {}, - .provider = new_s3_conf.client_conf.provider, - }; + S3ClientConf conf = std::move(new_s3_conf.client_conf); st = client->reset(conf); fs = std::move(existed_fs); } @@ -1406,7 +1444,7 @@ void update_s3_resource(const TStorageResource& param, io::RemoteFileSystemSPtr if (!st.ok()) { LOG(WARNING) << "update s3 resource failed: " << st; } else { - LOG_INFO("successfully update hdfs resource") + LOG_INFO("successfully update s3 resource") .tag("resource_id", param.id) .tag("resource_name", param.name); put_storage_resource(param.id, {std::move(fs)}, param.version); @@ -1610,6 +1648,21 @@ void drop_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) { remove_task_info(req.task_type, req.signature); } +void drop_tablet_callback(CloudStorageEngine& engine, const TAgentTaskRequest& req) { + const auto& drop_tablet_req = req.drop_tablet_req; + DBUG_EXECUTE_IF("WorkPoolCloudDropTablet.drop_tablet_callback.failed", { + LOG_WARNING("WorkPoolCloudDropTablet.drop_tablet_callback.failed") + .tag("tablet_id", drop_tablet_req.tablet_id); + return; + }); + // 1. erase lru from tablet mgr + // TODO(dx) clean tablet file cache + // get tablet's info(such as cachekey, tablet id, rsid) + engine.tablet_mgr().erase_tablet(drop_tablet_req.tablet_id); + // 2. gen clean file cache task + return; +} + void push_callback(StorageEngine& engine, const TAgentTaskRequest& req) { const auto& push_req = req.push_req; diff --git a/be/src/agent/task_worker_pool.h b/be/src/agent/task_worker_pool.h index f51d6c2a4c0dc0..c50ac57ffe9b74 100644 --- a/be/src/agent/task_worker_pool.h +++ b/be/src/agent/task_worker_pool.h @@ -155,6 +155,8 @@ void create_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) void drop_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req); +void drop_tablet_callback(CloudStorageEngine& engine, const TAgentTaskRequest& req); + void clear_transaction_task_callback(StorageEngine& engine, const TAgentTaskRequest& req); void push_callback(StorageEngine& engine, const TAgentTaskRequest& req); @@ -188,6 +190,8 @@ void report_disk_callback(CloudStorageEngine& engine, const TMasterInfo& master_ void report_tablet_callback(StorageEngine& engine, const TMasterInfo& master_info); +void report_tablet_callback(CloudStorageEngine& engine, const TMasterInfo& master_info); + void calc_delete_bitmap_callback(CloudStorageEngine& engine, const TAgentTaskRequest& req); } // namespace doris diff --git a/be/src/apache-orc b/be/src/apache-orc index 903ea6ccdc463b..db01184f765c03 160000 --- a/be/src/apache-orc +++ b/be/src/apache-orc @@ -1 +1 @@ -Subproject commit 903ea6ccdc463b8a17af2604975107ba7d895380 +Subproject commit db01184f765c03496e4107bd3ac37c077ac4bc5f diff --git a/be/src/cloud/cloud_base_compaction.cpp b/be/src/cloud/cloud_base_compaction.cpp index f431eaf850bbd1..88d83000e95dfa 100644 --- a/be/src/cloud/cloud_base_compaction.cpp +++ b/be/src/cloud/cloud_base_compaction.cpp @@ -124,7 +124,8 @@ Status CloudBaseCompaction::prepare_compact() { for (auto& rs : _input_rowsets) { _input_row_num += rs->num_rows(); _input_segments += rs->num_segments(); - _input_rowsets_size += rs->data_disk_size(); + _input_rowsets_data_size += rs->data_disk_size(); + _input_rowsets_total_size += rs->total_disk_size(); } LOG_INFO("start CloudBaseCompaction, tablet_id={}, range=[{}-{}]", _tablet->tablet_id(), _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) @@ -132,7 +133,9 @@ Status CloudBaseCompaction::prepare_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size); + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size); return st; } @@ -270,17 +273,21 @@ Status CloudBaseCompaction::execute_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total", _input_rowsets_total_size) .tag("output_rows", _output_rowset->num_rows()) .tag("output_segments", _output_rowset->num_segments()) - .tag("output_data_size", _output_rowset->data_disk_size()); + .tag("output_rowset_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_index_size", _output_rowset->index_disk_size()) + .tag("output_rowset_total_size", _output_rowset->total_disk_size()); //_compaction_succeed = true; _state = CompactionState::SUCCESS; DorisMetrics::instance()->base_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_size); - base_output_size << _output_rowset->data_disk_size(); + DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_total_size); + base_output_size << _output_rowset->total_disk_size(); return Status::OK(); } @@ -302,8 +309,8 @@ Status CloudBaseCompaction::modify_rowsets() { compaction_job->set_output_cumulative_point(cloud_tablet()->cumulative_layer_point()); compaction_job->set_num_input_rows(_input_row_num); compaction_job->set_num_output_rows(_output_rowset->num_rows()); - compaction_job->set_size_input_rowsets(_input_rowsets_size); - compaction_job->set_size_output_rowsets(_output_rowset->data_disk_size()); + compaction_job->set_size_input_rowsets(_input_rowsets_total_size); + compaction_job->set_size_output_rowsets(_output_rowset->total_disk_size()); compaction_job->set_num_input_segments(_input_segments); compaction_job->set_num_output_segments(_output_rowset->num_segments()); compaction_job->set_num_input_rowsets(_input_rowsets.size()); diff --git a/be/src/cloud/cloud_compaction_action.cpp b/be/src/cloud/cloud_compaction_action.cpp index 13161c32c8e20e..481f7b589fe523 100644 --- a/be/src/cloud/cloud_compaction_action.cpp +++ b/be/src/cloud/cloud_compaction_action.cpp @@ -149,8 +149,9 @@ Status CloudCompactionAction::_handle_run_compaction(HttpRequest* req, std::stri compaction_type != PARAM_COMPACTION_FULL) { return Status::NotSupported("The compaction type '{}' is not supported", compaction_type); } - - CloudTabletSPtr tablet = DORIS_TRY(_engine.tablet_mgr().get_tablet(tablet_id)); + bool sync_delete_bitmap = compaction_type != PARAM_COMPACTION_FULL; + CloudTabletSPtr tablet = + DORIS_TRY(_engine.tablet_mgr().get_tablet(tablet_id, false, sync_delete_bitmap)); if (tablet == nullptr) { return Status::NotFound("Tablet not found. tablet_id={}", tablet_id); } diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index aad1bd7bfe7d2d..6b74e70ee1b4b8 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -164,7 +164,9 @@ Status CloudCumulativeCompaction::prepare_compact() { for (auto& rs : _input_rowsets) { _input_row_num += rs->num_rows(); _input_segments += rs->num_segments(); - _input_rowsets_size += rs->data_disk_size(); + _input_rowsets_data_size += rs->data_disk_size(); + _input_rowsets_index_size += rs->index_disk_size(); + _input_rowsets_total_size += rs->total_disk_size(); } LOG_INFO("start CloudCumulativeCompaction, tablet_id={}, range=[{}-{}]", _tablet->tablet_id(), _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) @@ -172,7 +174,9 @@ Status CloudCumulativeCompaction::prepare_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("tablet_max_version", cloud_tablet()->max_version_unlocked()) .tag("cumulative_point", cloud_tablet()->cumulative_layer_point()) .tag("num_rowsets", cloud_tablet()->fetch_add_approximate_num_rowsets(0)) @@ -201,10 +205,14 @@ Status CloudCumulativeCompaction::execute_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("output_rows", _output_rowset->num_rows()) .tag("output_segments", _output_rowset->num_segments()) - .tag("output_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_index_size", _output_rowset->index_disk_size()) + .tag("output_rowset_total_size", _output_rowset->total_disk_size()) .tag("tablet_max_version", _tablet->max_version_unlocked()) .tag("cumulative_point", cloud_tablet()->cumulative_layer_point()) .tag("num_rowsets", cloud_tablet()->fetch_add_approximate_num_rowsets(0)) @@ -213,8 +221,9 @@ Status CloudCumulativeCompaction::execute_compact() { _state = CompactionState::SUCCESS; DorisMetrics::instance()->cumulative_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->cumulative_compaction_bytes_total->increment(_input_rowsets_size); - cumu_output_size << _output_rowset->data_disk_size(); + DorisMetrics::instance()->cumulative_compaction_bytes_total->increment( + _input_rowsets_total_size); + cumu_output_size << _output_rowset->total_disk_size(); return Status::OK(); } @@ -243,8 +252,8 @@ Status CloudCumulativeCompaction::modify_rowsets() { compaction_job->set_output_cumulative_point(new_cumulative_point); compaction_job->set_num_input_rows(_input_row_num); compaction_job->set_num_output_rows(_output_rowset->num_rows()); - compaction_job->set_size_input_rowsets(_input_rowsets_size); - compaction_job->set_size_output_rowsets(_output_rowset->data_disk_size()); + compaction_job->set_size_input_rowsets(_input_rowsets_total_size); + compaction_job->set_size_output_rowsets(_output_rowset->total_disk_size()); compaction_job->set_num_input_segments(_input_segments); compaction_job->set_num_output_segments(_output_rowset->num_segments()); compaction_job->set_num_input_rowsets(_input_rowsets.size()); @@ -351,14 +360,15 @@ Status CloudCumulativeCompaction::modify_rowsets() { stats.num_rows(), stats.data_size()); } } - if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && + if (config::enable_delete_bitmap_merge_on_compaction && + _tablet->keys_type() == KeysType::UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write() && _input_rowsets.size() != 1) { - process_old_version_delete_bitmap(); + RETURN_IF_ERROR(process_old_version_delete_bitmap()); } return Status::OK(); } -void CloudCumulativeCompaction::process_old_version_delete_bitmap() { +Status CloudCumulativeCompaction::process_old_version_delete_bitmap() { // agg previously rowset old version delete bitmap std::vector pre_rowsets {}; std::vector pre_rowset_ids {}; @@ -397,40 +407,29 @@ void CloudCumulativeCompaction::process_old_version_delete_bitmap() { } if (!new_delete_bitmap->empty()) { // store agg delete bitmap - Status update_st; DBUG_EXECUTE_IF("CloudCumulativeCompaction.modify_rowsets.update_delete_bitmap_failed", { - update_st = Status::InternalError( + return Status::InternalError( "test fail to update delete bitmap for tablet_id {}", cloud_tablet()->tablet_id()); }); - if (update_st.ok()) { - update_st = _engine.meta_mgr().update_delete_bitmap_without_lock( - *cloud_tablet(), new_delete_bitmap.get()); - } - if (!update_st.ok()) { - std::stringstream ss; - ss << "failed to update delete bitmap for tablet=" << cloud_tablet()->tablet_id() - << " st=" << update_st.to_string(); - std::string msg = ss.str(); - LOG(WARNING) << msg; - } else { - Version version(_input_rowsets.front()->start_version(), - _input_rowsets.back()->end_version()); - for (auto it = new_delete_bitmap->delete_bitmap.begin(); - it != new_delete_bitmap->delete_bitmap.end(); it++) { - _tablet->tablet_meta()->delete_bitmap().set(it->first, it->second); - } - _tablet->tablet_meta()->delete_bitmap().add_to_remove_queue(version.to_string(), - to_remove_vec); - DBUG_EXECUTE_IF( - "CloudCumulativeCompaction.modify_rowsets.delete_expired_stale_rowsets", { - static_cast(_tablet.get()) - ->delete_expired_stale_rowsets(); - }); + RETURN_IF_ERROR(_engine.meta_mgr().cloud_update_delete_bitmap_without_lock( + *cloud_tablet(), new_delete_bitmap.get())); + + Version version(_input_rowsets.front()->start_version(), + _input_rowsets.back()->end_version()); + for (auto it = new_delete_bitmap->delete_bitmap.begin(); + it != new_delete_bitmap->delete_bitmap.end(); it++) { + _tablet->tablet_meta()->delete_bitmap().set(it->first, it->second); } + _tablet->tablet_meta()->delete_bitmap().add_to_remove_queue(version.to_string(), + to_remove_vec); + DBUG_EXECUTE_IF( + "CloudCumulativeCompaction.modify_rowsets.delete_expired_stale_rowsets", + { static_cast(_tablet.get())->delete_expired_stale_rowsets(); }); } } + return Status::OK(); } void CloudCumulativeCompaction::garbage_collection() { diff --git a/be/src/cloud/cloud_cumulative_compaction.h b/be/src/cloud/cloud_cumulative_compaction.h index 62c7cb44ea5bf5..1159dcb59ceef1 100644 --- a/be/src/cloud/cloud_cumulative_compaction.h +++ b/be/src/cloud/cloud_cumulative_compaction.h @@ -47,7 +47,7 @@ class CloudCumulativeCompaction : public CloudCompactionMixin { void update_cumulative_point(); - void process_old_version_delete_bitmap(); + Status process_old_version_delete_bitmap(); ReaderType compaction_type() const override { return ReaderType::READER_CUMULATIVE_COMPACTION; } diff --git a/be/src/cloud/cloud_cumulative_compaction_policy.cpp b/be/src/cloud/cloud_cumulative_compaction_policy.cpp index f9af469e56f60a..5a9879387b2327 100644 --- a/be/src/cloud/cloud_cumulative_compaction_policy.cpp +++ b/be/src/cloud/cloud_cumulative_compaction_policy.cpp @@ -209,7 +209,7 @@ int64_t CloudSizeBasedCumulativeCompactionPolicy::new_cumulative_point( // if rowsets have no delete version, check output_rowset total disk size satisfies promotion size. return output_rowset->start_version() == last_cumulative_point && (last_delete_version.first != -1 || - output_rowset->data_disk_size() >= cloud_promotion_size(tablet) || + output_rowset->total_disk_size() >= cloud_promotion_size(tablet) || satisfy_promotion_version) ? output_rowset->end_version() + 1 : last_cumulative_point; diff --git a/be/src/cloud/cloud_full_compaction.cpp b/be/src/cloud/cloud_full_compaction.cpp index 2e11891045c250..c27b728c93d29b 100644 --- a/be/src/cloud/cloud_full_compaction.cpp +++ b/be/src/cloud/cloud_full_compaction.cpp @@ -98,7 +98,9 @@ Status CloudFullCompaction::prepare_compact() { for (auto& rs : _input_rowsets) { _input_row_num += rs->num_rows(); _input_segments += rs->num_segments(); - _input_rowsets_size += rs->data_disk_size(); + _input_rowsets_data_size += rs->data_disk_size(); + _input_rowsets_index_size += rs->index_disk_size(); + _input_rowsets_total_size += rs->total_disk_size(); } LOG_INFO("start CloudFullCompaction, tablet_id={}, range=[{}-{}]", _tablet->tablet_id(), _input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()) @@ -106,7 +108,9 @@ Status CloudFullCompaction::prepare_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size); + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size); return st; } @@ -162,16 +166,20 @@ Status CloudFullCompaction::execute_compact() { .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_data_size", _input_rowsets_data_size) + .tag("input_rowsets_index_size", _input_rowsets_index_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("output_rows", _output_rowset->num_rows()) .tag("output_segments", _output_rowset->num_segments()) - .tag("output_data_size", _output_rowset->data_disk_size()); + .tag("output_rowset_data_size", _output_rowset->data_disk_size()) + .tag("output_rowset_index_size", _output_rowset->index_disk_size()) + .tag("output_rowset_total_size", _output_rowset->total_disk_size()); _state = CompactionState::SUCCESS; DorisMetrics::instance()->full_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->full_compaction_bytes_total->increment(_input_rowsets_size); - full_output_size << _output_rowset->data_disk_size(); + DorisMetrics::instance()->full_compaction_bytes_total->increment(_input_rowsets_total_size); + full_output_size << _output_rowset->total_disk_size(); return Status::OK(); } @@ -193,8 +201,12 @@ Status CloudFullCompaction::modify_rowsets() { compaction_job->set_output_cumulative_point(_output_rowset->end_version() + 1); compaction_job->set_num_input_rows(_input_row_num); compaction_job->set_num_output_rows(_output_rowset->num_rows()); - compaction_job->set_size_input_rowsets(_input_rowsets_size); - compaction_job->set_size_output_rowsets(_output_rowset->data_disk_size()); + compaction_job->set_size_input_rowsets(_input_rowsets_total_size); + compaction_job->set_size_output_rowsets(_output_rowset->total_disk_size()); + DBUG_EXECUTE_IF("CloudFullCompaction::modify_rowsets.wrong_compaction_data_size", { + compaction_job->set_size_input_rowsets(1); + compaction_job->set_size_output_rowsets(10000001); + }) compaction_job->set_num_input_segments(_input_segments); compaction_job->set_num_output_segments(_output_rowset->num_segments()); compaction_job->set_num_input_rowsets(_input_rowsets.size()); @@ -341,7 +353,7 @@ Status CloudFullCompaction::_cloud_full_compaction_update_delete_bitmap(int64_t .tag("input_rowsets", _input_rowsets.size()) .tag("input_rows", _input_row_num) .tag("input_segments", _input_segments) - .tag("input_data_size", _input_rowsets_size) + .tag("input_rowsets_total_size", _input_rowsets_total_size) .tag("update_bitmap_size", delete_bitmap->delete_bitmap.size()); _tablet->tablet_meta()->delete_bitmap().merge(*delete_bitmap); return Status::OK(); diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index 071c15d11e5516..dc1ac169200eec 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #include "cloud/cloud_tablet.h" #include "cloud/config.h" #include "cloud/pb_convert.h" +#include "common/config.h" #include "common/logging.h" #include "common/status.h" #include "cpp/sync_point.h" @@ -51,6 +53,7 @@ #include "olap/olap_common.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/storage_engine.h" #include "olap/tablet_meta.h" #include "runtime/client_cache.h" @@ -292,6 +295,9 @@ static std::string debug_info(const Request& req) { return fmt::format(" tablet_id={}", req.rowset_meta().tablet_id()); } else if constexpr (is_any_v) { return fmt::format(" tablet_id={}", req.tablet_id()); + } else if constexpr (is_any_v) { + return fmt::format(" table_id={}, tablet_id={}, lock_id={}", req.table_id(), + req.tablet_id(), req.lock_id()); } else { static_assert(!sizeof(Request)); } @@ -378,7 +384,8 @@ Status CloudMetaMgr::get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tab return Status::OK(); } -Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_data) { +Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_data, + bool sync_delete_bitmap) { using namespace std::chrono; TEST_SYNC_POINT_RETURN_WITH_VALUE("CloudMetaMgr::sync_tablet_rowsets", Status::OK(), tablet); @@ -410,6 +417,10 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ req.set_cumulative_point(tablet->cumulative_layer_point()); } req.set_end_version(-1); + // backend side use schema dict + if (config::variant_use_cloud_schema_dict) { + req.set_schema_op(GetRowsetRequest::RETURN_DICT); + } VLOG_DEBUG << "send GetRowsetRequest: " << req.ShortDebugString(); stub->get_rowset(&cntl, &req, &resp, nullptr); @@ -455,7 +466,7 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ // If is mow, the tablet has no delete bitmap in base rowsets. // So dont need to sync it. - if (tablet->enable_unique_key_merge_on_write() && + if (sync_delete_bitmap && tablet->enable_unique_key_merge_on_write() && tablet->tablet_state() == TABLET_RUNNING) { DeleteBitmap delete_bitmap(tablet_id); int64_t old_max_version = req.start_version() - 1; @@ -524,7 +535,8 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ existed_rowset->rowset_id().to_string() == cloud_rs_meta_pb.rowset_id_v2()) { continue; // Same rowset, skip it } - RowsetMetaPB meta_pb = cloud_rowset_meta_to_doris(cloud_rs_meta_pb); + RowsetMetaPB meta_pb = cloud_rowset_meta_to_doris( + cloud_rs_meta_pb, resp.has_schema_dict() ? &resp.schema_dict() : nullptr); auto rs_meta = std::make_shared(); rs_meta->init_from_pb(meta_pb); RowsetSharedPtr rowset; @@ -543,6 +555,7 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ bool version_overlap = tablet->max_version_unlocked() >= rowsets.front()->start_version(); tablet->add_rowsets(std::move(rowsets), version_overlap, wlock, warmup_delta_data); + RETURN_IF_ERROR(tablet->merge_rowsets_schema()); } tablet->last_base_compaction_success_time_ms = stats.last_base_compaction_time_ms(); tablet->last_cumu_compaction_success_time_ms = stats.last_cumu_compaction_time_ms(); @@ -693,11 +706,19 @@ Status CloudMetaMgr::sync_tablet_delete_bitmap(CloudTablet* tablet, int64_t old_ const auto& segment_ids = res.segment_ids(); const auto& vers = res.versions(); const auto& delete_bitmaps = res.segment_delete_bitmaps(); + if (rowset_ids.size() != segment_ids.size() || rowset_ids.size() != vers.size() || + rowset_ids.size() != delete_bitmaps.size()) { + return Status::Error( + "get delete bitmap data wrong," + "rowset_ids.size={},segment_ids.size={},vers.size={},delete_bitmaps.size={}", + rowset_ids.size(), segment_ids.size(), vers.size(), delete_bitmaps.size()); + } for (size_t i = 0; i < rowset_ids.size(); i++) { RowsetId rst_id; rst_id.init(rowset_ids[i]); - delete_bitmap->merge({rst_id, segment_ids[i], vers[i]}, - roaring::Roaring::read(delete_bitmaps[i].data())); + delete_bitmap->merge( + {rst_id, segment_ids[i], vers[i]}, + roaring::Roaring::readSafe(delete_bitmaps[i].data(), delete_bitmaps[i].length())); } int64_t latency = cntl.latency_us(); if (latency > 100 * 1000) { // 100ms @@ -750,6 +771,7 @@ Status CloudMetaMgr::commit_rowset(const RowsetMeta& rs_meta, Status ret_st; TEST_INJECTION_POINT_RETURN_WITH_VALUE("CloudMetaMgr::commit_rowset", ret_st); } + check_table_size_correctness(rs_meta); CreateRowsetRequest req; CreateRowsetResponse resp; req.set_cloud_unique_id(config::cloud_unique_id); @@ -880,6 +902,7 @@ Status CloudMetaMgr::abort_txn(const StreamLoadContext& ctx) { AbortTxnRequest req; AbortTxnResponse res; req.set_cloud_unique_id(config::cloud_unique_id); + req.set_reason(std::string(ctx.status.msg().substr(0, 1024))); if (ctx.db_id > 0 && !ctx.label.empty()) { req.set_db_id(ctx.db_id); req.set_label(ctx.label); @@ -1048,9 +1071,10 @@ Status CloudMetaMgr::update_delete_bitmap(const CloudTablet& tablet, int64_t loc return st; } -Status CloudMetaMgr::update_delete_bitmap_without_lock(const CloudTablet& tablet, - DeleteBitmap* delete_bitmap) { - VLOG_DEBUG << "update_delete_bitmap_without_lock , tablet_id: " << tablet.tablet_id(); +Status CloudMetaMgr::cloud_update_delete_bitmap_without_lock(const CloudTablet& tablet, + DeleteBitmap* delete_bitmap) { + LOG(INFO) << "cloud_update_delete_bitmap_without_lock , tablet_id: " << tablet.tablet_id() + << ",delete_bitmap size:" << delete_bitmap->delete_bitmap.size(); UpdateDeleteBitmapRequest req; UpdateDeleteBitmapResponse res; req.set_cloud_unique_id(config::cloud_unique_id); @@ -1105,6 +1129,25 @@ Status CloudMetaMgr::get_delete_bitmap_update_lock(const CloudTablet& tablet, in return st; } +Status CloudMetaMgr::remove_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id, + int64_t initiator) { + VLOG_DEBUG << "remove_delete_bitmap_update_lock , tablet_id: " << tablet.tablet_id() + << ",lock_id:" << lock_id; + RemoveDeleteBitmapUpdateLockRequest req; + RemoveDeleteBitmapUpdateLockResponse res; + req.set_cloud_unique_id(config::cloud_unique_id); + req.set_tablet_id(tablet.tablet_id()); + req.set_lock_id(lock_id); + req.set_initiator(initiator); + auto st = retry_rpc("remove delete bitmap update lock", req, &res, + &MetaService_Stub::remove_delete_bitmap_update_lock); + if (!st.ok()) { + LOG(WARNING) << "remove delete bitmap update lock fail,tablet_id=" << tablet.tablet_id() + << " lock_id=" << lock_id << " st=" << st.to_string(); + } + return st; +} + Status CloudMetaMgr::remove_old_version_delete_bitmap( int64_t tablet_id, const std::vector>& to_delete) { @@ -1123,4 +1166,120 @@ Status CloudMetaMgr::remove_old_version_delete_bitmap( return st; } +void CloudMetaMgr::check_table_size_correctness(const RowsetMeta& rs_meta) { + if (!config::enable_table_size_correctness_check) { + return; + } + int64_t total_segment_size = get_segment_file_size(rs_meta); + int64_t total_inverted_index_size = get_inverted_index_file_szie(rs_meta); + if (rs_meta.data_disk_size() != total_segment_size || + rs_meta.index_disk_size() != total_inverted_index_size || + rs_meta.data_disk_size() + rs_meta.index_disk_size() != rs_meta.total_disk_size()) { + LOG(WARNING) << "[Cloud table table size check failed]:" + << " tablet id: " << rs_meta.tablet_id() + << ", rowset id:" << rs_meta.rowset_id() + << ", rowset data disk size:" << rs_meta.data_disk_size() + << ", rowset real data disk size:" << total_segment_size + << ", rowset index disk size:" << rs_meta.index_disk_size() + << ", rowset real index disk size:" << total_inverted_index_size + << ", rowset total disk size:" << rs_meta.total_disk_size() + << ", rowset segment path:" + << StorageResource().remote_segment_path(rs_meta.tablet_id(), + rs_meta.rowset_id().to_string(), 0); + DCHECK(false); + } +} + +int64_t CloudMetaMgr::get_segment_file_size(const RowsetMeta& rs_meta) { + int64_t total_segment_size = 0; + const auto fs = const_cast(rs_meta).fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta.resource_id(); + } + for (int64_t seg_id = 0; seg_id < rs_meta.num_segments(); seg_id++) { + std::string segment_path = StorageResource().remote_segment_path( + rs_meta.tablet_id(), rs_meta.rowset_id().to_string(), seg_id); + int64_t segment_file_size = 0; + auto st = fs->file_size(segment_path, &segment_file_size); + if (!st.ok()) { + segment_file_size = 0; + if (st.is()) { + LOG(INFO) << "cloud table size correctness check get segment size 0 because " + "file not exist! msg:" + << st.msg() << ", segment path:" << segment_path; + } else { + LOG(WARNING) << "cloud table size correctness check get segment size failed! msg:" + << st.msg() << ", segment path:" << segment_path; + } + } + total_segment_size += segment_file_size; + } + return total_segment_size; +} + +int64_t CloudMetaMgr::get_inverted_index_file_szie(const RowsetMeta& rs_meta) { + int64_t total_inverted_index_size = 0; + const auto fs = const_cast(rs_meta).fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta.resource_id(); + } + if (rs_meta.tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + const auto& indices = rs_meta.tablet_schema()->inverted_indexes(); + for (auto& index : indices) { + for (int seg_id = 0; seg_id < rs_meta.num_segments(); ++seg_id) { + std::string segment_path = StorageResource().remote_segment_path( + rs_meta.tablet_id(), rs_meta.rowset_id().to_string(), seg_id); + int64_t file_size = 0; + + std::string inverted_index_file_path = + InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path), + index->index_id(), index->get_index_suffix()); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + if (st.is()) { + LOG(INFO) << "cloud table size correctness check get inverted index v1 " + "0 because file not exist! msg:" + << st.msg() + << ", inverted index path:" << inverted_index_file_path; + } else { + LOG(WARNING) + << "cloud table size correctness check get inverted index v1 " + "size failed! msg:" + << st.msg() << ", inverted index path:" << inverted_index_file_path; + } + } + total_inverted_index_size += file_size; + } + } + } else { + for (int seg_id = 0; seg_id < rs_meta.num_segments(); ++seg_id) { + int64_t file_size = 0; + std::string segment_path = StorageResource().remote_segment_path( + rs_meta.tablet_id(), rs_meta.rowset_id().to_string(), seg_id); + + std::string inverted_index_file_path = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path)); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + if (st.is()) { + LOG(INFO) << "cloud table size correctness check get inverted index v2 " + "0 because file not exist! msg:" + << st.msg() << ", inverted index path:" << inverted_index_file_path; + } else { + LOG(WARNING) << "cloud table size correctness check get inverted index v2 " + "size failed! msg:" + << st.msg() + << ", inverted index path:" << inverted_index_file_path; + } + } + total_inverted_index_size += file_size; + } + } + return total_inverted_index_size; +} + } // namespace doris::cloud diff --git a/be/src/cloud/cloud_meta_mgr.h b/be/src/cloud/cloud_meta_mgr.h index 79cdb3fd3d1f8c..a657c0fdd8e350 100644 --- a/be/src/cloud/cloud_meta_mgr.h +++ b/be/src/cloud/cloud_meta_mgr.h @@ -57,7 +57,8 @@ class CloudMetaMgr { Status get_tablet_meta(int64_t tablet_id, std::shared_ptr* tablet_meta); - Status sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_data = false); + Status sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_data = false, + bool sync_delete_bitmap = true); Status prepare_rowset(const RowsetMeta& rs_meta, std::shared_ptr* existed_rs_meta = nullptr); @@ -95,12 +96,15 @@ class CloudMetaMgr { Status update_delete_bitmap(const CloudTablet& tablet, int64_t lock_id, int64_t initiator, DeleteBitmap* delete_bitmap); - Status update_delete_bitmap_without_lock(const CloudTablet& tablet, - DeleteBitmap* delete_bitmap); + Status cloud_update_delete_bitmap_without_lock(const CloudTablet& tablet, + DeleteBitmap* delete_bitmap); Status get_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id, int64_t initiator); + Status remove_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id, + int64_t initiator); + Status remove_old_version_delete_bitmap( int64_t tablet_id, const std::vector>& to_delete); @@ -113,6 +117,9 @@ class CloudMetaMgr { Status sync_tablet_delete_bitmap(CloudTablet* tablet, int64_t old_max_version, std::ranges::range auto&& rs_metas, const TabletStatsPB& stats, const TabletIndexPB& idx, DeleteBitmap* delete_bitmap); + void check_table_size_correctness(const RowsetMeta& rs_meta); + int64_t get_segment_file_size(const RowsetMeta& rs_meta); + int64_t get_inverted_index_file_szie(const RowsetMeta& rs_meta); }; } // namespace cloud diff --git a/be/src/cloud/cloud_rowset_builder.cpp b/be/src/cloud/cloud_rowset_builder.cpp index 192da0f17efa82..2e6764b33aa79c 100644 --- a/be/src/cloud/cloud_rowset_builder.cpp +++ b/be/src/cloud/cloud_rowset_builder.cpp @@ -106,7 +106,7 @@ void CloudRowsetBuilder::update_tablet_stats() { tablet->fetch_add_approximate_num_rowsets(1); tablet->fetch_add_approximate_num_segments(_rowset->num_segments()); tablet->fetch_add_approximate_num_rows(_rowset->num_rows()); - tablet->fetch_add_approximate_data_size(_rowset->data_disk_size()); + tablet->fetch_add_approximate_data_size(_rowset->total_disk_size()); tablet->fetch_add_approximate_cumu_num_rowsets(1); tablet->fetch_add_approximate_cumu_num_deltas(_rowset->num_segments()); tablet->write_count.fetch_add(1, std::memory_order_relaxed); diff --git a/be/src/cloud/cloud_rowset_writer.cpp b/be/src/cloud/cloud_rowset_writer.cpp index 5f878f59d5c64d..ebc411697ee4b1 100644 --- a/be/src/cloud/cloud_rowset_writer.cpp +++ b/be/src/cloud/cloud_rowset_writer.cpp @@ -94,7 +94,7 @@ Status CloudRowsetWriter::build(RowsetSharedPtr& rowset) { // transfer 0 (PREPARED -> COMMITTED): finish writing a rowset and the rowset' meta will not be changed // transfer 1 (PREPARED -> BEGIN_PARTIAL_UPDATE): finish writing a rowset, but may append new segments later and the rowset's meta may be changed // transfer 2 (BEGIN_PARTIAL_UPDATE -> VISIBLE): finish adding new segments and the rowset' meta will not be changed, the rowset is visible to users - if (_context.partial_update_info && _context.partial_update_info->is_partial_update) { + if (_context.partial_update_info && _context.partial_update_info->is_partial_update()) { _rowset_meta->set_rowset_state(BEGIN_PARTIAL_UPDATE); } else { _rowset_meta->set_rowset_state(COMMITTED); @@ -115,13 +115,14 @@ Status CloudRowsetWriter::build(RowsetSharedPtr& rowset) { } else { _rowset_meta->add_segments_file_size(seg_file_size.value()); } - - if (auto idx_files_info = _idx_files_info.get_inverted_files_info(_segment_start_id); - !idx_files_info.has_value()) [[unlikely]] { - LOG(ERROR) << "expected inverted index files info, but none presents: " - << idx_files_info.error(); - } else { - _rowset_meta->add_inverted_index_files_info(idx_files_info.value()); + if (rowset_schema->has_inverted_index()) { + if (auto idx_files_info = _idx_files.inverted_index_file_info(_segment_start_id); + !idx_files_info.has_value()) [[unlikely]] { + LOG(ERROR) << "expected inverted index files info, but none presents: " + << idx_files_info.error(); + } else { + _rowset_meta->add_inverted_index_files_info(idx_files_info.value()); + } } RETURN_NOT_OK_STATUS_WITH_WARN(RowsetFactory::create_rowset(rowset_schema, _context.tablet_path, diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index b7e3be93e853bb..896804578d7db9 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -344,7 +344,7 @@ Status CloudSchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParam sc_job->add_txn_ids(rs->txn_id()); sc_job->add_output_versions(rs->end_version()); num_output_rows += rs->num_rows(); - size_output_rowsets += rs->data_disk_size(); + size_output_rowsets += rs->total_disk_size(); num_output_segments += rs->num_segments(); } sc_job->set_num_output_rows(num_output_rows); diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index 4f452656a6236b..5d7b445917aa20 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -558,14 +558,16 @@ std::vector CloudStorageEngine::_generate_cloud_compaction_task } else if (config::enable_parallel_cumu_compaction) { filter_out = [&tablet_preparing_cumu_compaction](CloudTablet* t) { return tablet_preparing_cumu_compaction.contains(t->tablet_id()) || - (t->tablet_state() != TABLET_RUNNING && t->alter_version() == -1); + (t->tablet_state() != TABLET_RUNNING && + (!config::enable_new_tablet_do_compaction || t->alter_version() == -1)); }; } else { filter_out = [&tablet_preparing_cumu_compaction, &submitted_cumu_compactions](CloudTablet* t) { return tablet_preparing_cumu_compaction.contains(t->tablet_id()) || submitted_cumu_compactions.contains(t->tablet_id()) || - (t->tablet_state() != TABLET_RUNNING && t->alter_version() == -1); + (t->tablet_state() != TABLET_RUNNING && + (!config::enable_new_tablet_do_compaction || t->alter_version() == -1)); }; } diff --git a/be/src/cloud/cloud_stream_load_executor.cpp b/be/src/cloud/cloud_stream_load_executor.cpp index 1352b4aac81a5f..46ceca851e2b4d 100644 --- a/be/src/cloud/cloud_stream_load_executor.cpp +++ b/be/src/cloud/cloud_stream_load_executor.cpp @@ -23,6 +23,7 @@ #include "common/logging.h" #include "common/status.h" #include "runtime/stream_load/stream_load_context.h" +#include "util/debug_points.h" namespace doris { @@ -96,6 +97,7 @@ Status CloudStreamLoadExecutor::operate_txn_2pc(StreamLoadContext* ctx) { } Status CloudStreamLoadExecutor::commit_txn(StreamLoadContext* ctx) { + DBUG_EXECUTE_IF("StreamLoadExecutor.commit_txn.block", DBUG_BLOCK); // forward to fe to excute commit transaction for MoW table if (ctx->is_mow_table() || !config::enable_stream_load_commit_txn_on_be || ctx->load_type == TLoadType::ROUTINE_LOAD) { diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 06f7e97e0c475d..ebd1fea3dd9fac 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -108,6 +108,36 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, return capture_rs_readers_unlocked(version_path, rs_splits); } +Status CloudTablet::merge_rowsets_schema() { + // Find the rowset with the max version + auto max_version_rowset = + std::max_element( + _rs_version_map.begin(), _rs_version_map.end(), + [](const auto& a, const auto& b) { + return !a.second->tablet_schema() + ? true + : (!b.second->tablet_schema() + ? false + : a.second->tablet_schema()->schema_version() < + b.second->tablet_schema() + ->schema_version()); + }) + ->second; + TabletSchemaSPtr max_version_schema = max_version_rowset->tablet_schema(); + // If the schema has variant columns, perform a merge to create a wide tablet schema + if (max_version_schema->num_variant_columns() > 0) { + std::vector schemas; + std::transform(_rs_version_map.begin(), _rs_version_map.end(), std::back_inserter(schemas), + [](const auto& rs_meta) { return rs_meta.second->tablet_schema(); }); + // Merge the collected schemas to obtain the least common schema + RETURN_IF_ERROR(vectorized::schema_util::get_least_common_schema(schemas, nullptr, + max_version_schema)); + VLOG_DEBUG << "dump schema: " << max_version_schema->dump_full_schema(); + _merged_tablet_schema = max_version_schema; + } + return Status::OK(); +} + // There are only two tablet_states RUNNING and NOT_READY in cloud mode // This function will erase the tablet from `CloudTabletMgr` when it can't find this tablet in MS. Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data) { @@ -133,6 +163,7 @@ Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data) if (st.is()) { clear_cache(); } + return st; } @@ -188,16 +219,7 @@ Status CloudTablet::sync_if_not_running() { } TabletSchemaSPtr CloudTablet::merged_tablet_schema() const { - std::shared_lock rdlock(_meta_lock); - TabletSchemaSPtr target_schema; - std::vector schemas; - for (const auto& [_, rowset] : _rs_version_map) { - schemas.push_back(rowset->tablet_schema()); - } - // get the max version schema and merge all schema - static_cast( - vectorized::schema_util::get_least_common_schema(schemas, nullptr, target_schema)); - return target_schema; + return _merged_tablet_schema; } void CloudTablet::add_rowsets(std::vector to_add, bool version_overlap, @@ -263,15 +285,13 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ auto schema_ptr = rowset_meta->tablet_schema(); auto idx_version = schema_ptr->get_inverted_index_storage_format(); if (idx_version == InvertedIndexStorageFormatPB::V1) { - for (const auto& index : schema_ptr->indexes()) { - if (index.index_type() == IndexType::INVERTED) { - auto idx_path = storage_resource.value()->remote_idx_v1_path( - *rowset_meta, seg_id, index.index_id(), - index.get_index_suffix()); - download_idx_file(idx_path); - } + for (const auto& index : schema_ptr->inverted_indexes()) { + auto idx_path = storage_resource.value()->remote_idx_v1_path( + *rowset_meta, seg_id, index->index_id(), + index->get_index_suffix()); + download_idx_file(idx_path); } - } else if (idx_version == InvertedIndexStorageFormatPB::V2) { + } else { if (schema_ptr->has_inverted_index()) { auto idx_path = storage_resource.value()->remote_idx_v2_path( *rowset_meta, seg_id); @@ -412,7 +432,7 @@ int CloudTablet::delete_expired_stale_rowsets() { void CloudTablet::update_base_size(const Rowset& rs) { // Define base rowset as the rowset of version [2-x] if (rs.start_version() == 2) { - _base_size = rs.data_disk_size(); + _base_size = rs.total_disk_size(); } } @@ -433,7 +453,7 @@ void CloudTablet::recycle_cached_data(const std::vector& rowset // TODO: Segment::file_cache_key auto file_key = Segment::file_cache_key(rs->rowset_id().to_string(), seg_id); auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key); - file_cache->remove_if_cached(file_key); + file_cache->remove_if_cached_async(file_key); } } } @@ -671,7 +691,7 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t tx RETURN_IF_ERROR(_engine.txn_delete_bitmap_cache().update_tablet_txn_info( txn_id, tablet_id(), delete_bitmap, cur_rowset_ids, PublishStatus::PREPARE)); - if (txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update && + if (txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update() && rowset_writer->num_rows() > 0) { const auto& rowset_meta = rowset->rowset_meta(); RETURN_IF_ERROR(_engine.meta_mgr().update_tmp_rowset(*rowset_meta)); @@ -737,38 +757,54 @@ Status CloudTablet::calc_delete_bitmap_for_compaction( int64_t filtered_rows, int64_t initiator, DeleteBitmapPtr& output_rowset_delete_bitmap, bool allow_delete_in_cumu_compaction) { output_rowset_delete_bitmap = std::make_shared(tablet_id()); - std::set missed_rows; - std::map>> location_map; + std::unique_ptr missed_rows; + if ((config::enable_missing_rows_correctness_check || + config::enable_mow_compaction_correctness_check_core) && + !allow_delete_in_cumu_compaction && + compaction_type == ReaderType::READER_CUMULATIVE_COMPACTION) { + missed_rows = std::make_unique(); + LOG(INFO) << "RowLocation Set inited succ for tablet:" << tablet_id(); + } + + std::unique_ptr> location_map; + if (config::enable_rowid_conversion_correctness_check) { + location_map = std::make_unique>(); + LOG(INFO) << "Location Map inited succ for tablet:" << tablet_id(); + } // 1. calc delete bitmap for historical data RETURN_IF_ERROR(_engine.meta_mgr().sync_tablet_rowsets(this)); Version version = max_version(); + std::size_t missed_rows_size = 0; calc_compaction_output_rowset_delete_bitmap( - input_rowsets, rowid_conversion, 0, version.second + 1, &missed_rows, &location_map, - tablet_meta()->delete_bitmap(), output_rowset_delete_bitmap.get()); - std::size_t missed_rows_size = missed_rows.size(); - if (!allow_delete_in_cumu_compaction) { - if (compaction_type == ReaderType::READER_CUMULATIVE_COMPACTION && - tablet_state() == TABLET_RUNNING) { - if (merged_rows + filtered_rows >= 0 && - merged_rows + filtered_rows != missed_rows_size) { - std::string err_msg = fmt::format( - "cumulative compaction: the merged rows({}), the filtered rows({}) is not " - "equal to missed rows({}) in rowid conversion, tablet_id: {}, table_id:{}", - merged_rows, filtered_rows, missed_rows_size, tablet_id(), table_id()); - if (config::enable_mow_compaction_correctness_check_core) { - CHECK(false) << err_msg; - } else { - DCHECK(false) << err_msg; + input_rowsets, rowid_conversion, 0, version.second + 1, missed_rows.get(), + location_map.get(), tablet_meta()->delete_bitmap(), output_rowset_delete_bitmap.get()); + if (missed_rows) { + missed_rows_size = missed_rows->size(); + if (!allow_delete_in_cumu_compaction) { + if (compaction_type == ReaderType::READER_CUMULATIVE_COMPACTION && + tablet_state() == TABLET_RUNNING) { + if (merged_rows + filtered_rows >= 0 && + merged_rows + filtered_rows != missed_rows_size) { + std::string err_msg = fmt::format( + "cumulative compaction: the merged rows({}), the filtered rows({}) is " + "not equal to missed rows({}) in rowid conversion, tablet_id: {}, " + "table_id:{}", + merged_rows, filtered_rows, missed_rows_size, tablet_id(), table_id()); + if (config::enable_mow_compaction_correctness_check_core) { + CHECK(false) << err_msg; + } else { + DCHECK(false) << err_msg; + } + LOG(WARNING) << err_msg; } - LOG(WARNING) << err_msg; } } } - if (config::enable_rowid_conversion_correctness_check) { - RETURN_IF_ERROR(check_rowid_conversion(output_rowset, location_map)); + if (location_map) { + RETURN_IF_ERROR(check_rowid_conversion(output_rowset, *location_map)); + location_map->clear(); } - location_map.clear(); // 2. calc delete bitmap for incremental data RETURN_IF_ERROR(_engine.meta_mgr().get_delete_bitmap_update_lock( @@ -776,16 +812,16 @@ Status CloudTablet::calc_delete_bitmap_for_compaction( RETURN_IF_ERROR(_engine.meta_mgr().sync_tablet_rowsets(this)); calc_compaction_output_rowset_delete_bitmap( - input_rowsets, rowid_conversion, version.second, UINT64_MAX, &missed_rows, - &location_map, tablet_meta()->delete_bitmap(), output_rowset_delete_bitmap.get()); - if (config::enable_rowid_conversion_correctness_check) { - RETURN_IF_ERROR(check_rowid_conversion(output_rowset, location_map)); - } - if (compaction_type == ReaderType::READER_CUMULATIVE_COMPACTION) { - DCHECK_EQ(missed_rows.size(), missed_rows_size); - if (missed_rows.size() != missed_rows_size) { + input_rowsets, rowid_conversion, version.second, UINT64_MAX, missed_rows.get(), + location_map.get(), tablet_meta()->delete_bitmap(), output_rowset_delete_bitmap.get()); + if (location_map) { + RETURN_IF_ERROR(check_rowid_conversion(output_rowset, *location_map)); + } + if (missed_rows) { + DCHECK_EQ(missed_rows->size(), missed_rows_size); + if (missed_rows->size() != missed_rows_size) { LOG(WARNING) << "missed rows don't match, before: " << missed_rows_size - << " after: " << missed_rows.size(); + << " after: " << missed_rows->size(); } } @@ -872,4 +908,12 @@ Status CloudTablet::sync_meta() { return Status::OK(); } +void CloudTablet::build_tablet_report_info(TTabletInfo* tablet_info) { + std::shared_lock rdlock(_meta_lock); + tablet_info->__set_total_version_count(_tablet_meta->version_count()); + tablet_info->__set_tablet_id(_tablet_meta->tablet_id()); + // Currently, this information will not be used by the cloud report, + // but it may be used in the future. +} + } // namespace doris diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 53747dc19e27de..0fde2f5b1d93ff 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -191,15 +191,21 @@ class CloudTablet final : public BaseTablet { const auto& rowset_map() const { return _rs_version_map; } + // Merge all rowset schemas within a CloudTablet + Status merge_rowsets_schema(); + int64_t last_sync_time_s = 0; int64_t last_load_time_ms = 0; int64_t last_base_compaction_success_time_ms = 0; int64_t last_cumu_compaction_success_time_ms = 0; int64_t last_cumu_no_suitable_version_ms = 0; + int64_t last_access_time_ms = 0; // Return merged extended schema TabletSchemaSPtr merged_tablet_schema() const override; + void build_tablet_report_info(TTabletInfo* tablet_info); + private: // FIXME(plat1ko): No need to record base size if rowsets are ordered by version void update_base_size(const Rowset& rs); @@ -246,6 +252,9 @@ class CloudTablet final : public BaseTablet { std::mutex _base_compaction_lock; std::mutex _cumulative_compaction_lock; mutable std::mutex _rowset_update_lock; + + // Schema will be merged from all rowsets when sync_rowsets + TabletSchemaSPtr _merged_tablet_schema; }; using CloudTabletSPtr = std::shared_ptr; diff --git a/be/src/cloud/cloud_tablet_mgr.cpp b/be/src/cloud/cloud_tablet_mgr.cpp index e5c31785c1eb1c..e7a7d254f3fa89 100644 --- a/be/src/cloud/cloud_tablet_mgr.cpp +++ b/be/src/cloud/cloud_tablet_mgr.cpp @@ -28,6 +28,7 @@ #include "runtime/memory/cache_policy.h" namespace doris { +uint64_t g_tablet_report_inactive_duration_ms = 0; namespace { // port from @@ -142,8 +143,14 @@ CloudTabletMgr::CloudTabletMgr(CloudStorageEngine& engine) CloudTabletMgr::~CloudTabletMgr() = default; -Result> CloudTabletMgr::get_tablet(int64_t tablet_id, - bool warmup_data) { +void set_tablet_access_time_ms(CloudTablet* tablet) { + using namespace std::chrono; + int64_t now = duration_cast(system_clock::now().time_since_epoch()).count(); + tablet->last_access_time_ms = now; +} + +Result> CloudTabletMgr::get_tablet(int64_t tablet_id, bool warmup_data, + bool sync_delete_bitmap) { // LRU value type. `Value`'s lifetime MUST NOT be longer than `CloudTabletMgr` class Value : public LRUCacheValueBase { public: @@ -161,8 +168,8 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i CacheKey key(tablet_id_str); auto* handle = _cache->lookup(key); if (handle == nullptr) { - auto load_tablet = [this, &key, - warmup_data](int64_t tablet_id) -> std::shared_ptr { + auto load_tablet = [this, &key, warmup_data, + sync_delete_bitmap](int64_t tablet_id) -> std::shared_ptr { TabletMetaSharedPtr tablet_meta; auto st = _engine.meta_mgr().get_tablet_meta(tablet_id, &tablet_meta); if (!st.ok()) { @@ -173,7 +180,8 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i auto tablet = std::make_shared(_engine, std::move(tablet_meta)); auto value = std::make_unique(tablet, *_tablet_map); // MUST sync stats to let compaction scheduler work correctly - st = _engine.meta_mgr().sync_tablet_rowsets(tablet.get(), warmup_data); + st = _engine.meta_mgr().sync_tablet_rowsets(tablet.get(), warmup_data, + sync_delete_bitmap); if (!st.ok()) { LOG(WARNING) << "failed to sync tablet " << tablet_id << ": " << st; return nullptr; @@ -181,8 +189,11 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i auto* handle = _cache->insert(key, value.release(), 1, sizeof(CloudTablet), CachePriority::NORMAL); - auto ret = std::shared_ptr( - tablet.get(), [this, handle](...) { _cache->release(handle); }); + auto ret = + std::shared_ptr(tablet.get(), [this, handle](CloudTablet* tablet) { + set_tablet_access_time_ms(tablet); + _cache->release(handle); + }); _tablet_map->put(std::move(tablet)); return ret; }; @@ -191,12 +202,16 @@ Result> CloudTabletMgr::get_tablet(int64_t tablet_i if (tablet == nullptr) { return ResultError(Status::InternalError("failed to get tablet {}", tablet_id)); } + set_tablet_access_time_ms(tablet.get()); return tablet; } CloudTablet* tablet_raw_ptr = reinterpret_cast(_cache->value(handle))->tablet.get(); - auto tablet = std::shared_ptr(tablet_raw_ptr, - [this, handle](...) { _cache->release(handle); }); + set_tablet_access_time_ms(tablet_raw_ptr); + auto tablet = std::shared_ptr(tablet_raw_ptr, [this, handle](CloudTablet* tablet) { + set_tablet_access_time_ms(tablet); + _cache->release(handle); + }); return tablet; } @@ -357,4 +372,54 @@ Status CloudTabletMgr::get_topn_tablets_to_compact( return Status::OK(); } +void CloudTabletMgr::build_all_report_tablets_info(std::map* tablets_info, + uint64_t* tablet_num) { + DCHECK(tablets_info != nullptr); + VLOG_NOTICE << "begin to build all report cloud tablets info"; + + HistogramStat tablet_version_num_hist; + + auto handler = [&](const std::weak_ptr& tablet_wk) { + auto tablet = tablet_wk.lock(); + if (!tablet) return; + (*tablet_num)++; + TTabletInfo tablet_info; + tablet->build_tablet_report_info(&tablet_info); + using namespace std::chrono; + int64_t now = duration_cast(system_clock::now().time_since_epoch()).count(); + if (now - g_tablet_report_inactive_duration_ms * 1000 < tablet->last_access_time_ms) { + // the tablet is still being accessed and used in recently, so not report it + return; + } + auto& t_tablet = (*tablets_info)[tablet->tablet_id()]; + // On the cloud, a specific BE has only one tablet replica; + // there are no multiple replicas for a specific BE. + // This is only to reuse the non-cloud report protocol. + tablet_version_num_hist.add(tablet_info.total_version_count); + t_tablet.tablet_infos.emplace_back(std::move(tablet_info)); + }; + + auto weak_tablets = get_weak_tablets(); + std::for_each(weak_tablets.begin(), weak_tablets.end(), handler); + + DorisMetrics::instance()->tablet_version_num_distribution->set_histogram( + tablet_version_num_hist); + LOG(INFO) << "success to build all cloud report tablets info. all_tablet_count=" << *tablet_num + << " exceed drop time limit count=" << tablets_info->size(); +} + +void CloudTabletMgr::get_tablet_info(int64_t num_tablets, std::vector* tablets_info) { + auto weak_tablets = get_weak_tablets(); + for (auto& weak_tablet : weak_tablets) { + auto tablet = weak_tablet.lock(); + if (tablet == nullptr) { + continue; + } + if (tablets_info->size() >= num_tablets) { + return; + } + tablets_info->push_back(tablet->get_tablet_info()); + } +} + } // namespace doris diff --git a/be/src/cloud/cloud_tablet_mgr.h b/be/src/cloud/cloud_tablet_mgr.h index 976d483b36c143..cbbd119a36b532 100644 --- a/be/src/cloud/cloud_tablet_mgr.h +++ b/be/src/cloud/cloud_tablet_mgr.h @@ -17,6 +17,9 @@ #pragma once +#include +#include + #include #include #include @@ -31,6 +34,8 @@ class CloudStorageEngine; class LRUCachePolicy; class CountDownLatch; +extern uint64_t g_tablet_report_inactive_duration_ms; + class CloudTabletMgr { public: CloudTabletMgr(CloudStorageEngine& engine); @@ -38,7 +43,8 @@ class CloudTabletMgr { // If the tablet is in cache, return this tablet directly; otherwise will get tablet meta first, // sync rowsets after, and download segment data in background if `warmup_data` is true. - Result> get_tablet(int64_t tablet_id, bool warmup_data = false); + Result> get_tablet(int64_t tablet_id, bool warmup_data = false, + bool sync_delete_bitmap = true); void erase_tablet(int64_t tablet_id); @@ -65,6 +71,17 @@ class CloudTabletMgr { std::vector>* tablets, int64_t* max_score); + /** + * Gets tablets info and total tablet num that are reported + * + * @param tablets_info used by report + * @param tablet_num tablets in be tabletMgr, total num + */ + void build_all_report_tablets_info(std::map* tablets_info, + uint64_t* tablet_num); + + void get_tablet_info(int64_t num_tablets, std::vector* tablets_info); + private: CloudStorageEngine& _engine; diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 07beeaeb078a46..06d6df11dc4cc3 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -63,14 +63,14 @@ void CloudWarmUpManager::handle_jobs() { #ifndef BE_TEST constexpr int WAIT_TIME_SECONDS = 600; while (true) { - JobMeta cur_job; + std::shared_ptr cur_job = nullptr; { std::unique_lock lock(_mtx); _cond.wait(lock, [this]() { return _closed || !_pending_job_metas.empty(); }); if (_closed) break; - cur_job = std::move(_pending_job_metas.front()); + cur_job = _pending_job_metas.front(); } - for (int64_t tablet_id : cur_job.tablet_ids) { + for (int64_t tablet_id : cur_job->tablet_ids) { if (_cur_job_id == 0) { // The job is canceled break; } @@ -147,15 +147,13 @@ void CloudWarmUpManager::handle_jobs() { auto schema_ptr = rs->tablet_schema(); auto idx_version = schema_ptr->get_inverted_index_storage_format(); if (idx_version == InvertedIndexStorageFormatPB::V1) { - for (const auto& index : schema_ptr->indexes()) { - if (index.index_type() == IndexType::INVERTED) { - wait->add_count(); - auto idx_path = storage_resource.value()->remote_idx_v1_path( - *rs, seg_id, index.index_id(), index.get_index_suffix()); - download_idx_file(idx_path); - } + for (const auto& index : schema_ptr->inverted_indexes()) { + wait->add_count(); + auto idx_path = storage_resource.value()->remote_idx_v1_path( + *rs, seg_id, index->index_id(), index->get_index_suffix()); + download_idx_file(idx_path); } - } else if (idx_version == InvertedIndexStorageFormatPB::V2) { + } else { if (schema_ptr->has_inverted_index()) { wait->add_count(); auto idx_path = @@ -173,7 +171,7 @@ void CloudWarmUpManager::handle_jobs() { } { std::unique_lock lock(_mtx); - _finish_job.push_back(std::move(cur_job)); + _finish_job.push_back(cur_job); _pending_job_metas.pop_front(); } } @@ -230,8 +228,9 @@ Status CloudWarmUpManager::check_and_set_batch_id(int64_t job_id, int64_t batch_ void CloudWarmUpManager::add_job(const std::vector& job_metas) { { std::lock_guard lock(_mtx); - std::for_each(job_metas.begin(), job_metas.end(), - [this](const TJobMeta& meta) { _pending_job_metas.emplace_back(meta); }); + std::for_each(job_metas.begin(), job_metas.end(), [this](const TJobMeta& meta) { + _pending_job_metas.emplace_back(std::make_shared(meta)); + }); } _cond.notify_all(); } diff --git a/be/src/cloud/cloud_warm_up_manager.h b/be/src/cloud/cloud_warm_up_manager.h index fd034b2c5bc38c..219dedc58065a6 100644 --- a/be/src/cloud/cloud_warm_up_manager.h +++ b/be/src/cloud/cloud_warm_up_manager.h @@ -74,8 +74,8 @@ class CloudWarmUpManager { std::condition_variable _cond; int64_t _cur_job_id {0}; int64_t _cur_batch_id {-1}; - std::deque _pending_job_metas; - std::vector _finish_job; + std::deque> _pending_job_metas; + std::vector> _finish_job; std::thread _download_thread; bool _closed {false}; // the attribute for compile in ut diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index e724dbea84e10c..32e3250f87c258 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -75,4 +75,5 @@ DEFINE_mInt32(tablet_txn_info_min_expired_seconds, "120"); DEFINE_mBool(enable_use_cloud_unique_id_from_fe, "true"); +DEFINE_mBool(enable_cloud_tablet_report, "true"); } // namespace doris::config diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index 86197f924d0cad..8af967afb8c67b 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -108,4 +108,6 @@ DECLARE_mInt32(tablet_txn_info_min_expired_seconds); DECLARE_mBool(enable_use_cloud_unique_id_from_fe); +DECLARE_Bool(enable_cloud_tablet_report); + } // namespace doris::config diff --git a/be/src/cloud/injection_point_action.cpp b/be/src/cloud/injection_point_action.cpp index e0f88debf52f6f..bc6676313c1717 100644 --- a/be/src/cloud/injection_point_action.cpp +++ b/be/src/cloud/injection_point_action.cpp @@ -108,6 +108,15 @@ void register_suites() { sp->set_call_back("VOlapTableSink::close", [](auto&&) { std::this_thread::sleep_for(std::chrono::seconds(5)); }); }); + // curl be_ip:http_port/api/injection_point/apply_suite?name=test_ttl_lru_evict' + suite_map.emplace("test_ttl_lru_evict", [] { + auto* sp = SyncPoint::get_instance(); + sp->set_call_back("BlockFileCache::change_limit1", [](auto&& args) { + LOG(INFO) << "BlockFileCache::change_limit1"; + auto* limit = try_any_cast(args[0]); + *limit = 1; + }); + }); suite_map.emplace("test_file_segment_cache_corruption", [] { auto* sp = SyncPoint::get_instance(); sp->set_call_back("Segment::open:corruption", [](auto&& args) { diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index 550c08c5481d3a..1f780824e32c3d 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -17,6 +17,7 @@ #include "cloud/pb_convert.h" +#include #include #include @@ -138,19 +139,54 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { out->mutable_inverted_index_file_info()->Swap(in.mutable_inverted_index_file_info()); } -RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in) { +static void fill_schema_with_dict(const RowsetMetaCloudPB& in, RowsetMetaPB* out, + const SchemaCloudDictionary& dict) { + std::unordered_map unique_id_map; + //init map + for (ColumnPB& column : *out->mutable_tablet_schema()->mutable_column()) { + unique_id_map[column.unique_id()] = &column; + } + // column info + for (size_t i = 0; i < in.schema_dict_key_list().column_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().column_dict_key_list(i); + const ColumnPB& dict_val = dict.column_dict().at(dict_key); + ColumnPB& to_add = *out->mutable_tablet_schema()->add_column(); + to_add = dict_val; + VLOG_DEBUG << "fill dict column " << dict_val.ShortDebugString(); + } + + // index info + for (size_t i = 0; i < in.schema_dict_key_list().index_info_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().index_info_dict_key_list(i); + const TabletIndexPB& dict_val = dict.index_dict().at(dict_key); + *out->mutable_tablet_schema()->add_index() = dict_val; + VLOG_DEBUG << "fill dict index " << dict_val.ShortDebugString(); + } + + // sparse column info + for (size_t i = 0; i < in.schema_dict_key_list().sparse_column_dict_key_list_size(); ++i) { + int dict_key = in.schema_dict_key_list().sparse_column_dict_key_list(i); + const ColumnPB& dict_val = dict.column_dict().at(dict_key); + *unique_id_map.at(dict_val.parent_unique_id())->add_sparse_columns() = dict_val; + VLOG_DEBUG << "fill dict sparse column" << dict_val.ShortDebugString(); + } +} + +RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict) { RowsetMetaPB out; - cloud_rowset_meta_to_doris(&out, in); + cloud_rowset_meta_to_doris(&out, in, dict); return out; } -RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&& in) { +RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&& in, const SchemaCloudDictionary* dict) { RowsetMetaPB out; - cloud_rowset_meta_to_doris(&out, std::move(in)); + cloud_rowset_meta_to_doris(&out, std::move(in), dict); return out; } -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) { +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict) { // ATTN: please keep the set order aligned with the definition of proto `TabletSchemaCloudPB`. out->set_rowset_id(in.rowset_id()); out->set_partition_id(in.partition_id()); @@ -185,6 +221,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) if (in.has_tablet_schema()) { cloud_tablet_schema_to_doris(out->mutable_tablet_schema(), in.tablet_schema()); } + if (dict != nullptr) { + fill_schema_with_dict(in, out, *dict); + } out->set_txn_expiration(in.txn_expiration()); out->set_segments_overlap_pb(in.segments_overlap_pb()); out->mutable_segments_file_size()->CopyFrom(in.segments_file_size()); @@ -198,7 +237,8 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) out->mutable_inverted_index_file_info()->CopyFrom(in.inverted_index_file_info()); } -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, + const SchemaCloudDictionary* dict) { // ATTN: please keep the set order aligned with the definition of proto `TabletSchemaCloudPB`. out->set_rowset_id(in.rowset_id()); out->set_partition_id(in.partition_id()); @@ -234,6 +274,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { cloud_tablet_schema_to_doris(out->mutable_tablet_schema(), std::move(*in.mutable_tablet_schema())); } + if (dict != nullptr) { + fill_schema_with_dict(in, out, *dict); + } out->set_txn_expiration(in.txn_expiration()); out->set_segments_overlap_pb(in.segments_overlap_pb()); out->mutable_segments_file_size()->Swap(in.mutable_segments_file_size()); @@ -286,6 +329,7 @@ void doris_tablet_schema_to_cloud(TabletSchemaCloudPB* out, const TabletSchemaPB out->mutable_row_store_column_unique_ids()->CopyFrom(in.row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); out->set_enable_variant_flatten_nested(in.variant_enable_flatten_nested()); + out->set_skip_bitmap_col_idx(in.skip_bitmap_col_idx()); } void doris_tablet_schema_to_cloud(TabletSchemaCloudPB* out, TabletSchemaPB&& in) { @@ -313,6 +357,7 @@ void doris_tablet_schema_to_cloud(TabletSchemaCloudPB* out, TabletSchemaPB&& in) out->mutable_row_store_column_unique_ids()->Swap(in.mutable_row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); out->set_enable_variant_flatten_nested(in.variant_enable_flatten_nested()); + out->set_skip_bitmap_col_idx(in.skip_bitmap_col_idx()); } TabletSchemaPB cloud_tablet_schema_to_doris(const TabletSchemaCloudPB& in) { @@ -353,6 +398,7 @@ void cloud_tablet_schema_to_doris(TabletSchemaPB* out, const TabletSchemaCloudPB out->mutable_row_store_column_unique_ids()->CopyFrom(in.row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); out->set_variant_enable_flatten_nested(in.enable_variant_flatten_nested()); + out->set_skip_bitmap_col_idx(in.skip_bitmap_col_idx()); } void cloud_tablet_schema_to_doris(TabletSchemaPB* out, TabletSchemaCloudPB&& in) { @@ -381,6 +427,7 @@ void cloud_tablet_schema_to_doris(TabletSchemaPB* out, TabletSchemaCloudPB&& in) out->mutable_row_store_column_unique_ids()->Swap(in.mutable_row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); out->set_variant_enable_flatten_nested(in.enable_variant_flatten_nested()); + out->set_skip_bitmap_col_idx(in.skip_bitmap_col_idx()); } TabletMetaCloudPB doris_tablet_meta_to_cloud(const TabletMetaPB& in) { diff --git a/be/src/cloud/pb_convert.h b/be/src/cloud/pb_convert.h index 0cfa033f2930a0..31fe43adb11a6d 100644 --- a/be/src/cloud/pb_convert.h +++ b/be/src/cloud/pb_convert.h @@ -24,10 +24,14 @@ RowsetMetaCloudPB doris_rowset_meta_to_cloud(const RowsetMetaPB&); RowsetMetaCloudPB doris_rowset_meta_to_cloud(RowsetMetaPB&&); void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in); void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in); -RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB&); -RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&&); -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in); -void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in); +RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB&, + const SchemaCloudDictionary* dict = nullptr); +RowsetMetaPB cloud_rowset_meta_to_doris(RowsetMetaCloudPB&&, + const SchemaCloudDictionary* dict = nullptr); +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in, + const SchemaCloudDictionary* dict = nullptr); +void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, + const SchemaCloudDictionary* dict = nullptr); // TabletSchemaPB <=> TabletSchemaCloudPB TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB&); diff --git a/be/src/clucene b/be/src/clucene index 5e9566ab364d71..7cf6cf410d41d9 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 5e9566ab364d71b64c436ee46e5c848eed0ab7f7 +Subproject commit 7cf6cf410d41d95456edba263cc55b7b6f5ab027 diff --git a/be/src/common/compile_check_begin.h b/be/src/common/compile_check_begin.h index 8f5358fb89c6ed..6da403f2894885 100644 --- a/be/src/common/compile_check_begin.h +++ b/be/src/common/compile_check_begin.h @@ -15,10 +15,16 @@ // specific language governing permissions and limitations // under the License. -#pragma once +#ifdef COMPILE_CHECK +#error The handling of compile_check_begin.h and compile_check_end.h is not done correctly. +#endif +#define COMPILE_CHECK #ifdef __clang__ #pragma clang diagnostic push -#pragma clang diagnostic error "-Wshorten-64-to-32" +#pragma clang diagnostic error "-Wconversion" +#pragma clang diagnostic ignored "-Wsign-conversion" +#pragma clang diagnostic ignored "-Wfloat-conversion" #endif + //#include "common/compile_check_begin.h" \ No newline at end of file diff --git a/be/src/common/compile_check_end.h b/be/src/common/compile_check_end.h index 491f41b6c27c1c..0897965dc74a3d 100644 --- a/be/src/common/compile_check_end.h +++ b/be/src/common/compile_check_end.h @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -#pragma once - #ifdef __clang__ #pragma clang diagnostic pop #endif +#undef COMPILE_CHECK + // #include "common/compile_check_end.h" \ No newline at end of file diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 20f9402e8f2fa5..7c8abfeb8f46a1 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -150,7 +150,10 @@ DEFINE_mInt64(stacktrace_in_alloc_large_memory_bytes, "2147483648"); DEFINE_mInt64(crash_in_alloc_large_memory_bytes, "-1"); -// If memory tracker value is inaccurate, BE will crash. usually used in test environments, default value is false. +// The actual meaning of this parameter is `debug_memory`. +// 1. crash in memory tracker inaccurate, if memory tracker value is inaccurate, BE will crash. +// usually used in test environments, default value is false. +// 2. print more memory logs. DEFINE_mBool(crash_in_memory_tracker_inaccurate, "false"); // default is true. if any memory tracking in Orphan mem tracker will report error. @@ -393,7 +396,7 @@ DEFINE_mInt64(base_compaction_max_compaction_score, "20"); DEFINE_mDouble(base_compaction_min_data_ratio, "0.3"); DEFINE_mInt64(base_compaction_dup_key_max_file_size_mbytes, "1024"); -DEFINE_Bool(enable_skip_tablet_compaction, "true"); +DEFINE_Bool(enable_skip_tablet_compaction, "false"); // output rowset of cumulative compaction total disk size exceed this config size, // this rowset will be given to base compaction, unit is m byte. DEFINE_mInt64(compaction_promotion_size_mbytes, "1024"); @@ -537,7 +540,6 @@ DEFINE_mInt32(streaming_load_rpc_max_alive_time_sec, "1200"); DEFINE_Int32(tablet_writer_open_rpc_timeout_sec, "60"); // You can ignore brpc error '[E1011]The server is overcrowded' when writing data. DEFINE_mBool(tablet_writer_ignore_eovercrowded, "true"); -DEFINE_mBool(exchange_sink_ignore_eovercrowded, "true"); DEFINE_mInt32(slave_replica_writer_rpc_timeout_sec, "60"); // Whether to enable stream load record function, the default is false. // False: disable stream load record @@ -898,9 +900,10 @@ DEFINE_mInt32(orc_natural_read_size_mb, "8"); DEFINE_mInt64(big_column_size_buffer, "65535"); DEFINE_mInt64(small_column_size_buffer, "100"); -// rf will decide whether the next sampling_frequency blocks need to be filtered based on the filtering rate of the current block. +// Perform the always_true check at intervals determined by runtime_filter_sampling_frequency DEFINE_mInt32(runtime_filter_sampling_frequency, "64"); - +DEFINE_mInt32(execution_max_rpc_timeout_sec, "3600"); +DEFINE_mBool(execution_ignore_eovercrowded, "true"); // cooldown task configs DEFINE_Int32(cooldown_thread_num, "5"); DEFINE_mInt64(generate_cooldown_task_interval_sec, "20"); @@ -922,6 +925,9 @@ DEFINE_mBool(enable_query_like_bloom_filter, "true"); DEFINE_Int32(doris_remote_scanner_thread_pool_thread_num, "48"); // number of s3 scanner thread pool queue size DEFINE_Int32(doris_remote_scanner_thread_pool_queue_size, "102400"); +DEFINE_mInt64(block_cache_wait_timeout_ms, "1000"); +DEFINE_mInt64(cache_lock_long_tail_threshold, "1000"); +DEFINE_Int64(file_cache_recycle_keys_size, "1000000"); // limit the queue of pending batches which will be sent by a single nodechannel DEFINE_mInt64(nodechannel_pending_queue_max_bytes, "67108864"); @@ -976,6 +982,8 @@ DEFINE_Int32(pipeline_executor_size, "0"); DEFINE_Bool(enable_workload_group_for_scan, "false"); DEFINE_mInt64(workload_group_scan_task_wait_timeout_ms, "10000"); +// Whether use schema dict in backend side instead of MetaService side(cloud mode) +DEFINE_mBool(variant_use_cloud_schema_dict, "true"); DEFINE_mDouble(variant_ratio_of_defaults_as_sparse_column, "1"); DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048"); DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); @@ -995,7 +1003,7 @@ DEFINE_Bool(enable_file_cache, "false"); // or use the default storage value: // {"path": "memory", "total_size":53687091200} // Both will use the directory "memory" on the disk instead of the real RAM. -DEFINE_String(file_cache_path, ""); +DEFINE_String(file_cache_path, "[{\"path\":\"${DORIS_HOME}/file_cache\"}]"); DEFINE_Int64(file_cache_each_block_size, "1048576"); // 1MB DEFINE_Bool(clear_file_cache, "false"); @@ -1003,13 +1011,11 @@ DEFINE_Bool(enable_file_cache_query_limit, "false"); DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "90"); DEFINE_mInt32(file_cache_exit_disk_resource_limit_mode_percent, "80"); DEFINE_mBool(enable_read_cache_file_directly, "false"); -DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "false"); +DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "true"); DEFINE_mInt64(file_cache_ttl_valid_check_interval_second, "0"); // zero for not checking // If true, evict the ttl cache using LRU when full. // Otherwise, only expiration can evict ttl and new data won't add to cache when full. DEFINE_Bool(enable_ttl_cache_evict_using_lru, "true"); -// rename ttl filename to new format during read, with some performance cost -DEFINE_mBool(translate_to_new_ttl_format_during_read, "false"); DEFINE_mBool(enbale_dump_error_file, "true"); // limit the max size of error log on disk DEFINE_mInt64(file_cache_error_log_limit_bytes, "209715200"); // 200MB @@ -1019,7 +1025,7 @@ DEFINE_mInt32(inverted_index_cache_stale_sweep_time_sec, "600"); // inverted index searcher cache size DEFINE_String(inverted_index_searcher_cache_limit, "10%"); DEFINE_Bool(enable_inverted_index_cache_check_timestamp, "true"); -DEFINE_Int32(inverted_index_fd_number_limit_percent, "40"); // 40% +DEFINE_Int32(inverted_index_fd_number_limit_percent, "20"); // 20% DEFINE_Int32(inverted_index_query_cache_shards, "256"); // inverted index match bitmap cache size @@ -1068,9 +1074,9 @@ DEFINE_mInt32(schema_cache_sweep_time_sec, "100"); // max number of segment cache, default -1 for backward compatibility fd_number*2/5 DEFINE_Int32(segment_cache_capacity, "-1"); -DEFINE_Int32(segment_cache_fd_percentage, "40"); -DEFINE_mInt32(estimated_mem_per_column_reader, "1024"); -DEFINE_Int32(segment_cache_memory_percentage, "2"); +DEFINE_Int32(segment_cache_fd_percentage, "20"); +DEFINE_mInt32(estimated_mem_per_column_reader, "512"); +DEFINE_Int32(segment_cache_memory_percentage, "5"); // enable feature binlog, default false DEFINE_Bool(enable_feature_binlog, "false"); @@ -1284,7 +1290,7 @@ DEFINE_Int64(num_s3_file_upload_thread_pool_min_thread, "16"); // The max thread num for S3FileUploadThreadPool DEFINE_Int64(num_s3_file_upload_thread_pool_max_thread, "64"); // The max ratio for ttl cache's size -DEFINE_mInt64(max_ttl_cache_ratio, "90"); +DEFINE_mInt64(max_ttl_cache_ratio, "50"); // The maximum jvm heap usage ratio for hdfs write workload DEFINE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio, "0.5"); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1345,8 +1351,14 @@ DEFINE_mInt32(lz4_compression_block_size, "262144"); DEFINE_mBool(enable_pipeline_task_leakage_detect, "false"); +DEFINE_mInt32(check_score_rounds_num, "1000"); + DEFINE_Int32(query_cache_size, "512"); +DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); +// Enable validation to check the correctness of table size. +DEFINE_Bool(enable_table_size_correctness_check, "false"); + // clang-format off #ifdef BE_TEST // test s3 @@ -1683,6 +1695,13 @@ bool init(const char* conf_file, bool fill_conf_map, bool must_exist, bool set_t SET_FIELD(it.second, std::vector, fill_conf_map, set_to_default); } + if (config::is_cloud_mode()) { + auto st = config::set_config("enable_file_cache", "true", true, true); + LOG(INFO) << "set config enable_file_cache " + << "true" + << " " << st; + } + return true; } diff --git a/be/src/common/config.h b/be/src/common/config.h index 94435bf83fca78..d6a581a7614c8d 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -200,7 +200,10 @@ DECLARE_mInt64(stacktrace_in_alloc_large_memory_bytes); // modify this parameter to crash when large memory allocation occur will help DECLARE_mInt64(crash_in_alloc_large_memory_bytes); -// If memory tracker value is inaccurate, BE will crash. usually used in test environments, default value is false. +// The actual meaning of this parameter is `debug_memory`. +// 1. crash in memory tracker inaccurate, if memory tracker value is inaccurate, BE will crash. +// usually used in test environments, default value is false. +// 2. print more memory logs. DECLARE_mBool(crash_in_memory_tracker_inaccurate); // default is true. if any memory tracking in Orphan mem tracker will report error. @@ -584,7 +587,6 @@ DECLARE_mInt32(streaming_load_rpc_max_alive_time_sec); DECLARE_Int32(tablet_writer_open_rpc_timeout_sec); // You can ignore brpc error '[E1011]The server is overcrowded' when writing data. DECLARE_mBool(tablet_writer_ignore_eovercrowded); -DECLARE_mBool(exchange_sink_ignore_eovercrowded); DECLARE_mInt32(slave_replica_writer_rpc_timeout_sec); // Whether to enable stream load record function, the default is false. // False: disable stream load record @@ -955,6 +957,8 @@ DECLARE_mInt64(big_column_size_buffer); DECLARE_mInt64(small_column_size_buffer); DECLARE_mInt32(runtime_filter_sampling_frequency); +DECLARE_mInt32(execution_max_rpc_timeout_sec); +DECLARE_mBool(execution_ignore_eovercrowded); // cooldown task configs DECLARE_Int32(cooldown_thread_num); @@ -981,6 +985,9 @@ DECLARE_mInt64(nodechannel_pending_queue_max_bytes); // The batch size for sending data by brpc streaming client DECLARE_mInt64(brpc_streaming_client_batch_bytes); +DECLARE_mInt64(block_cache_wait_timeout_ms); +DECLARE_mInt64(cache_lock_long_tail_threshold); +DECLARE_Int64(file_cache_recycle_keys_size); DECLARE_Bool(enable_brpc_builtin_services); @@ -1056,8 +1063,6 @@ DECLARE_mInt64(file_cache_ttl_valid_check_interval_second); // If true, evict the ttl cache using LRU when full. // Otherwise, only expiration can evict ttl and new data won't add to cache when full. DECLARE_Bool(enable_ttl_cache_evict_using_lru); -// rename ttl filename to new format during read, with some performance cost -DECLARE_Bool(translate_to_new_ttl_format_during_read); DECLARE_mBool(enbale_dump_error_file); // limit the max size of error log on disk DECLARE_mInt64(file_cache_error_log_limit_bytes); @@ -1175,6 +1180,7 @@ DECLARE_mInt64(LZ4_HC_compression_level); // Threshold of a column as sparse column // Notice: TEST ONLY DECLARE_mDouble(variant_ratio_of_defaults_as_sparse_column); +DECLARE_mBool(variant_use_cloud_schema_dict); // Threshold to estimate a column is sparsed // Notice: TEST ONLY DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column); @@ -1430,9 +1436,15 @@ DECLARE_mInt32(lz4_compression_block_size); DECLARE_mBool(enable_pipeline_task_leakage_detect); +DECLARE_mInt32(check_score_rounds_num); + // MB DECLARE_Int32(query_cache_size); +DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); +// Enable validation to check the correctness of table size. +DECLARE_Bool(enable_table_size_correctness_check); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 5da49758865c1c..ce2a6878dba034 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -27,17 +27,13 @@ // IWYU pragma: no_include #include #include -#include #include #include -#include -#include // IWYU pragma: no_include #include // IWYU pragma: keep #include #include -#include #include #include "cloud/config.h" @@ -45,30 +41,23 @@ #include "common/logging.h" #include "common/status.h" #include "olap/memtable_memory_limiter.h" -#include "olap/options.h" #include "olap/storage_engine.h" #include "olap/tablet_manager.h" #include "runtime/be_proc_monitor.h" -#include "runtime/client_cache.h" #include "runtime/exec_env.h" #include "runtime/fragment_mgr.h" #include "runtime/memory/global_memory_arbitrator.h" -#include "runtime/memory/mem_tracker.h" #include "runtime/memory/mem_tracker_limiter.h" #include "runtime/memory/memory_reclamation.h" +#include "runtime/process_profile.h" #include "runtime/runtime_query_statistics_mgr.h" #include "runtime/workload_group/workload_group_manager.h" #include "util/algorithm_util.h" -#include "util/cpu_info.h" -#include "util/debug_util.h" -#include "util/disk_info.h" #include "util/doris_metrics.h" #include "util/mem_info.h" #include "util/metrics.h" -#include "util/network_util.h" #include "util/perf_counters.h" #include "util/system_metrics.h" -#include "util/thrift_util.h" #include "util/time.h" namespace doris { @@ -233,9 +222,8 @@ void refresh_memory_state_after_memory_change() { if (abs(last_print_proc_mem - PerfCounters::get_vm_rss()) > 268435456) { last_print_proc_mem = PerfCounters::get_vm_rss(); doris::MemTrackerLimiter::clean_tracker_limiter_group(); - doris::MemTrackerLimiter::enable_print_log_process_usage(); - // Refresh mem tracker each type counter. - doris::MemTrackerLimiter::refresh_global_counter(); + doris::ProcessProfile::instance()->memory_profile()->enable_print_log_process_usage(); + doris::ProcessProfile::instance()->memory_profile()->refresh_memory_overview_profile(); LOG(INFO) << doris::GlobalMemoryArbitrator:: process_mem_log_str(); // print mem log when memory state by 256M } @@ -296,6 +284,7 @@ void Daemon::memory_maintenance_thread() { // TODO replace memory_gc_thread. // step 6. Refresh weighted memory ratio of workload groups. + doris::ExecEnv::GetInstance()->workload_group_mgr()->do_sweep(); doris::ExecEnv::GetInstance()->workload_group_mgr()->refresh_wg_weighted_memory_limit(); // step 7. Analyze blocking queries. @@ -338,10 +327,12 @@ void Daemon::memory_gc_thread() { memory_full_gc_sleep_time_ms = memory_gc_sleep_time_ms; memory_minor_gc_sleep_time_ms = memory_gc_sleep_time_ms; LOG(INFO) << fmt::format("[MemoryGC] start full GC, {}.", mem_info); - doris::MemTrackerLimiter::print_log_process_usage(); + doris::ProcessProfile::instance()->memory_profile()->print_log_process_usage(); if (doris::MemoryReclamation::process_full_gc(std::move(mem_info))) { // If there is not enough memory to be gc, the process memory usage will not be printed in the next continuous gc. - doris::MemTrackerLimiter::enable_print_log_process_usage(); + doris::ProcessProfile::instance() + ->memory_profile() + ->enable_print_log_process_usage(); } } else if (memory_minor_gc_sleep_time_ms <= 0 && (sys_mem_available < doris::MemInfo::sys_mem_available_warning_water_mark() || @@ -351,9 +342,11 @@ void Daemon::memory_gc_thread() { doris::GlobalMemoryArbitrator::process_soft_limit_exceeded_errmsg_str(); memory_minor_gc_sleep_time_ms = memory_gc_sleep_time_ms; LOG(INFO) << fmt::format("[MemoryGC] start minor GC, {}.", mem_info); - doris::MemTrackerLimiter::print_log_process_usage(); + doris::ProcessProfile::instance()->memory_profile()->print_log_process_usage(); if (doris::MemoryReclamation::process_minor_gc(std::move(mem_info))) { - doris::MemTrackerLimiter::enable_print_log_process_usage(); + doris::ProcessProfile::instance() + ->memory_profile() + ->enable_print_log_process_usage(); } } else { if (memory_full_gc_sleep_time_ms > 0) { diff --git a/be/src/common/status.h b/be/src/common/status.h index e95b93431679a2..91386a5887e63e 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -76,6 +76,7 @@ namespace ErrorCode { TStatusError(HTTP_ERROR, true); \ TStatusError(TABLET_MISSING, true); \ TStatusError(NOT_MASTER, true); \ + TStatusError(OBTAIN_LOCK_FAILED, false); \ TStatusError(DELETE_BITMAP_LOCK_ERROR, false); // E error_name, error_code, print_stacktrace #define APPLY_FOR_OLAP_ERROR_CODES(E) \ @@ -478,7 +479,7 @@ class [[nodiscard]] Status { ERROR_CTOR_NOSTACK(Cancelled, CANCELLED) ERROR_CTOR(MemoryLimitExceeded, MEM_LIMIT_EXCEEDED) ERROR_CTOR(RpcError, THRIFT_RPC_ERROR) - ERROR_CTOR(TimedOut, TIMEOUT) + ERROR_CTOR_NOSTACK(TimedOut, TIMEOUT) ERROR_CTOR_NOSTACK(TooManyTasks, TOO_MANY_TASKS) ERROR_CTOR(Uninitialized, UNINITIALIZED) ERROR_CTOR(Aborted, ABORTED) @@ -487,6 +488,7 @@ class [[nodiscard]] Status { ERROR_CTOR(HttpError, HTTP_ERROR) ERROR_CTOR_NOSTACK(NeedSendAgain, NEED_SEND_AGAIN) ERROR_CTOR_NOSTACK(CgroupError, CGROUP_ERROR) + ERROR_CTOR_NOSTACK(ObtainLockFailed, OBTAIN_LOCK_FAILED) #undef ERROR_CTOR template diff --git a/be/src/exec/lzo_decompressor.cpp b/be/src/exec/lzo_decompressor.cpp index c8cf0499508f2d..b075509202b70f 100644 --- a/be/src/exec/lzo_decompressor.cpp +++ b/be/src/exec/lzo_decompressor.cpp @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include "common/logging.h" #include "exec/decompressor.h" #include "olap/utils.h" #include "orc/Exceptions.hh" @@ -197,11 +198,11 @@ Status LzopDecompressor::decompress(uint8_t* input, size_t input_len, size_t* in *decompressed_len = uncompressed_size; *input_bytes_read += ptr - block_start; - LOG(INFO) << "finished decompress lzo block." - << " compressed_size: " << compressed_size - << " decompressed_len: " << *decompressed_len - << " input_bytes_read: " << *input_bytes_read - << " next_uncompressed_size: " << next_uncompressed_size; + VLOG_DEBUG << "finished decompress lzo block." + << " compressed_size: " << compressed_size + << " decompressed_len: " << *decompressed_len + << " input_bytes_read: " << *input_bytes_read + << " next_uncompressed_size: " << next_uncompressed_size; return Status::OK(); } @@ -222,9 +223,9 @@ Status LzopDecompressor::decompress(uint8_t* input, size_t input_len, size_t* in Status LzopDecompressor::parse_header_info(uint8_t* input, size_t input_len, size_t* input_bytes_read, size_t* more_input_bytes) { if (input_len < MIN_HEADER_SIZE) { - LOG(INFO) << "highly recommanded that Lzo header size is larger than " << MIN_HEADER_SIZE - << ", or parsing header info may failed." - << " only given: " << input_len; + VLOG_NOTICE << "highly recommanded that Lzo header size is larger than " << MIN_HEADER_SIZE + << ", or parsing header info may failed." + << " only given: " << input_len; *more_input_bytes = MIN_HEADER_SIZE - input_len; return Status::OK(); } @@ -362,7 +363,7 @@ Status LzopDecompressor::parse_header_info(uint8_t* input, size_t input_len, *input_bytes_read = _header_info.header_size; _is_header_loaded = true; - LOG(INFO) << debug_info(); + VLOG_DEBUG << debug_info(); return Status::OK(); } diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index a4180938dfc536..c30adf7d2fbdd1 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -617,6 +617,18 @@ bool ColumnValueRange::convert_to_avg_range_value( std::vector& begin_scan_keys, std::vector& end_scan_keys, bool& begin_include, bool& end_include, int32_t max_scan_key_num) { if constexpr (!_is_reject_split_type) { + CppType min_value = get_range_min_value(); + CppType max_value = get_range_max_value(); + if constexpr (primitive_type == PrimitiveType::TYPE_DATE) { + min_value.set_type(TimeType::TIME_DATE); + max_value.set_type(TimeType::TIME_DATE); + } + auto empty_range_only_null = min_value > max_value; + if (empty_range_only_null) { + // Not contain null will be disposed in `convert_to_close_range`, return eos. + DCHECK(contain_null()); + } + auto no_split = [&]() -> bool { begin_scan_keys.emplace_back(); begin_scan_keys.back().add_value( @@ -624,18 +636,11 @@ bool ColumnValueRange::convert_to_avg_range_value( contain_null()); end_scan_keys.emplace_back(); end_scan_keys.back().add_value( - cast_to_string(get_range_max_value(), scale())); + cast_to_string(get_range_max_value(), scale()), + empty_range_only_null ? true : false); return true; }; - - CppType min_value = get_range_min_value(); - CppType max_value = get_range_max_value(); - if constexpr (primitive_type == PrimitiveType::TYPE_DATE) { - min_value.set_type(TimeType::TIME_DATE); - max_value.set_type(TimeType::TIME_DATE); - } - - if (min_value > max_value || max_scan_key_num == 1) { + if (empty_range_only_null || max_scan_key_num == 1) { return no_split(); } @@ -1028,7 +1033,8 @@ Status OlapScanKeys::extend_scan_key(ColumnValueRange& range, *eos |= range.convert_to_close_range(_begin_scan_keys, _end_scan_keys, _begin_include, _end_include); - if (range.convert_to_avg_range_value(_begin_scan_keys, _end_scan_keys, _begin_include, + if (!(*eos) && + range.convert_to_avg_range_value(_begin_scan_keys, _end_scan_keys, _begin_include, _end_include, max_scan_key_num)) { _has_range_value = true; } diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index 90140e748f5d6b..39dd45163322ac 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -33,6 +33,7 @@ #include "exec/schema_scanner/schema_collations_scanner.h" #include "exec/schema_scanner/schema_columns_scanner.h" #include "exec/schema_scanner/schema_dummy_scanner.h" +#include "exec/schema_scanner/schema_file_cache_statistics.h" #include "exec/schema_scanner/schema_files_scanner.h" #include "exec/schema_scanner/schema_metadata_name_ids_scanner.h" #include "exec/schema_scanner/schema_partitions_scanner.h" @@ -77,9 +78,6 @@ namespace doris { class ObjectPool; -SchemaScanner::SchemaScanner(const std::vector& columns) - : _is_init(false), _columns(columns), _schema_table_type(TSchemaTableType::SCH_INVALID) {} - SchemaScanner::SchemaScanner(const std::vector& columns, TSchemaTableType::type type) : _is_init(false), _columns(columns), _schema_table_type(type) {} @@ -125,7 +123,6 @@ Status SchemaScanner::get_next_block_async(RuntimeState* state) { return; } SCOPED_ATTACH_TASK(state); - _dependency->block(); _async_thread_running = true; _finish_dependency->block(); if (!_opened) { @@ -150,19 +147,6 @@ Status SchemaScanner::get_next_block_async(RuntimeState* state) { return Status::OK(); } -Status SchemaScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { - if (!_is_init) { - return Status::InternalError("used before initialized."); - } - - if (nullptr == block || nullptr == eos) { - return Status::InternalError("input pointer is nullptr."); - } - - *eos = true; - return Status::OK(); -} - Status SchemaScanner::init(SchemaScannerParam* param, ObjectPool* pool) { if (_is_init) { return Status::OK(); @@ -241,6 +225,8 @@ std::unique_ptr SchemaScanner::create(TSchemaTableType::type type return SchemaBackendWorkloadGroupResourceUsage::create_unique(); case TSchemaTableType::SCH_TABLE_PROPERTIES: return SchemaTablePropertiesScanner::create_unique(); + case TSchemaTableType::SCH_FILE_CACHE_STATISTICS: + return SchemaFileCacheStatisticsScanner::create_unique(); case TSchemaTableType::SCH_CATALOG_META_CACHE_STATISTICS: return SchemaCatalogMetaCacheStatsScanner::create_unique(); default: @@ -426,21 +412,18 @@ Status SchemaScanner::insert_block_column(TCell cell, int col_index, vectorized: case TYPE_BIGINT: { reinterpret_cast*>(col_ptr)->insert_value( cell.longVal); - nullable_column->get_null_map_data().emplace_back(0); break; } case TYPE_INT: { reinterpret_cast*>(col_ptr)->insert_value( cell.intVal); - nullable_column->get_null_map_data().emplace_back(0); break; } case TYPE_BOOLEAN: { reinterpret_cast*>(col_ptr)->insert_value( cell.boolVal); - nullable_column->get_null_map_data().emplace_back(0); break; } @@ -449,7 +432,6 @@ Status SchemaScanner::insert_block_column(TCell cell, int col_index, vectorized: case TYPE_CHAR: { reinterpret_cast(col_ptr)->insert_data(cell.stringVal.data(), cell.stringVal.size()); - nullable_column->get_null_map_data().emplace_back(0); break; } @@ -461,7 +443,6 @@ Status SchemaScanner::insert_block_column(TCell cell, int col_index, vectorized: auto data = datas[0]; reinterpret_cast*>(col_ptr)->insert_data( reinterpret_cast(data), 0); - nullable_column->get_null_map_data().emplace_back(0); break; } default: { @@ -470,6 +451,7 @@ Status SchemaScanner::insert_block_column(TCell cell, int col_index, vectorized: return Status::InternalError(ss.str()); } } + nullable_column->get_null_map_data().emplace_back(0); return Status::OK(); } diff --git a/be/src/exec/schema_scanner.h b/be/src/exec/schema_scanner.h index da61d58b943fc4..440912bff1d729 100644 --- a/be/src/exec/schema_scanner.h +++ b/be/src/exec/schema_scanner.h @@ -19,10 +19,10 @@ #include #include -#include -#include #include +#include +#include #include #include #include @@ -82,8 +82,6 @@ struct SchemaScannerParam { // virtual scanner for all schema table class SchemaScanner { - ENABLE_FACTORY_CREATOR(SchemaScanner); - public: struct ColumnDesc { const char* name = nullptr; @@ -94,8 +92,8 @@ class SchemaScanner { int precision = -1; int scale = -1; }; - SchemaScanner(const std::vector& columns); - SchemaScanner(const std::vector& columns, TSchemaTableType::type type); + SchemaScanner(const std::vector& columns, + TSchemaTableType::type type = TSchemaTableType::SCH_INVALID); virtual ~SchemaScanner(); // init object need information, schema etc. @@ -103,7 +101,7 @@ class SchemaScanner { Status get_next_block(RuntimeState* state, vectorized::Block* block, bool* eos); // Start to work virtual Status start(RuntimeState* state); - virtual Status get_next_block_internal(vectorized::Block* block, bool* eos); + virtual Status get_next_block_internal(vectorized::Block* block, bool* eos) = 0; const std::vector& get_column_desc() const { return _columns; } // factory function static std::unique_ptr create(TSchemaTableType::type type); diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.cpp b/be/src/exec/schema_scanner/schema_columns_scanner.cpp index f4e15d2aef0af2..8325a7f5dc4f2d 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_columns_scanner.cpp @@ -160,6 +160,12 @@ std::string SchemaColumnsScanner::_to_mysql_data_type_string(TColumnDesc& desc) case TPrimitiveType::STRUCT: { return "struct"; } + case TPrimitiveType::IPV4: + return "ipv4"; + case TPrimitiveType::IPV6: + return "ipv6"; + case TPrimitiveType::VARIANT: + return "variant"; default: return "unknown"; } @@ -272,7 +278,12 @@ std::string SchemaColumnsScanner::_type_to_string(TColumnDesc& desc) { ret += ">"; return ret; } - + case TPrimitiveType::IPV4: + return "ipv4"; + case TPrimitiveType::IPV6: + return "ipv6"; + case TPrimitiveType::VARIANT: + return "variant"; default: return "unknown"; } diff --git a/be/src/exec/schema_scanner/schema_file_cache_statistics.cpp b/be/src/exec/schema_scanner/schema_file_cache_statistics.cpp new file mode 100644 index 00000000000000..ecad274d218983 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_file_cache_statistics.cpp @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/schema_scanner/schema_file_cache_statistics.h" + +#include "io/cache/block_file_cache_factory.h" +#include "runtime/exec_env.h" +#include "runtime/runtime_state.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" + +namespace doris { + +std::vector SchemaFileCacheStatisticsScanner::_s_tbls_columns = { + // name, type, size + {"BE_ID", TYPE_BIGINT, sizeof(int64_t), false}, + {"BE_IP", TYPE_STRING, sizeof(StringRef), false}, + {"CACHE_PATH", TYPE_STRING, sizeof(StringRef), false}, + {"METRIC_NAME", TYPE_STRING, sizeof(StringRef), false}, + {"METRIC_VALUE", TYPE_STRING, sizeof(StringRef), false}}; + +SchemaFileCacheStatisticsScanner::SchemaFileCacheStatisticsScanner() + : SchemaScanner(_s_tbls_columns, TSchemaTableType::SCH_FILE_CACHE_STATISTICS) {} + +SchemaFileCacheStatisticsScanner::~SchemaFileCacheStatisticsScanner() {} + +Status SchemaFileCacheStatisticsScanner::start(RuntimeState* state) { + _block_rows_limit = state->batch_size(); + return Status::OK(); +} + +Status SchemaFileCacheStatisticsScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { + if (!_is_init) { + return Status::InternalError("Used before initialized."); + } + + if (nullptr == block || nullptr == eos) { + return Status::InternalError("input pointer is nullptr."); + } + + if (_stats_block == nullptr) { + _stats_block = vectorized::Block::create_unique(); + + for (int i = 0; i < _s_tbls_columns.size(); ++i) { + TypeDescriptor descriptor(_s_tbls_columns[i].type); + auto data_type = + vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + _stats_block->insert(vectorized::ColumnWithTypeAndName( + data_type->create_column(), data_type, _s_tbls_columns[i].name)); + } + + _stats_block->reserve(_block_rows_limit); + + ExecEnv::GetInstance()->file_cache_factory()->get_cache_stats_block(_stats_block.get()); + _total_rows = _stats_block->rows(); + } + + if (_row_idx == _total_rows) { + *eos = true; + return Status::OK(); + } + + int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); + vectorized::MutableBlock mblock = vectorized::MutableBlock::build_mutable_block(block); + RETURN_IF_ERROR(mblock.add_rows(_stats_block.get(), _row_idx, current_batch_rows)); + _row_idx += current_batch_rows; + + *eos = _row_idx == _total_rows; + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/exec/schema_scanner/schema_file_cache_statistics.h b/be/src/exec/schema_scanner/schema_file_cache_statistics.h new file mode 100644 index 00000000000000..96c6aa9028f0c8 --- /dev/null +++ b/be/src/exec/schema_scanner/schema_file_cache_statistics.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" +#include "exec/schema_scanner.h" + +namespace doris { +class RuntimeState; +namespace vectorized { +class Block; +} // namespace vectorized + +class SchemaFileCacheStatisticsScanner : public SchemaScanner { + ENABLE_FACTORY_CREATOR(SchemaFileCacheStatisticsScanner); + +public: + SchemaFileCacheStatisticsScanner(); + ~SchemaFileCacheStatisticsScanner() override; + + Status start(RuntimeState* state) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; + + static std::vector _s_tbls_columns; + +private: + int _block_rows_limit = 4096; + int _row_idx = 0; + int _total_rows = 0; + std::unique_ptr _stats_block = nullptr; +}; +}; // namespace doris diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp index 16d5f2daba61e7..3aa0e944a822c5 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp @@ -26,6 +26,9 @@ #include #include +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet.h" +#include "cloud/cloud_tablet_mgr.h" #include "cloud/config.h" #include "common/status.h" #include "olap/olap_common.h" @@ -35,6 +38,7 @@ #include "olap/tablet.h" #include "olap/tablet_manager.h" #include "runtime/define_primitive_type.h" +#include "runtime/exec_env.h" #include "runtime/runtime_state.h" #include "util/runtime_profile.h" #include "vec/common/string_ref.h" @@ -78,7 +82,19 @@ Status SchemaRowsetsScanner::start(RuntimeState* state) { Status SchemaRowsetsScanner::_get_all_rowsets() { if (config::is_cloud_mode()) { - return Status::NotSupported("SchemaRowsetsScanner::_get_all_rowsets is not implemented"); + // only query cloud tablets in lru cache instead of all tablets + std::vector> tablets = + ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_mgr().get_weak_tablets(); + for (const std::weak_ptr& tablet : tablets) { + if (!tablet.expired()) { + auto t = tablet.lock(); + std::shared_lock rowset_ldlock(t->get_header_lock()); + for (const auto& it : t->rowset_map()) { + rowsets_.emplace_back(it.second); + } + } + } + return Status::OK(); } std::vector tablets = ExecEnv::GetInstance()->storage_engine().to_local().tablet_manager()->get_all_tablet(); diff --git a/be/src/exec/schema_scanner/schema_statistics_scanner.cpp b/be/src/exec/schema_scanner/schema_statistics_scanner.cpp deleted file mode 100644 index f4f3d5dba83271..00000000000000 --- a/be/src/exec/schema_scanner/schema_statistics_scanner.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "exec/schema_scanner/schema_statistics_scanner.h" - -#include - -#include "runtime/define_primitive_type.h" -#include "vec/common/string_ref.h" - -namespace doris { - -std::vector SchemaStatisticsScanner::_s_cols_statistics = { - // name, type, size, is_null - {"TABLE_CATALOG", TYPE_VARCHAR, sizeof(StringRef), true}, - {"TABLE_SCHEMA", TYPE_VARCHAR, sizeof(StringRef), false}, - {"TABLE_NAME", TYPE_VARCHAR, sizeof(StringRef), false}, - {"NON_UNIQUE", TYPE_BIGINT, sizeof(int64_t), false}, - {"INDEX_SCHEMA", TYPE_VARCHAR, sizeof(StringRef), false}, - {"INDEX_NAME", TYPE_VARCHAR, sizeof(StringRef), false}, - {"SEQ_IN_INDEX", TYPE_BIGINT, sizeof(int64_t), false}, - {"COLUMN_NAME", TYPE_VARCHAR, sizeof(StringRef), false}, - {"COLLATION", TYPE_VARCHAR, sizeof(StringRef), true}, - {"CARDINALITY", TYPE_BIGINT, sizeof(int64_t), true}, - {"SUB_PART", TYPE_BIGINT, sizeof(int64_t), true}, - {"PACKED", TYPE_VARCHAR, sizeof(StringRef), true}, - {"NULLABLE", TYPE_VARCHAR, sizeof(StringRef), false}, - {"INDEX_TYPE", TYPE_VARCHAR, sizeof(StringRef), false}, - {"COMMENT", TYPE_VARCHAR, sizeof(StringRef), true}, -}; - -SchemaStatisticsScanner::SchemaStatisticsScanner() : SchemaScanner(_s_cols_statistics) {} - -SchemaStatisticsScanner::~SchemaStatisticsScanner() {} - -} // namespace doris diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index 44846ded868e8f..f1c0ad60e06455 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,7 @@ #include #include #include +#include #include #include "common/exception.h" @@ -117,9 +119,21 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { _db_id = pschema.db_id(); _table_id = pschema.table_id(); _version = pschema.version(); - _is_partial_update = pschema.partial_update(); + if (pschema.has_unique_key_update_mode()) { + _unique_key_update_mode = pschema.unique_key_update_mode(); + if (pschema.has_sequence_map_col_unique_id()) { + _sequence_map_col_uid = pschema.sequence_map_col_unique_id(); + } + } else { + // for backward compatibility + if (pschema.has_partial_update() && pschema.partial_update()) { + _unique_key_update_mode = UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS; + } else { + _unique_key_update_mode = UniqueKeyUpdateModePB::UPSERT; + } + } _is_strict_mode = pschema.is_strict_mode(); - if (_is_partial_update) { + if (_unique_key_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { _auto_increment_column = pschema.auto_increment_column(); if (!_auto_increment_column.empty() && pschema.auto_increment_column_unique_id() == -1) { return Status::InternalError( @@ -137,7 +151,8 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { for (const auto& col : pschema.partial_update_input_columns()) { _partial_update_input_columns.insert(col); } - std::unordered_map, SlotDescriptor*> slots_map; + std::unordered_map slots_map; + _tuple_desc = _obj_pool.add(new TupleDescriptor(pschema.tuple_desc())); for (const auto& p_slot_desc : pschema.slot_descs()) { @@ -145,8 +160,10 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { _tuple_desc->add_slot(slot_desc); string data_type; EnumToString(TPrimitiveType, to_thrift(slot_desc->col_type()), data_type); - slots_map.emplace(std::make_pair(to_lower(slot_desc->col_name()), - TabletColumn::get_field_type_by_string(data_type)), + std::string is_null_str = slot_desc->is_nullable() ? "true" : "false"; + std::string data_type_str = + std::to_string(int64_t(TabletColumn::get_field_type_by_string(data_type))); + slots_map.emplace(to_lower(slot_desc->col_name()) + "+" + data_type_str + is_null_str, slot_desc); } @@ -155,11 +172,13 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { index->index_id = p_index.id(); index->schema_hash = p_index.schema_hash(); for (const auto& pcolumn_desc : p_index.columns_desc()) { - if (!_is_partial_update || + if (_unique_key_update_mode != UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS || _partial_update_input_columns.contains(pcolumn_desc.name())) { - auto it = slots_map.find(std::make_pair( - to_lower(pcolumn_desc.name()), - TabletColumn::get_field_type_by_string(pcolumn_desc.type()))); + std::string is_null_str = pcolumn_desc.is_nullable() ? "true" : "false"; + std::string data_type_str = std::to_string( + int64_t(TabletColumn::get_field_type_by_string(pcolumn_desc.type()))); + auto it = slots_map.find(to_lower(pcolumn_desc.name()) + "+" + data_type_str + + is_null_str); if (it == std::end(slots_map)) { return Status::InternalError("unknown index column, column={}, type={}", pcolumn_desc.name(), pcolumn_desc.type()); @@ -185,15 +204,51 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { return Status::OK(); } +Status OlapTableSchemaParam::init_unique_key_update_mode(const TOlapTableSchemaParam& tschema) { + if (tschema.__isset.unique_key_update_mode) { + switch (tschema.unique_key_update_mode) { + case doris::TUniqueKeyUpdateMode::UPSERT: { + _unique_key_update_mode = UniqueKeyUpdateModePB::UPSERT; + break; + } + case doris::TUniqueKeyUpdateMode::UPDATE_FIXED_COLUMNS: { + _unique_key_update_mode = UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS; + break; + } + case doris::TUniqueKeyUpdateMode::UPDATE_FLEXIBLE_COLUMNS: { + _unique_key_update_mode = UniqueKeyUpdateModePB::UPDATE_FLEXIBLE_COLUMNS; + break; + } + default: { + return Status::InternalError( + "Unknown unique_key_update_mode: {}, should be one of " + "UPSERT/UPDATE_FIXED_COLUMNS/UPDATE_FLEXIBLE_COLUMNS", + tschema.unique_key_update_mode); + } + } + if (tschema.__isset.sequence_map_col_unique_id) { + _sequence_map_col_uid = tschema.sequence_map_col_unique_id; + } + } else { + // for backward compatibility + if (tschema.__isset.is_partial_update && tschema.is_partial_update) { + _unique_key_update_mode = UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS; + } else { + _unique_key_update_mode = UniqueKeyUpdateModePB::UPSERT; + } + } + return Status::OK(); +} + Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { _db_id = tschema.db_id; _table_id = tschema.table_id; _version = tschema.version; - _is_partial_update = tschema.is_partial_update; + RETURN_IF_ERROR(init_unique_key_update_mode(tschema)); if (tschema.__isset.is_strict_mode) { _is_strict_mode = tschema.is_strict_mode; } - if (_is_partial_update) { + if (_unique_key_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { _auto_increment_column = tschema.auto_increment_column; if (!_auto_increment_column.empty() && tschema.auto_increment_column_unique_id == -1) { return Status::InternalError( @@ -206,12 +261,14 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { for (const auto& tcolumn : tschema.partial_update_input_columns) { _partial_update_input_columns.insert(tcolumn); } - std::unordered_map, SlotDescriptor*> slots_map; + std::unordered_map slots_map; _tuple_desc = _obj_pool.add(new TupleDescriptor(tschema.tuple_desc)); for (const auto& t_slot_desc : tschema.slot_descs) { auto* slot_desc = _obj_pool.add(new SlotDescriptor(t_slot_desc)); _tuple_desc->add_slot(slot_desc); - slots_map.emplace(std::make_pair(to_lower(slot_desc->col_name()), slot_desc->col_type()), + std::string is_null_str = slot_desc->is_nullable() ? "true" : "false"; + std::string data_type_str = std::to_string(int64_t(slot_desc->col_type())); + slots_map.emplace(to_lower(slot_desc->col_name()) + "+" + data_type_str + is_null_str, slot_desc); } @@ -221,11 +278,13 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { index->index_id = t_index.id; index->schema_hash = t_index.schema_hash; for (const auto& tcolumn_desc : t_index.columns_desc) { - if (!_is_partial_update || + if (_unique_key_update_mode != UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS || _partial_update_input_columns.contains(tcolumn_desc.column_name)) { - auto it = slots_map.find( - std::make_pair(to_lower(tcolumn_desc.column_name), - thrift_to_type(tcolumn_desc.column_type.type))); + std::string is_null_str = tcolumn_desc.is_allow_null ? "true" : "false"; + std::string data_type_str = + std::to_string(int64_t(thrift_to_type(tcolumn_desc.column_type.type))); + auto it = slots_map.find(to_lower(tcolumn_desc.column_name) + "+" + data_type_str + + is_null_str); if (it == slots_map.end()) { return Status::InternalError("unknown index column, column={}, type={}", tcolumn_desc.column_name, @@ -270,13 +329,18 @@ void OlapTableSchemaParam::to_protobuf(POlapTableSchemaParam* pschema) const { pschema->set_db_id(_db_id); pschema->set_table_id(_table_id); pschema->set_version(_version); - pschema->set_partial_update(_is_partial_update); + pschema->set_unique_key_update_mode(_unique_key_update_mode); + if (_unique_key_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { + // for backward compatibility + pschema->set_partial_update(true); + } pschema->set_is_strict_mode(_is_strict_mode); pschema->set_auto_increment_column(_auto_increment_column); pschema->set_auto_increment_column_unique_id(_auto_increment_column_unique_id); pschema->set_timestamp_ms(_timestamp_ms); pschema->set_timezone(_timezone); pschema->set_nano_seconds(_nano_seconds); + pschema->set_sequence_map_col_unique_id(_sequence_map_col_uid); for (auto col : _partial_update_input_columns) { *pschema->add_partial_update_input_columns() = col; } @@ -724,6 +788,7 @@ Status VOlapTablePartitionParam::replace_partitions( // add new partitions with new id. _partitions.emplace_back(part); + VLOG_NOTICE << "params add new partition " << part->id; // replace items in _partition_maps if (_is_in_partition) { diff --git a/be/src/exec/tablet_info.h b/be/src/exec/tablet_info.h index ff1c2e8e6b072e..fb3c230a036fd0 100644 --- a/be/src/exec/tablet_info.h +++ b/be/src/exec/tablet_info.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -88,7 +89,18 @@ class OlapTableSchemaParam { return _proto_schema; } - bool is_partial_update() const { return _is_partial_update; } + UniqueKeyUpdateModePB unique_key_update_mode() const { return _unique_key_update_mode; } + + bool is_partial_update() const { + return _unique_key_update_mode != UniqueKeyUpdateModePB::UPSERT; + } + bool is_fixed_partial_update() const { + return _unique_key_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS; + } + bool is_flexible_partial_update() const { + return _unique_key_update_mode == UniqueKeyUpdateModePB::UPDATE_FLEXIBLE_COLUMNS; + } + std::set partial_update_input_columns() const { return _partial_update_input_columns; } @@ -101,8 +113,11 @@ class OlapTableSchemaParam { void set_timezone(std::string timezone) { _timezone = timezone; } std::string timezone() const { return _timezone; } bool is_strict_mode() const { return _is_strict_mode; } + int32_t sequence_map_col_uid() const { return _sequence_map_col_uid; } std::string debug_string() const; + Status init_unique_key_update_mode(const TOlapTableSchemaParam& tschema); + private: int64_t _db_id; int64_t _table_id; @@ -112,7 +127,7 @@ class OlapTableSchemaParam { mutable POlapTableSchemaParam* _proto_schema = nullptr; std::vector _indexes; mutable ObjectPool _obj_pool; - bool _is_partial_update = false; + UniqueKeyUpdateModePB _unique_key_update_mode {UniqueKeyUpdateModePB::UPSERT}; std::set _partial_update_input_columns; bool _is_strict_mode = false; std::string _auto_increment_column; @@ -120,6 +135,7 @@ class OlapTableSchemaParam { int64_t _timestamp_ms = 0; int32_t _nano_seconds {0}; std::string _timezone; + int32_t _sequence_map_col_uid {-1}; }; using OlapTableIndexTablets = TOlapTableIndexTablets; diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h index 376453c06817b0..5cb2b812220b10 100644 --- a/be/src/exprs/bitmapfilter_predicate.h +++ b/be/src/exprs/bitmapfilter_predicate.h @@ -37,7 +37,7 @@ class BitmapFilterFuncBase : public RuntimeFilterFuncBase { virtual void light_copy(BitmapFilterFuncBase* other) { _not_in = other->_not_in; } virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number) = 0; - virtual void find_batch(const char* data, const uint8* nullmap, int number, + virtual void find_batch(const char* data, const uint8* nullmap, size_t number, uint8* results) const = 0; virtual size_t size() const = 0; bool is_not_in() const { return _not_in; } @@ -65,7 +65,7 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number) override; - void find_batch(const char* data, const uint8* nullmap, int number, + void find_batch(const char* data, const uint8* nullmap, size_t number, uint8* results) const override; bool empty() override { return _bitmap_value->empty(); } @@ -133,9 +133,9 @@ uint16_t BitmapFilterFunc::find_fixed_len_olap_engine(const char* data, co } template -void BitmapFilterFunc::find_batch(const char* data, const uint8* nullmap, int number, +void BitmapFilterFunc::find_batch(const char* data, const uint8* nullmap, size_t number, uint8* results) const { - for (int i = 0; i < number; i++) { + for (size_t i = 0; i < number; i++) { results[i] = false; if (nullmap != nullptr && nullmap[i]) { continue; diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index 6d452bbe9922dc..eaf3a652dbc963 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -151,19 +151,25 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { } Status merge(BloomFilterFuncBase* bloomfilter_func) { + if (bloomfilter_func == nullptr) { + return Status::InternalError("bloomfilter_func is nullptr"); + } + if (bloomfilter_func->_bloom_filter == nullptr) { + return Status::InternalError("bloomfilter_func->_bloom_filter is nullptr"); + } // If `_inited` is false, there is no memory allocated in bloom filter and this is the first // call for `merge` function. So we just reuse this bloom filter, and we don't need to // allocate memory again. if (!_inited) { auto* other_func = static_cast(bloomfilter_func); - DCHECK(_bloom_filter == nullptr); - DCHECK(bloomfilter_func != nullptr); + if (_bloom_filter != nullptr) { + return Status::InternalError("_bloom_filter must is nullptr"); + } _bloom_filter = bloomfilter_func->_bloom_filter; _bloom_filter_alloced = other_func->_bloom_filter_alloced; _inited = true; return Status::OK(); } - DCHECK(bloomfilter_func != nullptr); auto* other_func = static_cast(bloomfilter_func); if (_bloom_filter_alloced != other_func->_bloom_filter_alloced) { return Status::InternalError( diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h index f0977a652b1cbe..6536ec2430fe08 100644 --- a/be/src/exprs/hybrid_set.h +++ b/be/src/exprs/hybrid_set.h @@ -333,10 +333,6 @@ class HybridSet : public HybridSetBase { int size() override { return _set.size(); } bool find(const void* data) const override { - if (data == nullptr) { - return false; - } - return _set.find(*reinterpret_cast(data)); } @@ -487,10 +483,6 @@ class StringSet : public HybridSetBase { int size() override { return _set.size(); } bool find(const void* data) const override { - if (data == nullptr) { - return false; - } - const auto* value = reinterpret_cast(data); std::string str_value(const_cast(value->data), value->size); return _set.find(str_value); @@ -654,19 +646,11 @@ class StringValueSet : public HybridSetBase { int size() override { return _set.size(); } bool find(const void* data) const override { - if (data == nullptr) { - return false; - } - const auto* value = reinterpret_cast(data); return _set.find(*value); } bool find(const void* data, size_t size) const override { - if (data == nullptr) { - return false; - } - StringRef sv(reinterpret_cast(data), size); return _set.find(sv); } diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 8b3f4b197d08cd..85f1c535c7038b 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include // IWYU pragma: no_include @@ -472,10 +471,10 @@ class RuntimePredicateWrapper { const TExpr& probe_expr); Status merge(const RuntimePredicateWrapper* wrapper) { - if (is_ignored() || wrapper->is_ignored()) { - _context->ignored = true; + if (wrapper->is_ignored()) { return Status::OK(); } + _context->ignored = false; bool can_not_merge_in_or_bloom = _filter_type == RuntimeFilterType::IN_OR_BLOOM_FILTER && @@ -493,7 +492,10 @@ class RuntimePredicateWrapper { switch (_filter_type) { case RuntimeFilterType::IN_FILTER: { - // try insert set + if (!_context->hybrid_set) { + _context->ignored = true; + return Status::OK(); + } _context->hybrid_set->insert(wrapper->_context->hybrid_set.get()); if (_max_in_num >= 0 && _context->hybrid_set->size() >= _max_in_num) { _context->ignored = true; @@ -1051,30 +1053,33 @@ class SyncSizeClosure : public AutoReleaseClosure _rf_context; - std::string _rf_debug_info; using Base = AutoReleaseClosure>; ENABLE_FACTORY_CREATOR(SyncSizeClosure); void _process_if_rpc_failed() override { - ((pipeline::CountedFinishDependency*)_dependency.get())->sub(); - LOG(WARNING) << "sync filter size meet rpc error, filter=" << _rf_debug_info; + Defer defer {[&]() { ((pipeline::CountedFinishDependency*)_dependency.get())->sub(); }}; + auto ctx = _rf_context.lock(); + if (!ctx) { + return; + } + + ctx->err_msg = cntl_->ErrorText(); Base::_process_if_rpc_failed(); } void _process_if_meet_error_status(const Status& status) override { - ((pipeline::CountedFinishDependency*)_dependency.get())->sub(); + Defer defer {[&]() { ((pipeline::CountedFinishDependency*)_dependency.get())->sub(); }}; + auto ctx = _rf_context.lock(); + if (!ctx) { + return; + } + if (status.is()) { // rf merger backend may finished before rf's send_filter_size, we just ignore filter in this case. - auto ctx = _rf_context.lock(); - if (ctx) { - ctx->ignored = true; - } else { - LOG(WARNING) << "sync filter size returned but context is released, filter=" - << _rf_debug_info; - } + ctx->ignored = true; } else { - LOG(WARNING) << "sync filter size meet error status, filter=" << _rf_debug_info; + ctx->err_msg = status.to_string(); Base::_process_if_meet_error_status(status); } } @@ -1083,11 +1088,8 @@ class SyncSizeClosure : public AutoReleaseClosure req, std::shared_ptr> callback, std::shared_ptr dependency, - RuntimeFilterContextSPtr rf_context, std::string_view rf_debug_info) - : Base(req, callback), - _dependency(std::move(dependency)), - _rf_context(rf_context), - _rf_debug_info(rf_debug_info) {} + RuntimeFilterContextSPtr rf_context) + : Base(req, callback), _dependency(std::move(dependency)), _rf_context(rf_context) {} }; Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filter_size) { @@ -1131,8 +1133,8 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt auto callback = DummyBrpcCallback::create_shared(); // IRuntimeFilter maybe deconstructed before the rpc finished, so that could not use // a raw pointer in closure. Has to use the context's shared ptr. - auto closure = SyncSizeClosure::create_unique(request, callback, _dependency, - _wrapper->_context, this->debug_string()); + auto closure = + SyncSizeClosure::create_unique(request, callback, _dependency, _wrapper->_context); auto* pquery_id = request->mutable_query_id(); pquery_id->set_hi(_state->query_id.hi()); pquery_id->set_lo(_state->query_id.lo()); @@ -1143,7 +1145,11 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt request->set_filter_size(local_filter_size); request->set_filter_id(_filter_id); - callback->cntl_->set_timeout_ms(std::min(3600, state->execution_timeout()) * 1000); + + callback->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(state->execution_timeout())); + if (config::execution_ignore_eovercrowded) { + callback->cntl_->ignore_eovercrowded(); + } stub->send_filter_size(closure->cntl_.get(), closure->request_.get(), closure->response_.get(), closure.get()); @@ -1177,10 +1183,14 @@ Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { pfragment_instance_id->set_lo((int64_t)this); merge_filter_request->set_filter_id(_filter_id); - merge_filter_request->set_is_pipeline(true); auto column_type = _wrapper->column_type(); RETURN_IF_CATCH_EXCEPTION(merge_filter_request->set_column_type(to_proto(column_type))); - merge_filter_callback->cntl_->set_timeout_ms(wait_time_ms()); + + merge_filter_callback->cntl_->set_timeout_ms( + get_execution_rpc_timeout_ms(_state->execution_timeout)); + if (config::execution_ignore_eovercrowded) { + merge_filter_callback->cntl_->ignore_eovercrowded(); + } if (get_ignored()) { merge_filter_request->set_filter_type(PFilterType::UNKNOW_FILTER); @@ -1214,7 +1224,7 @@ Status IRuntimeFilter::get_push_expr_ctxs(std::listattach_profile_counter(expr_filtered_rows_counter, expr_input_rows_counter, always_true_counter); @@ -1278,16 +1288,17 @@ void IRuntimeFilter::set_filter_timer(std::shared_ptr dependency) { +void IRuntimeFilter::set_finish_dependency( + const std::shared_ptr& dependency) { _dependency = dependency; - ((pipeline::CountedFinishDependency*)_dependency.get())->add(); + _dependency->add(); CHECK(_dependency); } void IRuntimeFilter::set_synced_size(uint64_t global_size) { _synced_size = global_size; if (_dependency) { - ((pipeline::CountedFinishDependency*)_dependency.get())->sub(); + _dependency->sub(); } } @@ -1307,10 +1318,6 @@ std::string IRuntimeFilter::formatted_state() const { _wrapper->_context->ignored); } -BloomFilterFuncBase* IRuntimeFilter::get_bloomfilter() const { - return _wrapper->get_bloomfilter(); -} - Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQueryOptions* options, int node_id, bool build_bf_exactly) { // if node_id == -1 , it shouldn't be a consumer @@ -1503,9 +1510,9 @@ void IRuntimeFilter::update_runtime_filter_type_to_profile() { std::string IRuntimeFilter::debug_string() const { return fmt::format( "RuntimeFilter: (id = {}, type = {}, need_local_merge: {}, is_broadcast: {}, " - "build_bf_cardinality: {}", + "build_bf_cardinality: {}, error_msg: {}", _filter_id, to_string(_runtime_filter_type), _need_local_merge, _is_broadcast_join, - _wrapper->get_build_bf_cardinality()); + _wrapper->get_build_bf_cardinality(), _wrapper->_context->err_msg); } Status IRuntimeFilter::merge_from(const RuntimePredicateWrapper* wrapper) { diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index c4a38517ab4ba0..f5a069d9e55f85 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -69,6 +69,7 @@ struct RuntimeFilterContextSPtr; namespace pipeline { class RuntimeFilterTimer; +class CountedFinishDependency; } // namespace pipeline enum class RuntimeFilterType { @@ -198,7 +199,6 @@ class IRuntimeFilter { _is_broadcast_join(true), _has_remote_target(false), _has_local_target(false), - _rf_state(RuntimeFilterState::NOT_READY), _rf_state_atomic(RuntimeFilterState::NOT_READY), _role(RuntimeFilterRole::PRODUCER), _expr_order(-1), @@ -264,8 +264,6 @@ class IRuntimeFilter { Status init_with_desc(const TRuntimeFilterDesc* desc, const TQueryOptions* options, int node_id = -1, bool build_bf_exactly = false); - BloomFilterFuncBase* get_bloomfilter() const; - // serialize _wrapper to protobuf Status serialize(PMergeFilterRequest* request, void** data, int* len); Status serialize(PPublishFilterRequest* request, void** data = nullptr, int* len = nullptr); @@ -355,7 +353,8 @@ class IRuntimeFilter { void set_synced_size(uint64_t global_size); - void set_dependency(std::shared_ptr dependency); + void set_finish_dependency( + const std::shared_ptr& dependency); int64_t get_synced_size() const { return _synced_size; } @@ -366,9 +365,6 @@ class IRuntimeFilter { void to_protobuf(PInFilter* filter); void to_protobuf(PMinMaxFilter* filter); - template - Status _update_filter(const T* param); - template Status serialize_impl(T* request, void** data, int* len); @@ -398,7 +394,6 @@ class IRuntimeFilter { // will apply to local node bool _has_local_target; // filter is ready for consumer - RuntimeFilterState _rf_state; std::atomic _rf_state_atomic; // role consumer or producer RuntimeFilterRole _role; @@ -429,7 +424,7 @@ class IRuntimeFilter { std::vector> _filter_timer; int64_t _synced_size = -1; - std::shared_ptr _dependency; + std::shared_ptr _dependency; }; // avoid expose RuntimePredicateWrapper diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index c0a249cd6b063d..42c5f598633ad9 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -42,17 +42,17 @@ class VRuntimeFilterSlots { } Status send_filter_size(RuntimeState* state, uint64_t hash_table_size, - std::shared_ptr dependency) { + std::shared_ptr dependency) { if (_runtime_filters.empty()) { return Status::OK(); } for (auto runtime_filter : _runtime_filters) { if (runtime_filter->need_sync_filter_size()) { - runtime_filter->set_dependency(dependency); + runtime_filter->set_finish_dependency(dependency); } } - // send_filter_size may call dependency->sub(), so we call set_dependency firstly for all rf to avoid dependency set_ready repeatedly + // send_filter_size may call dependency->sub(), so we call set_finish_dependency firstly for all rf to avoid dependency set_ready repeatedly for (auto runtime_filter : _runtime_filters) { if (runtime_filter->need_sync_filter_size()) { RETURN_IF_ERROR(runtime_filter->send_filter_size(state, hash_table_size)); @@ -77,6 +77,10 @@ class VRuntimeFilterSlots { if (filter->get_real_type() != RuntimeFilterType::IN_FILTER) { continue; } + if (!filter->need_sync_filter_size() && + filter->type() == RuntimeFilterType::IN_OR_BLOOM_FILTER) { + continue; + } if (has_in_filter.contains(filter->expr_order())) { filter->set_ignored(); continue; @@ -84,7 +88,7 @@ class VRuntimeFilterSlots { has_in_filter.insert(filter->expr_order()); } - // process ignore filter when it has IN_FILTER on same expr, and init bloom filter size + // process ignore filter when it has IN_FILTER on same expr for (auto filter : _runtime_filters) { if (filter->get_ignored()) { continue; @@ -98,12 +102,16 @@ class VRuntimeFilterSlots { return Status::OK(); } + Status ignore_all_filters() { + for (auto filter : _runtime_filters) { + filter->set_ignored(); + } + return Status::OK(); + } + Status init_filters(RuntimeState* state, uint64_t local_hash_table_size) { // process IN_OR_BLOOM_FILTER's real type for (auto filter : _runtime_filters) { - if (filter->get_ignored()) { - continue; - } if (filter->type() == RuntimeFilterType::IN_OR_BLOOM_FILTER && get_real_size(filter.get(), local_hash_table_size) > state->runtime_filter_max_in_num()) { @@ -141,7 +149,7 @@ class VRuntimeFilterSlots { } // publish runtime filter - Status publish(bool publish_local = false) { + Status publish(bool publish_local) { for (auto& pair : _runtime_filters_map) { for (auto& filter : pair.second) { RETURN_IF_ERROR(filter->publish(publish_local)); diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h index 4bc04e1e303eb5..f78480b3cf5fec 100644 --- a/be/src/gutil/endian.h +++ b/be/src/gutil/endian.h @@ -60,8 +60,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128 host_int) { } inline wide::UInt256 gbswap_256(wide::UInt256 host_int) { - wide::UInt256 result{gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), - gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; + wide::UInt256 result {gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), + gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; return result; } @@ -136,6 +136,9 @@ class LittleEndian { static unsigned __int128 FromHost128(unsigned __int128 x) { return x; } static unsigned __int128 ToHost128(unsigned __int128 x) { return x; } + static wide::UInt256 FromHost256(wide::UInt256 x) { return x; } + static wide::UInt256 ToHost256(wide::UInt256 x) { return x; } + static bool IsLittleEndian() { return true; } #elif defined IS_BIG_ENDIAN @@ -149,6 +152,12 @@ class LittleEndian { static uint64 FromHost64(uint64 x) { return gbswap_64(x); } static uint64 ToHost64(uint64 x) { return gbswap_64(x); } + static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); } + static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); } + + static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); } + static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); } + static bool IsLittleEndian() { return false; } #endif /* ENDIAN */ diff --git a/be/src/http/action/adjust_log_level.cpp b/be/src/http/action/adjust_log_level.cpp index 687639a9b58dea..a8644a0fb5f52a 100644 --- a/be/src/http/action/adjust_log_level.cpp +++ b/be/src/http/action/adjust_log_level.cpp @@ -17,8 +17,9 @@ #include +#include + #include "common/logging.h" -#include "common/status.h" #include "http/http_channel.h" #include "http/http_request.h" @@ -26,7 +27,7 @@ namespace doris { // **Note**: If the module_name does not exist in the vlog modules, vlog // would create corresponding module for it. -int handle_request(HttpRequest* req) { +std::tuple handle_request(HttpRequest* req) { auto parse_param = [&req](std::string param) { const auto& value = req->param(param); if (value.empty()) { @@ -38,13 +39,16 @@ int handle_request(HttpRequest* req) { const auto& module = parse_param("module"); const auto& level = parse_param("level"); int new_level = std::stoi(level); - return google::SetVLOGLevel(module.c_str(), new_level); + return std::make_tuple(module, google::SetVLOGLevel(module.c_str(), new_level), new_level); } void AdjustLogLevelAction::handle(HttpRequest* req) { try { - auto old_level = handle_request(req); - auto msg = fmt::format("adjust log level success, origin level is {}", old_level); + auto handle_result = handle_request(req); + auto msg = + fmt::format("adjust vlog of {} from {} to {} succeed", std::get<0>(handle_result), + std::get<1>(handle_result), std::get<2>(handle_result)); + LOG(INFO) << msg; HttpChannel::send_reply(req, msg); } catch (const std::exception& e) { HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, e.what()); diff --git a/be/src/http/action/file_cache_action.cpp b/be/src/http/action/file_cache_action.cpp index f31c040c5cf672..740bac46edf2a7 100644 --- a/be/src/http/action/file_cache_action.cpp +++ b/be/src/http/action/file_cache_action.cpp @@ -17,10 +17,15 @@ #include "file_cache_action.h" +#include + +#include #include #include #include #include +#include +#include #include "common/status.h" #include "http/http_channel.h" @@ -30,6 +35,7 @@ #include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_factory.h" #include "io/cache/file_cache_common.h" +#include "io/cache/fs_file_cache_storage.h" #include "olap/olap_define.h" #include "olap/tablet_meta.h" #include "util/easy_json.h" @@ -43,6 +49,7 @@ constexpr static std::string_view PATH = "path"; constexpr static std::string_view CLEAR = "clear"; constexpr static std::string_view RESET = "reset"; constexpr static std::string_view HASH = "hash"; +constexpr static std::string_view LIST_CACHE = "list_cache"; constexpr static std::string_view CAPACITY = "capacity"; constexpr static std::string_view RELEASE = "release"; constexpr static std::string_view BASE_PATH = "base_path"; @@ -66,7 +73,14 @@ Status FileCacheAction::_handle_header(HttpRequest* req, std::string* json_metri *json_metrics = json.ToString(); } else if (operation == CLEAR) { const std::string& sync = req->param(SYNC.data()); - auto ret = io::FileCacheFactory::instance()->clear_file_caches(to_lower(sync) == "true"); + const std::string& segment_path = req->param(VALUE.data()); + if (segment_path.empty()) { + io::FileCacheFactory::instance()->clear_file_caches(to_lower(sync) == "true"); + } else { + io::UInt128Wrapper hash = io::BlockFileCache::hash(segment_path); + io::BlockFileCache* cache = io::FileCacheFactory::instance()->get_by_path(hash); + cache->remove_if_cached(hash); + } } else if (operation == RESET) { std::string capacity = req->param(CAPACITY.data()); int64_t new_capacity = 0; @@ -96,6 +110,23 @@ Status FileCacheAction::_handle_header(HttpRequest* req, std::string* json_metri json[HASH.data()] = ret.to_string(); *json_metrics = json.ToString(); } + } else if (operation == LIST_CACHE) { + const std::string& segment_path = req->param(VALUE.data()); + if (segment_path.empty()) { + st = Status::InvalidArgument("missing parameter: {} is required", VALUE.data()); + } else { + io::UInt128Wrapper cache_hash = io::BlockFileCache::hash(segment_path); + std::vector cache_files = + io::FileCacheFactory::instance()->get_cache_file_by_path(cache_hash); + if (cache_files.empty()) { + *json_metrics = "[]"; + } else { + EasyJson json; + std::for_each(cache_files.begin(), cache_files.end(), + [&json](auto& x) { json.PushBack(x); }); + *json_metrics = json.ToString(); + } + } } else { st = Status::InternalError("invalid operation: {}", operation); } diff --git a/be/src/http/action/jeprofile_actions.cpp b/be/src/http/action/jeprofile_actions.cpp index f805d61d5b0b87..47399c575a3f6d 100644 --- a/be/src/http/action/jeprofile_actions.cpp +++ b/be/src/http/action/jeprofile_actions.cpp @@ -18,69 +18,101 @@ #include "http/action/jeprofile_actions.h" #include -#include #include -#include -#include -#include -#include #include -#include "common/config.h" -#include "common/object_pool.h" #include "http/ev_http_server.h" #include "http/http_channel.h" #include "http/http_handler.h" #include "http/http_handler_with_auth.h" -#include "http/http_method.h" -#include "io/fs/local_file_system.h" +#include "http/http_headers.h" +#include "http/http_request.h" +#include "runtime/memory/heap_profiler.h" namespace doris { -class HttpRequest; -static std::mutex kJeprofileActionMutex; -class JeHeapAction : public HttpHandlerWithAuth { -public: - JeHeapAction(ExecEnv* exec_env) : HttpHandlerWithAuth(exec_env) {} - virtual ~JeHeapAction() = default; +const static std::string HEADER_JSON = "application/json"; - virtual void handle(HttpRequest* req) override; -}; - -void JeHeapAction::handle(HttpRequest* req) { - std::lock_guard lock(kJeprofileActionMutex); -#ifndef USE_JEMALLOC - std::string str = "jemalloc heap dump is not available without setting USE_JEMALLOC"; - HttpChannel::send_reply(req, str); +static bool compile_check(HttpRequest* req) { +#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) + HttpChannel::send_reply( + req, HttpStatus::INTERNAL_SERVER_ERROR, + "Jemalloc heap dump is not available with ASAN(address sanitizer) builds.\n"); + return false; +#elif !defined(USE_JEMALLOC) + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + "jemalloc heap dump is not available without setting USE_JEMALLOC.\n"); + return false; #else - std::stringstream tmp_jeprof_file_name; - std::time_t now = std::time(nullptr); - // Build a temporary file name that is hopefully unique. - tmp_jeprof_file_name << config::jeprofile_dir << "/jeheap_dump." << now << "." << getpid() - << "." << rand() << ".heap"; - const std::string& tmp_file_name_str = tmp_jeprof_file_name.str(); - const char* file_name_ptr = tmp_file_name_str.c_str(); - int result = jemallctl("prof.dump", nullptr, nullptr, &file_name_ptr, sizeof(const char*)); - std::stringstream response; - if (result == 0) { - response << "Jemalloc heap dump success, dump file path: " << tmp_jeprof_file_name.str() - << "\n"; - } else { - response << "Jemalloc heap dump failed, je_mallctl return: " << result << "\n"; - } - HttpChannel::send_reply(req, response.str()); + return true; #endif } -Status JeprofileActions::setup(doris::ExecEnv* exec_env, doris::EvHttpServer* http_server, - doris::ObjectPool& pool) { - if (!config::jeprofile_dir.empty()) { - RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(config::jeprofile_dir)); +void SetJeHeapProfileActiveActions::handle(HttpRequest* req) { + req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); + if (compile_check(req)) { + if (req->param("prof_value") == "true") { + HeapProfiler::instance()->heap_profiler_start(); + HttpChannel::send_reply( + req, HttpStatus::OK, + "heap profiler started\nJemalloc will only track and sample the memory " + "allocated and freed after the heap profiler started, it cannot analyze the " + "memory allocated and freed before. Therefore, dumping the heap profile " + "immediately after start heap profiler may prompt `No nodes to print`. If you " + "want to analyze the memory that has been allocated in the past, you can only " + "restart the BE process and start heap profiler immediately.\n"); + } else { + HeapProfiler::instance()->heap_profiler_stop(); + HttpChannel::send_reply(req, HttpStatus::OK, "heap profiler stoped\n"); + } + } +} + +void DumpJeHeapProfileToDotActions::handle(HttpRequest* req) { + req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); + if (compile_check(req)) { + if (!HeapProfiler::instance()->check_heap_profiler()) { + HttpChannel::send_reply( + req, HttpStatus::INTERNAL_SERVER_ERROR, + "`curl http://be_host:be_webport/jeheap/prof/true` to start heap profiler\n"); + } + std::string dot = HeapProfiler::instance()->dump_heap_profile_to_dot(); + if (dot.empty()) { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + "dump heap profile to dot failed, see be.INFO\n"); + } else { + dot += "\n-------------------------------------------------------\n"; + dot += "Copy the text after `digraph` in the above output to " + "http://www.webgraphviz.com to generate a dot graph.\n" + "after start heap profiler, if there is no operation, will print `No nodes to " + "print`." + "If there are many errors: `addr2line: Dwarf Error`," + "or other FAQ, reference doc: " + "https://doris.apache.org/community/developer-guide/debug-tool/#4-qa\n"; + HttpChannel::send_reply(req, HttpStatus::OK, dot); + } + } +} + +void DumpJeHeapProfileActions::handle(HttpRequest* req) { + req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); + if (compile_check(req)) { + if (!HeapProfiler::instance()->check_heap_profiler()) { + HttpChannel::send_reply( + req, HttpStatus::INTERNAL_SERVER_ERROR, + "`curl http://be_host:be_webport/jeheap/prof/true` to start heap profiler\n"); + } + std::string profile_file_name = HeapProfiler::instance()->dump_heap_profile(); + if (profile_file_name.empty()) { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + "jemalloc heap dump failed\n"); + } else { + HttpChannel::send_reply(req, HttpStatus::OK, + fmt::format("jemalloc heap dump success, dump file path: {}\n", + profile_file_name)); + } } - http_server->register_handler(HttpMethod::GET, "/jeheap/dump", - pool.add(new JeHeapAction(exec_env))); - return Status::OK(); } } // namespace doris diff --git a/be/src/http/action/jeprofile_actions.h b/be/src/http/action/jeprofile_actions.h index 2ebeb3c9ffdc92..f1336ac4691d57 100644 --- a/be/src/http/action/jeprofile_actions.h +++ b/be/src/http/action/jeprofile_actions.h @@ -15,17 +15,35 @@ // specific language governing permissions and limitations // under the License. -#ifndef DORIS_JEPROFILE_ACTIONS_H -#define DORIS_JEPROFILE_ACTIONS_H -#include "common/status.h" +#pragma once + +#include "http/http_handler.h" +#include "http/http_handler_with_auth.h" + namespace doris { -class EvHttpServer; + +class HttpRequest; class ExecEnv; -class ObjectPool; -class JeprofileActions { + +class SetJeHeapProfileActiveActions final : public HttpHandlerWithAuth { +public: + SetJeHeapProfileActiveActions(ExecEnv* exec_env) : HttpHandlerWithAuth(exec_env) {} + ~SetJeHeapProfileActiveActions() override = default; + void handle(HttpRequest* req) override; +}; + +class DumpJeHeapProfileToDotActions final : public HttpHandlerWithAuth { +public: + DumpJeHeapProfileToDotActions(ExecEnv* exec_env) : HttpHandlerWithAuth(exec_env) {} + ~DumpJeHeapProfileToDotActions() override = default; + void handle(HttpRequest* req) override; +}; + +class DumpJeHeapProfileActions final : public HttpHandlerWithAuth { public: - static Status setup(ExecEnv* exec_env, EvHttpServer* http_server, ObjectPool& pool); + DumpJeHeapProfileActions(ExecEnv* exec_env) : HttpHandlerWithAuth(exec_env) {} + ~DumpJeHeapProfileActions() override = default; + void handle(HttpRequest* req) override; }; } // namespace doris -#endif //DORIS_JEPROFILE_ACTIONS_H diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index fa23e5e56c12f1..60c9f659fbc4eb 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -636,13 +636,75 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, request.__set_enable_profile(false); } } - if (!http_req->header(HTTP_PARTIAL_COLUMNS).empty()) { + + if (!http_req->header(HTTP_UNIQUE_KEY_UPDATE_MODE).empty()) { + static const StringCaseMap unique_key_update_mode_map = { + {"UPSERT", TUniqueKeyUpdateMode::UPSERT}, + {"UPDATE_FIXED_COLUMNS", TUniqueKeyUpdateMode::UPDATE_FIXED_COLUMNS}, + {"UPDATE_FLEXIBLE_COLUMNS", TUniqueKeyUpdateMode::UPDATE_FLEXIBLE_COLUMNS}}; + std::string unique_key_update_mode_str = http_req->header(HTTP_UNIQUE_KEY_UPDATE_MODE); + auto iter = unique_key_update_mode_map.find(unique_key_update_mode_str); + if (iter != unique_key_update_mode_map.end()) { + TUniqueKeyUpdateMode::type unique_key_update_mode = iter->second; + if (unique_key_update_mode == TUniqueKeyUpdateMode::UPDATE_FLEXIBLE_COLUMNS) { + // check constraints when flexible partial update is enabled + if (ctx->format != TFileFormatType::FORMAT_JSON) { + return Status::InvalidArgument( + "flexible partial update only support json format as input file " + "currently"); + } + if (!http_req->header(HTTP_FUZZY_PARSE).empty() && + iequal(http_req->header(HTTP_FUZZY_PARSE), "true")) { + return Status::InvalidArgument( + "Don't support flexible partial update when 'fuzzy_parse' is enabled"); + } + if (!http_req->header(HTTP_COLUMNS).empty()) { + return Status::InvalidArgument( + "Don't support flexible partial update when 'columns' is specified"); + } + if (!http_req->header(HTTP_JSONPATHS).empty()) { + return Status::InvalidArgument( + "Don't support flexible partial update when 'jsonpaths' is specified"); + } + if (!http_req->header(HTTP_HIDDEN_COLUMNS).empty()) { + return Status::InvalidArgument( + "Don't support flexible partial update when 'hidden_columns' is " + "specified"); + } + if (!http_req->header(HTTP_FUNCTION_COLUMN + "." + HTTP_SEQUENCE_COL).empty()) { + return Status::InvalidArgument( + "Don't support flexible partial update when " + "'function_column.sequence_col' is specified"); + } + if (!http_req->header(HTTP_MERGE_TYPE).empty()) { + return Status::InvalidArgument( + "Don't support flexible partial update when " + "'merge_type' is specified"); + } + if (!http_req->header(HTTP_WHERE).empty()) { + return Status::InvalidArgument( + "Don't support flexible partial update when " + "'where' is specified"); + } + } + request.__set_unique_key_update_mode(unique_key_update_mode); + } else { + return Status::InvalidArgument( + "Invalid unique_key_partial_mode {}, must be one of 'UPSERT', " + "'UPDATE_FIXED_COLUMNS' or 'UPDATE_FLEXIBLE_COLUMNS'", + unique_key_update_mode_str); + } + } + if (http_req->header(HTTP_UNIQUE_KEY_UPDATE_MODE).empty() && + !http_req->header(HTTP_PARTIAL_COLUMNS).empty()) { + // only consider `partial_columns` parameter when `unique_key_update_mode` is not set if (iequal(http_req->header(HTTP_PARTIAL_COLUMNS), "true")) { + request.__set_unique_key_update_mode(TUniqueKeyUpdateMode::UPDATE_FIXED_COLUMNS); + // for backward compatibility request.__set_partial_update(true); - } else { - request.__set_partial_update(false); } } + if (!http_req->header(HTTP_MEMTABLE_ON_SINKNODE).empty()) { bool value = iequal(http_req->header(HTTP_MEMTABLE_ON_SINKNODE), "true"); request.__set_memtable_on_sink_node(value); diff --git a/be/src/http/action/tablets_info_action.cpp b/be/src/http/action/tablets_info_action.cpp index 9c27c1de9a02b3..672b03ce6ceaed 100644 --- a/be/src/http/action/tablets_info_action.cpp +++ b/be/src/http/action/tablets_info_action.cpp @@ -24,6 +24,8 @@ #include #include +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet_mgr.h" #include "cloud/config.h" #include "http/http_channel.h" #include "http/http_headers.h" @@ -51,12 +53,6 @@ void TabletsInfoAction::handle(HttpRequest* req) { EasyJson TabletsInfoAction::get_tablets_info(string tablet_num_to_return) { EasyJson tablets_info_ej; - if (config::is_cloud_mode()) { - // TODO(plat1ko): CloudStorageEngine - tablets_info_ej["msg"] = "TabletsInfoAction::get_tablets_info is not implemented"; - tablets_info_ej["code"] = 0; - return tablets_info_ej; - } int64_t number; std::string msg; @@ -74,9 +70,15 @@ EasyJson TabletsInfoAction::get_tablets_info(string tablet_num_to_return) { msg = "Parameter Error"; } std::vector tablets_info; - TabletManager* tablet_manager = - ExecEnv::GetInstance()->storage_engine().to_local().tablet_manager(); - tablet_manager->obtain_specific_quantity_tablets(tablets_info, number); + if (!config::is_cloud_mode()) { + TabletManager* tablet_manager = + ExecEnv::GetInstance()->storage_engine().to_local().tablet_manager(); + tablet_manager->obtain_specific_quantity_tablets(tablets_info, number); + } else { + CloudTabletMgr& cloud_tablet_manager = + ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_mgr(); + cloud_tablet_manager.get_tablet_info(number, &tablets_info); + } tablets_info_ej["msg"] = msg; tablets_info_ej["code"] = 0; diff --git a/be/src/http/default_path_handlers.cpp b/be/src/http/default_path_handlers.cpp index 2ece1e3fdcd20a..04e1121cab63ba 100644 --- a/be/src/http/default_path_handlers.cpp +++ b/be/src/http/default_path_handlers.cpp @@ -38,13 +38,9 @@ #include #include "common/config.h" -#include "gutil/strings/numbers.h" -#include "gutil/strings/substitute.h" #include "http/action/tablets_info_action.h" #include "http/web_page_handler.h" -#include "runtime/memory/global_memory_arbitrator.h" -#include "runtime/memory/mem_tracker.h" -#include "runtime/memory/mem_tracker_limiter.h" +#include "runtime/process_profile.h" #include "util/easy_json.h" #include "util/mem_info.h" #include "util/perf_counters.h" @@ -97,16 +93,51 @@ void config_handler(const WebPageHandler::ArgumentMap& args, std::stringstream* (*output) << ""; } -// Registered to handle "/memz", and prints out memory allocation statistics. -void mem_usage_handler(const WebPageHandler::ArgumentMap& args, std::stringstream* output) { - (*output) << "
"
-              << "Mem Limit: " << PrettyPrinter::print(MemInfo::mem_limit(), TUnit::BYTES)
+void memory_info_handler(std::stringstream* output) {
+    (*output) << "

Memory Info

\n"; + (*output) << "
";
+    (*output) << "

Memory Documents

\n" + << "Memory Management Overview\n" + << "Memory Issue FAQ\n" + << "\n---\n\n"; + + (*output) << "

Memory Properties

\n" + << "System Physical Mem: " + << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES) << std::endl + << "System Page Size: " << MemInfo::get_page_size() << std::endl + << "Mem Limit: " << MemInfo::mem_limit_str() << std::endl + << "Soft Mem Limit: " << MemInfo::soft_mem_limit_str() << std::endl + << "System Mem Available Low Water Mark: " + << PrettyPrinter::print(MemInfo::sys_mem_available_low_water_mark(), TUnit::BYTES) + << std::endl + << "System Mem Available Warning Water Mark: " + << PrettyPrinter::print(MemInfo::sys_mem_available_warning_water_mark(), TUnit::BYTES) << std::endl - << "Physical Mem From Perf: " - << PrettyPrinter::print(PerfCounters::get_vm_rss(), TUnit::BYTES) << std::endl - << "
"; + << "Cgroup Mem Limit: " + << PrettyPrinter::print(MemInfo::cgroup_mem_limit(), TUnit::BYTES) << std::endl + << "Cgroup Mem Usage: " + << PrettyPrinter::print(MemInfo::cgroup_mem_usage(), TUnit::BYTES) << std::endl + << "Cgroup Mem Refresh State: " << MemInfo::cgroup_mem_refresh_state() << std::endl + << "\n---\n\n"; + + (*output) << "

Memory Option Settings

\n"; + { + std::lock_guard lock(*config::get_mutable_string_config_lock()); + for (const auto& it : *(config::full_conf_map)) { + if (it.first.find("memory") != std::string::npos || + it.first.find("cache") != std::string::npos || + it.first.find("mem") != std::string::npos) { + (*output) << it.first << "=" << it.second << std::endl; + } + } + } + (*output) << "\n---\n\n"; - (*output) << "
";
+    (*output) << "

Jemalloc Profiles

\n"; #if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) (*output) << "Memory tracking is not available with address sanitizer builds."; #elif defined(USE_JEMALLOC) @@ -117,15 +148,59 @@ void mem_usage_handler(const WebPageHandler::ArgumentMap& args, std::stringstrea }; jemalloc_stats_print(write_cb, &tmp, "a"); boost::replace_all(tmp, "\n", "
"); - (*output) << tmp << "
"; + (*output) << tmp; #else char buf[2048]; MallocExtension::instance()->GetStats(buf, 2048); // Replace new lines with
for html std::string tmp(buf); boost::replace_all(tmp, "\n", "
"); - (*output) << tmp << "
"; + (*output) << tmp; #endif + (*output) << ""; +} + +// Registered to handle "/profile". +void process_profile_handler(const WebPageHandler::ArgumentMap& args, std::stringstream* output) { + (*output) << "

Copy Process Profile To Clipboard (拷贝 Process Profile 到剪切板)

"; + (*output) << "" << std::endl; + (*output) << "" << std::endl; + + doris::ProcessProfile::instance()->refresh_profile(); + + (*output) << "
" << std::endl; + (*output) << "

Process Profile

" << std::endl; + (*output) << "
"
+              << doris::ProcessProfile::instance()->print_process_profile_no_root() << "
" + << "\n\n---\n\n"; + memory_info_handler(output); + + // TODO, expect more information about process status, CPU, IO, etc. + + (*output) << "
" << std::endl; } void display_tablets_callback(const WebPageHandler::ArgumentMap& args, EasyJson* ej) { @@ -141,76 +216,8 @@ void display_tablets_callback(const WebPageHandler::ArgumentMap& args, EasyJson* // Registered to handle "/mem_tracker", and prints out memory tracker information. void mem_tracker_handler(const WebPageHandler::ArgumentMap& args, std::stringstream* output) { - (*output) << "

Memory usage by subsystem

\n"; - std::vector snapshots; - auto iter = args.find("type"); - if (iter != args.end()) { - if (iter->second == "global") { - MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::GLOBAL); - } else if (iter->second == "query") { - MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::QUERY); - } else if (iter->second == "load") { - MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::LOAD); - } else if (iter->second == "compaction") { - MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::COMPACTION); - } else if (iter->second == "schema_change") { - MemTrackerLimiter::make_type_snapshots(&snapshots, - MemTrackerLimiter::Type::SCHEMA_CHANGE); - } else if (iter->second == "other") { - MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::OTHER); - } else if (iter->second == "reserved_memory") { - MemTrackerLimiter::make_all_reserved_trackers_snapshots(&snapshots); - } else if (iter->second == "all") { - MemTrackerLimiter::make_all_memory_state_snapshots(&snapshots); - } - } else { - (*output) << "

*Notice:

\n"; - (*output) << "

1. MemTracker only counts the memory on part of the main execution " - "path, " - "which is usually less than the real process memory.

\n"; - (*output) << "

2. each `type` is the sum of a set of tracker values, " - "`sum of all trackers` is the sum of all trackers of all types, .

\n"; - (*output) << "

3. `process resident memory` is the physical memory of the process, " - "from /proc VmRSS VmHWM.

\n"; - (*output) << "

4. `process virtual memory` is the virtual memory of the process, " - "from /proc VmSize VmPeak.

\n"; - (*output) << "

5.`/mem_tracker?type=` to view the memory details of each " - "type, for example, `/mem_tracker?type=query` will list the memory of all " - "queries; " - "`/mem_tracker?type=global` will list the memory of all Cache, metadata and " - "other " - "global life cycles.

\n"; - (*output) << "

see documentation for details."; - MemTrackerLimiter::make_process_snapshots(&snapshots); - } - - (*output) << "\n"; - (*output) << "" - "" - "" - "" - "" - "" - "" - "" - ""; - (*output) << "\n"; - for (const auto& item : snapshots) { - string limit_str = item.limit == -1 ? "none" : AccurateItoaKMGT(item.limit); - string current_consumption_normalize = AccurateItoaKMGT(item.cur_consumption); - string peak_consumption_normalize = AccurateItoaKMGT(item.peak_consumption); - (*output) << strings::Substitute( - "\n", - item.type, item.label, limit_str, item.cur_consumption, - current_consumption_normalize, item.peak_consumption, peak_consumption_normalize); - } - (*output) << "
TypeLabelLimitCurrent Consumption(Bytes)Current Consumption(Normalize)Peak Consumption(Bytes)Peak Consumption(Normalize)
$0$1$2$3$4$5$6
\n"; + (*output) << "

mem_tracker webpage has been offline, please click Process Profile, see MemoryProfile and Memory Info

\n"; } void heap_handler(const WebPageHandler::ArgumentMap& args, std::stringstream* output) { @@ -394,14 +401,10 @@ void add_default_path_handlers(WebPageHandler* web_page_handler) { web_page_handler->register_page("/varz", "Configs", config_handler, true /* is_on_nav_bar */); } - web_page_handler->register_page("/memz", "Memory", mem_usage_handler, true /* is_on_nav_bar */); - web_page_handler->register_page( - "/mem_tracker", "MemTracker", - [](auto&& PH1, auto&& PH2) { - return mem_tracker_handler(std::forward(PH1), - std::forward(PH2)); - }, - true /* is_on_nav_bar */); + web_page_handler->register_page("/profile", "Process Profile", process_profile_handler, + true /* is_on_nav_bar */); + web_page_handler->register_page("/mem_tracker", "MemTracker", mem_tracker_handler, + true /* is_on_nav_bar */); web_page_handler->register_page("/heap", "Heap Profile", heap_handler, true /* is_on_nav_bar */); web_page_handler->register_page("/cpu", "CPU Profile", cpu_handler, true /* is_on_nav_bar */); diff --git a/be/src/http/http_client.cpp b/be/src/http/http_client.cpp index e94614788f5236..bf1cd751ae37dd 100644 --- a/be/src/http/http_client.cpp +++ b/be/src/http/http_client.cpp @@ -253,7 +253,13 @@ Status HttpClient::download(const std::string& local_path) { } return true; }; - RETURN_IF_ERROR(execute(callback)); + + if (auto s = execute(callback); !s.ok()) { + status = s; + } + if (!status.ok()) { + remove(local_path.c_str()); + } return status; } diff --git a/be/src/http/http_common.h b/be/src/http/http_common.h index ec2dfc896e48a3..7262ea28a8ddda 100644 --- a/be/src/http/http_common.h +++ b/be/src/http/http_common.h @@ -59,6 +59,7 @@ static const std::string HTTP_SKIP_LINES = "skip_lines"; static const std::string HTTP_COMMENT = "comment"; static const std::string HTTP_ENABLE_PROFILE = "enable_profile"; static const std::string HTTP_PARTIAL_COLUMNS = "partial_columns"; +static const std::string HTTP_UNIQUE_KEY_UPDATE_MODE = "unique_key_update_mode"; static const std::string HTTP_SQL = "sql"; static const std::string HTTP_TWO_PHASE_COMMIT = "two_phase_commit"; static const std::string HTTP_TXN_ID_KEY = "txn_id"; diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 2ff374442d1580..4fb3f3e02cb58c 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -21,6 +21,7 @@ #include "io/cache/block_file_cache.h" #include "common/status.h" +#include "cpp/sync_point.h" #if defined(__APPLE__) #include @@ -53,6 +54,8 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _max_query_cache_size(cache_settings.max_query_cache_size) { _cur_cache_size_metrics = std::make_shared>(_cache_base_path.c_str(), "file_cache_cache_size", 0); + _cache_capacity_metrics = std::make_shared>( + _cache_base_path.c_str(), "file_cache_capacity", _capacity); _cur_ttl_cache_size_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_ttl_cache_size", 0); _cur_normal_queue_element_count_metrics = std::make_shared>( @@ -83,15 +86,131 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _total_evict_size_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_total_evict_size"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_index"); + _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_disposable_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_index"); + _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_normal_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_index_to_ttl"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_disposable"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_normal"); + _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_heat_ttl_to_index"); + + _evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_self_lru_disposable"); + _evict_by_self_lru_metrics_matrix[FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_self_lru_normal"); + _evict_by_self_lru_metrics_matrix[FileCacheType::INDEX] = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_self_lru_index"); + _evict_by_self_lru_metrics_matrix[FileCacheType::TTL] = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_self_lru_ttl"); + + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_index"); + _evict_by_size_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_disposable_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_index"); + _evict_by_size_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_normal_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_index_to_ttl"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_disposable"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_normal"); + _evict_by_size_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + std::make_shared>(_cache_base_path.c_str(), + "file_cache_evict_by_size_ttl_to_index"); + + _evict_by_try_release = std::make_shared>( + _cache_base_path.c_str(), "file_cache_evict_by_try_release"); + + _num_read_blocks = std::make_shared>(_cache_base_path.c_str(), + "file_cache_num_read_blocks"); + _num_hit_blocks = std::make_shared>(_cache_base_path.c_str(), + "file_cache_num_hit_blocks"); + _num_removed_blocks = std::make_shared>(_cache_base_path.c_str(), + "file_cache_num_removed_blocks"); + + _num_hit_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_num_hit_blocks_5m", _num_hit_blocks.get(), 300); + _num_read_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_num_read_blocks_5m", _num_read_blocks.get(), 300); + _num_hit_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_num_hit_blocks_1h", _num_hit_blocks.get(), 3600); + _num_read_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_num_read_blocks_1h", _num_read_blocks.get(), + 3600); + + _hit_ratio = std::make_shared>(_cache_base_path.c_str(), + "file_cache_hit_ratio", 0.0); + _hit_ratio_5m = std::make_shared>(_cache_base_path.c_str(), + "file_cache_hit_ratio_5m", 0.0); + _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), + "file_cache_hit_ratio_1h", 0.0); + _disk_limit_mode_metrics = + std::make_shared>(_cache_base_path.c_str(), "disk_limit_mode", 0); + _disposable_queue = LRUQueue(cache_settings.disposable_queue_size, cache_settings.disposable_queue_elements, 60 * 60); _index_queue = LRUQueue(cache_settings.index_queue_size, cache_settings.index_queue_elements, 7 * 24 * 60 * 60); _normal_queue = LRUQueue(cache_settings.query_queue_size, cache_settings.query_queue_elements, 24 * 60 * 60); - _ttl_queue = LRUQueue(std::numeric_limits::max(), std::numeric_limits::max(), + _ttl_queue = LRUQueue(cache_settings.ttl_queue_size, cache_settings.ttl_queue_elements, std::numeric_limits::max()); + _recycle_keys = std::make_shared>( + config::file_cache_recycle_keys_size); if (cache_settings.storage == "memory") { _storage = std::make_unique(); _cache_base_path = "memory"; @@ -136,8 +255,7 @@ FileCacheType BlockFileCache::string_to_cache_type(const std::string& str) { BlockFileCache::QueryFileCacheContextHolderPtr BlockFileCache::get_query_context_holder( const TUniqueId& query_id) { - std::lock_guard cache_lock(_mutex); - + SCOPED_CACHE_LOCK(_mutex); if (!config::enable_file_cache_query_limit) { return {}; } @@ -155,7 +273,7 @@ BlockFileCache::QueryFileCacheContextPtr BlockFileCache::get_query_context( } void BlockFileCache::remove_query_context(const TUniqueId& query_id) { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); const auto& query_iter = _query_map.find(query_id); if (query_iter != _query_map.end() && query_iter->second.use_count() <= 1) { @@ -200,7 +318,7 @@ void BlockFileCache::QueryFileCacheContext::reserve(const UInt128Wrapper& hash, } Status BlockFileCache::initialize() { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); return initialize_unlocked(cache_lock); } @@ -219,15 +337,12 @@ void BlockFileCache::use_cell(const FileBlockCell& cell, FileBlocks* result, boo result->push_back(cell.file_block); } - if (cell.file_block->cache_type() != FileCacheType::TTL || - config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(cell.file_block->cache_type()); - DCHECK(cell.queue_iterator) << "impossible"; - /// Move to the end of the queue. The iterator remains valid. - if (move_iter_flag) { - queue.move_to_end(*cell.queue_iterator, cache_lock); - } + auto& queue = get_queue(cell.file_block->cache_type()); + /// Move to the end of the queue. The iterator remains valid. + if (cell.queue_iterator && move_iter_flag) { + queue.move_to_end(*cell.queue_iterator, cache_lock); } + cell.update_atime(); cell.is_deleted = false; } @@ -292,14 +407,10 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); - if (config::enable_ttl_cache_evict_using_lru) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = ttl_queue.add( - cell.file_block->get_hash_value(), cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - cell.queue_iterator.reset(); - } + auto& ttl_queue = get_queue(FileCacheType::TTL); + cell.queue_iterator = + ttl_queue.add(cell.file_block->get_hash_value(), cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); } else { LOG_WARNING("Failed to change key meta").error(st); } @@ -333,7 +444,7 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte auto st = cell.file_block->change_cache_type_between_ttl_and_others( FileCacheType::NORMAL); if (st.ok()) { - if (config::enable_ttl_cache_evict_using_lru) { + if (cell.queue_iterator) { auto& ttl_queue = get_queue(FileCacheType::TTL); ttl_queue.remove(cell.queue_iterator.value(), cache_lock); } @@ -414,7 +525,7 @@ std::string BlockFileCache::clear_file_cache_async() { int64_t num_cells_to_delete = 0; int64_t num_files_all = 0; { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); if (!_async_clear_file_cache) { for (auto& [_, offset_to_cell] : _files) { ++num_files_all; @@ -650,7 +761,7 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o CacheContext& context) { FileBlock::Range range(offset, offset + size - 1); - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); if (auto iter = _key_to_time.find(hash); context.cache_type == FileCacheType::INDEX && iter != _key_to_time.end()) { context.cache_type = FileCacheType::TTL; @@ -667,10 +778,10 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o fill_holes_with_empty_file_blocks(file_blocks, hash, context, range, cache_lock); } DCHECK(!file_blocks.empty()); - _num_read_blocks += file_blocks.size(); + *_num_read_blocks << file_blocks.size(); for (auto& block : file_blocks) { if (block->state() == FileBlock::State::DOWNLOADED) { - _num_hit_blocks++; + *_num_hit_blocks << 1; } } return FileBlocksHolder(std::move(file_blocks)); @@ -709,11 +820,10 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha << " cache_type=" << cache_type_to_string(context.cache_type) << " error=" << st.msg(); } - if (cell.file_block->cache_type() != FileCacheType::TTL || - config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(cell.file_block->cache_type()); - cell.queue_iterator = queue.add(hash, offset, size, cache_lock); - } + + auto& queue = get_queue(cell.file_block->cache_type()); + cell.queue_iterator = queue.add(hash, offset, size, cache_lock); + if (cell.file_block->cache_type() == FileCacheType::TTL) { if (_key_to_time.find(hash) == _key_to_time.end()) { _key_to_time[hash] = context.expiration_time; @@ -727,7 +837,7 @@ BlockFileCache::FileBlockCell* BlockFileCache::add_cell(const UInt128Wrapper& ha } size_t BlockFileCache::try_release() { - std::lock_guard l(_mutex); + SCOPED_CACHE_LOCK(_mutex); std::vector trash; for (auto& [hash, blocks] : _files) { for (auto& [offset, cell] : blocks) { @@ -736,11 +846,14 @@ size_t BlockFileCache::try_release() { } } } + size_t remove_size = 0; for (auto& cell : trash) { FileBlockSPtr file_block = cell->file_block; std::lock_guard lc(cell->file_block->_mutex); - remove(file_block, l, lc); + remove_size += file_block->range().size(); + remove(file_block, cache_lock, lc); } + *_evict_by_try_release << remove_size; LOG(INFO) << "Released " << trash.size() << " blocks in file cache " << _cache_base_path; return trash.size(); } @@ -789,6 +902,18 @@ void BlockFileCache::remove_file_blocks(std::vector& to_evict, std::for_each(to_evict.begin(), to_evict.end(), remove_file_block_if); } +void BlockFileCache::remove_file_blocks_async(std::vector& to_evict, + std::lock_guard& cache_lock) { + auto remove_file_block_if = [&](FileBlockCell* cell) { + FileBlockSPtr file_block = cell->file_block; + if (file_block) { + std::lock_guard block_lock(file_block->_mutex); + remove(file_block, cache_lock, block_lock, /*sync*/ false); + } + }; + std::for_each(to_evict.begin(), to_evict.end(), remove_file_block_if); +} + void BlockFileCache::remove_file_blocks_and_clean_time_maps( std::vector& to_evict, std::lock_guard& cache_lock) { auto remove_file_block_and_clean_time_maps_if = [&](FileBlockCell* cell) { @@ -819,9 +944,10 @@ void BlockFileCache::remove_file_blocks_and_clean_time_maps( void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, - std::lock_guard& cache_lock, bool is_ttl) { + std::lock_guard& cache_lock, + size_t& cur_removed_size) { for (const auto& [entry_key, entry_offset, entry_size] : queue) { - if (!is_overflow(removed_size, size, cur_cache_size, is_ttl)) { + if (!is_overflow(removed_size, size, cur_cache_size)) { break; } auto* cell = get_cell(entry_key, entry_offset, cache_lock); @@ -839,6 +965,7 @@ void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t DCHECK(file_block->_download_state == FileBlock::State::DOWNLOADED); to_evict.push_back(cell); removed_size += cell_size; + cur_removed_size += cell_size; } } } @@ -848,6 +975,9 @@ bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; auto limit = config::max_ttl_cache_ratio * _capacity; + + TEST_INJECTION_POINT_CALLBACK("BlockFileCache::change_limit1", &limit); + if ((_cur_ttl_size + size) * 100 > limit) { return false; } @@ -861,8 +991,9 @@ bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, } std::vector to_evict; auto collect_eliminate_fragments = [&](LRUQueue& queue) { + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); }; if (disposable_queue_size != 0) { collect_eliminate_fragments(get_queue(FileCacheType::DISPOSABLE)); @@ -889,8 +1020,9 @@ bool BlockFileCache::try_reserve_for_ttl(size_t size, std::lock_guard to_evict; + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - true); + cur_removed_size); remove_file_blocks_and_clean_time_maps(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); @@ -923,10 +1055,6 @@ bool BlockFileCache::try_reserve(const UInt128Wrapper& hash, const CacheContext& size = 5 * size; } - if (context.cache_type == FileCacheType::TTL) { - return try_reserve_for_ttl(size, cache_lock); - } - auto query_context = config::enable_file_cache_query_limit && (context.query_id.hi != 0 || context.query_id.lo != 0) ? get_query_context(context.query_id, cache_lock) @@ -1028,7 +1156,7 @@ bool BlockFileCache::remove_if_ttl_file_unlock(const UInt128Wrapper& file_key, b auto st = cell.file_block->change_cache_type_between_ttl_and_others( FileCacheType::NORMAL); if (st.ok()) { - if (config::enable_ttl_cache_evict_using_lru) { + if (cell.queue_iterator) { ttl_queue.remove(cell.queue_iterator.value(), cache_lock); } auto& queue = get_queue(FileCacheType::NORMAL); @@ -1071,7 +1199,7 @@ bool BlockFileCache::remove_if_ttl_file_unlock(const UInt128Wrapper& file_key, b } void BlockFileCache::remove_if_cached(const UInt128Wrapper& file_key) { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); bool is_ttl_file = remove_if_ttl_file_unlock(file_key, true, cache_lock); if (!is_ttl_file) { auto iter = _files.find(file_key); @@ -1087,12 +1215,50 @@ void BlockFileCache::remove_if_cached(const UInt128Wrapper& file_key) { } } -std::vector BlockFileCache::get_other_cache_type(FileCacheType cur_cache_type) { +void BlockFileCache::remove_if_cached_async(const UInt128Wrapper& file_key) { + SCOPED_CACHE_LOCK(_mutex); + bool is_ttl_file = remove_if_ttl_file_unlock(file_key, true, cache_lock); + if (!is_ttl_file) { + auto iter = _files.find(file_key); + std::vector to_remove; + if (iter != _files.end()) { + for (auto& [_, cell] : iter->second) { + if (cell.releasable()) { + to_remove.push_back(&cell); + } + } + } + remove_file_blocks_async(to_remove, cache_lock); + } +} + +std::vector BlockFileCache::get_other_cache_type_without_ttl( + FileCacheType cur_cache_type) { switch (cur_cache_type) { + case FileCacheType::TTL: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::INDEX}; case FileCacheType::INDEX: return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL}; case FileCacheType::NORMAL: return {FileCacheType::DISPOSABLE, FileCacheType::INDEX}; + case FileCacheType::DISPOSABLE: + return {FileCacheType::NORMAL, FileCacheType::INDEX}; + default: + return {}; + } + return {}; +} + +std::vector BlockFileCache::get_other_cache_type(FileCacheType cur_cache_type) { + switch (cur_cache_type) { + case FileCacheType::TTL: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::INDEX}; + case FileCacheType::INDEX: + return {FileCacheType::DISPOSABLE, FileCacheType::NORMAL, FileCacheType::TTL}; + case FileCacheType::NORMAL: + return {FileCacheType::DISPOSABLE, FileCacheType::INDEX, FileCacheType::TTL}; + case FileCacheType::DISPOSABLE: + return {FileCacheType::NORMAL, FileCacheType::INDEX, FileCacheType::TTL}; default: return {}; } @@ -1105,7 +1271,7 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size _files.find(hash)->second.find(offset) != _files.find(hash)->second.end()); FileBlockCell* cell = get_cell(hash, offset, cache_lock); DCHECK(cell != nullptr); - if (cell->file_block->cache_type() != FileCacheType::TTL) { + if (cell->queue_iterator) { auto& queue = get_queue(cell->file_block->cache_type()); DCHECK(queue.contains(hash, offset, cache_lock)); auto iter = queue.get(hash, offset, cache_lock); @@ -1118,13 +1284,14 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size } bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( - std::vector other_cache_types, size_t size, int64_t cur_time, - std::lock_guard& cache_lock) { + FileCacheType cur_type, std::vector other_cache_types, size_t size, + int64_t cur_time, std::lock_guard& cache_lock) { size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; std::vector to_evict; for (FileCacheType cache_type : other_cache_types) { auto& queue = get_queue(cache_type); + size_t remove_size_per_type = 0; for (const auto& [entry_key, entry_offset, entry_size] : queue) { if (!is_overflow(removed_size, size, cur_cache_size)) { break; @@ -1146,39 +1313,48 @@ bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( DCHECK(file_block->_download_state == FileBlock::State::DOWNLOADED); to_evict.push_back(cell); removed_size += cell_size; + remove_size_per_type += cell_size; } } + *(_evict_by_heat_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; } remove_file_blocks(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); } -bool BlockFileCache::is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size, - bool is_ttl) const { +bool BlockFileCache::is_overflow(size_t removed_size, size_t need_size, + size_t cur_cache_size) const { bool ret = false; if (_disk_resource_limit_mode) { ret = (removed_size < need_size); } else { ret = (cur_cache_size + need_size - removed_size > _capacity); } - if (is_ttl) { - size_t ttl_threshold = config::max_ttl_cache_ratio * _capacity / 100; - return (ret || ((cur_cache_size + need_size - removed_size) > ttl_threshold)); - } return ret; } bool BlockFileCache::try_reserve_from_other_queue_by_size( - std::vector other_cache_types, size_t size, + FileCacheType cur_type, std::vector other_cache_types, size_t size, std::lock_guard& cache_lock) { size_t removed_size = 0; size_t cur_cache_size = _cur_cache_size; std::vector to_evict; + // we follow the privilege defined in get_other_cache_types to evict for (FileCacheType cache_type : other_cache_types) { auto& queue = get_queue(cache_type); + + // we will not drain each of them to the bottom -- i.e., we only + // evict what they have stolen. + size_t cur_queue_size = queue.get_capacity(cache_lock); + size_t cur_queue_max_size = queue.get_max_size(); + if (cur_queue_size <= cur_queue_max_size) { + continue; + } + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); + *(_evict_by_size_metrics_matrix[cache_type][cur_type]) << cur_removed_size; } remove_file_blocks(to_evict, cache_lock); return !is_overflow(removed_size, size, cur_cache_size); @@ -1187,16 +1363,15 @@ bool BlockFileCache::try_reserve_from_other_queue_by_size( bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, size_t size, int64_t cur_time, std::lock_guard& cache_lock) { - // disposable queue cannot reserve other queues - if (cur_cache_type == FileCacheType::DISPOSABLE) { - return false; - } - auto other_cache_types = get_other_cache_type(cur_cache_type); - bool reserve_success = try_reserve_from_other_queue_by_hot_interval(other_cache_types, size, - cur_time, cache_lock); + // currently, TTL cache is not considered as a candidate + auto other_cache_types = get_other_cache_type_without_ttl(cur_cache_type); + bool reserve_success = try_reserve_from_other_queue_by_hot_interval( + cur_cache_type, other_cache_types, size, cur_time, cache_lock); if (reserve_success || !config::file_cache_enable_evict_from_other_queue_by_size) { return reserve_success; } + + other_cache_types = get_other_cache_type(cur_cache_type); auto& cur_queue = get_queue(cur_cache_type); size_t cur_queue_size = cur_queue.get_capacity(cache_lock); size_t cur_queue_max_size = cur_queue.get_max_size(); @@ -1204,7 +1379,8 @@ bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, if (_cur_cache_size + size > _capacity && cur_queue_size + size > cur_queue_max_size) { return false; } - return try_reserve_from_other_queue_by_size(other_cache_types, size, cache_lock); + return try_reserve_from_other_queue_by_size(cur_cache_type, other_cache_types, size, + cache_lock); } bool BlockFileCache::try_reserve_for_lru(const UInt128Wrapper& hash, @@ -1220,9 +1396,11 @@ bool BlockFileCache::try_reserve_for_lru(const UInt128Wrapper& hash, size_t cur_cache_size = _cur_cache_size; std::vector to_evict; + size_t cur_removed_size = 0; find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - false); + cur_removed_size); remove_file_blocks(to_evict, cache_lock); + *(_evict_by_self_lru_metrics_matrix[context.cache_type]) << cur_removed_size; if (is_overflow(removed_size, size, cur_cache_size)) { return false; @@ -1237,7 +1415,7 @@ bool BlockFileCache::try_reserve_for_lru(const UInt128Wrapper& hash, template requires IsXLock && IsXLock -void BlockFileCache::remove(FileBlockSPtr file_block, T& cache_lock, U& block_lock) { +void BlockFileCache::remove(FileBlockSPtr file_block, T& cache_lock, U& block_lock, bool sync) { auto hash = file_block->get_hash_value(); auto offset = file_block->offset(); auto type = file_block->cache_type(); @@ -1257,9 +1435,24 @@ void BlockFileCache::remove(FileBlockSPtr file_block, T& cache_lock, U& block_lo key.offset = offset; key.meta.type = type; key.meta.expiration_time = expiration_time; - Status st = _storage->remove(key); - if (!st.ok()) { - LOG_WARNING("").error(st); + if (sync) { + Status st = _storage->remove(key); + if (!st.ok()) { + LOG_WARNING("").error(st); + } + } else { + // the file will be deleted in the bottom half + // so there will be a window that the file is not in the cache but still in the storage + // but it's ok, because the rowset is stale already + // in case something unexpected happen, set the _recycle_keys queue to zero to fallback + bool ret = _recycle_keys->push(key); + if (!ret) { + LOG_WARNING("Failed to push recycle key to queue, do it synchronously"); + Status st = _storage->remove(key); + if (!st.ok()) { + LOG_WARNING("").error(st); + } + } } } _cur_cache_size -= file_block->range().size(); @@ -1271,11 +1464,21 @@ void BlockFileCache::remove(FileBlockSPtr file_block, T& cache_lock, U& block_lo if (offsets.empty()) { _files.erase(hash); } - _num_removed_blocks++; + *_num_removed_blocks << 1; +} + +void BlockFileCache::recycle_stale_rowset_async_bottom_half() { + FileCacheKey key; + while (_recycle_keys->pop(key)) { + Status st = _storage->remove(key); + if (!st.ok()) { + LOG_WARNING("").error(st); + } + } } size_t BlockFileCache::get_used_cache_size(FileCacheType cache_type) const { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); return get_used_cache_size_unlocked(cache_type, cache_lock); } @@ -1285,7 +1488,7 @@ size_t BlockFileCache::get_used_cache_size_unlocked(FileCacheType cache_type, } size_t BlockFileCache::get_available_cache_size(FileCacheType cache_type) const { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); return get_available_cache_size_unlocked(cache_type, cache_lock); } @@ -1296,7 +1499,7 @@ size_t BlockFileCache::get_available_cache_size_unlocked( } size_t BlockFileCache::get_file_blocks_num(FileCacheType cache_type) const { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); return get_file_blocks_num_unlocked(cache_type, cache_lock); } @@ -1380,7 +1583,7 @@ std::string BlockFileCache::LRUQueue::to_string( } std::string BlockFileCache::dump_structure(const UInt128Wrapper& hash) { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); return dump_structure_unlocked(hash, cache_lock); } @@ -1398,7 +1601,7 @@ std::string BlockFileCache::dump_structure_unlocked(const UInt128Wrapper& hash, } std::string BlockFileCache::dump_single_cache_type(const UInt128Wrapper& hash, size_t offset) { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); return dump_single_cache_type_unlocked(hash, offset, cache_lock); } @@ -1461,7 +1664,7 @@ std::string BlockFileCache::reset_capacity(size_t new_capacity) { ss << "finish reset_capacity, path=" << _cache_base_path; auto start_time = steady_clock::time_point(); { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); if (new_capacity < _capacity && new_capacity < _cur_cache_size) { int64_t need_remove_size = _cur_cache_size - new_capacity; auto remove_blocks = [&](LRUQueue& queue) -> int64_t { @@ -1497,11 +1700,13 @@ std::string BlockFileCache::reset_capacity(size_t new_capacity) { ss << " ttl_queue released " << queue_released; } _disk_resource_limit_mode = true; + _disk_limit_mode_metrics->set_value(1); _async_clear_file_cache = true; ss << " total_space_released=" << space_released; } old_capacity = _capacity; _capacity = new_capacity; + _cache_capacity_metrics->set_value(_capacity); } auto use_time = duration_cast(steady_clock::time_point() - start_time); LOG(INFO) << "Finish tag deleted block. path=" << _cache_base_path @@ -1517,6 +1722,7 @@ void BlockFileCache::check_disk_resource_limit() { } if (_capacity > _cur_cache_size) { _disk_resource_limit_mode = false; + _disk_limit_mode_metrics->set_value(0); } std::pair percent; int ret = disk_used_percentage(_cache_base_path, &percent); @@ -1542,10 +1748,12 @@ void BlockFileCache::check_disk_resource_limit() { if (capacity_percentage >= config::file_cache_enter_disk_resource_limit_mode_percent || inode_is_insufficient(inode_percentage)) { _disk_resource_limit_mode = true; + _disk_limit_mode_metrics->set_value(1); } else if (_disk_resource_limit_mode && (capacity_percentage < config::file_cache_exit_disk_resource_limit_mode_percent) && (inode_percentage < config::file_cache_exit_disk_resource_limit_mode_percent)) { _disk_resource_limit_mode = false; + _disk_limit_mode_metrics->set_value(0); } if (_disk_resource_limit_mode) { // log per mins @@ -1569,10 +1777,11 @@ void BlockFileCache::run_background_operation() { break; } } + recycle_stale_rowset_async_bottom_half(); recycle_deleted_blocks(); // gc int64_t cur_time = UnixSeconds(); - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); while (!_time_to_key.empty()) { auto begin = _time_to_key.begin(); if (cur_time < begin->first) { @@ -1600,12 +1809,25 @@ void BlockFileCache::run_background_operation() { _disposable_queue.get_capacity(cache_lock)); _cur_disposable_queue_element_count_metrics->set_value( _disposable_queue.get_elements_num(cache_lock)); + + if (_num_read_blocks->get_value() > 0) { + _hit_ratio->set_value((double)_num_hit_blocks->get_value() / + _num_read_blocks->get_value()); + } + if (_num_read_blocks_5m->get_value() > 0) { + _hit_ratio_5m->set_value((double)_num_hit_blocks_5m->get_value() / + _num_read_blocks_5m->get_value()); + } + if (_num_read_blocks_1h->get_value() > 0) { + _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / + _num_read_blocks_1h->get_value()); + } } } void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, uint64_t new_expiration_time) { - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); // 1. If new_expiration_time is equal to zero if (new_expiration_time == 0) { remove_if_ttl_file_unlock(hash, false, cache_lock); @@ -1647,14 +1869,9 @@ void BlockFileCache::modify_expiration_time(const UInt128Wrapper& hash, if (st.ok()) { auto& queue = get_queue(origin_type); queue.remove(cell.queue_iterator.value(), cache_lock); - if (config::enable_ttl_cache_evict_using_lru) { - auto& ttl_queue = get_queue(FileCacheType::TTL); - cell.queue_iterator = - ttl_queue.add(hash, cell.file_block->offset(), - cell.file_block->range().size(), cache_lock); - } else { - cell.queue_iterator.reset(); - } + auto& ttl_queue = get_queue(FileCacheType::TTL); + cell.queue_iterator = ttl_queue.add(hash, cell.file_block->offset(), + cell.file_block->range().size(), cache_lock); } if (!st.ok()) { LOG_WARNING("").error(st); @@ -1670,7 +1887,7 @@ BlockFileCache::get_hot_blocks_meta(const UInt128Wrapper& hash) const { int64_t cur_time = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); std::vector> blocks_meta; if (auto iter = _files.find(hash); iter != _files.end()) { for (auto& pair : _files.find(hash)->second) { @@ -1739,7 +1956,7 @@ std::string BlockFileCache::clear_file_cache_directly() { using namespace std::chrono; std::stringstream ss; auto start = steady_clock::now(); - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); LOG_INFO("start clear_file_cache_directly").tag("path", _cache_base_path); std::string clear_msg; @@ -1768,10 +1985,8 @@ std::string BlockFileCache::clear_file_cache_directly() { << " time_elapsed=" << duration_cast(steady_clock::now() - start).count() << " num_files=" << num_files << " cache_size=" << cache_size << " index_queue_size=" << index_queue_size << " normal_queue_size=" << normal_queue_size - << " disposible_queue_size=" << disposible_queue_size; - if (config::enable_ttl_cache_evict_using_lru) { - ss << "ttl_queue_size=" << ttl_queue_size; - } + << " disposible_queue_size=" << disposible_queue_size << "ttl_queue_size=" << ttl_queue_size; + auto msg = ss.str(); LOG(INFO) << msg; return msg; @@ -1779,7 +1994,7 @@ std::string BlockFileCache::clear_file_cache_directly() { std::map BlockFileCache::get_blocks_by_key(const UInt128Wrapper& hash) { std::map offset_to_block; - std::lock_guard cache_lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); if (_files.contains(hash)) { for (auto& [offset, cell] : _files[hash]) { if (cell.file_block->state() == FileBlock::State::DOWNLOADED) { @@ -1794,7 +2009,7 @@ std::map BlockFileCache::get_blocks_by_key(const UInt128W } void BlockFileCache::update_ttl_atime(const UInt128Wrapper& hash) { - std::lock_guard lock(_mutex); + SCOPED_CACHE_LOCK(_mutex); if (auto iter = _files.find(hash); iter != _files.end()) { for (auto& [_, cell] : iter->second) { cell.update_atime(); @@ -1802,7 +2017,71 @@ void BlockFileCache::update_ttl_atime(const UInt128Wrapper& hash) { }; } +std::map BlockFileCache::get_stats() { + std::map stats; + stats["hits_ratio"] = (double)_hit_ratio->get_value(); + stats["hits_ratio_5m"] = (double)_hit_ratio_5m->get_value(); + stats["hits_ratio_1h"] = (double)_hit_ratio_1h->get_value(); + + stats["index_queue_max_size"] = (double)_index_queue.get_max_size(); + stats["index_queue_curr_size"] = (double)_cur_index_queue_element_count_metrics->get_value(); + stats["index_queue_max_elements"] = (double)_index_queue.get_max_element_size(); + stats["index_queue_curr_elements"] = + (double)_cur_index_queue_element_count_metrics->get_value(); + + stats["ttl_queue_max_size"] = (double)_ttl_queue.get_max_size(); + stats["ttl_queue_curr_size"] = (double)_cur_ttl_cache_lru_queue_cache_size_metrics->get_value(); + stats["ttl_queue_max_elements"] = (double)_ttl_queue.get_max_element_size(); + stats["ttl_queue_curr_elements"] = + (double)_cur_ttl_cache_lru_queue_element_count_metrics->get_value(); + + stats["normal_queue_max_size"] = (double)_normal_queue.get_max_size(); + stats["normal_queue_curr_size"] = (double)_cur_normal_queue_element_count_metrics->get_value(); + stats["normal_queue_max_elements"] = (double)_normal_queue.get_max_element_size(); + stats["normal_queue_curr_elements"] = + (double)_cur_normal_queue_element_count_metrics->get_value(); + + stats["disposable_queue_max_size"] = (double)_disposable_queue.get_max_size(); + stats["disposable_queue_curr_size"] = + (double)_cur_disposable_queue_element_count_metrics->get_value(); + stats["disposable_queue_max_elements"] = (double)_disposable_queue.get_max_element_size(); + stats["disposable_queue_curr_elements"] = + (double)_cur_disposable_queue_element_count_metrics->get_value(); + + return stats; +} + +// for be UTs +std::map BlockFileCache::get_stats_unsafe() { + std::map stats; + stats["hits_ratio"] = (double)_hit_ratio->get_value(); + stats["hits_ratio_5m"] = (double)_hit_ratio_5m->get_value(); + stats["hits_ratio_1h"] = (double)_hit_ratio_1h->get_value(); + + stats["index_queue_max_size"] = (double)_index_queue.get_max_size(); + stats["index_queue_curr_size"] = (double)_index_queue.get_capacity_unsafe(); + stats["index_queue_max_elements"] = (double)_index_queue.get_max_element_size(); + stats["index_queue_curr_elements"] = (double)_index_queue.get_elements_num_unsafe(); + + stats["ttl_queue_max_size"] = (double)_ttl_queue.get_max_size(); + stats["ttl_queue_curr_size"] = (double)_ttl_queue.get_capacity_unsafe(); + stats["ttl_queue_max_elements"] = (double)_ttl_queue.get_max_element_size(); + stats["ttl_queue_curr_elements"] = (double)_ttl_queue.get_elements_num_unsafe(); + + stats["normal_queue_max_size"] = (double)_normal_queue.get_max_size(); + stats["normal_queue_curr_size"] = (double)_normal_queue.get_capacity_unsafe(); + stats["normal_queue_max_elements"] = (double)_normal_queue.get_max_element_size(); + stats["normal_queue_curr_elements"] = (double)_normal_queue.get_elements_num_unsafe(); + + stats["disposable_queue_max_size"] = (double)_disposable_queue.get_max_size(); + stats["disposable_queue_curr_size"] = (double)_disposable_queue.get_capacity_unsafe(); + stats["disposable_queue_max_elements"] = (double)_disposable_queue.get_max_element_size(); + stats["disposable_queue_curr_elements"] = (double)_disposable_queue.get_elements_num_unsafe(); + + return stats; +} + template void BlockFileCache::remove(FileBlockSPtr file_block, std::lock_guard& cache_lock, - std::lock_guard& block_lock); + std::lock_guard& block_lock, bool sync); } // namespace doris::io diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index c0a2bce76b17c0..0de33dadc8249d 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -27,15 +28,51 @@ #include "io/cache/file_block.h" #include "io/cache/file_cache_common.h" #include "io/cache/file_cache_storage.h" +#include "util/threadpool.h" namespace doris::io { +// Note: the cache_lock is scoped, so do not add do...while(0) here. +#ifdef ENABLE_CACHE_LOCK_DEBUG +#define SCOPED_CACHE_LOCK(MUTEX) \ + std::chrono::time_point start_time = \ + std::chrono::steady_clock::now(); \ + std::lock_guard cache_lock(MUTEX); \ + std::chrono::time_point acq_time = \ + std::chrono::steady_clock::now(); \ + auto duration = \ + std::chrono::duration_cast(acq_time - start_time).count(); \ + if (duration > config::cache_lock_long_tail_threshold) \ + LOG(WARNING) << "Lock wait time " << std::to_string(duration) << "ms. " \ + << get_stack_trace_by_boost() << std::endl; \ + LockScopedTimer cache_lock_timer; +#else +#define SCOPED_CACHE_LOCK(MUTEX) std::lock_guard cache_lock(MUTEX); +#endif + template concept IsXLock = std::same_as> || std::same_as>; class FSFileCacheStorage; +class LockScopedTimer { +public: + LockScopedTimer() : start_(std::chrono::steady_clock::now()) {} + + ~LockScopedTimer() { + auto end = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast(end - start_).count(); + if (duration > 500) { + LOG(WARNING) << "Lock held time " << std::to_string(duration) << "ms. " + << get_stack_trace_by_boost(); + } + } + +private: + std::chrono::time_point start_; +}; + // The BlockFileCache is responsible for the management of the blocks // The current strategies are lru and ttl. class BlockFileCache { @@ -119,6 +156,7 @@ class BlockFileCache { // remove all blocks that belong to the key void remove_if_cached(const UInt128Wrapper& key); + void remove_if_cached_async(const UInt128Wrapper& key); // modify the expiration time about the key void modify_expiration_time(const UInt128Wrapper& key, uint64_t new_expiration_time); @@ -143,6 +181,11 @@ class BlockFileCache { void update_ttl_atime(const UInt128Wrapper& hash); + std::map get_stats(); + + // for be UTs + std::map get_stats_unsafe(); + class LRUQueue { public: LRUQueue() = default; @@ -177,6 +220,10 @@ class BlockFileCache { return cache_size; } + size_t get_capacity_unsafe() const { return cache_size; } + + size_t get_elements_num_unsafe() const { return queue.size(); } + size_t get_elements_num(std::lock_guard& /* cache_lock */) const { return queue.size(); } @@ -318,7 +365,7 @@ class BlockFileCache { template requires IsXLock && IsXLock - void remove(FileBlockSPtr file_block, T& cache_lock, U& segment_lock); + void remove(FileBlockSPtr file_block, T& cache_lock, U& segment_lock, bool sync = true); FileBlocks get_impl(const UInt128Wrapper& hash, const CacheContext& context, const FileBlock::Range& range, std::lock_guard& cache_lock); @@ -343,6 +390,7 @@ class BlockFileCache { bool try_reserve_during_async_load(size_t size, std::lock_guard& cache_lock); std::vector get_other_cache_type(FileCacheType cur_cache_type); + std::vector get_other_cache_type_without_ttl(FileCacheType cur_cache_type); bool try_reserve_from_other_queue(FileCacheType cur_cache_type, size_t offset, int64_t cur_time, std::lock_guard& cache_lock); @@ -388,24 +436,30 @@ class BlockFileCache { void recycle_deleted_blocks(); - bool try_reserve_from_other_queue_by_hot_interval(std::vector other_cache_types, + bool try_reserve_from_other_queue_by_hot_interval(FileCacheType cur_type, + std::vector other_cache_types, size_t size, int64_t cur_time, std::lock_guard& cache_lock); - bool try_reserve_from_other_queue_by_size(std::vector other_cache_types, + bool try_reserve_from_other_queue_by_size(FileCacheType cur_type, + std::vector other_cache_types, size_t size, std::lock_guard& cache_lock); - bool is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size, - bool is_ttl = false) const; + bool is_overflow(size_t removed_size, size_t need_size, size_t cur_cache_size) const; void remove_file_blocks(std::vector&, std::lock_guard&); + void remove_file_blocks_async(std::vector&, std::lock_guard&); + void remove_file_blocks_and_clean_time_maps(std::vector&, std::lock_guard&); void find_evict_candidates(LRUQueue& queue, size_t size, size_t cur_cache_size, size_t& removed_size, std::vector& to_evict, - std::lock_guard& cache_lock, bool is_ttl); + std::lock_guard& cache_lock, size_t& cur_removed_size); + + void recycle_stale_rowset_async_bottom_half(); + // info std::string _cache_base_path; size_t _capacity = 0; @@ -444,10 +498,11 @@ class BlockFileCache { LRUQueue _disposable_queue; LRUQueue _ttl_queue; + // keys for async remove + std::shared_ptr> _recycle_keys; + // metrics - size_t _num_read_blocks = 0; - size_t _num_hit_blocks = 0; - size_t _num_removed_blocks = 0; + std::shared_ptr> _cache_capacity_metrics; std::shared_ptr> _cur_cache_size_metrics; std::shared_ptr> _cur_ttl_cache_size_metrics; std::shared_ptr> _cur_ttl_cache_lru_queue_cache_size_metrics; @@ -460,6 +515,24 @@ class BlockFileCache { std::shared_ptr> _cur_disposable_queue_cache_size_metrics; std::array>, 4> _queue_evict_size_metrics; std::shared_ptr> _total_evict_size_metrics; + std::shared_ptr> _evict_by_heat_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_size_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_self_lru_metrics_matrix[4]; + std::shared_ptr> _evict_by_try_release; + + std::shared_ptr>> _num_hit_blocks_5m; + std::shared_ptr>> _num_read_blocks_5m; + std::shared_ptr>> _num_hit_blocks_1h; + std::shared_ptr>> _num_read_blocks_1h; + + std::shared_ptr> _num_read_blocks; + std::shared_ptr> _num_hit_blocks; + std::shared_ptr> _num_removed_blocks; + + std::shared_ptr> _hit_ratio; + std::shared_ptr> _hit_ratio_5m; + std::shared_ptr> _hit_ratio_1h; + std::shared_ptr> _disk_limit_mode_metrics; }; } // namespace doris::io diff --git a/be/src/io/cache/block_file_cache_factory.cpp b/be/src/io/cache/block_file_cache_factory.cpp index ac16bbefa58d74..2d0d25735fe2fd 100644 --- a/be/src/io/cache/block_file_cache_factory.cpp +++ b/be/src/io/cache/block_file_cache_factory.cpp @@ -21,6 +21,9 @@ #include "io/cache/block_file_cache_factory.h" #include + +#include +#include #if defined(__APPLE__) #include #else @@ -32,9 +35,12 @@ #include #include "common/config.h" +#include "exec/schema_scanner/schema_scanner_helper.h" #include "io/cache/file_cache_common.h" #include "io/fs/local_file_system.h" #include "runtime/exec_env.h" +#include "service/backend_options.h" +#include "vec/core/block.h" namespace doris { class TUniqueId; @@ -115,6 +121,20 @@ Status FileCacheFactory::create_file_cache(const std::string& cache_base_path, return Status::OK(); } +std::vector FileCacheFactory::get_cache_file_by_path(const UInt128Wrapper& hash) { + io::BlockFileCache* cache = io::FileCacheFactory::instance()->get_by_path(hash); + auto blocks = cache->get_blocks_by_key(hash); + std::vector ret; + if (blocks.empty()) { + return ret; + } else { + for (auto& [_, fb] : blocks) { + ret.emplace_back(fb->get_cache_file()); + } + } + return ret; +} + BlockFileCache* FileCacheFactory::get_by_path(const UInt128Wrapper& key) { // dont need lock mutex because _caches is immutable after create_file_cache return _caches[KeyHash()(key) % _caches.size()].get(); @@ -169,5 +189,23 @@ std::string FileCacheFactory::reset_capacity(const std::string& path, int64_t ne return "Unknown the cache path " + path; } +void FileCacheFactory::get_cache_stats_block(vectorized::Block* block) { + // std::shared_lock read_lock(_qs_ctx_map_lock); + TBackend be = BackendOptions::get_local_backend(); + int64_t be_id = be.id; + std::string be_ip = be.host; + for (auto& cache : _caches) { + std::map stats = cache->get_stats(); + for (auto& [k, v] : stats) { + SchemaScannerHelper::insert_int64_value(0, be_id, block); // be id + SchemaScannerHelper::insert_string_value(1, be_ip, block); // be ip + SchemaScannerHelper::insert_string_value(2, cache->get_base_path(), + block); // cache path + SchemaScannerHelper::insert_string_value(3, k, block); // metric name + SchemaScannerHelper::insert_string_value(4, std::to_string(v), block); // metric value + } + } +} + } // namespace io } // namespace doris diff --git a/be/src/io/cache/block_file_cache_factory.h b/be/src/io/cache/block_file_cache_factory.h index d7b710876ce8e3..b00bd7bdfcb315 100644 --- a/be/src/io/cache/block_file_cache_factory.h +++ b/be/src/io/cache/block_file_cache_factory.h @@ -32,6 +32,10 @@ namespace doris { class TUniqueId; +namespace vectorized { +class Block; +} // namespace vectorized + namespace io { /** @@ -58,6 +62,8 @@ class FileCacheFactory { [[nodiscard]] size_t get_cache_instance_size() const { return _caches.size(); } + std::vector get_cache_file_by_path(const UInt128Wrapper& hash); + BlockFileCache* get_by_path(const UInt128Wrapper& hash); BlockFileCache* get_by_path(const std::string& cache_base_path); std::vector get_query_context_holders( @@ -82,6 +88,8 @@ class FileCacheFactory { */ std::string reset_capacity(const std::string& path, int64_t new_capacity); + void get_cache_stats_block(vectorized::Block* block); + FileCacheFactory() = default; FileCacheFactory& operator=(const FileCacheFactory&) = delete; FileCacheFactory(const FileCacheFactory&) = delete; diff --git a/be/src/io/cache/block_file_cache_profile.cpp b/be/src/io/cache/block_file_cache_profile.cpp index 68e6c1433deaf8..1759d37f9e4314 100644 --- a/be/src/io/cache/block_file_cache_profile.cpp +++ b/be/src/io/cache/block_file_cache_profile.cpp @@ -34,9 +34,9 @@ std::shared_ptr FileCacheProfile::report() { } void FileCacheProfile::update(FileCacheStatistics* stats) { - { - std::lock_guard lock(_mtx); - if (!_profile) { + if (_profile == nullptr) { + std::lock_guard lock(_mtx); + if (_profile == nullptr) { _profile = std::make_shared(); _file_cache_metric = std::make_shared(this); _file_cache_metric->register_entity(); diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 0a46c98390e70f..c9a273c5d368a6 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -292,6 +292,8 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* file_offset); } if (!st || block_state != FileBlock::State::DOWNLOADED) { + LOG(WARNING) << "Read data failed from file cache downloaded by others. err=" + << st.msg() << ", block state=" << block_state; size_t bytes_read {0}; stats.hit_cache = false; s3_read_counter << 1; diff --git a/be/src/io/cache/file_block.cpp b/be/src/io/cache/file_block.cpp index b015cbd61110d2..44cad5520ead06 100644 --- a/be/src/io/cache/file_block.cpp +++ b/be/src/io/cache/file_block.cpp @@ -144,7 +144,7 @@ Status FileBlock::append(Slice data) { Status FileBlock::finalize() { if (_downloaded_size != 0 && _downloaded_size != _block_range.size()) { - std::lock_guard cache_lock(_mgr->_mutex); + SCOPED_CACHE_LOCK(_mgr->_mutex); size_t old_size = _block_range.size(); _block_range.right = _block_range.left + _downloaded_size - 1; size_t new_size = _block_range.size(); @@ -179,7 +179,7 @@ Status FileBlock::change_cache_type_between_ttl_and_others(FileCacheType new_typ } Status FileBlock::change_cache_type_between_normal_and_index(FileCacheType new_type) { - std::lock_guard cache_lock(_mgr->_mutex); + SCOPED_CACHE_LOCK(_mgr->_mutex); std::lock_guard block_lock(_mutex); bool expr = (new_type != FileCacheType::TTL && _key.meta.type != FileCacheType::TTL); if (!expr) { @@ -223,7 +223,7 @@ FileBlock::State FileBlock::wait() { if (_download_state == State::DOWNLOADING) { DCHECK(_downloader_id != 0 && _downloader_id != get_caller_id()); - _cv.wait_for(block_lock, std::chrono::seconds(1)); + _cv.wait_for(block_lock, std::chrono::milliseconds(config::block_cache_wait_timeout_ms)); } return _download_state; @@ -272,20 +272,34 @@ std::string FileBlock::state_to_string(FileBlock::State state) { } } +std::string FileBlock::get_cache_file() const { + return _mgr->_storage->get_local_file(this->_key); +} + FileBlocksHolder::~FileBlocksHolder() { for (auto file_block_it = file_blocks.begin(); file_block_it != file_blocks.end();) { auto current_file_block_it = file_block_it; auto& file_block = *current_file_block_it; BlockFileCache* _mgr = file_block->_mgr; { - std::lock_guard cache_lock(_mgr->_mutex); - std::lock_guard block_lock(file_block->_mutex); - file_block->complete_unlocked(block_lock); - if (file_block.use_count() == 2) { - DCHECK(file_block->state_unlock(block_lock) != FileBlock::State::DOWNLOADING); - // one in cache, one in here - if (file_block->state_unlock(block_lock) == FileBlock::State::EMPTY) { - _mgr->remove(file_block, cache_lock, block_lock); + bool should_remove = false; + { + std::lock_guard block_lock(file_block->_mutex); + file_block->complete_unlocked(block_lock); + if (file_block.use_count() == 2 && + file_block->state_unlock(block_lock) == FileBlock::State::EMPTY) { + should_remove = true; + } + } + if (should_remove) { + SCOPED_CACHE_LOCK(_mgr->_mutex); + std::lock_guard block_lock(file_block->_mutex); + if (file_block.use_count() == 2) { + DCHECK(file_block->state_unlock(block_lock) != FileBlock::State::DOWNLOADING); + // one in cache, one in here + if (file_block->state_unlock(block_lock) == FileBlock::State::EMPTY) { + _mgr->remove(file_block, cache_lock, block_lock); + } } } } diff --git a/be/src/io/cache/file_block.h b/be/src/io/cache/file_block.h index 6e49a597b7b95c..3a4490d67a3f9d 100644 --- a/be/src/io/cache/file_block.h +++ b/be/src/io/cache/file_block.h @@ -123,6 +123,8 @@ class FileBlock { uint64_t expiration_time() const { return _key.meta.expiration_time; } + std::string get_cache_file() const; + State state_unlock(std::lock_guard&) const; FileBlock& operator=(const FileBlock&) = delete; diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index c569ace0011866..674879300452df 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -34,6 +34,7 @@ std::string FileCacheSettings::to_string() const { << ", disposable_queue_elements: " << disposable_queue_elements << ", index_queue_size: " << index_queue_size << ", index_queue_elements: " << index_queue_elements + << ", ttl_queue_size: " << ttl_queue_size << ", ttl_queue_elements: " << ttl_queue_elements << ", query_queue_size: " << query_queue_size << ", query_queue_elements: " << query_queue_elements << ", storage: " << storage; return ss.str(); @@ -58,6 +59,10 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach std::max(settings.index_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); + settings.ttl_queue_size = per_size * config::max_ttl_cache_ratio; + settings.ttl_queue_elements = std::max(settings.ttl_queue_size / settings.max_file_block_size, + REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); + settings.query_queue_size = settings.capacity - settings.disposable_queue_size - settings.index_queue_size; settings.query_queue_elements = diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 21309831a8284c..30579ba7851b28 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -26,17 +26,17 @@ namespace doris::io { inline static constexpr size_t REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS = 100 * 1024; inline static constexpr size_t FILE_CACHE_MAX_FILE_BLOCK_SIZE = 1 * 1024 * 1024; -inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 85; -inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 10; +inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 40; +inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 5; inline static constexpr size_t DEFAULT_INDEX_PERCENT = 5; using uint128_t = vectorized::UInt128; -enum class FileCacheType { - INDEX, - NORMAL, - DISPOSABLE, - TTL, +enum FileCacheType { + INDEX = 2, + NORMAL = 1, + DISPOSABLE = 0, + TTL = 3, }; struct UInt128Wrapper { @@ -93,6 +93,8 @@ struct FileCacheSettings { size_t index_queue_elements {0}; size_t query_queue_size {0}; size_t query_queue_elements {0}; + size_t ttl_queue_size {0}; + size_t ttl_queue_elements {0}; size_t max_file_block_size {0}; size_t max_query_cache_size {0}; std::string storage; diff --git a/be/src/io/cache/file_cache_storage.h b/be/src/io/cache/file_cache_storage.h index 642c4711cf6c62..024e701c6fa08b 100644 --- a/be/src/io/cache/file_cache_storage.h +++ b/be/src/io/cache/file_cache_storage.h @@ -65,6 +65,8 @@ class FileCacheStorage { // force clear all current data in the cache virtual Status clear(std::string& msg) = 0; virtual FileCacheStorageType get_type() = 0; + // get local cached file + virtual std::string get_local_file(const FileCacheKey& key) = 0; }; } // namespace doris::io diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index ecdf04c88304f0..cf1cd41a537abc 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -160,30 +160,36 @@ Status FSFileCacheStorage::read(const FileCacheKey& key, size_t value_offset, Sl get_path_in_local_cache(get_path_in_local_cache(key.hash, key.meta.expiration_time), key.offset, key.meta.type); Status s = fs->open_file(file, &file_reader); - if (!s.ok()) { - if (!s.is() || key.meta.type != FileCacheType::TTL) { - return s; + + // handle the case that the file is not found but actually exists in other type format + // TODO(zhengyu): nasty! better eliminate the type encoding in file name in the future + if (!s.ok() && !s.is()) { + LOG(WARNING) << "open file failed, file=" << file << ", error=" << s.to_string(); + return s; // return other error directly + } else if (!s.ok() && s.is()) { // but handle NOT_FOUND error + auto candidates = get_path_in_local_cache_all_candidates( + get_path_in_local_cache(key.hash, key.meta.expiration_time), key.offset); + for (auto& candidate : candidates) { + s = fs->open_file(candidate, &file_reader); + if (s.ok()) { + break; // success with one of there candidates + } } - std::string file_old_format = get_path_in_local_cache_old_ttl_format( - get_path_in_local_cache(key.hash, key.meta.expiration_time), key.offset, - key.meta.type); - if (config::translate_to_new_ttl_format_during_read) { - // try to rename the file with old ttl format to new and retry - VLOG(7) << "try to rename the file with old ttl format to new and retry" - << " oldformat=" << file_old_format << " original=" << file; - RETURN_IF_ERROR(fs->rename(file_old_format, file)); - RETURN_IF_ERROR(fs->open_file(file, &file_reader)); - } else { - // try to open the file with old ttl format - VLOG(7) << "try to open the file with old ttl format" - << " oldformat=" << file_old_format << " original=" << file; - RETURN_IF_ERROR(fs->open_file(file_old_format, &file_reader)); + if (!s.ok()) { // still not found, return error + LOG(WARNING) << "open file failed, file=" << file << ", error=" << s.to_string(); + return s; } - } + } // else, s.ok() means open file success + FDCache::instance()->insert_file_reader(fd_key, file_reader); } size_t bytes_read = 0; - RETURN_IF_ERROR(file_reader->read_at(value_offset, buffer, &bytes_read)); + auto s = file_reader->read_at(value_offset, buffer, &bytes_read); + if (!s.ok()) { + LOG(WARNING) << "read file failed, file=" << file_reader->path() + << ", error=" << s.to_string(); + return s; + } DCHECK(bytes_read == buffer.get_size()); return Status::OK(); } @@ -270,6 +276,17 @@ std::string FSFileCacheStorage::get_path_in_local_cache_old_ttl_format(const std return Path(dir) / (std::to_string(offset) + BlockFileCache::cache_type_to_string(type)); } +std::vector FSFileCacheStorage::get_path_in_local_cache_all_candidates( + const std::string& dir, size_t offset) { + std::vector candidates; + std::string base = get_path_in_local_cache(dir, offset, FileCacheType::NORMAL); + candidates.push_back(base); + candidates.push_back(base + "_idx"); + candidates.push_back(base + "_ttl"); + candidates.push_back(base + "_disposable"); + return candidates; +} + std::string FSFileCacheStorage::get_path_in_local_cache(const UInt128Wrapper& value, uint64_t expiration_time) const { auto str = value.to_string(); @@ -471,7 +488,8 @@ void FSFileCacheStorage::load_cache_info_into_memory(BlockFileCache* _mgr) const std::vector batch_load_buffer; batch_load_buffer.reserve(scan_length); auto add_cell_batch_func = [&]() { - std::lock_guard cache_lock(_mgr->_mutex); + SCOPED_CACHE_LOCK(_mgr->_mutex); + auto f = [&](const BatchLoadArgs& args) { // in async load mode, a cell may be added twice. if (_mgr->_files.contains(args.hash) && _mgr->_files[args.hash].contains(args.offset)) { @@ -659,6 +677,11 @@ Status FSFileCacheStorage::clear(std::string& msg) { return Status::OK(); } +std::string FSFileCacheStorage::get_local_file(const FileCacheKey& key) { + return get_path_in_local_cache(get_path_in_local_cache(key.hash, key.meta.expiration_time), + key.offset, key.meta.type, false); +} + FSFileCacheStorage::~FSFileCacheStorage() { if (_cache_background_load_thread.joinable()) { _cache_background_load_thread.join(); diff --git a/be/src/io/cache/fs_file_cache_storage.h b/be/src/io/cache/fs_file_cache_storage.h index 23e98f422ac884..8a97aa109ad741 100644 --- a/be/src/io/cache/fs_file_cache_storage.h +++ b/be/src/io/cache/fs_file_cache_storage.h @@ -70,6 +70,7 @@ class FSFileCacheStorage : public FileCacheStorage { void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) override; Status clear(std::string& msg) override; + std::string get_local_file(const FileCacheKey& key) override; [[nodiscard]] static std::string get_path_in_local_cache(const std::string& dir, size_t offset, FileCacheType type, @@ -101,6 +102,9 @@ class FSFileCacheStorage : public FileCacheStorage { void load_cache_info_into_memory(BlockFileCache* _mgr) const; + [[nodiscard]] std::vector get_path_in_local_cache_all_candidates( + const std::string& dir, size_t offset); + std::string _cache_base_path; std::thread _cache_background_load_thread; const std::shared_ptr& fs = global_local_filesystem(); diff --git a/be/src/io/cache/mem_file_cache_storage.cpp b/be/src/io/cache/mem_file_cache_storage.cpp index bffa75ae305b59..7e76dd5f88c565 100644 --- a/be/src/io/cache/mem_file_cache_storage.cpp +++ b/be/src/io/cache/mem_file_cache_storage.cpp @@ -128,4 +128,8 @@ Status MemFileCacheStorage::clear(std::string& msg) { return Status::OK(); } +std::string MemFileCacheStorage::get_local_file(const FileCacheKey& key) { + return ""; +} + } // namespace doris::io diff --git a/be/src/io/cache/mem_file_cache_storage.h b/be/src/io/cache/mem_file_cache_storage.h index 20fdd8ce9f6520..82064c6e9edc78 100644 --- a/be/src/io/cache/mem_file_cache_storage.h +++ b/be/src/io/cache/mem_file_cache_storage.h @@ -44,6 +44,7 @@ class MemFileCacheStorage : public FileCacheStorage { void load_blocks_directly_unlocked(BlockFileCache* _mgr, const FileCacheKey& key, std::lock_guard& cache_lock) override; Status clear(std::string& msg) override; + std::string get_local_file(const FileCacheKey& key) override; FileCacheStorageType get_type() override { return MEMORY; } diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp index 43445ed42efd3b..7fd85caa43b6c0 100644 --- a/be/src/io/fs/buffered_reader.cpp +++ b/be/src/io/fs/buffered_reader.cpp @@ -23,6 +23,7 @@ #include #include +#include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" @@ -31,6 +32,7 @@ #include "runtime/thread_context.h" #include "runtime/workload_management/io_throttle.h" #include "util/runtime_profile.h" +#include "util/slice.h" #include "util/threadpool.h" namespace doris { @@ -270,7 +272,7 @@ void MergeRangeFileReader::_read_in_box(RangeCachedData& cached_data, size_t off } if (copy_out != nullptr) { memcpy(copy_out + to_handle - remaining, - _boxes[box_index] + cached_data.box_start_offset[i], box_to_handle); + _boxes[box_index].data() + cached_data.box_start_offset[i], box_to_handle); } remaining -= box_to_handle; cached_data.box_start_offset[i] += box_to_handle; @@ -307,14 +309,15 @@ void MergeRangeFileReader::_read_in_box(RangeCachedData& cached_data, size_t off Status MergeRangeFileReader::_fill_box(int range_index, size_t start_offset, size_t to_read, size_t* bytes_read, const IOContext* io_ctx) { - if (_read_slice == nullptr) { - _read_slice = new char[READ_SLICE_SIZE]; + if (!_read_slice) { + _read_slice = std::make_unique(READ_SLICE_SIZE); } + *bytes_read = 0; { SCOPED_RAW_TIMER(&_statistics.read_time); - RETURN_IF_ERROR( - _reader->read_at(start_offset, Slice(_read_slice, to_read), bytes_read, io_ctx)); + RETURN_IF_ERROR(_reader->read_at(start_offset, Slice(_read_slice->data(), to_read), + bytes_read, io_ctx)); _statistics.merged_io++; _statistics.merged_bytes += *bytes_read; } @@ -328,8 +331,8 @@ Status MergeRangeFileReader::_fill_box(int range_index, size_t start_offset, siz auto fill_box = [&](int16 fill_box_ref, uint32 box_usage, size_t box_copy_end) { size_t copy_size = std::min(box_copy_end - copy_start, BOX_SIZE - box_usage); - memcpy(_boxes[fill_box_ref] + box_usage, _read_slice + copy_start - start_offset, - copy_size); + memcpy(_boxes[fill_box_ref].data() + box_usage, + _read_slice->data() + copy_start - start_offset, copy_size); filled_boxes.emplace_back(fill_box_ref, box_usage, copy_start, copy_start + copy_size); copy_start += copy_size; _last_box_ref = fill_box_ref; @@ -367,7 +370,7 @@ Status MergeRangeFileReader::_fill_box(int range_index, size_t start_offset, siz } // apply for new box to copy data while (copy_start < range_copy_end && _boxes.size() < NUM_BOX) { - _boxes.emplace_back(new char[BOX_SIZE]); + _boxes.emplace_back(BOX_SIZE); _box_ref.emplace_back(0); fill_box(_boxes.size() - 1, 0, range_copy_end); } @@ -778,8 +781,12 @@ BufferedFileStreamReader::BufferedFileStreamReader(io::FileReaderSPtr file, uint Status BufferedFileStreamReader::read_bytes(const uint8_t** buf, uint64_t offset, const size_t bytes_to_read, const IOContext* io_ctx) { - if (offset < _file_start_offset || offset >= _file_end_offset) { - return Status::IOError("Out-of-bounds Access"); + if (offset < _file_start_offset || offset >= _file_end_offset || + offset + bytes_to_read > _file_end_offset) { + return Status::IOError( + "Out-of-bounds Access: offset={}, bytes_to_read={}, file_start={}, " + "file_end={}", + offset, bytes_to_read, _file_start_offset, _file_end_offset); } int64_t end_offset = offset + bytes_to_read; if (_buf_start_offset <= offset && _buf_end_offset >= end_offset) { @@ -862,5 +869,107 @@ Result DelegateReader::create_file_reader( return reader; }); } + +Status LinearProbeRangeFinder::get_range_for(int64_t desired_offset, + io::PrefetchRange& result_range) { + while (index < _ranges.size()) { + io::PrefetchRange& range = _ranges[index]; + if (range.end_offset > desired_offset) { + if (range.start_offset > desired_offset) [[unlikely]] { + return Status::InvalidArgument("Invalid desiredOffset"); + } + result_range = range; + return Status::OK(); + } + ++index; + } + return Status::InvalidArgument("Invalid desiredOffset"); +} + +RangeCacheFileReader::RangeCacheFileReader(RuntimeProfile* profile, io::FileReaderSPtr inner_reader, + std::shared_ptr range_finder) + : _profile(profile), + _inner_reader(std::move(inner_reader)), + _range_finder(std::move(range_finder)) { + _size = _inner_reader->size(); + uint64_t max_cache_size = + std::max((uint64_t)4096, (uint64_t)_range_finder->get_max_range_size()); + _cache = OwnedSlice(max_cache_size); + + if (_profile != nullptr) { + const char* random_profile = "RangeCacheFileReader"; + ADD_TIMER_WITH_LEVEL(_profile, random_profile, 1); + _request_io = + ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "RequestIO", TUnit::UNIT, random_profile, 1); + _request_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "RequestBytes", TUnit::BYTES, + random_profile, 1); + _request_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "RequestTime", random_profile, 1); + _read_to_cache_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ReadToCacheTime", random_profile, 1); + _cache_refresh_count = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "CacheRefreshCount", + TUnit::UNIT, random_profile, 1); + _read_to_cache_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "ReadToCacheBytes", + TUnit::BYTES, random_profile, 1); + } +} + +Status RangeCacheFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_read, + const IOContext* io_ctx) { + auto request_size = result.size; + + _cache_statistics.request_io++; + _cache_statistics.request_bytes += request_size; + SCOPED_RAW_TIMER(&_cache_statistics.request_time); + + PrefetchRange range; + if (_range_finder->get_range_for(offset, range)) [[likely]] { + if (_current_start_offset != range.start_offset) { // need read new range to cache. + auto range_size = range.end_offset - range.start_offset; + + _cache_statistics.cache_refresh_count++; + _cache_statistics.read_to_cache_bytes += range_size; + SCOPED_RAW_TIMER(&_cache_statistics.read_to_cache_time); + + Slice cache_slice = {_cache.data(), range_size}; + RETURN_IF_ERROR( + _inner_reader->read_at(range.start_offset, cache_slice, bytes_read, io_ctx)); + + if (*bytes_read != range_size) [[unlikely]] { + return Status::InternalError( + "RangeCacheFileReader use inner reader read bytes {} not eq expect size {}", + *bytes_read, range_size); + } + + _current_start_offset = range.start_offset; + } + + int64_t buffer_offset = offset - _current_start_offset; + memcpy(result.data, _cache.data() + buffer_offset, request_size); + *bytes_read = request_size; + + return Status::OK(); + } else { + return Status::InternalError("RangeCacheFileReader read not in Ranges. Offset = {}", + offset); + // RETURN_IF_ERROR(_inner_reader->read_at(offset, result , bytes_read, io_ctx)); + // return Status::OK(); + // think return error is ok,otherwise it will cover up the error. + } +} + +void RangeCacheFileReader::_collect_profile_before_close() { + if (_profile != nullptr) { + COUNTER_UPDATE(_request_io, _cache_statistics.request_io); + COUNTER_UPDATE(_request_bytes, _cache_statistics.request_bytes); + COUNTER_UPDATE(_request_time, _cache_statistics.request_time); + COUNTER_UPDATE(_read_to_cache_time, _cache_statistics.read_to_cache_time); + COUNTER_UPDATE(_cache_refresh_count, _cache_statistics.cache_refresh_count); + COUNTER_UPDATE(_read_to_cache_bytes, _cache_statistics.read_to_cache_bytes); + if (_inner_reader != nullptr) { + _inner_reader->collect_profile_before_close(); + } + } +} + } // namespace io } // namespace doris diff --git a/be/src/io/fs/buffered_reader.h b/be/src/io/fs/buffered_reader.h index 70c8445db233e6..67e07665fbfd9f 100644 --- a/be/src/io/fs/buffered_reader.h +++ b/be/src/io/fs/buffered_reader.h @@ -53,6 +53,147 @@ struct PrefetchRange { : start_offset(start_offset), end_offset(end_offset) {} PrefetchRange() : start_offset(0), end_offset(0) {} + + bool operator==(const PrefetchRange& other) const { + return (start_offset == other.start_offset) && (end_offset == other.end_offset); + } + + bool operator!=(const PrefetchRange& other) const { return !(*this == other); } + + PrefetchRange span(const PrefetchRange& other) const { + return {std::min(start_offset, other.end_offset), std::max(start_offset, other.end_offset)}; + } + PrefetchRange seq_span(const PrefetchRange& other) const { + return {start_offset, other.end_offset}; + } + + //Ranges needs to be sorted. + static std::vector merge_adjacent_seq_ranges( + const std::vector& seq_ranges, int64_t max_merge_distance_bytes, + int64_t once_max_read_bytes) { + if (seq_ranges.empty()) { + return {}; + } + // Merge overlapping ranges + std::vector result; + PrefetchRange last = seq_ranges.front(); + for (size_t i = 1; i < seq_ranges.size(); ++i) { + PrefetchRange current = seq_ranges[i]; + PrefetchRange merged = last.seq_span(current); + if (merged.end_offset <= once_max_read_bytes + merged.start_offset && + last.end_offset + max_merge_distance_bytes >= current.start_offset) { + last = merged; + } else { + result.push_back(last); + last = current; + } + } + result.push_back(last); + return result; + } +}; + +class RangeFinder { +public: + virtual ~RangeFinder() = default; + virtual Status get_range_for(int64_t desired_offset, io::PrefetchRange& result_range) = 0; + virtual size_t get_max_range_size() const = 0; +}; + +class LinearProbeRangeFinder : public RangeFinder { +public: + LinearProbeRangeFinder(std::vector&& ranges) : _ranges(std::move(ranges)) {} + + Status get_range_for(int64_t desired_offset, io::PrefetchRange& result_range) override; + + size_t get_max_range_size() const override { + size_t max_range_size = 0; + for (const auto& range : _ranges) { + max_range_size = std::max(max_range_size, range.end_offset - range.start_offset); + } + return max_range_size; + } + + ~LinearProbeRangeFinder() override = default; + +private: + std::vector _ranges; + size_t index {0}; +}; + +/** + * The reader provides a solution to read one range at a time. You can customize RangeFinder to meet your scenario. + * For me, since there will be tiny stripes when reading orc files, in order to reduce the requests to hdfs, + * I first merge the access to the orc files to be read (of course there is a problem of read amplification, + * but in my scenario, compared with reading hdfs multiple times, it is faster to read more data on hdfs at one time), + * and then because the actual reading of orc files is in order from front to back, I provide LinearProbeRangeFinder. + */ +class RangeCacheFileReader : public io::FileReader { + struct RangeCacheReaderStatistics { + int64_t request_io = 0; + int64_t request_bytes = 0; + int64_t request_time = 0; + int64_t read_to_cache_time = 0; + int64_t cache_refresh_count = 0; + int64_t read_to_cache_bytes = 0; + }; + +public: + RangeCacheFileReader(RuntimeProfile* profile, io::FileReaderSPtr inner_reader, + std::shared_ptr range_finder); + + ~RangeCacheFileReader() override = default; + + Status close() override { + if (!_closed) { + _closed = true; + } + return Status::OK(); + } + + const io::Path& path() const override { return _inner_reader->path(); } + + size_t size() const override { return _size; } + + bool closed() const override { return _closed; } + +protected: + Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, + const IOContext* io_ctx) override; + + void _collect_profile_before_close() override; + +private: + RuntimeProfile* _profile = nullptr; + io::FileReaderSPtr _inner_reader; + std::shared_ptr _range_finder; + + OwnedSlice _cache; + int64_t _current_start_offset = -1; + + size_t _size; + bool _closed = false; + + RuntimeProfile::Counter* _request_io = nullptr; + RuntimeProfile::Counter* _request_bytes = nullptr; + RuntimeProfile::Counter* _request_time = nullptr; + RuntimeProfile::Counter* _read_to_cache_time = nullptr; + RuntimeProfile::Counter* _cache_refresh_count = nullptr; + RuntimeProfile::Counter* _read_to_cache_bytes = nullptr; + RangeCacheReaderStatistics _cache_statistics; + /** + * `RangeCacheFileReader`: + * 1. `CacheRefreshCount`: how many IOs are merged + * 2. `ReadToCacheBytes`: how much data is actually read after merging + * 3. `ReadToCacheTime`: how long it takes to read data after merging + * 4. `RequestBytes`: how many bytes does the apache-orc library actually need to read the orc file + * 5. `RequestIO`: how many times the apache-orc library calls this read interface + * 6. `RequestTime`: how long it takes the apache-orc library to call this read interface + * + * It should be noted that `RangeCacheFileReader` is a wrapper of the reader that actually reads data,such as + * the hdfs reader, so strictly speaking, `CacheRefreshCount` is not equal to how many IOs are initiated to hdfs, + * because each time the hdfs reader is requested, the hdfs reader may not be able to read all the data at once. + */ }; /** @@ -168,12 +309,7 @@ class MergeRangeFileReader : public io::FileReader { } } - ~MergeRangeFileReader() override { - delete[] _read_slice; - for (char* box : _boxes) { - delete[] box; - } - } + ~MergeRangeFileReader() override = default; Status close() override { if (!_closed) { @@ -244,8 +380,8 @@ class MergeRangeFileReader : public io::FileReader { bool _closed = false; size_t _remaining; - char* _read_slice = nullptr; - std::vector _boxes; + std::unique_ptr _read_slice; + std::vector _boxes; int16 _last_box_ref = -1; uint32 _last_box_usage = 0; std::vector _box_ref; diff --git a/be/src/io/fs/local_file_reader.cpp b/be/src/io/fs/local_file_reader.cpp index b4f144a633048e..4a41fa479d9808 100644 --- a/be/src/io/fs/local_file_reader.cpp +++ b/be/src/io/fs/local_file_reader.cpp @@ -34,11 +34,13 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "cpp/sync_point.h" #include "io/fs/err_utils.h" +#include "olap/data_dir.h" #include "olap/olap_common.h" #include "olap/options.h" #include "runtime/thread_context.h" #include "runtime/workload_management/io_throttle.h" #include "util/async_io.h" +#include "util/debug_points.h" #include "util/doris_metrics.h" namespace doris { @@ -139,6 +141,15 @@ Status LocalFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_ while (bytes_req != 0) { auto res = SYNC_POINT_HOOK_RETURN_VALUE(::pread(_fd, to, bytes_req, offset), "LocalFileReader::pread", _fd, to); + DBUG_EXECUTE_IF("LocalFileReader::read_at_impl.io_error", { + auto sub_path = dp->param("sub_path", ""); + if ((sub_path.empty() && _path.filename().compare(kTestFilePath)) || + (!sub_path.empty() && _path.native().find(sub_path) != std::string::npos)) { + res = -1; + errno = EIO; + LOG(WARNING) << Status::IOError("debug read io error: {}", _path.native()); + } + }); if (UNLIKELY(-1 == res && errno != EINTR)) { return localfs_error(errno, fmt::format("failed to read {}", _path.native())); } diff --git a/be/src/io/fs/local_file_system.cpp b/be/src/io/fs/local_file_system.cpp index 4b44027abbbf2d..0107ed57dc8fb1 100644 --- a/be/src/io/fs/local_file_system.cpp +++ b/be/src/io/fs/local_file_system.cpp @@ -62,9 +62,13 @@ Status LocalFileSystem::create_file_impl(const Path& file, FileWriterPtr* writer int fd = ::open(file.c_str(), O_TRUNC | O_WRONLY | O_CREAT | O_CLOEXEC, 0666); DBUG_EXECUTE_IF("LocalFileSystem.create_file_impl.open_file_failed", { // spare '.testfile' to make bad disk checker happy - if (file.filename().compare(kTestFilePath)) { + auto sub_path = dp->param("sub_path", ""); + if ((sub_path.empty() && file.filename().compare(kTestFilePath)) || + (!sub_path.empty() && file.native().find(sub_path) != std::string::npos)) { ::close(fd); fd = -1; + errno = EIO; + LOG(WARNING) << Status::IOError("debug open io error: {}", file.native()); } }); if (-1 == fd) { @@ -85,6 +89,17 @@ Status LocalFileSystem::open_file_impl(const Path& file, FileReaderSPtr* reader, } int fd = -1; RETRY_ON_EINTR(fd, open(file.c_str(), O_RDONLY)); + DBUG_EXECUTE_IF("LocalFileSystem.create_file_impl.open_file_failed", { + // spare '.testfile' to make bad disk checker happy + auto sub_path = dp->param("sub_path", ""); + if ((sub_path.empty() && file.filename().compare(kTestFilePath)) || + (!sub_path.empty() && file.native().find(sub_path) != std::string::npos)) { + ::close(fd); + fd = -1; + errno = EIO; + LOG(WARNING) << Status::IOError("debug open io error: {}", file.native()); + } + }); if (fd < 0) { return localfs_error(errno, fmt::format("failed to open {}", file.native())); } diff --git a/be/src/io/fs/local_file_writer.cpp b/be/src/io/fs/local_file_writer.cpp index 7301ceae588a0b..c65dee2535e79d 100644 --- a/be/src/io/fs/local_file_writer.cpp +++ b/be/src/io/fs/local_file_writer.cpp @@ -147,6 +147,15 @@ Status LocalFileWriter::appendv(const Slice* data, size_t data_cnt) { RETRY_ON_EINTR(res, SYNC_POINT_HOOK_RETURN_VALUE( ::writev(_fd, iov.data() + completed_iov, iov_count), "LocalFileWriter::writev", _fd)); + DBUG_EXECUTE_IF("LocalFileWriter::appendv.io_error", { + auto sub_path = dp->param("sub_path", ""); + if ((sub_path.empty() && _path.filename().compare(kTestFilePath)) || + (!sub_path.empty() && _path.native().find(sub_path) != std::string::npos)) { + res = -1; + errno = EIO; + LOG(WARNING) << Status::IOError("debug write io error: {}", _path.native()); + } + }); if (UNLIKELY(res < 0)) { return localfs_error(errno, fmt::format("failed to write {}", _path.native())); } diff --git a/be/src/io/fs/s3_file_system.cpp b/be/src/io/fs/s3_file_system.cpp index 3a5fffb2549938..d841c79ed66069 100644 --- a/be/src/io/fs/s3_file_system.cpp +++ b/be/src/io/fs/s3_file_system.cpp @@ -86,7 +86,7 @@ Status ObjClientHolder::reset(const S3ClientConf& conf) { S3ClientConf reset_conf; { std::shared_lock lock(_mtx); - if (conf.ak == _conf.ak && conf.sk == _conf.sk && conf.token == _conf.token) { + if (conf.get_hash() == _conf.get_hash()) { return Status::OK(); // Same conf } @@ -95,6 +95,10 @@ Status ObjClientHolder::reset(const S3ClientConf& conf) { reset_conf.sk = conf.sk; reset_conf.token = conf.token; reset_conf.bucket = conf.bucket; + reset_conf.connect_timeout_ms = conf.connect_timeout_ms; + reset_conf.max_connections = conf.max_connections; + reset_conf.request_timeout_ms = conf.request_timeout_ms; + reset_conf.use_virtual_addressing = conf.use_virtual_addressing; // Should check endpoint here? } diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index 24b72a4b6c902c..e40b9e171eb08f 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -204,12 +204,12 @@ Status S3FileWriter::_build_upload_buffer() { Status S3FileWriter::_close_impl() { VLOG_DEBUG << "S3FileWriter::close, path: " << _obj_storage_path_opts.path.native(); - if (_cur_part_num == 1 && _pending_buf) { + if (_cur_part_num == 1 && _pending_buf) { // data size is less than config::s3_write_buffer_size RETURN_IF_ERROR(_set_upload_to_remote_less_than_buffer_size()); } if (_bytes_appended == 0) { - DCHECK(_cur_part_num == 1); + DCHECK_EQ(_cur_part_num, 1); // No data written, but need to create an empty file RETURN_IF_ERROR(_build_upload_buffer()); if (!_used_by_s3_committer) { @@ -220,10 +220,15 @@ Status S3FileWriter::_close_impl() { } } - if (_pending_buf != nullptr) { + if (_pending_buf != nullptr) { // there is remaining data in buffer need to be uploaded _countdown_event.add_count(); RETURN_IF_ERROR(FileBuffer::submit(std::move(_pending_buf))); _pending_buf = nullptr; + } else if (_bytes_appended != 0) { // Non-empty file and has nothing to be uploaded + // NOTE: When the data size is a multiple of config::s3_write_buffer_size, + // _cur_part_num may exceed the actual number of parts that need to be uploaded. + // This is because it is incremented by 1 in advance within the S3FileWriter::appendv method. + _cur_part_num--; } RETURN_IF_ERROR(_complete()); @@ -327,26 +332,29 @@ Status S3FileWriter::_complete() { _wait_until_finish("Complete"); TEST_SYNC_POINT_CALLBACK("S3FileWriter::_complete:1", std::make_pair(&_failed, &_completed_parts)); - if (!_used_by_s3_committer) { // S3 committer will complete multipart upload file on FE side. - if (_failed || _completed_parts.size() != _cur_part_num) { - _st = Status::InternalError( - "error status {}, have failed {}, complete parts {}, cur part num {}, whole " - "parts {}, file path {}, file size {}, has left buffer {}", - _st, _failed, _completed_parts.size(), _cur_part_num, _dump_completed_part(), - _obj_storage_path_opts.path.native(), _bytes_appended, _pending_buf != nullptr); - LOG(WARNING) << _st; - return _st; - } - // make sure _completed_parts are ascending order - std::sort(_completed_parts.begin(), _completed_parts.end(), - [](auto& p1, auto& p2) { return p1.part_num < p2.part_num; }); - TEST_SYNC_POINT_CALLBACK("S3FileWriter::_complete:2", &_completed_parts); - auto resp = client->complete_multipart_upload(_obj_storage_path_opts, _completed_parts); - if (resp.status.code != ErrorCode::OK) { - LOG_WARNING("Compltet multi part upload failed because {}, file path {}", - resp.status.msg, _obj_storage_path_opts.path.native()); - return {resp.status.code, std::move(resp.status.msg)}; - } + if (_used_by_s3_committer) { // S3 committer will complete multipart upload file on FE side. + s3_file_created_total << 1; // Assume that it will be created successfully + return Status::OK(); + } + + if (_failed || _completed_parts.size() != _cur_part_num) { + _st = Status::InternalError( + "error status={} failed={} #complete_parts={} #expected_parts={} " + "completed_parts_list={} file_path={} file_size={} has left buffer not uploaded={}", + _st, _failed, _completed_parts.size(), _cur_part_num, _dump_completed_part(), + _obj_storage_path_opts.path.native(), _bytes_appended, _pending_buf != nullptr); + LOG(WARNING) << _st; + return _st; + } + // make sure _completed_parts are ascending order + std::sort(_completed_parts.begin(), _completed_parts.end(), + [](auto& p1, auto& p2) { return p1.part_num < p2.part_num; }); + TEST_SYNC_POINT_CALLBACK("S3FileWriter::_complete:2", &_completed_parts); + auto resp = client->complete_multipart_upload(_obj_storage_path_opts, _completed_parts); + if (resp.status.code != ErrorCode::OK) { + LOG_WARNING("Compltet multi part upload failed because {}, file path {}", resp.status.msg, + _obj_storage_path_opts.path.native()); + return {resp.status.code, std::move(resp.status.msg)}; } s3_file_created_total << 1; return Status::OK(); diff --git a/be/src/io/hdfs_builder.cpp b/be/src/io/hdfs_builder.cpp index 945ef3ab02bd13..59ca46e86944df 100644 --- a/be/src/io/hdfs_builder.cpp +++ b/be/src/io/hdfs_builder.cpp @@ -20,21 +20,83 @@ #include #include +#include #include -#include #include #include -#include "agent/utils.h" #include "common/config.h" #include "common/logging.h" +#ifdef USE_HADOOP_HDFS +#include "hadoop_hdfs/hdfs.h" +#endif #include "io/fs/hdfs.h" #include "util/string_util.h" -#include "util/uid_util.h" namespace doris { +#ifdef USE_HADOOP_HDFS +void err_log_message(const char* fmt, ...) { + va_list args; + va_start(args, fmt); + + // First, call vsnprintf to get the required buffer size + int size = vsnprintf(nullptr, 0, fmt, args) + 1; // +1 for '\0' + if (size <= 0) { + LOG(ERROR) << "Error formatting log message, invalid size"; + va_end(args); + return; + } + + va_end(args); + va_start(args, fmt); // Reinitialize va_list + + // Allocate a buffer and format the string into it + std::vector buffer(size); + vsnprintf(buffer.data(), size, fmt, args); + + va_end(args); + + // Use glog to log the message + LOG(ERROR) << buffer.data(); +} + +void va_err_log_message(const char* fmt, va_list ap) { + va_list args_copy; + va_copy(args_copy, ap); + + // Call vsnprintf to get the required buffer size + int size = vsnprintf(nullptr, 0, fmt, args_copy) + 1; // +1 for '\0' + va_end(args_copy); // Release the copied va_list + + if (size <= 0) { + LOG(ERROR) << "Error formatting log message, invalid size"; + return; + } + + // Reinitialize va_list for the second vsnprintf call + va_copy(args_copy, ap); + + // Allocate a buffer and format the string into it + std::vector buffer(size); + vsnprintf(buffer.data(), size, fmt, args_copy); + + va_end(args_copy); + + // Use glog to log the message + LOG(ERROR) << buffer.data(); +} + +struct hdfsLogger logger = {.errLogMessage = err_log_message, + .vaErrLogMessage = va_err_log_message}; +#endif // #ifdef USE_HADOOP_HDFS + Status HDFSCommonBuilder::init_hdfs_builder() { +#ifdef USE_HADOOP_HDFS + static std::once_flag flag; + std::call_once(flag, []() { hdfsSetLogger(&logger); }); +#endif // #ifdef USE_HADOOP_HDFS + hdfs_builder = hdfsNewBuilder(); if (hdfs_builder == nullptr) { LOG(INFO) << "failed to init HDFSCommonBuilder, please check check be/conf/hdfs-site.xml"; diff --git a/be/src/io/hdfs_util.cpp b/be/src/io/hdfs_util.cpp index 6c1bbf80a1526f..92d8933d8b5c92 100644 --- a/be/src/io/hdfs_util.cpp +++ b/be/src/io/hdfs_util.cpp @@ -17,10 +17,13 @@ #include "io/hdfs_util.h" +#include +#include #include #include #include +#include #include "common/logging.h" #include "io/fs/err_utils.h" @@ -30,7 +33,7 @@ namespace doris::io { namespace { -Status create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name, hdfsFS* fs) { +Status _create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name, hdfsFS* fs) { HDFSCommonBuilder builder; RETURN_IF_ERROR(create_hdfs_builder(hdfs_params, fs_name, &builder)); hdfsFS hdfs_fs = hdfsBuilderConnect(builder.get()); @@ -41,6 +44,39 @@ Status create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name return Status::OK(); } +// https://brpc.apache.org/docs/server/basics/ +// According to the brpc doc, JNI code checks stack layout and cannot be run in +// bthreads so create a pthread for creating hdfs connection if necessary. +Status create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name, hdfsFS* fs) { + bool is_pthread = bthread_self() == 0; + LOG(INFO) << "create hfdfs fs, is_pthread=" << is_pthread << " fs_name=" << fs_name; + if (is_pthread) { // running in pthread + return _create_hdfs_fs(hdfs_params, fs_name, fs); + } + + // running in bthread, switch to a pthread and wait + Status st; + auto btx = bthread::butex_create(); + *(int*)btx = 0; + std::thread t([&] { + st = _create_hdfs_fs(hdfs_params, fs_name, fs); + *(int*)btx = 1; + bthread::butex_wake_all(btx); + }); + std::unique_ptr> defer((int*)0x01, [&t, &btx](...) { + if (t.joinable()) t.join(); + bthread::butex_destroy(btx); + }); + timespec tmout {.tv_sec = std::chrono::system_clock::now().time_since_epoch().count() + 60, + .tv_nsec = 0}; + if (int ret = bthread::butex_wait(btx, 1, &tmout); ret != 0) { + std::string msg = "failed to wait _create_hdfs_fs fs_name=" + fs_name; + LOG(WARNING) << msg << " error=" << std::strerror(errno); + st = Status::Error(msg); + } + return st; +} + uint64_t hdfs_hash_code(const THdfsParams& hdfs_params, const std::string& fs_name) { uint64_t hash_code = 0; // The specified fsname is used first. diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index 8be29383c1e9b1..8b9cbd75ed33b8 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -80,7 +80,7 @@ Status BaseCompaction::execute_compact() { tablet()->set_last_base_compaction_success_time(UnixMillis()); DorisMetrics::instance()->base_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_size); + DorisMetrics::instance()->base_compaction_bytes_total->increment(_input_rowsets_total_size); return Status::OK(); } diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index 1fd3b785b9072f..e5ec38738155e5 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -20,6 +20,7 @@ #include #include +#include "common/logging.h" #include "common/status.h" #include "olap/calc_delete_bitmap_executor.h" #include "olap/delete_bitmap_calculator.h" @@ -79,7 +80,8 @@ Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t .use_page_cache = !config::disable_storage_page_cache, .file_reader = segment->file_reader().get(), .stats = stats, - .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY}, + .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY, + .file_cache_stats = &stats->file_cache_stats}, }; RETURN_IF_ERROR((*column_iterator)->init(opt)); return Status::OK(); @@ -441,7 +443,8 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest const std::vector& specified_rowsets, RowLocation* row_location, uint32_t version, std::vector>& segment_caches, - RowsetSharedPtr* rowset, bool with_rowid) { + RowsetSharedPtr* rowset, bool with_rowid, + std::string* encoded_seq_value, OlapReaderStatistics* stats) { SCOPED_BVAR_LATENCY(g_tablet_lookup_rowkey_latency); size_t seq_col_length = 0; // use the latest tablet schema to decide if the tablet has sequence column currently @@ -465,13 +468,9 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest DCHECK_EQ(segments_key_bounds.size(), num_segments); std::vector picked_segments; for (int i = num_segments - 1; i >= 0; i--) { - // If mow table has cluster keys, the key bounds is short keys, not primary keys - // use PrimaryKeyIndexMetaPB in primary key index? - if (schema->cluster_key_idxes().empty()) { - if (key_without_seq.compare(segments_key_bounds[i].max_key()) > 0 || - key_without_seq.compare(segments_key_bounds[i].min_key()) < 0) { - continue; - } + if (key_without_seq.compare(segments_key_bounds[i].max_key()) > 0 || + key_without_seq.compare(segments_key_bounds[i].min_key()) < 0) { + continue; } picked_segments.emplace_back(i); } @@ -489,7 +488,7 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest for (auto id : picked_segments) { Status s = segments[id]->lookup_row_key(encoded_key, schema, with_seq_col, with_rowid, - &loc); + &loc, encoded_seq_value, stats); if (s.is()) { continue; } @@ -562,16 +561,27 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, auto rowset_id = rowset->rowset_id(); Version dummy_version(end_version + 1, end_version + 1); auto rowset_schema = rowset->tablet_schema(); + + PartialUpdateInfo* partial_update_info {nullptr}; bool is_partial_update = rowset_writer && rowset_writer->is_partial_update(); + // `have_input_seq_column` is for fixed partial update only. For flexible partial update, we should use + // the skip bitmap to determine wheather a row has specified the sequence column bool have_input_seq_column = false; - if (is_partial_update && rowset_schema->has_sequence_col()) { - std::vector including_cids = - rowset_writer->get_partial_update_info()->update_cids; - have_input_seq_column = - rowset_schema->has_sequence_col() && - (std::find(including_cids.cbegin(), including_cids.cend(), - rowset_schema->sequence_col_idx()) != including_cids.cend()); + // `rids_be_overwritten` is for flexible partial update only, it records row ids that is overwritten by + // another row with higher seqeucne value + std::set rids_be_overwritten; + if (is_partial_update) { + partial_update_info = rowset_writer->get_partial_update_info().get(); + if (partial_update_info->is_fixed_partial_update() && rowset_schema->has_sequence_col()) { + std::vector including_cids = + rowset_writer->get_partial_update_info()->update_cids; + have_input_seq_column = + rowset_schema->has_sequence_col() && + (std::find(including_cids.cbegin(), including_cids.cend(), + rowset_schema->sequence_col_idx()) != including_cids.cend()); + } } + if (rowset_schema->num_variant_columns() > 0) { // During partial updates, the extracted columns of a variant should not be included in the rowset schema. // This is because the partial update for a variant needs to ignore the extracted columns. @@ -580,8 +590,8 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, rowset_schema = rowset_schema->copy_without_variant_extracted_columns(); } // use for partial update - PartialUpdateReadPlan read_plan_ori; - PartialUpdateReadPlan read_plan_update; + FixedReadPlan read_plan_ori; + FixedReadPlan read_plan_update; int64_t conflict_rows = 0; int64_t new_generated_rows = 0; @@ -674,7 +684,10 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, continue; } - if (st.is() && (!is_partial_update || have_input_seq_column)) { + ++conflict_rows; + if (st.is() && + (!is_partial_update || + (partial_update_info->is_fixed_partial_update() && have_input_seq_column))) { // `st.is()` means that there exists a row with the same key and larger value // in seqeunce column. // - If the current load is not a partial update, we just delete current row. @@ -689,7 +702,6 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, // of the including columns in the current row into a new row. delete_bitmap->add({rowset_id, seg->id(), DeleteBitmap::TEMP_VERSION_COMMON}, row_id); - ++conflict_rows; continue; } if (is_partial_update && rowset_writer != nullptr) { @@ -700,10 +712,26 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, // So here we should read version 5's columns and build a new row, which is // consists of version 6's update columns and version 5's origin columns // here we build 2 read plan for ori values and update values + + // - for fixed partial update, we should read update columns from current load's rowset + // and read missing columns from previous rowsets to create the final block + // - for flexible partial update, we should read all columns from current load's rowset + // and read non sort key columns from previous rowsets to create the final block + // So we only need to record rows to read for both mode partial update read_plan_ori.prepare_to_read(loc, pos); read_plan_update.prepare_to_read(RowLocation {rowset_id, seg->id(), row_id}, pos); + + // For flexible partial update, we should use skip bitmap to determine wheather + // a row has specified the sequence column. But skip bitmap should be read from the segment. + // So we record these row ids and process and filter them in `generate_new_block_for_flexible_partial_update()` + if (st.is() && + partial_update_info->is_flexible_partial_update()) { + rids_be_overwritten.insert(pos); + } + rsid_to_rowset[rowset_find->rowset_id()] = rowset_find; ++pos; + // delete bitmap will be calculate when memtable flush and // publish. The two stages may see different versions. // When there is sequence column, the currently imported data @@ -717,14 +745,12 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, loc.row_id); delete_bitmap->add({rowset_id, seg->id(), DeleteBitmap::TEMP_VERSION_COMMON}, row_id); - ++conflict_rows; ++new_generated_rows; continue; } // when st = ok delete_bitmap->add({loc.rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, loc.row_id); - ++conflict_rows; } remaining -= num_read; } @@ -742,22 +768,31 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, } if (pos > 0) { - auto partial_update_info = rowset_writer->get_partial_update_info(); DCHECK(partial_update_info); - RETURN_IF_ERROR(generate_new_block_for_partial_update( - rowset_schema, partial_update_info.get(), read_plan_ori, read_plan_update, - rsid_to_rowset, &block)); + if (partial_update_info->is_fixed_partial_update()) { + RETURN_IF_ERROR(generate_new_block_for_partial_update( + rowset_schema, partial_update_info, read_plan_ori, read_plan_update, + rsid_to_rowset, &block)); + } else { + RETURN_IF_ERROR(generate_new_block_for_flexible_partial_update( + rowset_schema, partial_update_info, rids_be_overwritten, read_plan_ori, + read_plan_update, rsid_to_rowset, &block)); + } RETURN_IF_ERROR(sort_block(block, ordered_block)); RETURN_IF_ERROR(rowset_writer->flush_single_block(&ordered_block)); if (new_generated_rows != rowset_writer->num_rows()) { - LOG(WARNING) << "partial update correctness warning: conflict new generated rows (" - << new_generated_rows << ") not equal to the new flushed rows (" - << rowset_writer->num_rows() << "), tablet: " << tablet_id(); + LOG_WARNING( + "{} correctness warning: new_generated_rows != flushed_rows, " + "new_generated_rows={}, flushed_rows={}, filtered_rows={}, tablet={}", + partial_update_info->partial_update_mode_str(), new_generated_rows, + rowset_writer->num_rows(), rids_be_overwritten.size(), tablet_id()); } - LOG(INFO) << "calc segment delete bitmap for partial update, tablet: " << tablet_id() + LOG(INFO) << "calc segment delete bitmap for " + << partial_update_info->partial_update_mode_str() << ", tablet: " << tablet_id() << " rowset: " << rowset_id << " seg_id: " << seg->id() << " dummy_version: " << end_version + 1 << " rows: " << seg->num_rows() << " conflict rows: " << conflict_rows + << " filtered rows: " << rids_be_overwritten.size() << " new generated rows: " << new_generated_rows << " bimap num: " << delete_bitmap->delete_bitmap.size() << " cost: " << watch.get_elapse_time_us() << "(us)"; @@ -872,7 +907,7 @@ Status BaseTablet::fetch_value_by_rowids(RowsetSharedPtr input_rowset, uint32_t return Status::OK(); } -const signed char* BaseTablet::get_delete_sign_column_data(vectorized::Block& block, +const signed char* BaseTablet::get_delete_sign_column_data(const vectorized::Block& block, size_t rows_at_least) { if (const vectorized::ColumnWithTypeAndName* delete_sign_column = block.try_get_by_name(DELETE_SIGN); @@ -908,7 +943,7 @@ Status BaseTablet::generate_default_value_block(const TabletSchema& schema, Status BaseTablet::generate_new_block_for_partial_update( TabletSchemaSPtr rowset_schema, const PartialUpdateInfo* partial_update_info, - const PartialUpdateReadPlan& read_plan_ori, const PartialUpdateReadPlan& read_plan_update, + const FixedReadPlan& read_plan_ori, const FixedReadPlan& read_plan_update, const std::map& rsid_to_rowset, vectorized::Block* output_block) { // do partial update related works @@ -934,8 +969,7 @@ Status BaseTablet::generate_new_block_for_partial_update( for (auto i = 0; i < update_cids.size(); ++i) { for (auto idx = 0; idx < update_rows; ++idx) { full_mutable_columns[update_cids[i]]->insert_from( - *update_block.get_columns_with_type_and_name()[i].column.get(), - read_index_update[idx]); + *update_block.get_by_position(i).column, read_index_update[idx]); } } @@ -962,7 +996,6 @@ Status BaseTablet::generate_new_block_for_partial_update( *rowset_schema, missing_cids, partial_update_info->default_values, old_block, default_value_block)); } - auto mutable_default_value_columns = default_value_block.mutate_columns(); CHECK(update_rows >= old_rows); @@ -985,7 +1018,7 @@ Status BaseTablet::generate_new_block_for_partial_update( } else if (old_block_delete_signs != nullptr && old_block_delete_signs[read_index_old[idx]] != 0) { if (rs_column.has_default_value()) { - mutable_column->insert_from(*mutable_default_value_columns[i].get(), 0); + mutable_column->insert_from(*default_value_block.get_by_position(i).column, 0); } else if (rs_column.is_nullable()) { assert_cast( mutable_column.get()) @@ -994,12 +1027,153 @@ Status BaseTablet::generate_new_block_for_partial_update( mutable_column->insert_default(); } } else { - mutable_column->insert_from( - *old_block.get_columns_with_type_and_name()[i].column.get(), - read_index_old[idx]); + mutable_column->insert_from(*old_block.get_by_position(i).column, + read_index_old[idx]); + } + } + } + output_block->set_columns(std::move(full_mutable_columns)); + VLOG_DEBUG << "full block when publish: " << output_block->dump_data(); + return Status::OK(); +} + +Status BaseTablet::generate_new_block_for_flexible_partial_update( + TabletSchemaSPtr rowset_schema, const PartialUpdateInfo* partial_update_info, + std::set& rids_be_overwritten, const FixedReadPlan& read_plan_ori, + const FixedReadPlan& read_plan_update, + const std::map& rsid_to_rowset, + vectorized::Block* output_block) { + CHECK(output_block); + + const auto& non_sort_key_cids = partial_update_info->missing_cids; + std::vector all_cids(rowset_schema->num_columns()); + std::iota(all_cids.begin(), all_cids.end(), 0); + auto old_block = rowset_schema->create_block_by_cids(non_sort_key_cids); + auto update_block = rowset_schema->create_block_by_cids(all_cids); + + // rowid in the final block(start from 0, increase continuously) -> rowid to read in update_block + std::map read_index_update; + + // 1. read the current rowset first, if a row in the current rowset has delete sign mark + // we don't need to read values from old block for that row + RETURN_IF_ERROR(read_plan_update.read_columns_by_plan(*rowset_schema, all_cids, rsid_to_rowset, + update_block, &read_index_update)); + size_t update_rows = read_index_update.size(); + + // TODO(bobhan1): add the delete sign optimazation here + // // if there is sequence column in the table, we need to read the sequence column, + // // otherwise it may cause the merge-on-read based compaction policy to produce incorrect results + // const auto* __restrict new_block_delete_signs = + // rowset_schema->has_sequence_col() + // ? nullptr + // : get_delete_sign_column_data(update_block, update_rows); + + // 2. read previous rowsets + // rowid in the final block(start from 0, increase, may not continuous becasue we skip to read some rows) -> rowid to read in old_block + std::map read_index_old; + RETURN_IF_ERROR(read_plan_ori.read_columns_by_plan(*rowset_schema, non_sort_key_cids, + rsid_to_rowset, old_block, &read_index_old)); + size_t old_rows = read_index_old.size(); + DCHECK(update_rows == old_rows); + const auto* __restrict old_block_delete_signs = + get_delete_sign_column_data(old_block, old_rows); + DCHECK(old_block_delete_signs != nullptr); + + // 3. build default value block + auto default_value_block = old_block.clone_empty(); + RETURN_IF_ERROR(BaseTablet::generate_default_value_block(*rowset_schema, non_sort_key_cids, + partial_update_info->default_values, + old_block, default_value_block)); + + // 4. build the final block + auto full_mutable_columns = output_block->mutate_columns(); + DCHECK(rowset_schema->has_skip_bitmap_col()); + auto skip_bitmap_col_idx = rowset_schema->skip_bitmap_col_idx(); + const std::vector* skip_bitmaps = + &(assert_cast( + update_block.get_by_position(skip_bitmap_col_idx).column->get_ptr().get()) + ->get_data()); + + VLOG_DEBUG << fmt::format( + "BaseTablet::generate_new_block_for_flexible_partial_update: " + "rids_be_overwritten.size()={}", + rids_be_overwritten.size()); + if (rowset_schema->has_sequence_col() && !rids_be_overwritten.empty()) { + int32_t seq_col_unique_id = + rowset_schema->column(rowset_schema->sequence_col_idx()).unique_id(); + // If the row specifies the sequence column, we should delete the current row becase the + // flexible partial update on the current row has been `overwritten` by the previous one with larger sequence + // column value. + for (auto it = rids_be_overwritten.begin(); it != rids_be_overwritten.end();) { + auto rid = *it; + if (!skip_bitmaps->at(rid).contains(seq_col_unique_id)) { + VLOG_DEBUG << fmt::format( + "BaseTablet::generate_new_block_for_flexible_partial_update: rid={} " + "filtered", + rid); + ++it; + } else { + it = rids_be_overwritten.erase(it); + VLOG_DEBUG << fmt::format( + "BaseTablet::generate_new_block_for_flexible_partial_update: rid={} " + "keeped", + rid); + } + } + } + + auto fill_one_cell = [&read_index_old](const TabletColumn& tablet_column, std::size_t idx, + vectorized::MutableColumnPtr& new_col, + const vectorized::IColumn& default_value_col, + const vectorized::IColumn& old_value_col, + const vectorized::IColumn& cur_col, bool skipped, + const signed char* delete_sign_column_data) { + if (skipped) { + if (delete_sign_column_data != nullptr && + delete_sign_column_data[read_index_old[idx]] != 0) { + if (tablet_column.has_default_value()) { + new_col->insert_from(default_value_col, 0); + } else if (tablet_column.is_nullable()) { + assert_cast( + new_col.get()) + ->insert_null_elements(1); + } else { + new_col->insert_default(); + } + } else { + new_col->insert_from(old_value_col, idx); + } + } else { + new_col->insert_from(cur_col, idx); + } + }; + + for (std::size_t cid {0}; cid < rowset_schema->num_columns(); cid++) { + vectorized::MutableColumnPtr& new_col = full_mutable_columns[cid]; + const vectorized::IColumn& cur_col = *update_block.get_by_position(cid).column; + const auto& rs_column = rowset_schema->column(cid); + auto col_uid = rs_column.unique_id(); + for (auto idx = 0; idx < update_rows; ++idx) { + if (cid < rowset_schema->num_key_columns()) { + new_col->insert_from(cur_col, idx); + } else { + const vectorized::IColumn& default_value_col = + *default_value_block.get_by_position(cid - rowset_schema->num_key_columns()) + .column; + const vectorized::IColumn& old_value_col = + *old_block.get_by_position(cid - rowset_schema->num_key_columns()).column; + if (rids_be_overwritten.contains(idx)) { + new_col->insert_from(old_value_col, idx); + } else { + fill_one_cell(rs_column, idx, new_col, default_value_col, old_value_col, + cur_col, skip_bitmaps->at(idx).contains(col_uid), + old_block_delete_signs); + } } } + DCHECK_EQ(full_mutable_columns[cid]->size(), update_rows); } + output_block->set_columns(std::move(full_mutable_columns)); VLOG_DEBUG << "full block when publish: " << output_block->dump_data(); return Status::OK(); @@ -1178,7 +1352,7 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf std::unique_ptr transient_rs_writer; DeleteBitmapPtr delete_bitmap = txn_info->delete_bitmap; bool is_partial_update = - txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update; + txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update(); if (is_partial_update) { transient_rs_writer = DORIS_TRY(self->create_transient_rowset_writer( *rowset, txn_info->partial_update_info, txn_expiration)); @@ -1312,7 +1486,8 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf auto old_segments = rowset->num_segments(); rowset->merge_rowset_meta(*transient_rowset->rowset_meta()); auto new_segments = rowset->num_segments(); - ss << ", partial update flush rowset (old segment num: " << old_segments + ss << ", " << txn_info->partial_update_info->partial_update_mode_str() + << " flush rowset (old segment num: " << old_segments << ", new segment num: " << new_segments << ")" << ", cost:" << watch.get_elapse_time_us() - t4 << "(us)"; diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index 943f815581809a..b5da0e3bf06be1 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -24,6 +24,7 @@ #include "common/status.h" #include "olap/iterators.h" #include "olap/olap_common.h" +#include "olap/partial_update_info.h" #include "olap/rowset/segment_v2/segment.h" #include "olap/tablet_fwd.h" #include "olap/tablet_meta.h" @@ -39,7 +40,7 @@ class CalcDeleteBitmapToken; class SegmentCacheHandle; class RowIdConversion; struct PartialUpdateInfo; -class PartialUpdateReadPlan; +class FixedReadPlan; struct TabletWithVersion { BaseTabletSPtr tablet; @@ -153,7 +154,9 @@ class BaseTablet { const std::vector& specified_rowsets, RowLocation* row_location, uint32_t version, std::vector>& segment_caches, - RowsetSharedPtr* rowset = nullptr, bool with_rowid = true); + RowsetSharedPtr* rowset = nullptr, bool with_rowid = true, + std::string* encoded_seq_value = nullptr, + OlapReaderStatistics* stats = nullptr); // calc delete bitmap when flush memtable, use a fake version to calc // For example, cur max version is 5, and we use version 6 to calc but @@ -191,7 +194,7 @@ class BaseTablet { int64_t txn_id, const RowsetIdUnorderedSet& rowset_ids, std::vector* rowsets = nullptr); - static const signed char* get_delete_sign_column_data(vectorized::Block& block, + static const signed char* get_delete_sign_column_data(const vectorized::Block& block, size_t rows_at_least = 0); static Status generate_default_value_block(const TabletSchema& schema, @@ -202,8 +205,14 @@ class BaseTablet { static Status generate_new_block_for_partial_update( TabletSchemaSPtr rowset_schema, const PartialUpdateInfo* partial_update_info, - const PartialUpdateReadPlan& read_plan_ori, - const PartialUpdateReadPlan& read_plan_update, + const FixedReadPlan& read_plan_ori, const FixedReadPlan& read_plan_update, + const std::map& rsid_to_rowset, + vectorized::Block* output_block); + + static Status generate_new_block_for_flexible_partial_update( + TabletSchemaSPtr rowset_schema, const PartialUpdateInfo* partial_update_info, + std::set& rids_be_overwritten, const FixedReadPlan& read_plan_ori, + const FixedReadPlan& read_plan_update, const std::map& rsid_to_rowset, vectorized::Block* output_block); @@ -284,6 +293,9 @@ class BaseTablet { Status show_nested_index_file(std::string* json_meta); + TabletUid tablet_uid() const { return _tablet_meta->tablet_uid(); } + TabletInfo get_tablet_info() const { return TabletInfo(tablet_id(), tablet_uid()); } + protected: // Find the missed versions until the spec_version. // diff --git a/be/src/olap/bitmap_filter_predicate.h b/be/src/olap/bitmap_filter_predicate.h index 431182c4ce821e..48e93642f4c368 100644 --- a/be/src/olap/bitmap_filter_predicate.h +++ b/be/src/olap/bitmap_filter_predicate.h @@ -105,7 +105,7 @@ class BitmapFilterColumnPredicate : public ColumnPredicate { SpecificFilter* _specific_filter; // owned by _filter int get_filter_id() const override { return _filter->get_filter_id(); } - bool is_filter() const override { return true; } + bool is_runtime_filter() const override { return true; } }; template diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 0e2ae500ac6a58..2c49ff2ea8d1a2 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -96,7 +96,6 @@ class BloomFilterColumnPredicate : public ColumnPredicate { DCHECK(filter_id != -1); return filter_id; } - bool is_filter() const override { return true; } std::shared_ptr _filter; SpecificFilter* _specific_filter; // owned by _filter diff --git a/be/src/olap/cold_data_compaction.cpp b/be/src/olap/cold_data_compaction.cpp index 3c61819903460b..54e21d7d7bcf6a 100644 --- a/be/src/olap/cold_data_compaction.cpp +++ b/be/src/olap/cold_data_compaction.cpp @@ -28,6 +28,7 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" +#include "common/status.h" #include "io/fs/remote_file_system.h" #include "olap/compaction.h" #include "olap/olap_common.h" @@ -97,7 +98,7 @@ Status ColdDataCompaction::modify_rowsets() { std::lock_guard wlock(_tablet->get_header_lock()); SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); // Merged cooldowned rowsets MUST NOT be managed by version graph, they will be reclaimed by `remove_unused_remote_files`. - tablet()->delete_rowsets(_input_rowsets, false); + RETURN_IF_ERROR(tablet()->delete_rowsets(_input_rowsets, false)); tablet()->add_rowsets({_output_rowset}); // TODO(plat1ko): process primary key _tablet->tablet_meta()->set_cooldown_meta_id(cooldown_meta_id); diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index a4a8e637bc63a5..2b76c777228b2c 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -165,6 +165,7 @@ class ColumnPredicate { explicit ColumnPredicate(uint32_t column_id, bool opposite = false) : _column_id(column_id), _opposite(opposite) { _predicate_params = std::make_shared(); + reset_judge_selectivity(); } virtual ~ColumnPredicate() = default; @@ -188,16 +189,15 @@ class ColumnPredicate { // evaluate predicate on IColumn // a short circuit eval way uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, uint16_t size) const { - if (always_true(true)) { + if (always_true()) { return size; } uint16_t new_size = _evaluate_inner(column, sel, size); _evaluated_rows += size; _passed_rows += new_size; - if (_can_ignore() && !_judge_counter) { - vectorized::VRuntimeFilterWrapper::judge_selectivity( - get_ignore_threshold(), size - new_size, size, _always_true, _judge_counter); + if (_can_ignore()) { + do_judge_selectivity(size - new_size, size); } return new_size; } @@ -262,8 +262,6 @@ class ColumnPredicate { } virtual int get_filter_id() const { return -1; } - // now InListPredicateBase BloomFilterColumnPredicate BitmapFilterColumnPredicate = true - virtual bool is_filter() const { return false; } PredicateFilterInfo get_filtered_info() const { return PredicateFilterInfo {static_cast(type()), _evaluated_rows - 1, _evaluated_rows - 1 - _passed_rows}; @@ -302,15 +300,14 @@ class ColumnPredicate { } } - bool always_true(bool update) const { - if (update) { - _judge_counter--; - if (!_judge_counter) { - _always_true = false; - } - } - return _always_true; - } + bool always_true() const { return _always_true; } + // Return whether the ColumnPredicate was created by a runtime filter. + // If true, it was definitely created by a runtime filter. + // If false, it may still have been created by a runtime filter, + // as certain filters like "in filter" generate key ranges instead of ColumnPredicate. + // is_runtime_filter uses _can_ignore, except for BitmapFilter, + // as BitmapFilter cannot ignore data. + virtual bool is_runtime_filter() const { return _can_ignore(); } protected: virtual std::string _debug_string() const = 0; @@ -326,13 +323,42 @@ class ColumnPredicate { throw Exception(INTERNAL_ERROR, "Not Implemented _evaluate_inner"); } + void reset_judge_selectivity() const { + _always_true = false; + _judge_counter = config::runtime_filter_sampling_frequency; + _judge_input_rows = 0; + _judge_filter_rows = 0; + } + + void do_judge_selectivity(uint64_t filter_rows, uint64_t input_rows) const { + if ((_judge_counter--) == 0) { + reset_judge_selectivity(); + } + if (!_always_true) { + _judge_filter_rows += filter_rows; + _judge_input_rows += input_rows; + vectorized::VRuntimeFilterWrapper::judge_selectivity( + get_ignore_threshold(), _judge_filter_rows, _judge_input_rows, _always_true); + } + } + uint32_t _column_id; // TODO: the value is only in delete condition, better be template value bool _opposite; std::shared_ptr _predicate_params; mutable uint64_t _evaluated_rows = 1; mutable uint64_t _passed_rows = 0; + // VRuntimeFilterWrapper and ColumnPredicate share the same logic, + // but it's challenging to unify them, so the code is duplicated. + // _judge_counter, _judge_input_rows, _judge_filter_rows, and _always_true + // are variables used to implement the _always_true logic, calculated periodically + // based on runtime_filter_sampling_frequency. During each period, if _always_true + // is evaluated as true, the logic for always_true is applied for the rest of that period + // without recalculating. At the beginning of the next period, + // reset_judge_selectivity is used to reset these variables. mutable int _judge_counter = 0; + mutable uint64_t _judge_input_rows = 0; + mutable uint64_t _judge_filter_rows = 0; mutable bool _always_true = false; }; diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 8a419c841d1a86..a40e28669e90cc 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -47,6 +47,7 @@ #include "olap/cumulative_compaction_policy.h" #include "olap/cumulative_compaction_time_series_policy.h" #include "olap/data_dir.h" +#include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/rowset/beta_rowset.h" #include "olap/rowset/beta_rowset_writer.h" @@ -99,14 +100,14 @@ bool is_rowset_tidy(std::string& pre_max_key, const RowsetSharedPtr& rhs) { } } std::string min_key; - auto ret = rhs->min_key(&min_key); + auto ret = rhs->first_key(&min_key); if (!ret) { return false; } if (min_key <= pre_max_key) { return false; } - CHECK(rhs->max_key(&pre_max_key)); + CHECK(rhs->last_key(&pre_max_key)); return true; } @@ -174,10 +175,11 @@ Status Compaction::merge_input_rowsets() { // write merged rows to output rowset // The test results show that merger is low-memory-footprint, there is no need to tracker its mem pool - // if ctx.skip_inverted_index.size() > 0, it means we need to do inverted index compaction. + // if ctx.columns_to_do_index_compaction.size() > 0, it means we need to do inverted index compaction. // the row ID conversion matrix needs to be used for inverted index compaction. - if (!ctx.skip_inverted_index.empty() || (_tablet->keys_type() == KeysType::UNIQUE_KEYS && - _tablet->enable_unique_key_merge_on_write())) { + if (!ctx.columns_to_do_index_compaction.empty() || + (_tablet->keys_type() == KeysType::UNIQUE_KEYS && + _tablet->enable_unique_key_merge_on_write())) { _stats.rowid_conversion = _rowid_conversion.get(); } @@ -186,25 +188,32 @@ Status Compaction::merge_input_rowsets() { Status res; { SCOPED_TIMER(_merge_rowsets_latency_timer); + // 1. Merge segment files and write bkd inverted index if (_is_vertical) { res = Merger::vertical_merge_rowsets(_tablet, compaction_type(), *_cur_tablet_schema, input_rs_readers, _output_rs_writer.get(), get_avg_segment_rows(), way_num, &_stats); } else { + if (!_tablet->tablet_schema()->cluster_key_idxes().empty()) { + return Status::InternalError( + "mow table with cluster keys does not support non vertical compaction"); + } res = Merger::vmerge_rowsets(_tablet, compaction_type(), *_cur_tablet_schema, input_rs_readers, _output_rs_writer.get(), &_stats); } - } - - _tablet->last_compaction_status = res; - if (!res.ok()) { - return res; + _tablet->last_compaction_status = res; + if (!res.ok()) { + return res; + } + // 2. Merge the remaining inverted index files of the string type + RETURN_IF_ERROR(do_inverted_index_compaction()); } COUNTER_UPDATE(_merged_rows_counter, _stats.merged_rows); COUNTER_UPDATE(_filtered_rows_counter, _stats.filtered_rows); + // 3. In the `build`, `_close_file_writers` is called to close the inverted index file writer and write the final compound index file. RETURN_NOT_OK_STATUS_WITH_WARN(_output_rs_writer->build(_output_rowset), fmt::format("rowset writer build failed. output_version: {}", _output_version.to_string())); @@ -248,10 +257,10 @@ int64_t Compaction::get_avg_segment_rows() { if (meta->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY) { int64_t compaction_goal_size_mbytes = meta->time_series_compaction_goal_size_mbytes(); return (compaction_goal_size_mbytes * 1024 * 1024 * 2) / - (_input_rowsets_size / (_input_row_num + 1) + 1); + (_input_rowsets_data_size / (_input_row_num + 1) + 1); } return config::vertical_compaction_max_segment_size / - (_input_rowsets_size / (_input_row_num + 1) + 1); + (_input_rowsets_data_size / (_input_row_num + 1) + 1); } CompactionMixin::CompactionMixin(StorageEngine& engine, TabletSharedPtr tablet, @@ -296,9 +305,9 @@ Status CompactionMixin::do_compact_ordered_rowsets() { // build output rowset RowsetMetaSharedPtr rowset_meta = std::make_shared(); rowset_meta->set_num_rows(_input_row_num); - rowset_meta->set_total_disk_size(_input_rowsets_size); - rowset_meta->set_data_disk_size(_input_rowsets_size); - rowset_meta->set_index_disk_size(_input_index_size); + rowset_meta->set_total_disk_size(_input_rowsets_data_size + _input_rowsets_index_size); + rowset_meta->set_data_disk_size(_input_rowsets_data_size); + rowset_meta->set_index_disk_size(_input_rowsets_index_size); rowset_meta->set_empty(_input_row_num == 0); rowset_meta->set_num_segments(_input_num_segments); rowset_meta->set_segments_overlap(NONOVERLAPPING); @@ -311,12 +320,13 @@ Status CompactionMixin::do_compact_ordered_rowsets() { void CompactionMixin::build_basic_info() { for (auto& rowset : _input_rowsets) { - _input_rowsets_size += rowset->data_disk_size(); - _input_index_size += rowset->index_disk_size(); + _input_rowsets_data_size += rowset->data_disk_size(); + _input_rowsets_index_size += rowset->index_disk_size(); + _input_rowsets_total_size += rowset->total_disk_size(); _input_row_num += rowset->num_rows(); _input_num_segments += rowset->num_segments(); } - COUNTER_UPDATE(_input_rowsets_data_size_counter, _input_rowsets_size); + COUNTER_UPDATE(_input_rowsets_data_size_counter, _input_rowsets_data_size); COUNTER_UPDATE(_input_row_num_counter, _input_row_num); COUNTER_UPDATE(_input_segments_num_counter, _input_num_segments); @@ -435,8 +445,12 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { << ", disk=" << tablet()->data_dir()->path() << ", segments=" << _input_num_segments << ", input_row_num=" << _input_row_num << ", output_row_num=" << _output_rowset->num_rows() - << ", input_rowset_size=" << _input_rowsets_size - << ", output_rowset_size=" << _output_rowset->data_disk_size() + << ", input_rowsets_data_size=" << _input_rowsets_data_size + << ", input_rowsets_index_size=" << _input_rowsets_index_size + << ", input_rowsets_total_size=" << _input_rowsets_total_size + << ", output_rowset_data_size=" << _output_rowset->data_disk_size() + << ", output_rowset_index_size=" << _output_rowset->index_disk_size() + << ", output_rowset_total_size=" << _output_rowset->total_disk_size() << ". elapsed time=" << watch.get_elapse_second() << "s."; _state = CompactionState::SUCCESS; return Status::OK(); @@ -450,8 +464,6 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { RETURN_IF_ERROR(merge_input_rowsets()); - RETURN_IF_ERROR(do_inverted_index_compaction()); - RETURN_IF_ERROR(modify_rowsets()); auto* cumu_policy = tablet()->cumulative_compaction_policy(); @@ -460,8 +472,8 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { << ". tablet=" << _tablet->tablet_id() << ", output_version=" << _output_version << ", current_max_version=" << tablet()->max_version().second << ", disk=" << tablet()->data_dir()->path() << ", segments=" << _input_num_segments - << ", input_rowset_size=" << _input_rowsets_size - << ", output_rowset_size=" << _output_rowset->data_disk_size() + << ", input_data_size=" << _input_rowsets_data_size + << ", output_rowset_size=" << _output_rowset->total_disk_size() << ", input_row_num=" << _input_row_num << ", output_row_num=" << _output_rowset->num_rows() << ", filtered_row_num=" << _stats.filtered_rows @@ -478,46 +490,12 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { Status Compaction::do_inverted_index_compaction() { const auto& ctx = _output_rs_writer->context(); if (!config::inverted_index_compaction_enable || _input_row_num <= 0 || - !_stats.rowid_conversion || ctx.skip_inverted_index.empty()) { + !_stats.rowid_conversion || ctx.columns_to_do_index_compaction.empty()) { return Status::OK(); } OlapStopWatch inverted_watch; - int64_t cur_max_version = 0; - { - std::shared_lock rlock(_tablet->get_header_lock()); - cur_max_version = _tablet->max_version_unlocked(); - } - - DeleteBitmap output_rowset_delete_bitmap(_tablet->tablet_id()); - std::set missed_rows; - std::map>> location_map; - // Convert the delete bitmap of the input rowsets to output rowset. - _tablet->calc_compaction_output_rowset_delete_bitmap( - _input_rowsets, *_rowid_conversion, 0, cur_max_version + 1, &missed_rows, &location_map, - _tablet->tablet_meta()->delete_bitmap(), &output_rowset_delete_bitmap); - - if (!_allow_delete_in_cumu_compaction) { - if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION && - _stats.merged_rows != missed_rows.size() && _tablet->tablet_state() == TABLET_RUNNING) { - std::string err_msg = fmt::format( - "cumulative compaction: the merged rows({}) is not equal to missed " - "rows({}) in rowid conversion, tablet_id: {}, table_id:{}", - _stats.merged_rows, missed_rows.size(), _tablet->tablet_id(), - _tablet->table_id()); - if (config::enable_mow_compaction_correctness_check_core) { - CHECK(false) << err_msg; - } else { - DCHECK(false) << err_msg; - } - // log here just for debugging, do not return error - LOG(WARNING) << err_msg; - } - } - - RETURN_IF_ERROR(_tablet->check_rowid_conversion(_output_rowset, location_map)); - // translation vec // <> // the first level vector: index indicates src segment. @@ -538,10 +516,11 @@ Status Compaction::do_inverted_index_compaction() { auto src_segment_num = src_seg_to_id_map.size(); auto dest_segment_num = dest_segment_num_rows.size(); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_dest_segment_num_is_zero", + { dest_segment_num = 0; }) if (dest_segment_num <= 0) { LOG(INFO) << "skip doing index compaction due to no output segments" << ". tablet=" << _tablet->tablet_id() << ", input row number=" << _input_row_num - << ", output row number=" << _output_rowset->num_rows() << ". elapsed time=" << inverted_watch.get_elapse_second() << "s."; return Status::OK(); } @@ -613,14 +592,17 @@ Status Compaction::do_inverted_index_compaction() { const auto& [rowset_id, seg_id] = m.first; auto find_it = rs_id_to_rowset_map.find(rowset_id); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_find_rowset_error", + { find_it = rs_id_to_rowset_map.end(); }) if (find_it == rs_id_to_rowset_map.end()) [[unlikely]] { - DCHECK(false) << _tablet->tablet_id() << ' ' << rowset_id; + // DCHECK(false) << _tablet->tablet_id() << ' ' << rowset_id; return Status::InternalError("cannot find rowset. tablet_id={} rowset_id={}", _tablet->tablet_id(), rowset_id.to_string()); } auto* rowset = find_it->second; - const auto& fs = rowset->rowset_meta()->fs(); + auto fs = rowset->rowset_meta()->fs(); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_get_fs_error", { fs = nullptr; }) if (!fs) { return Status::InternalError("get fs failed, resource_id={}", rowset->rowset_meta()->resource_id()); @@ -641,58 +623,9 @@ Status Compaction::do_inverted_index_compaction() { // dest index files // format: rowsetId_segmentId - std::vector> inverted_index_file_writers( - dest_segment_num); - - // Some columns have already been indexed - // key: seg_id, value: inverted index file size - std::unordered_map compacted_idx_file_size; - for (int seg_id = 0; seg_id < dest_segment_num; ++seg_id) { - std::string index_path_prefix { - InvertedIndexDescriptor::get_index_file_path_prefix(ctx.segment_path(seg_id))}; - auto inverted_index_file_reader = std::make_unique( - ctx.fs(), index_path_prefix, - _cur_tablet_schema->get_inverted_index_storage_format()); - bool open_idx_file_cache = false; - auto st = inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); - if (st.ok()) { - auto index_not_need_to_compact = - DORIS_TRY(inverted_index_file_reader->get_all_directories()); - // V1: each index is a separate file - // V2: all indexes are in a single file - if (_cur_tablet_schema->get_inverted_index_storage_format() != - doris::InvertedIndexStorageFormatPB::V1) { - int64_t fsize = 0; - st = ctx.fs()->file_size( - InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix), &fsize); - if (!st.ok()) { - LOG(ERROR) << "file size error in index compaction, error:" << st.msg(); - return st; - } - compacted_idx_file_size[seg_id] = fsize; - } - auto inverted_index_file_writer = std::make_unique( - ctx.fs(), index_path_prefix, ctx.rowset_id.to_string(), seg_id, - _cur_tablet_schema->get_inverted_index_storage_format()); - RETURN_IF_ERROR(inverted_index_file_writer->initialize(index_not_need_to_compact)); - inverted_index_file_writers[seg_id] = std::move(inverted_index_file_writer); - } else if (st.is()) { - auto inverted_index_file_writer = std::make_unique( - ctx.fs(), index_path_prefix, ctx.rowset_id.to_string(), seg_id, - _cur_tablet_schema->get_inverted_index_storage_format()); - inverted_index_file_writers[seg_id] = std::move(inverted_index_file_writer); - // no index file - compacted_idx_file_size[seg_id] = 0; - } else { - LOG(ERROR) << "inverted_index_file_reader init failed in index compaction, error:" - << st; - return st; - } - } - for (const auto& writer : inverted_index_file_writers) { - writer->set_file_writer_opts(ctx.get_file_writer_options()); - } + auto& inverted_index_file_writers = dynamic_cast(_output_rs_writer.get()) + ->inverted_index_file_writers(); + DCHECK_EQ(inverted_index_file_writers.size(), dest_segment_num); // use tmp file dir to store index files auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); @@ -714,47 +647,30 @@ Status Compaction::do_inverted_index_compaction() { }; Status status = Status::OK(); - for (auto&& column_uniq_id : ctx.skip_inverted_index) { + for (auto&& column_uniq_id : ctx.columns_to_do_index_compaction) { auto col = _cur_tablet_schema->column_by_uid(column_uniq_id); - const auto* index_meta = _cur_tablet_schema->get_inverted_index(col); - - // if index properties are different, index compaction maybe needs to be skipped. - bool is_continue = false; - std::optional> first_properties; - for (const auto& rowset : _input_rowsets) { - const auto* tablet_index = rowset->tablet_schema()->get_inverted_index(col); - const auto& properties = tablet_index->properties(); - if (!first_properties.has_value()) { - first_properties = properties; - } else { - if (properties != first_properties.value()) { - error_handler(index_meta->index_id(), column_uniq_id); - status = Status::Error( - "if index properties are different, index compaction needs to be " - "skipped."); - is_continue = true; - break; - } - } - } - if (is_continue) { - continue; + const auto* index_meta = _cur_tablet_schema->inverted_index(col); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_can_not_find_index_meta", + { index_meta = nullptr; }) + if (index_meta == nullptr) { + status = Status::Error( + fmt::format("Can not find index_meta for col {}", col.name())); + break; } std::vector dest_index_dirs(dest_segment_num); - std::vector src_index_dirs(src_segment_num); try { + std::vector> src_idx_dirs(src_segment_num); for (int src_segment_id = 0; src_segment_id < src_segment_num; src_segment_id++) { - auto src_dir = + src_idx_dirs[src_segment_id] = DORIS_TRY(inverted_index_file_readers[src_segment_id]->open(index_meta)); - src_index_dirs[src_segment_id] = src_dir.release(); } for (int dest_segment_id = 0; dest_segment_id < dest_segment_num; dest_segment_id++) { auto* dest_dir = DORIS_TRY(inverted_index_file_writers[dest_segment_id]->open(index_meta)); dest_index_dirs[dest_segment_id] = dest_dir; } - auto st = compact_column(index_meta->index_id(), src_index_dirs, dest_index_dirs, + auto st = compact_column(index_meta->index_id(), src_idx_dirs, dest_index_dirs, index_tmp_path.native(), trans_vec, dest_segment_num_rows); if (!st.ok()) { error_handler(index_meta->index_id(), column_uniq_id); @@ -766,54 +682,63 @@ Status Compaction::do_inverted_index_compaction() { } } - std::vector all_inverted_index_file_info(dest_segment_num); - uint64_t inverted_index_file_size = 0; - for (int seg_id = 0; seg_id < dest_segment_num; ++seg_id) { - auto inverted_index_file_writer = inverted_index_file_writers[seg_id].get(); - if (Status st = inverted_index_file_writer->close(); !st.ok()) { - status = Status::Error(st.msg()); - } else { - inverted_index_file_size += inverted_index_file_writer->get_index_file_total_size(); - inverted_index_file_size -= compacted_idx_file_size[seg_id]; - } - all_inverted_index_file_info[seg_id] = inverted_index_file_writer->get_index_file_info(); - } // check index compaction status. If status is not ok, we should return error and end this compaction round. if (!status.ok()) { return status; } - - // index compaction should update total disk size and index disk size - _output_rowset->rowset_meta()->set_data_disk_size(_output_rowset->data_disk_size() + - inverted_index_file_size); - _output_rowset->rowset_meta()->set_total_disk_size(_output_rowset->data_disk_size() + - inverted_index_file_size); - _output_rowset->rowset_meta()->set_index_disk_size(_output_rowset->index_disk_size() + - inverted_index_file_size); - - _output_rowset->rowset_meta()->update_inverted_index_files_info(all_inverted_index_file_info); - COUNTER_UPDATE(_output_rowset_data_size_counter, _output_rowset->data_disk_size()); - LOG(INFO) << "succeed to do index compaction" - << ". tablet=" << _tablet->tablet_id() << ", input row number=" << _input_row_num - << ", output row number=" << _output_rowset->num_rows() - << ", input_rowset_size=" << _input_rowsets_size - << ", output_rowset_size=" << _output_rowset->data_disk_size() - << ", inverted index file size=" << inverted_index_file_size + << ". tablet=" << _tablet->tablet_id() << ". elapsed time=" << inverted_watch.get_elapse_second() << "s."; return Status::OK(); } -void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) { - for (const auto& index : _cur_tablet_schema->indexes()) { - if (index.index_type() != IndexType::INVERTED) { +void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { + for (const auto& index : _cur_tablet_schema->inverted_indexes()) { + auto col_unique_ids = index->col_unique_ids(); + // check if column unique ids is empty to avoid crash + if (col_unique_ids.empty()) { + LOG(WARNING) << "tablet[" << _tablet->tablet_id() << "] index[" << index->index_id() + << "] has no column unique id, will skip index compaction." + << " tablet_schema=" << _cur_tablet_schema->dump_full_schema(); + continue; + } + auto col_unique_id = col_unique_ids[0]; + // Avoid doing inverted index compaction on non-slice type columns + if (!field_is_slice_type(_cur_tablet_schema->column_by_uid(col_unique_id).type())) { continue; } - auto col_unique_id = index.col_unique_ids()[0]; + // if index properties are different, index compaction maybe needs to be skipped. + bool is_continue = false; + std::optional> first_properties; + for (const auto& rowset : _input_rowsets) { + const auto* tablet_index = rowset->tablet_schema()->inverted_index(col_unique_id); + // no inverted index or index id is different from current index id + if (tablet_index == nullptr || tablet_index->index_id() != index->index_id()) { + is_continue = true; + break; + } + auto properties = tablet_index->properties(); + if (!first_properties.has_value()) { + first_properties = properties; + } else { + DBUG_EXECUTE_IF( + "Compaction::do_inverted_index_compaction_index_properties_different", + { properties.emplace("dummy_key", "dummy_value"); }) + if (properties != first_properties.value()) { + is_continue = true; + break; + } + } + } + if (is_continue) { + continue; + } auto has_inverted_index = [&](const RowsetSharedPtr& src_rs) { auto* rowset = static_cast(src_rs.get()); + DBUG_EXECUTE_IF("Compaction::construct_skip_inverted_index_is_skip_index_compaction", + { rowset->set_skip_index_compaction(col_unique_id); }) if (rowset->is_skip_index_compaction(col_unique_id)) { LOG(WARNING) << "tablet[" << _tablet->tablet_id() << "] rowset[" << rowset->rowset_id() << "] column_unique_id[" << col_unique_id @@ -821,14 +746,18 @@ void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) { return false; } - const auto& fs = rowset->rowset_meta()->fs(); + auto fs = rowset->rowset_meta()->fs(); + DBUG_EXECUTE_IF("Compaction::construct_skip_inverted_index_get_fs_error", + { fs = nullptr; }) if (!fs) { LOG(WARNING) << "get fs failed, resource_id=" << rowset->rowset_meta()->resource_id(); return false; } - const auto* index_meta = rowset->tablet_schema()->get_inverted_index(col_unique_id, ""); + const auto* index_meta = rowset->tablet_schema()->inverted_index(col_unique_id); + DBUG_EXECUTE_IF("Compaction::construct_skip_inverted_index_index_meta_nullptr", + { index_meta = nullptr; }) if (index_meta == nullptr) { LOG(WARNING) << "tablet[" << _tablet->tablet_id() << "] column_unique_id[" << col_unique_id << "] index meta is null, will skip index compaction"; @@ -838,6 +767,9 @@ void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) { for (auto i = 0; i < rowset->num_segments(); i++) { // TODO: inverted_index_path auto seg_path = rowset->segment_path(i); + DBUG_EXECUTE_IF("Compaction::construct_skip_inverted_index_seg_path_nullptr", { + seg_path = ResultError(Status::Error("error")); + }) if (!seg_path) { LOG(WARNING) << seg_path.error(); return false; @@ -855,6 +787,16 @@ void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) { auto st = inverted_index_file_reader->init( config::inverted_index_read_buffer_size, open_idx_file_cache); index_file_path = inverted_index_file_reader->get_index_file_path(index_meta); + DBUG_EXECUTE_IF( + "Compaction::construct_skip_inverted_index_index_file_reader_init_" + "status_not_ok", + { + st = Status::Error( + "debug point: " + "construct_skip_inverted_index_index_file_reader_init_" + "status_" + "not_ok"); + }) if (!st.ok()) { LOG(WARNING) << "init index " << index_file_path << " error:" << st; return false; @@ -862,6 +804,14 @@ void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) { // check index meta auto result = inverted_index_file_reader->open(index_meta); + DBUG_EXECUTE_IF( + "Compaction::construct_skip_inverted_index_index_file_reader_open_" + "error", + { + result = ResultError( + Status::Error( + "CLuceneError occur when open idx file")); + }) if (!result.has_value()) { LOG(WARNING) << "open index " << index_file_path << " error:" << result.error(); @@ -871,9 +821,15 @@ void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) { std::vector files; reader->list(&files); reader->close(); + DBUG_EXECUTE_IF( + "Compaction::construct_skip_inverted_index_index_reader_close_error", + { _CLTHROWA(CL_ERR_IO, "debug point: reader close error"); }) + + DBUG_EXECUTE_IF("Compaction::construct_skip_inverted_index_index_files_count", + { files.clear(); }) // why is 3? - // bkd index will write at least 3 files + // slice type index file at least has 3 files: null_bitmap, segments_N, segments.gen if (files.size() < 3) { LOG(WARNING) << "tablet[" << _tablet->tablet_id() << "] column_unique_id[" << col_unique_id << "]," << index_file_path @@ -893,9 +849,8 @@ void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) { bool all_have_inverted_index = std::all_of(_input_rowsets.begin(), _input_rowsets.end(), std::move(has_inverted_index)); - if (all_have_inverted_index && - field_is_slice_type(_cur_tablet_schema->column_by_uid(col_unique_id).type())) { - ctx.skip_inverted_index.insert(col_unique_id); + if (all_have_inverted_index) { + ctx.columns_to_do_index_compaction.insert(col_unique_id); } } } @@ -905,10 +860,8 @@ Status CompactionMixin::construct_output_rowset_writer(RowsetWriterContext& ctx) if (config::inverted_index_compaction_enable && (((_tablet->keys_type() == KeysType::UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write()) || - _tablet->keys_type() == KeysType::DUP_KEYS)) && - _cur_tablet_schema->get_inverted_index_storage_format() == - InvertedIndexStorageFormatPB::V1) { - construct_skip_inverted_index(ctx); + _tablet->keys_type() == KeysType::DUP_KEYS))) { + construct_index_compaction_columns(ctx); } ctx.version = _output_version; ctx.rowset_state = VISIBLE; @@ -1089,6 +1042,7 @@ Status CompactionMixin::modify_rowsets() { LOG(WARNING) << "failed to remove old version delete bitmap, st: " << st; } } + return Status::OK(); } @@ -1173,8 +1127,6 @@ Status CloudCompactionMixin::execute_compact_impl(int64_t permits) { RETURN_IF_ERROR(merge_input_rowsets()); - RETURN_IF_ERROR(do_inverted_index_compaction()); - RETURN_IF_ERROR(_engine.meta_mgr().commit_rowset(*_output_rowset->rowset_meta().get())); // 4. modify rowsets in memory @@ -1201,10 +1153,8 @@ Status CloudCompactionMixin::construct_output_rowset_writer(RowsetWriterContext& if (config::inverted_index_compaction_enable && (((_tablet->keys_type() == KeysType::UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write()) || - _tablet->keys_type() == KeysType::DUP_KEYS)) && - _cur_tablet_schema->get_inverted_index_storage_format() == - InvertedIndexStorageFormatPB::V1) { - construct_skip_inverted_index(ctx); + _tablet->keys_type() == KeysType::DUP_KEYS))) { + construct_index_compaction_columns(ctx); } // Use the storage resource of the previous rowset diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index 38d50595ca8f6e..06ef4268529247 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -67,9 +67,10 @@ class Compaction { protected: Status merge_input_rowsets(); + // merge inverted index files Status do_inverted_index_compaction(); - void construct_skip_inverted_index(RowsetWriterContext& ctx); + void construct_index_compaction_columns(RowsetWriterContext& ctx); virtual Status construct_output_rowset_writer(RowsetWriterContext& ctx) = 0; @@ -89,10 +90,11 @@ class Compaction { BaseTabletSPtr _tablet; std::vector _input_rowsets; - int64_t _input_rowsets_size {0}; + int64_t _input_rowsets_data_size {0}; + int64_t _input_rowsets_index_size {0}; + int64_t _input_rowsets_total_size {0}; int64_t _input_row_num {0}; int64_t _input_num_segments {0}; - int64_t _input_index_size {0}; Merger::Statistics _stats; diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index ece960f0250459..f17dae4a72b6f1 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -204,12 +204,23 @@ class ComparisonPredicateBase : public ColumnPredicate { return bf->test_bytes(_value.data, _value.size); } else { // DecimalV2 using decimal12_t in bloom filter, should convert value to decimal12_t - // Datev1/DatetimeV1 using VecDatetimeValue in bloom filter, NO need to convert. if constexpr (Type == PrimitiveType::TYPE_DECIMALV2) { decimal12_t decimal12_t_val(_value.int_value(), _value.frac_value()); return bf->test_bytes( const_cast(reinterpret_cast(&decimal12_t_val)), sizeof(decimal12_t)); + // Datev1 using uint24_t in bloom filter + } else if constexpr (Type == PrimitiveType::TYPE_DATE) { + uint24_t date_value(_value.to_olap_date()); + return bf->test_bytes( + const_cast(reinterpret_cast(&date_value)), + sizeof(uint24_t)); + // DatetimeV1 using int64_t in bloom filter + } else if constexpr (Type == PrimitiveType::TYPE_DATETIME) { + int64_t datetime_value(_value.to_olap_datetime()); + return bf->test_bytes( + const_cast(reinterpret_cast(&datetime_value)), + sizeof(int64_t)); } else { return bf->test_bytes(const_cast(reinterpret_cast(&_value)), sizeof(T)); @@ -339,14 +350,13 @@ class ComparisonPredicateBase : public ColumnPredicate { } } - if (_can_ignore() && !_judge_counter) { + if (_can_ignore()) { for (uint16_t i = 0; i < size; i++) { current_passed_rows += flags[i]; } _passed_rows += current_passed_rows; - vectorized::VRuntimeFilterWrapper::judge_selectivity( - get_ignore_threshold(), current_evaluated_rows - current_passed_rows, - current_evaluated_rows, _always_true, _judge_counter); + do_judge_selectivity(current_evaluated_rows - current_passed_rows, + current_evaluated_rows); } } diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index b762468b3455a4..b961c694ede4d0 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -125,7 +125,8 @@ Status CumulativeCompaction::execute_compact() { tablet()->set_last_cumu_compaction_success_time(UnixMillis()); } DorisMetrics::instance()->cumulative_compaction_deltas_total->increment(_input_rowsets.size()); - DorisMetrics::instance()->cumulative_compaction_bytes_total->increment(_input_rowsets_size); + DorisMetrics::instance()->cumulative_compaction_bytes_total->increment( + _input_rowsets_total_size); return Status::OK(); } diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 4d5b1ce9add3e0..80fc440ce36a6d 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -346,6 +346,8 @@ Status DeleteHandler::parse_condition(const std::string& condition_str, TConditi } template + requires(std::is_same_v or + std::is_same_v) Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema, TabletSchemaSPtr delete_pred_related_schema, const RepeatedPtrField& sub_pred_list, @@ -353,10 +355,13 @@ Status DeleteHandler::_parse_column_pred(TabletSchemaSPtr complete_schema, for (const auto& sub_predicate : sub_pred_list) { TCondition condition; RETURN_IF_ERROR(parse_condition(sub_predicate, &condition)); - int32_t col_unique_id; - if constexpr (std::is_same_v) { - col_unique_id = sub_predicate.col_unique_id; - } else { + int32_t col_unique_id = -1; + if constexpr (std::is_same_v) { + if (sub_predicate.has_column_unique_id()) [[likely]] { + col_unique_id = sub_predicate.column_unique_id(); + } + } + if (col_unique_id < 0) { const auto& column = *DORIS_TRY(delete_pred_related_schema->column(condition.column_name)); col_unique_id = column.unique_id(); diff --git a/be/src/olap/delete_handler.h b/be/src/olap/delete_handler.h index cc585c0abcf9f6..77de62d31d988e 100644 --- a/be/src/olap/delete_handler.h +++ b/be/src/olap/delete_handler.h @@ -21,6 +21,7 @@ #include #include +#include #include "common/factory_creator.h" #include "common/status.h" @@ -115,6 +116,8 @@ class DeleteHandler { private: template + requires(std::is_same_v or + std::is_same_v) Status _parse_column_pred( TabletSchemaSPtr complete_schema, TabletSchemaSPtr delete_pred_related_schema, const ::google::protobuf::RepeatedPtrField& sub_pred_list, diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index 00c622df59f170..88277775f96101 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -103,10 +103,15 @@ Status BaseDeltaWriter::init() { if (_is_init) { return Status::OK(); } + auto* t_ctx = doris::thread_context(true); + std::shared_ptr wg_sptr = nullptr; + if (t_ctx) { + wg_sptr = t_ctx->workload_group().lock(); + } RETURN_IF_ERROR(_rowset_builder->init()); RETURN_IF_ERROR(_memtable_writer->init( _rowset_builder->rowset_writer(), _rowset_builder->tablet_schema(), - _rowset_builder->get_partial_update_info(), nullptr, + _rowset_builder->get_partial_update_info(), wg_sptr, _rowset_builder->tablet()->enable_unique_key_merge_on_write())); ExecEnv::GetInstance()->memtable_memory_limiter()->register_writer(_memtable_writer); _is_init = true; @@ -249,7 +254,7 @@ void DeltaWriter::_request_slave_tablet_pull_rowset(const PNodeInfo& node_info) auto tablet_schema = cur_rowset->rowset_meta()->tablet_schema(); if (!tablet_schema->skip_write_index_on_load()) { for (auto& column : tablet_schema->columns()) { - const TabletIndex* index_meta = tablet_schema->get_inverted_index(*column); + const TabletIndex* index_meta = tablet_schema->inverted_index(*column); if (index_meta) { indices_ids.emplace_back(index_meta->index_id(), index_meta->get_index_suffix()); } diff --git a/be/src/olap/delta_writer_v2.cpp b/be/src/olap/delta_writer_v2.cpp index 73d2fb1d9746a9..a6fb0154489042 100644 --- a/be/src/olap/delta_writer_v2.cpp +++ b/be/src/olap/delta_writer_v2.cpp @@ -127,13 +127,12 @@ Status DeltaWriterV2::init() { _rowset_writer = std::make_shared(_streams); RETURN_IF_ERROR(_rowset_writer->init(context)); - ThreadPool* wg_thread_pool_ptr = nullptr; + std::shared_ptr wg_sptr = nullptr; if (_state->get_query_ctx()) { - wg_thread_pool_ptr = _state->get_query_ctx()->get_memtable_flush_pool(); + wg_sptr = _state->get_query_ctx()->workload_group(); } RETURN_IF_ERROR(_memtable_writer->init(_rowset_writer, _tablet_schema, _partial_update_info, - wg_thread_pool_ptr, - _streams[0]->enable_unique_mow(_req.index_id))); + wg_sptr, _streams[0]->enable_unique_mow(_req.index_id))); ExecEnv::GetInstance()->memtable_memory_limiter()->register_writer(_memtable_writer); _is_init = true; _streams.clear(); @@ -237,7 +236,7 @@ void DeltaWriterV2::_build_current_tablet_schema(int64_t index_id, } // set partial update columns info _partial_update_info = std::make_shared(); - _partial_update_info->init(*_tablet_schema, table_schema_param->is_partial_update(), + _partial_update_info->init(*_tablet_schema, table_schema_param->unique_key_update_mode(), table_schema_param->partial_update_input_columns(), table_schema_param->is_strict_mode(), table_schema_param->timestamp_ms(), diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index bd91fe147fbb43..c88ac0cdd6cfae 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -99,9 +99,9 @@ class InListPredicateBase : public ColumnPredicate { if constexpr (is_string_type(Type)) { HybridSetBase::IteratorBase* iter = hybrid_set->begin(); while (iter->has_next()) { - const StringRef* value = (const StringRef*)(iter->get_value()); + const auto* value = (const StringRef*)(iter->get_value()); if constexpr (Type == TYPE_CHAR) { - _temp_datas.push_back(""); + _temp_datas.emplace_back(""); _temp_datas.back().resize(std::max(char_length, value->size)); memcpy(_temp_datas.back().data(), value->data, value->size); const string& str = _temp_datas.back(); @@ -225,18 +225,17 @@ class InListPredicateBase : public ColumnPredicate { } int get_filter_id() const override { return _values->get_filter_id(); } - bool is_filter() const override { return true; } template void _evaluate_bit(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const { if (column.is_nullable()) { - auto* nullable_col = + const auto* nullable_col = vectorized::check_and_get_column(column); - auto& null_bitmap = reinterpret_cast( - nullable_col->get_null_map_column()) - .get_data(); - auto& nested_col = nullable_col->get_nested_column(); + const auto& null_bitmap = reinterpret_cast( + nullable_col->get_null_map_column()) + .get_data(); + const auto& nested_col = nullable_col->get_nested_column(); if (_opposite) { return _base_evaluate_bit(&nested_col, &null_bitmap, sel, size, @@ -302,11 +301,13 @@ class InListPredicateBase : public ColumnPredicate { bool evaluate_and(const segment_v2::BloomFilter* bf) const override { if constexpr (PT == PredicateType::IN_LIST) { // IN predicate can not use ngram bf, just return true to accept - if (bf->is_ngram_bf()) return true; + if (bf->is_ngram_bf()) { + return true; + } HybridSetBase::IteratorBase* iter = _values->begin(); while (iter->has_next()) { if constexpr (std::is_same_v) { - const StringRef* value = (const StringRef*)iter->get_value(); + const auto* value = (const StringRef*)iter->get_value(); if (bf->test_bytes(value->data, value->size)) { return true; } @@ -371,8 +372,6 @@ class InListPredicateBase : public ColumnPredicate { new_size = _base_evaluate(&column, nullptr, sel, size); } } - _evaluated_rows += size; - _passed_rows += new_size; return new_size; } @@ -393,9 +392,9 @@ class InListPredicateBase : public ColumnPredicate { if (column->is_column_dictionary()) { if constexpr (std::is_same_v) { - auto* nested_col_ptr = vectorized::check_and_get_column< + const auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary>(column); - auto& data_array = nested_col_ptr->get_data(); + const auto& data_array = nested_col_ptr->get_data(); auto segid = column->get_rowset_segment_id(); DCHECK((segid.first.hi | segid.first.mi | segid.first.lo) != 0); auto& value_in_dict_flags = _segment_id_to_value_in_dict_flags[segid]; @@ -460,9 +459,9 @@ class InListPredicateBase : public ColumnPredicate { const uint16_t* sel, uint16_t size, bool* flags) const { if (column->is_column_dictionary()) { if constexpr (std::is_same_v) { - auto* nested_col_ptr = vectorized::check_and_get_column< + const auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary>(column); - auto& data_array = nested_col_ptr->get_data(); + const auto& data_array = nested_col_ptr->get_data(); auto& value_in_dict_flags = _segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()]; if (value_in_dict_flags.empty()) { diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index a9ed7ec062e162..f7e511970d91f2 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -128,6 +128,7 @@ std::string get_parser_ignore_above_value_from_properties( std::string get_parser_stopwords_from_properties( const std::map& properties) { + DBUG_EXECUTE_IF("inverted_index_parser.get_parser_stopwords_from_properties", { return ""; }) if (properties.find(INVERTED_INDEX_PARSER_STOPWORDS_KEY) != properties.end()) { return properties.at(INVERTED_INDEX_PARSER_STOPWORDS_KEY); } else { diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index b0ad59b6c8d15c..e539f4a440ab0c 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -26,6 +26,8 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_element_count, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_usage_ratio, MetricUnit::NOUNIT); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(cache_lookup_count, MetricUnit::OPERATIONS); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(cache_hit_count, MetricUnit::OPERATIONS); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(cache_miss_count, MetricUnit::OPERATIONS); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(cache_stampede_count, MetricUnit::OPERATIONS); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_hit_ratio, MetricUnit::NOUNIT); uint32_t CacheKey::hash(const char* data, size_t n, uint32_t seed) const { @@ -207,6 +209,16 @@ uint64_t LRUCache::get_hit_count() { return _hit_count; } +uint64_t LRUCache::get_stampede_count() { + std::lock_guard l(_mutex); + return _stampede_count; +} + +uint64_t LRUCache::get_miss_count() { + std::lock_guard l(_mutex); + return _miss_count; +} + size_t LRUCache::get_usage() { std::lock_guard l(_mutex); return _usage; @@ -290,6 +302,8 @@ Cache::Handle* LRUCache::lookup(const CacheKey& key, uint32_t hash) { e->refs++; ++_hit_count; e->last_visit_time = UnixMillis(); + } else { + ++_miss_count; } return reinterpret_cast(e); } @@ -430,6 +444,7 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, auto old = _table.insert(e); _usage += e->total_size; if (old != nullptr) { + _stampede_count++; old->in_cache = false; if (_unref(old)) { _usage -= old->total_size; @@ -592,6 +607,8 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t capacity, LRUCa INT_DOUBLE_METRIC_REGISTER(_entity, cache_usage_ratio); INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_lookup_count); INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_hit_count); + INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_stampede_count); + INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_miss_count); INT_DOUBLE_METRIC_REGISTER(_entity, cache_hit_ratio); _hit_count_bvar.reset(new bvar::Adder("doris_cache", _name)); @@ -714,12 +731,17 @@ void ShardedLRUCache::update_cache_metrics() const { size_t total_lookup_count = 0; size_t total_hit_count = 0; size_t total_element_count = 0; + size_t total_miss_count = 0; + size_t total_stampede_count = 0; + for (int i = 0; i < _num_shards; i++) { capacity += _shards[i]->get_capacity(); total_usage += _shards[i]->get_usage(); total_lookup_count += _shards[i]->get_lookup_count(); total_hit_count += _shards[i]->get_hit_count(); total_element_count += _shards[i]->get_element_count(); + total_miss_count += _shards[i]->get_miss_count(); + total_stampede_count += _shards[i]->get_stampede_count(); } cache_capacity->set_value(capacity); @@ -727,6 +749,8 @@ void ShardedLRUCache::update_cache_metrics() const { cache_element_count->set_value(total_element_count); cache_lookup_count->set_value(total_lookup_count); cache_hit_count->set_value(total_hit_count); + cache_miss_count->set_value(total_miss_count); + cache_stampede_count->set_value(total_stampede_count); cache_usage_ratio->set_value(capacity == 0 ? 0 : ((double)total_usage / capacity)); cache_hit_ratio->set_value( total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count)); diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index ba2dd2b5c52c56..303a4cf2065ef9 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -350,6 +350,9 @@ class LRUCache { uint64_t get_lookup_count(); uint64_t get_hit_count(); + uint64_t get_miss_count(); + uint64_t get_stampede_count(); + size_t get_usage(); size_t get_capacity(); size_t get_element_count(); @@ -384,6 +387,8 @@ class LRUCache { uint64_t _lookup_count = 0; // number of cache lookups uint64_t _hit_count = 0; // number of cache hits + uint64_t _miss_count = 0; // number of cache misses + uint64_t _stampede_count = 0; CacheValueTimeExtractor _cache_value_time_extractor; bool _cache_value_check_timestamp = false; @@ -444,6 +449,8 @@ class ShardedLRUCache : public Cache { DoubleGauge* cache_usage_ratio = nullptr; IntAtomicCounter* cache_lookup_count = nullptr; IntAtomicCounter* cache_hit_count = nullptr; + IntAtomicCounter* cache_miss_count = nullptr; + IntAtomicCounter* cache_stampede_count = nullptr; DoubleGauge* cache_hit_ratio = nullptr; // bvars std::unique_ptr> _hit_count_bvar; diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index a70486e39b3a4b..e0f19b1624df5b 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -34,7 +34,6 @@ #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/thread_context.h" -#include "tablet_meta.h" #include "util/runtime_profile.h" #include "util/stopwatch.hpp" #include "vec/aggregate_functions/aggregate_function_reader.h" @@ -66,8 +65,8 @@ MemTable::MemTable(int64_t tablet_id, std::shared_ptr tablet_schem _vec_row_comparator = std::make_shared(_tablet_schema); _num_columns = _tablet_schema->num_columns(); if (partial_update_info != nullptr) { - _is_partial_update = partial_update_info->is_partial_update; - if (_is_partial_update) { + _partial_update_mode = partial_update_info->update_mode(); + if (_partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { _num_columns = partial_update_info->partial_update_input_columns.size(); if (partial_update_info->is_schema_contains_auto_inc_column && !partial_update_info->is_input_columns_contains_auto_inc_column) { @@ -103,10 +102,16 @@ void MemTable::_init_agg_functions(const vectorized::Block* block) { if (_keys_type == KeysType::UNIQUE_KEYS && _enable_unique_key_mow) { // In such table, non-key column's aggregation type is NONE, so we need to construct // the aggregate function manually. - function = vectorized::AggregateFunctionSimpleFactory::instance().get( - "replace_load", {block->get_data_type(cid)}, - block->get_data_type(cid)->is_nullable(), - BeExecVersionManager::get_newest_version()); + if (_skip_bitmap_col_idx != cid) { + function = vectorized::AggregateFunctionSimpleFactory::instance().get( + "replace_load", {block->get_data_type(cid)}, + block->get_data_type(cid)->is_nullable(), + BeExecVersionManager::get_newest_version()); + } else { + function = vectorized::AggregateFunctionSimpleFactory::instance().get( + "bitmap_intersect", {block->get_data_type(cid)}, false, + BeExecVersionManager::get_newest_version()); + } } else { function = _tablet_schema->column(cid).get_aggregate_function( vectorized::AGG_LOAD_SUFFIX, _tablet_schema->column(cid).get_be_exec_version()); @@ -179,20 +184,16 @@ int RowInBlockComparator::operator()(const RowInBlock* left, const RowInBlock* r Status MemTable::insert(const vectorized::Block* input_block, const std::vector& row_idxs) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); + if (_is_first_insertion) { _is_first_insertion = false; auto clone_block = input_block->clone_without_columns(&_column_offset); _input_mutable_block = vectorized::MutableBlock::build_mutable_block(&clone_block); _vec_row_comparator->set_block(&_input_mutable_block); _output_mutable_block = vectorized::MutableBlock::build_mutable_block(&clone_block); - if (_keys_type != KeysType::DUP_KEYS) { - // there may be additional intermediate columns in input_block - // we only need columns indicated by column offset in the output - RETURN_IF_CATCH_EXCEPTION(_init_agg_functions(&clone_block)); - } if (_tablet_schema->has_sequence_col()) { - if (_is_partial_update) { - // for unique key partial update, sequence column index in block + if (_partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { + // for unique key fixed partial update, sequence column index in block // may be different with the index in `_tablet_schema` for (size_t i = 0; i < clone_block.columns(); i++) { if (clone_block.get_by_position(i).name == SEQUENCE_COL) { @@ -204,6 +205,19 @@ Status MemTable::insert(const vectorized::Block* input_block, _seq_col_idx_in_block = _tablet_schema->sequence_col_idx(); } } + if (_partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FLEXIBLE_COLUMNS && + _tablet_schema->has_skip_bitmap_col()) { + // init of _skip_bitmap_col_idx must be before _init_agg_functions() + _skip_bitmap_col_idx = _tablet_schema->skip_bitmap_col_idx(); + if (_seq_col_idx_in_block != -1) { + _seq_col_unique_id = _tablet_schema->column(_seq_col_idx_in_block).unique_id(); + } + } + if (_keys_type != KeysType::DUP_KEYS) { + // there may be additional intermediate columns in input_block + // we only need columns indicated by column offset in the output + RETURN_IF_CATCH_EXCEPTION(_init_agg_functions(&clone_block)); + } } auto num_rows = row_idxs.size(); @@ -221,8 +235,12 @@ Status MemTable::insert(const vectorized::Block* input_block, return Status::OK(); } +template void MemTable::_aggregate_two_row_in_block(vectorized::MutableBlock& mutable_block, RowInBlock* src_row, RowInBlock* dst_row) { + // for flexible partial update, the caller must guarantees that either src_row and dst_row + // both specify the sequence column, or src_row and dst_row both don't specify the + // sequence column if (_tablet_schema->has_sequence_col() && _seq_col_idx_in_block >= 0) { DCHECK_LT(_seq_col_idx_in_block, mutable_block.columns()); auto col_ptr = mutable_block.mutable_columns()[_seq_col_idx_in_block].get(); @@ -236,11 +254,31 @@ void MemTable::_aggregate_two_row_in_block(vectorized::MutableBlock& mutable_blo dst_row->_row_pos = src_row->_row_pos; } // dst is non-sequence row, or dst sequence is smaller - for (uint32_t cid = _tablet_schema->num_key_columns(); cid < _num_columns; ++cid) { - auto col_ptr = mutable_block.mutable_columns()[cid].get(); - _agg_functions[cid]->add(dst_row->agg_places(cid), - const_cast(&col_ptr), - src_row->_row_pos, _arena.get()); + if constexpr (!has_skip_bitmap_col) { + DCHECK(_skip_bitmap_col_idx == -1); + for (uint32_t cid = _tablet_schema->num_key_columns(); cid < _num_columns; ++cid) { + auto* col_ptr = mutable_block.mutable_columns()[cid].get(); + _agg_functions[cid]->add(dst_row->agg_places(cid), + const_cast(&col_ptr), + src_row->_row_pos, _arena.get()); + } + } else { + DCHECK(_skip_bitmap_col_idx != -1); + DCHECK_LT(_skip_bitmap_col_idx, mutable_block.columns()); + const BitmapValue& skip_bitmap = + assert_cast( + mutable_block.mutable_columns()[_skip_bitmap_col_idx].get()) + ->get_data()[src_row->_row_pos]; + for (uint32_t cid = _tablet_schema->num_key_columns(); cid < _num_columns; ++cid) { + const auto& col = _tablet_schema->column(cid); + if (cid != _skip_bitmap_col_idx && skip_bitmap.contains(col.unique_id())) { + continue; + } + auto* col_ptr = mutable_block.mutable_columns()[cid].get(); + _agg_functions[cid]->add(dst_row->agg_places(cid), + const_cast(&col_ptr), + src_row->_row_pos, _arena.get()); + } } } Status MemTable::_put_into_output(vectorized::Block& in_block) { @@ -414,7 +452,7 @@ void MemTable::_finalize_one_row(RowInBlock* row, } } -template +template void MemTable::_aggregate() { SCOPED_RAW_TIMER(&_stat.agg_ns); _stat.agg_times++; @@ -428,37 +466,104 @@ void MemTable::_aggregate() { RowInBlock* prev_row = nullptr; int row_pos = -1; //only init agg if needed - for (int i = 0; i < _row_in_blocks.size(); i++) { - if (!temp_row_in_blocks.empty() && - (*_vec_row_comparator)(prev_row, _row_in_blocks[i]) == 0) { - if (!prev_row->has_init_agg()) { - prev_row->init_agg_places( - _arena->aligned_alloc(_total_size_of_aggregate_states, 16), - _offsets_of_aggregate_states.data()); - for (auto cid = _tablet_schema->num_key_columns(); cid < _num_columns; cid++) { - auto col_ptr = mutable_block.mutable_columns()[cid].get(); - auto data = prev_row->agg_places(cid); - _agg_functions[cid]->create(data); - _agg_functions[cid]->add( - data, const_cast(&col_ptr), - prev_row->_row_pos, _arena.get()); + + auto init_for_agg = [&](RowInBlock* row) { + row->init_agg_places(_arena->aligned_alloc(_total_size_of_aggregate_states, 16), + _offsets_of_aggregate_states.data()); + for (auto cid = _tablet_schema->num_key_columns(); cid < _num_columns; cid++) { + auto* col_ptr = mutable_block.mutable_columns()[cid].get(); + auto* data = prev_row->agg_places(cid); + _agg_functions[cid]->create(data); + _agg_functions[cid]->add(data, const_cast(&col_ptr), + prev_row->_row_pos, _arena.get()); + } + }; + + if (!has_skip_bitmap_col || _seq_col_idx_in_block == -1) { + for (RowInBlock* cur_row : _row_in_blocks) { + if (!temp_row_in_blocks.empty() && (*_vec_row_comparator)(prev_row, cur_row) == 0) { + if (!prev_row->has_init_agg()) { + init_for_agg(prev_row); } + _stat.merged_rows++; + _aggregate_two_row_in_block(mutable_block, cur_row, prev_row); + } else { + prev_row = cur_row; + if (!temp_row_in_blocks.empty()) { + // no more rows to merge for prev row, finalize it + _finalize_one_row(temp_row_in_blocks.back(), block_data, row_pos); + } + temp_row_in_blocks.push_back(prev_row); + row_pos++; } - _stat.merged_rows++; - _aggregate_two_row_in_block(mutable_block, _row_in_blocks[i], prev_row); - } else { - prev_row = _row_in_blocks[i]; - if (!temp_row_in_blocks.empty()) { - // no more rows to merge for prev row, finalize it - _finalize_one_row(temp_row_in_blocks.back(), block_data, row_pos); + } + if (!temp_row_in_blocks.empty()) { + // finalize the last low + _finalize_one_row(temp_row_in_blocks.back(), block_data, row_pos); + } + } else { + // For flexible partial update and the table has sequence column, considering the following situation: + // there are multiple rows with the same keys in memtable, some of them specify the sequence column, + // some of them don't. We can't do the de-duplication in memtable becasue we can only know the value + // of the sequence column of the row which don't specify seqeuence column in SegmentWriter after we + // probe the historical data. So at here we can only merge rows that have sequence column together and + // merge rows without sequence column together, and finally, perform deduplication on them in SegmentWriter. + + // !!ATTENTION!!: there may be rows with the same keys after MemTable::_aggregate() in this situation. + RowInBlock* row_with_seq_col = nullptr; + int row_pos_with_seq = -1; + RowInBlock* row_without_seq_col = nullptr; + int row_pos_without_seq = -1; + + auto finalize_rows = [&]() { + if (row_with_seq_col != nullptr) { + _finalize_one_row(row_with_seq_col, block_data, row_pos_with_seq); + row_with_seq_col = nullptr; } - temp_row_in_blocks.push_back(prev_row); + if (row_without_seq_col != nullptr) { + _finalize_one_row(row_without_seq_col, block_data, row_pos_without_seq); + row_without_seq_col = nullptr; + } + }; + auto add_row = [&](RowInBlock* row, bool with_seq_col) { + temp_row_in_blocks.push_back(row); row_pos++; + if (with_seq_col) { + row_with_seq_col = row; + row_pos_with_seq = row_pos; + } else { + row_without_seq_col = row; + row_pos_without_seq = row_pos; + } + }; + auto& skip_bitmaps = assert_cast( + mutable_block.mutable_columns()[_skip_bitmap_col_idx].get()) + ->get_data(); + for (auto* cur_row : _row_in_blocks) { + const BitmapValue& skip_bitmap = skip_bitmaps[cur_row->_row_pos]; + bool with_seq_col = !skip_bitmap.contains(_seq_col_unique_id); + // compare keys, the keys of row_with_seq_col and row_with_seq_col is the same, + // choose any of them if it's valid + prev_row = (row_with_seq_col == nullptr) ? row_without_seq_col : row_with_seq_col; + if (prev_row != nullptr && (*_vec_row_comparator)(prev_row, cur_row) == 0) { + prev_row = (with_seq_col ? row_with_seq_col : row_without_seq_col); + if (prev_row == nullptr) { + add_row(cur_row, with_seq_col); + continue; + } + if (!prev_row->has_init_agg()) { + init_for_agg(prev_row); + } + _stat.merged_rows++; + _aggregate_two_row_in_block(mutable_block, cur_row, prev_row); + } else { + // no more rows to merge for prev rows, finalize them + finalize_rows(); + add_row(cur_row, with_seq_col); + } } - } - if (!temp_row_in_blocks.empty()) { - // finalize the last low - _finalize_one_row(temp_row_in_blocks.back(), block_data, row_pos); + // finalize the last lows + finalize_rows(); } if constexpr (!is_final) { // if is not final, we collect the agg results to input_block and then continue to insert @@ -480,13 +585,13 @@ void MemTable::shrink_memtable_by_agg() { } size_t same_keys_num = _sort(); if (same_keys_num != 0) { - _aggregate(); + (_skip_bitmap_col_idx == -1) ? _aggregate() : _aggregate(); } } bool MemTable::need_flush() const { auto max_size = config::write_buffer_size; - if (_is_partial_update) { + if (_partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { auto update_columns_size = _num_columns; max_size = max_size * update_columns_size / _tablet_schema->num_columns(); max_size = max_size > 1048576 ? max_size : 1048576; @@ -512,11 +617,11 @@ Status MemTable::_to_block(std::unique_ptr* res) { RETURN_IF_ERROR(_put_into_output(in_block)); } } else { - _aggregate(); + (_skip_bitmap_col_idx == -1) ? _aggregate() : _aggregate(); } if (_keys_type == KeysType::UNIQUE_KEYS && _enable_unique_key_mow && !_tablet_schema->cluster_key_idxes().empty()) { - if (_is_partial_update) { + if (_partial_update_mode != UniqueKeyUpdateModePB::UPSERT) { return Status::InternalError( "Partial update for mow with cluster keys is not supported"); } diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 4ae92c2d2d8949..77ff2e886bff36 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -208,6 +208,7 @@ class MemTable { private: // for vectorized + template void _aggregate_two_row_in_block(vectorized::MutableBlock& mutable_block, RowInBlock* new_row, RowInBlock* row_in_skiplist); @@ -218,8 +219,8 @@ class MemTable { std::atomic _mem_type; int64_t _tablet_id; bool _enable_unique_key_mow = false; - bool _is_partial_update = false; bool _is_flush_success = false; + UniqueKeyUpdateModePB _partial_update_mode {UniqueKeyUpdateModePB::UPSERT}; const KeysType _keys_type; std::shared_ptr _tablet_schema; @@ -255,7 +256,7 @@ class MemTable { template void _finalize_one_row(RowInBlock* row, const vectorized::ColumnsWithTypeAndName& block_data, int row_pos); - template + template void _aggregate(); Status _put_into_output(vectorized::Block& in_block); bool _is_first_insertion; @@ -268,6 +269,8 @@ class MemTable { size_t _num_columns; int32_t _seq_col_idx_in_block = -1; + int32_t _skip_bitmap_col_idx {-1}; + int32_t _seq_col_unique_id {-1}; bool _is_partial_update_and_auto_inc = false; }; // class MemTable diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index dc911647be8f96..5cdb45281b99ee 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -100,7 +100,16 @@ Status FlushToken::submit(std::shared_ptr mem_table) { int64_t submit_task_time = MonotonicNanos(); auto task = MemtableFlushTask::create_shared( shared_from_this(), mem_table, _rowset_writer->allocate_segment_id(), submit_task_time); - Status ret = _thread_pool->submit(std::move(task)); + // NOTE: we should guarantee WorkloadGroup is not deconstructed when submit memtable flush task. + // because currently WorkloadGroup's can only be destroyed when all queries in the group is finished, + // but not consider whether load channel is finish. + std::shared_ptr wg_sptr = _wg_wptr.lock(); + ThreadPool* wg_thread_pool = nullptr; + if (wg_sptr) { + wg_thread_pool = wg_sptr->get_memtable_flush_pool_ptr(); + } + Status ret = wg_thread_pool ? wg_thread_pool->submit(std::move(task)) + : _thread_pool->submit(std::move(task)); if (ret.ok()) { // _wait_running_task_finish was executed after this function, so no need to notify _cond here _stats.flush_running_count++; @@ -236,7 +245,8 @@ void MemTableFlushExecutor::init(int num_disk) { // NOTE: we use SERIAL mode here to ensure all mem-tables from one tablet are flushed in order. Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& flush_token, std::shared_ptr rowset_writer, - bool is_high_priority) { + bool is_high_priority, + std::shared_ptr wg_sptr) { switch (rowset_writer->type()) { case ALPHA_ROWSET: // alpha rowset do not support flush in CONCURRENT. and not support alpha rowset now. @@ -244,7 +254,7 @@ Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& fl case BETA_ROWSET: { // beta rowset can be flush in CONCURRENT, because each memtable using a new segment writer. ThreadPool* pool = is_high_priority ? _high_prio_flush_pool.get() : _flush_pool.get(); - flush_token = FlushToken::create_shared(pool); + flush_token = FlushToken::create_shared(pool, wg_sptr); flush_token->set_rowset_writer(rowset_writer); return Status::OK(); } @@ -253,18 +263,6 @@ Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& fl } } -Status MemTableFlushExecutor::create_flush_token(std::shared_ptr& flush_token, - std::shared_ptr rowset_writer, - ThreadPool* wg_flush_pool_ptr) { - if (rowset_writer->type() == BETA_ROWSET) { - flush_token = FlushToken::create_shared(wg_flush_pool_ptr); - } else { - return Status::InternalError("not support alpha rowset load now."); - } - flush_token->set_rowset_writer(rowset_writer); - return Status::OK(); -} - void MemTableFlushExecutor::_register_metrics() { REGISTER_HOOK_METRIC(flush_thread_pool_queue_size, [this]() { return _flush_pool->get_queue_size(); }); diff --git a/be/src/olap/memtable_flush_executor.h b/be/src/olap/memtable_flush_executor.h index 25c5a37afba3ac..27e8e8a9b0ebe0 100644 --- a/be/src/olap/memtable_flush_executor.h +++ b/be/src/olap/memtable_flush_executor.h @@ -34,6 +34,7 @@ namespace doris { class DataDir; class MemTable; class RowsetWriter; +class WorkloadGroup; // the statistic of a certain flush handler. // use atomic because it may be updated by multi threads @@ -59,7 +60,8 @@ class FlushToken : public std::enable_shared_from_this { ENABLE_FACTORY_CREATOR(FlushToken); public: - FlushToken(ThreadPool* thread_pool) : _flush_status(Status::OK()), _thread_pool(thread_pool) {} + FlushToken(ThreadPool* thread_pool, std::shared_ptr wg_sptr) + : _flush_status(Status::OK()), _thread_pool(thread_pool), _wg_wptr(wg_sptr) {} Status submit(std::shared_ptr mem_table); @@ -108,6 +110,8 @@ class FlushToken : public std::enable_shared_from_this { std::mutex _mutex; std::condition_variable _cond; + + std::weak_ptr _wg_wptr; }; // MemTableFlushExecutor is responsible for flushing memtables to disk. @@ -133,11 +137,8 @@ class MemTableFlushExecutor { void init(int num_disk); Status create_flush_token(std::shared_ptr& flush_token, - std::shared_ptr rowset_writer, bool is_high_priority); - - Status create_flush_token(std::shared_ptr& flush_token, - std::shared_ptr rowset_writer, - ThreadPool* wg_flush_pool_ptr); + std::shared_ptr rowset_writer, bool is_high_priority, + std::shared_ptr wg_sptr); private: void _register_metrics(); diff --git a/be/src/olap/memtable_writer.cpp b/be/src/olap/memtable_writer.cpp index e8123c48eccd29..88532646b66349 100644 --- a/be/src/olap/memtable_writer.cpp +++ b/be/src/olap/memtable_writer.cpp @@ -65,7 +65,7 @@ MemTableWriter::~MemTableWriter() { Status MemTableWriter::init(std::shared_ptr rowset_writer, TabletSchemaSPtr tablet_schema, std::shared_ptr partial_update_info, - ThreadPool* wg_flush_pool_ptr, bool unique_key_mow) { + std::shared_ptr wg_sptr, bool unique_key_mow) { _rowset_writer = rowset_writer; _tablet_schema = tablet_schema; _unique_key_mow = unique_key_mow; @@ -77,19 +77,9 @@ Status MemTableWriter::init(std::shared_ptr rowset_writer, // create flush handler // by assigning segment_id to memtable before submiting to flush executor, // we can make sure same keys sort in the same order in all replicas. - if (wg_flush_pool_ptr) { - RETURN_IF_ERROR( - ExecEnv::GetInstance() - ->storage_engine() - .memtable_flush_executor() - ->create_flush_token(_flush_token, _rowset_writer, wg_flush_pool_ptr)); - } else { - RETURN_IF_ERROR( - ExecEnv::GetInstance() - ->storage_engine() - .memtable_flush_executor() - ->create_flush_token(_flush_token, _rowset_writer, _req.is_high_priority)); - } + RETURN_IF_ERROR( + ExecEnv::GetInstance()->storage_engine().memtable_flush_executor()->create_flush_token( + _flush_token, _rowset_writer, _req.is_high_priority, wg_sptr)); _is_init = true; return Status::OK(); diff --git a/be/src/olap/memtable_writer.h b/be/src/olap/memtable_writer.h index ec44348b4a9e11..fb07e740fa3cf6 100644 --- a/be/src/olap/memtable_writer.h +++ b/be/src/olap/memtable_writer.h @@ -52,6 +52,7 @@ class SlotDescriptor; class OlapTableSchemaParam; class RowsetWriter; struct FlushStatistic; +class WorkloadGroup; namespace vectorized { class Block; @@ -67,7 +68,7 @@ class MemTableWriter { Status init(std::shared_ptr rowset_writer, TabletSchemaSPtr tablet_schema, std::shared_ptr partial_update_info, - ThreadPool* wg_flush_pool_ptr, bool unique_key_mow = false); + std::shared_ptr wg_sptr, bool unique_key_mow = false); Status write(const vectorized::Block* block, const std::vector& row_idxs); diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index ab034123ac883c..a79434551b5cc1 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,8 @@ Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type, if (stats_output && stats_output->rowid_conversion) { reader_params.record_rowids = true; + reader_params.rowid_conversion = stats_output->rowid_conversion; + stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); } reader_params.return_columns.resize(cur_tablet_schema.num_columns()); @@ -98,17 +101,6 @@ Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type, reader_params.origin_return_columns = &reader_params.return_columns; RETURN_IF_ERROR(reader.init(reader_params)); - if (reader_params.record_rowids) { - stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); - // init segment rowid map for rowid conversion - std::vector segment_num_rows; - for (auto& rs_split : reader_params.rs_splits) { - RETURN_IF_ERROR(rs_split.rs_reader->get_segment_num_rows(&segment_num_rows)); - stats_output->rowid_conversion->init_segment_map( - rs_split.rs_reader->rowset()->rowset_id(), segment_num_rows); - } - } - vectorized::Block block = cur_tablet_schema.create_block(reader_params.return_columns); size_t output_rows = 0; bool eof = false; @@ -274,6 +266,8 @@ Status Merger::vertical_compact_one_group( if (is_key && stats_output && stats_output->rowid_conversion) { reader_params.record_rowids = true; + reader_params.rowid_conversion = stats_output->rowid_conversion; + stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); } reader_params.return_columns = column_group; @@ -281,17 +275,6 @@ Status Merger::vertical_compact_one_group( reader_params.batch_size = batch_size; RETURN_IF_ERROR(reader.init(reader_params, sample_info)); - if (reader_params.record_rowids) { - stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); - // init segment rowid map for rowid conversion - std::vector segment_num_rows; - for (auto& rs_split : reader_params.rs_splits) { - RETURN_IF_ERROR(rs_split.rs_reader->get_segment_num_rows(&segment_num_rows)); - stats_output->rowid_conversion->init_segment_map( - rs_split.rs_reader->rowset()->rowset_id(), segment_num_rows); - } - } - vectorized::Block block = tablet_schema.create_block(reader_params.return_columns); size_t output_rows = 0; bool eof = false; diff --git a/be/src/olap/metadata_adder.h b/be/src/olap/metadata_adder.h new file mode 100644 index 00000000000000..bdc9e7a398d8a3 --- /dev/null +++ b/be/src/olap/metadata_adder.h @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace doris { + +inline bvar::Adder g_rowset_meta_mem_bytes("doris_rowset_meta_mem_bytes"); +inline bvar::Adder g_rowset_meta_num("doris_rowset_meta_num"); + +inline bvar::Adder g_tablet_meta_mem_bytes("doris_tablet_meta_mem_bytes"); +inline bvar::Adder g_tablet_meta_num("doris_tablet_meta_num"); + +inline bvar::Adder g_tablet_column_mem_bytes("doris_tablet_column_mem_bytes"); +inline bvar::Adder g_tablet_column_num("doris_tablet_column_num"); + +inline bvar::Adder g_tablet_index_mem_bytes("doris_tablet_index_mem_bytes"); +inline bvar::Adder g_tablet_index_num("doris_tablet_index_num"); + +inline bvar::Adder g_tablet_schema_mem_bytes("doris_tablet_schema_mem_bytes"); +inline bvar::Adder g_tablet_schema_num("doris_tablet_schema_num"); + +inline bvar::Adder g_segment_mem_bytes("doris_segment_mem_bytes"); +inline bvar::Adder g_segment_num("doris_segment_num"); + +inline bvar::Adder g_column_reader_mem_bytes("doris_column_reader_mem_bytes"); +inline bvar::Adder g_column_reader_num("doris_column_reader_num"); + +inline bvar::Adder g_bitmap_index_reader_mem_bytes("doris_bitmap_index_reader_mem_bytes"); +inline bvar::Adder g_bitmap_index_reader_num("doris_bitmap_index_reader_num"); + +inline bvar::Adder g_bloom_filter_index_reader_mem_bytes( + "doris_bloom_filter_index_reader_mem_bytes"); +inline bvar::Adder g_bloom_filter_index_reader_num("doris_bloom_filter_index_reader_num"); + +inline bvar::Adder g_index_page_reader_mem_bytes("doris_index_page_reader_mem_bytes"); +inline bvar::Adder g_index_page_reader_num("doris_index_page_reader_num"); + +inline bvar::Adder g_indexed_column_reader_mem_bytes( + "doris_indexed_column_reader_mem_bytes"); +inline bvar::Adder g_indexed_column_reader_num("doris_indexed_column_reader_num"); + +inline bvar::Adder g_inverted_index_reader_mem_bytes( + "doris_inverted_index_reader_mem_bytes"); +inline bvar::Adder g_inverted_index_reader_num("doris_inverted_index_reader_num"); + +inline bvar::Adder g_ordinal_index_reader_mem_bytes( + "doris_ordinal_index_reader_mem_bytes"); +inline bvar::Adder g_ordinal_index_reader_num("doris_ordinal_index_reader_num"); + +inline bvar::Adder g_zone_map_index_reader_mem_bytes( + "doris_zone_map_index_reader_mem_bytes"); +inline bvar::Adder g_zone_map_index_reader_num("doris_zone_map_index_reader_num"); + +class RowsetMeta; +class TabletMeta; +class TabletColumn; +class TabletIndex; +class TabletSchema; + +namespace segment_v2 { +class Segment; +class ColumnReader; +class BitmapIndexReader; +class BloomFilterIndexReader; +class IndexPageReader; +class IndexedColumnReader; +class InvertedIndexReader; +class OrdinalIndexReader; +class ZoneMapIndexReader; +}; // namespace segment_v2 + +/* + When a derived Class extends MetadataAdder, then the Class's number and fixed length field's memory can be counted automatically. + But if the Class has variable length field, then you should overwrite get_metadata_size and call update_metadata_size when the Class's memory changes. + + There are some special situations that need to be noted: + 1. when the derived Class override copy constructor, you'd better update memory size(call update_metadata_size) if derived class's + memory changed in its copy constructor or you not call MetadataAdder's copy constructor. + 2. when the derived Class override operator=, you'd better update memory size(call update_metadata_size) if the derived Class has variable length field; + + Anyway, you should update mem size whenever derived Class's memory changes. +*/ + +template +class MetadataAdder { +public: + MetadataAdder(); + +protected: + MetadataAdder(const MetadataAdder& other); + + virtual ~MetadataAdder(); + + virtual int64_t get_metadata_size() const { return sizeof(T); } + + void update_metadata_size(); + + MetadataAdder& operator=(const MetadataAdder& other) = default; + +private: + int64_t _current_meta_size {0}; + + void add_mem_size(int64_t val); + + void add_num(int64_t val); +}; + +template +MetadataAdder::MetadataAdder(const MetadataAdder& other) { + this->_current_meta_size = other._current_meta_size; + add_num(1); + add_mem_size(this->_current_meta_size); +} + +template +MetadataAdder::MetadataAdder() { + this->_current_meta_size = sizeof(T); + add_mem_size(this->_current_meta_size); + add_num(1); +} + +template +MetadataAdder::~MetadataAdder() { + add_mem_size(-_current_meta_size); + add_num(-1); +} + +template +void MetadataAdder::update_metadata_size() { + int64_t old_size = _current_meta_size; + _current_meta_size = get_metadata_size(); + int64_t size_diff = _current_meta_size - old_size; + + add_mem_size(size_diff); +} + +template +void MetadataAdder::add_mem_size(int64_t val) { + if (val == 0) { + return; + } + if constexpr (std::is_same_v) { + g_rowset_meta_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_tablet_meta_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_tablet_column_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_tablet_index_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_tablet_schema_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_segment_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_column_reader_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_bitmap_index_reader_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_bloom_filter_index_reader_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_index_page_reader_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_indexed_column_reader_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_inverted_index_reader_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_ordinal_index_reader_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_zone_map_index_reader_mem_bytes << val; + } +} + +template +void MetadataAdder::add_num(int64_t val) { + if (val == 0) { + return; + } + if constexpr (std::is_same_v) { + g_rowset_meta_num << val; + } else if constexpr (std::is_same_v) { + g_tablet_meta_num << val; + } else if constexpr (std::is_same_v) { + g_tablet_column_num << val; + } else if constexpr (std::is_same_v) { + g_tablet_index_num << val; + } else if constexpr (std::is_same_v) { + g_tablet_schema_num << val; + } else if constexpr (std::is_same_v) { + g_segment_num << val; + } else if constexpr (std::is_same_v) { + g_column_reader_num << val; + } else if constexpr (std::is_same_v) { + g_bitmap_index_reader_num << val; + } else if constexpr (std::is_same_v) { + g_bloom_filter_index_reader_num << val; + } else if constexpr (std::is_same_v) { + g_index_page_reader_num << val; + } else if constexpr (std::is_same_v) { + g_indexed_column_reader_num << val; + } else if constexpr (std::is_same_v) { + g_inverted_index_reader_num << val; + } else if constexpr (std::is_same_v) { + g_ordinal_index_reader_num << val; + } else if constexpr (std::is_same_v) { + g_zone_map_index_reader_num << val; + } +} + +}; // namespace doris \ No newline at end of file diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index c1d3038050fbd4..d3bd0f0a3a2436 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -305,24 +305,22 @@ struct OlapReaderStatistics { // block_load_ns // block_init_ns // block_init_seek_ns - // block_conditions_filtered_ns - // first_read_ns - // block_first_read_seek_ns + // generate_row_ranges_ns + // predicate_column_read_ns + // predicate_column_read_seek_ns // lazy_read_ns // block_lazy_read_seek_ns int64_t block_init_ns = 0; int64_t block_init_seek_num = 0; int64_t block_init_seek_ns = 0; - int64_t first_read_ns = 0; - int64_t second_read_ns = 0; - int64_t block_first_read_seek_num = 0; - int64_t block_first_read_seek_ns = 0; + int64_t predicate_column_read_ns = 0; + int64_t non_predicate_read_ns = 0; + int64_t predicate_column_read_seek_num = 0; + int64_t predicate_column_read_seek_ns = 0; int64_t lazy_read_ns = 0; int64_t block_lazy_read_seek_num = 0; int64_t block_lazy_read_seek_ns = 0; - int64_t block_convert_ns = 0; - int64_t raw_rows_read = 0; int64_t rows_vec_cond_filtered = 0; @@ -351,11 +349,10 @@ struct OlapReaderStatistics { int64_t rows_del_by_bitmap = 0; // the number of rows filtered by various column indexes. int64_t rows_conditions_filtered = 0; - int64_t block_conditions_filtered_ns = 0; - int64_t block_conditions_filtered_bf_ns = 0; - int64_t block_conditions_filtered_zonemap_ns = 0; - int64_t block_conditions_filtered_zonemap_rp_ns = 0; - int64_t block_conditions_filtered_dict_ns = 0; + int64_t generate_row_ranges_ns = 0; + int64_t generate_row_ranges_by_bf_ns = 0; + int64_t generate_row_ranges_by_zonemap_ns = 0; + int64_t generate_row_ranges_by_dict_ns = 0; int64_t index_load_ns = 0; @@ -372,7 +369,6 @@ struct OlapReaderStatistics { int64_t inverted_index_query_cache_miss = 0; int64_t inverted_index_query_null_bitmap_timer = 0; int64_t inverted_index_query_bitmap_copy_timer = 0; - int64_t inverted_index_query_bitmap_op_timer = 0; int64_t inverted_index_searcher_open_timer = 0; int64_t inverted_index_searcher_search_timer = 0; int64_t inverted_index_searcher_cache_hit = 0; diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index 020d151d16b849..8fae8887d7a772 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -78,6 +78,7 @@ #include "runtime/memory/cache_manager.h" #include "runtime/memory/global_memory_arbitrator.h" #include "util/countdown_latch.h" +#include "util/debug_points.h" #include "util/doris_metrics.h" #include "util/mem_info.h" #include "util/thread.h" @@ -1134,6 +1135,8 @@ Status StorageEngine::submit_seg_compaction_task(std::shared_ptrget_tablet(tablet_id); + DBUG_EXECUTE_IF("StorageEngine::process_index_change_task_tablet_nullptr", + { tablet = nullptr; }) if (tablet == nullptr) { LOG(WARNING) << "tablet: " << tablet_id << " not exist"; return Status::InternalError("tablet not exist, tablet_id={}.", tablet_id); diff --git a/be/src/olap/partial_update_info.cpp b/be/src/olap/partial_update_info.cpp index 247353103dfdcf..3e8c5d9750c81a 100644 --- a/be/src/olap/partial_update_info.cpp +++ b/be/src/olap/partial_update_info.cpp @@ -33,42 +33,53 @@ namespace doris { -void PartialUpdateInfo::init(const TabletSchema& tablet_schema, bool partial_update, +void PartialUpdateInfo::init(const TabletSchema& tablet_schema, + UniqueKeyUpdateModePB unique_key_update_mode, const std::set& partial_update_cols, bool is_strict_mode, int64_t timestamp_ms, int32_t nano_seconds, const std::string& timezone, const std::string& auto_increment_column, - int64_t cur_max_version) { - is_partial_update = partial_update; + int32_t sequence_map_col_uid, int64_t cur_max_version) { + partial_update_mode = unique_key_update_mode; partial_update_input_columns = partial_update_cols; max_version_in_flush_phase = cur_max_version; + sequence_map_col_unqiue_id = sequence_map_col_uid; this->timestamp_ms = timestamp_ms; this->nano_seconds = nano_seconds; this->timezone = timezone; missing_cids.clear(); update_cids.clear(); + for (auto i = 0; i < tablet_schema.num_columns(); ++i) { - auto tablet_column = tablet_schema.column(i); - if (!partial_update_input_columns.contains(tablet_column.name())) { - missing_cids.emplace_back(i); - if (!tablet_column.has_default_value() && !tablet_column.is_nullable() && - tablet_schema.auto_increment_column() != tablet_column.name()) { - can_insert_new_rows_in_partial_update = false; + if (partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { + auto tablet_column = tablet_schema.column(i); + if (!partial_update_input_columns.contains(tablet_column.name())) { + missing_cids.emplace_back(i); + if (!tablet_column.has_default_value() && !tablet_column.is_nullable() && + tablet_schema.auto_increment_column() != tablet_column.name()) { + can_insert_new_rows_in_partial_update = false; + } + } else { + update_cids.emplace_back(i); + } + if (auto_increment_column == tablet_column.name()) { + is_schema_contains_auto_inc_column = true; } } else { - update_cids.emplace_back(i); - } - if (auto_increment_column == tablet_column.name()) { - is_schema_contains_auto_inc_column = true; + // in flexible partial update, missing cids is all non sort keys' cid + if (i >= tablet_schema.num_key_columns()) { + missing_cids.emplace_back(i); + } } } this->is_strict_mode = is_strict_mode; is_input_columns_contains_auto_inc_column = - is_partial_update && partial_update_input_columns.contains(auto_increment_column); + is_fixed_partial_update() && + partial_update_input_columns.contains(auto_increment_column); _generate_default_values_for_missing_cids(tablet_schema); } void PartialUpdateInfo::to_pb(PartialUpdateInfoPB* partial_update_info_pb) const { - partial_update_info_pb->set_is_partial_update(is_partial_update); + partial_update_info_pb->set_partial_update_mode(partial_update_mode); partial_update_info_pb->set_max_version_in_flush_phase(max_version_in_flush_phase); for (const auto& col : partial_update_input_columns) { partial_update_info_pb->add_partial_update_input_columns(col); @@ -95,7 +106,16 @@ void PartialUpdateInfo::to_pb(PartialUpdateInfoPB* partial_update_info_pb) const } void PartialUpdateInfo::from_pb(PartialUpdateInfoPB* partial_update_info_pb) { - is_partial_update = partial_update_info_pb->is_partial_update(); + if (!partial_update_info_pb->has_partial_update_mode()) { + // for backward compatibility + if (partial_update_info_pb->is_partial_update()) { + partial_update_mode = UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS; + } else { + partial_update_mode = UniqueKeyUpdateModePB::UPSERT; + } + } else { + partial_update_mode = partial_update_info_pb->partial_update_mode(); + } max_version_in_flush_phase = partial_update_info_pb->has_max_version_in_flush_phase() ? partial_update_info_pb->max_version_in_flush_phase() : -1; @@ -130,13 +150,27 @@ void PartialUpdateInfo::from_pb(PartialUpdateInfoPB* partial_update_info_pb) { } std::string PartialUpdateInfo::summary() const { + std::string mode; + switch (partial_update_mode) { + case UniqueKeyUpdateModePB::UPSERT: + mode = "upsert"; + break; + case UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS: + mode = "fixed partial update"; + break; + case UniqueKeyUpdateModePB::UPDATE_FLEXIBLE_COLUMNS: + mode = "flexible partial update"; + break; + } return fmt::format( - "update_cids={}, missing_cids={}, is_strict_mode={}, max_version_in_flush_phase={}", - update_cids.size(), missing_cids.size(), is_strict_mode, max_version_in_flush_phase); + "mode={}, update_cids={}, missing_cids={}, is_strict_mode={}, " + "max_version_in_flush_phase={}", + mode, update_cids.size(), missing_cids.size(), is_strict_mode, + max_version_in_flush_phase); } -Status PartialUpdateInfo::handle_non_strict_mode_not_found_error( - const TabletSchema& tablet_schema) { +Status PartialUpdateInfo::handle_not_found_error_for_fixed_partial_update( + const TabletSchema& tablet_schema) const { if (!can_insert_new_rows_in_partial_update) { std::string error_column; for (auto cid : missing_cids) { @@ -149,13 +183,45 @@ Status PartialUpdateInfo::handle_non_strict_mode_not_found_error( } return Status::Error( "the unmentioned column `{}` should have default value or be nullable " - "for " - "newly inserted rows in non-strict mode partial update", + "for newly inserted rows in non-strict mode partial update", + error_column); + } + return Status::OK(); +} +Status PartialUpdateInfo::handle_not_found_error_for_flexible_partial_update( + const TabletSchema& tablet_schema, BitmapValue* skip_bitmap) const { + DCHECK(skip_bitmap != nullptr); + bool can_insert_new_rows_in_partial_update = true; + std::string error_column; + for (auto cid : missing_cids) { + const TabletColumn& col = tablet_schema.column(cid); + if (skip_bitmap->contains(col.unique_id()) && !col.has_default_value() && + !col.is_nullable() && col.is_auto_increment()) { + error_column = col.name(); + can_insert_new_rows_in_partial_update = false; + break; + } + } + if (!can_insert_new_rows_in_partial_update) { + return Status::Error( + "the unmentioned column `{}` should have default value or be " + "nullable for newly inserted rows in non-strict mode flexible partial update", error_column); } return Status::OK(); } +Status PartialUpdateInfo::handle_non_strict_mode_not_found_error(const TabletSchema& tablet_schema, + BitmapValue* skip_bitmap) const { + if (partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { + RETURN_IF_ERROR(handle_not_found_error_for_fixed_partial_update(tablet_schema)); + } else if (partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FLEXIBLE_COLUMNS) { + RETURN_IF_ERROR( + handle_not_found_error_for_flexible_partial_update(tablet_schema, skip_bitmap)); + } + return Status::OK(); +} + void PartialUpdateInfo::_generate_default_values_for_missing_cids( const TabletSchema& tablet_schema) { for (unsigned int cur_cid : missing_cids) { @@ -199,16 +265,17 @@ void PartialUpdateInfo::_generate_default_values_for_missing_cids( CHECK_EQ(missing_cids.size(), default_values.size()); } -void PartialUpdateReadPlan::prepare_to_read(const RowLocation& row_location, size_t pos) { +void FixedReadPlan::prepare_to_read(const RowLocation& row_location, size_t pos) { plan[row_location.rowset_id][row_location.segment_id].emplace_back(row_location.row_id, pos); } // read columns by read plan // read_index: ori_pos-> block_idx -Status PartialUpdateReadPlan::read_columns_by_plan( +Status FixedReadPlan::read_columns_by_plan( const TabletSchema& tablet_schema, const std::vector cids_to_read, const std::map& rsid_to_rowset, vectorized::Block& block, - std::map* read_index, const signed char* __restrict skip_map) const { + std::map* read_index, + const signed char* __restrict delete_signs) const { bool has_row_column = tablet_schema.has_row_store_for_all_columns(); auto mutable_columns = block.mutate_columns(); size_t read_idx = 0; @@ -218,7 +285,7 @@ Status PartialUpdateReadPlan::read_columns_by_plan( CHECK(rowset_iter != rsid_to_rowset.end()); std::vector rids; for (auto [rid, pos] : mappings) { - if (skip_map && skip_map[pos]) { + if (delete_signs && delete_signs[pos]) { continue; } rids.emplace_back(rid); @@ -249,7 +316,7 @@ Status PartialUpdateReadPlan::read_columns_by_plan( return Status::OK(); } -Status PartialUpdateReadPlan::fill_missing_columns( +Status FixedReadPlan::fill_missing_columns( RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, vectorized::Block& full_block, const std::vector& use_default_or_null_flag, bool has_default_or_nullable, @@ -260,7 +327,7 @@ Status PartialUpdateReadPlan::fill_missing_columns( auto old_value_block = tablet_schema.create_block_by_cids(missing_cids); CHECK_EQ(missing_cids.size(), old_value_block.columns()); - // record real pos, key is input line num, value is old_block line num + // segment pos to write -> rowid to read in old_value_block std::map read_index; RETURN_IF_ERROR(read_columns_by_plan(tablet_schema, missing_cids, rsid_to_rowset, old_value_block, &read_index, nullptr)); @@ -284,7 +351,8 @@ Status PartialUpdateReadPlan::fill_missing_columns( // be found in Tablet::lookup_row_key() and `use_default_or_null_flag[idx]` will be false. But we should not // read values from old rows for missing values in this occasion. So we should read the DELETE_SIGN column // to check if a row REALLY exists in the table. - auto pos_in_old_block = read_index[idx + segment_start_pos]; + auto segment_pos = idx + segment_start_pos; + auto pos_in_old_block = read_index[segment_pos]; if (use_default_or_null_flag[idx] || (delete_sign_column_data != nullptr && delete_sign_column_data[pos_in_old_block] != 0)) { for (auto i = 0; i < missing_cids.size(); ++i) { @@ -294,7 +362,7 @@ Status PartialUpdateReadPlan::fill_missing_columns( auto& missing_col = mutable_full_columns[missing_cids[i]]; // clang-format off if (tablet_column.has_default_value()) { - missing_col->insert_from(*mutable_default_value_columns[i].get(), 0); + missing_col->insert_from(*mutable_default_value_columns[i], 0); } else if (tablet_column.is_nullable()) { auto* nullable_column = assert_cast(missing_col.get()); @@ -320,8 +388,271 @@ Status PartialUpdateReadPlan::fill_missing_columns( } for (auto i = 0; i < missing_cids.size(); ++i) { mutable_full_columns[missing_cids[i]]->insert_from( - *old_value_block.get_columns_with_type_and_name()[i].column.get(), - pos_in_old_block); + *old_value_block.get_by_position(i).column, pos_in_old_block); + } + } + full_block.set_columns(std::move(mutable_full_columns)); + return Status::OK(); +} + +void FlexibleReadPlan::prepare_to_read(const RowLocation& row_location, size_t pos, + const BitmapValue& skip_bitmap) { + if (!use_row_store) { + for (uint64_t col_uid : skip_bitmap) { + plan[row_location.rowset_id][row_location.segment_id][col_uid].emplace_back( + row_location.row_id, pos); + } + } else { + row_store_plan[row_location.rowset_id][row_location.segment_id].emplace_back( + row_location.row_id, pos); + } +} + +Status FlexibleReadPlan::read_columns_by_plan( + const TabletSchema& tablet_schema, + const std::map& rsid_to_rowset, + vectorized::Block& old_value_block, + std::map>* read_index) const { + auto mutable_columns = old_value_block.mutate_columns(); + + // cid -> next rid to fill in block + std::map next_read_idx; + for (std::size_t cid {0}; cid < tablet_schema.num_columns(); cid++) { + next_read_idx[cid] = 0; + } + + for (const auto& [rowset_id, segment_mappings] : plan) { + for (const auto& [segment_id, uid_mappings] : segment_mappings) { + for (const auto& [col_uid, mappings] : uid_mappings) { + auto rowset_iter = rsid_to_rowset.find(rowset_id); + CHECK(rowset_iter != rsid_to_rowset.end()); + auto cid = tablet_schema.field_index(col_uid); + DCHECK_NE(cid, -1); + DCHECK_GE(cid, tablet_schema.num_key_columns()); + std::vector rids; + for (auto [rid, pos] : mappings) { + rids.emplace_back(rid); + (*read_index)[cid][pos] = next_read_idx[cid]++; + } + + TabletColumn tablet_column = tablet_schema.column(cid); + auto idx = cid - tablet_schema.num_key_columns(); + RETURN_IF_ERROR(doris::BaseTablet::fetch_value_by_rowids( + rowset_iter->second, segment_id, rids, tablet_column, + mutable_columns[idx])); + } + } + } + // !!!ATTENTION!!!: columns in block may have different size because every row has different columns to update + old_value_block.set_columns(std::move(mutable_columns)); + return Status::OK(); +} + +Status FlexibleReadPlan::read_columns_by_plan( + const TabletSchema& tablet_schema, const std::vector& cids_to_read, + const std::map& rsid_to_rowset, + vectorized::Block& old_value_block, std::map* read_index) const { + DCHECK(use_row_store); + auto mutable_columns = old_value_block.mutate_columns(); + size_t read_idx = 0; + for (const auto& [rowset_id, segment_row_mappings] : row_store_plan) { + for (const auto& [segment_id, mappings] : segment_row_mappings) { + auto rowset_iter = rsid_to_rowset.find(rowset_id); + CHECK(rowset_iter != rsid_to_rowset.end()); + std::vector rids; + for (auto [rid, pos] : mappings) { + rids.emplace_back(rid); + (*read_index)[pos] = read_idx++; + } + for (size_t cid = 0; cid < mutable_columns.size(); ++cid) { + TabletColumn tablet_column = tablet_schema.column(cids_to_read[cid]); + auto st = doris::BaseTablet::fetch_value_by_rowids( + rowset_iter->second, segment_id, rids, tablet_column, mutable_columns[cid]); + // set read value to output block + if (!st.ok()) { + LOG(WARNING) << "failed to fetch value"; + return st; + } + } + } + } + old_value_block.set_columns(std::move(mutable_columns)); + return Status::OK(); +} + +Status FlexibleReadPlan::fill_non_primary_key_columns( + RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, vectorized::Block& full_block, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const std::size_t segment_start_pos, const std::size_t block_start_pos, + const vectorized::Block* block, std::vector* skip_bitmaps) const { + auto mutable_full_columns = full_block.mutate_columns(); + + // missing_cids are all non sort key columns' cids + const auto& non_sort_key_cids = rowset_ctx->partial_update_info->missing_cids; + auto old_value_block = tablet_schema.create_block_by_cids(non_sort_key_cids); + CHECK_EQ(non_sort_key_cids.size(), old_value_block.columns()); + + if (!use_row_store) { + RETURN_IF_ERROR(fill_non_primary_key_columns_for_column_store( + rowset_ctx, rsid_to_rowset, tablet_schema, non_sort_key_cids, old_value_block, + mutable_full_columns, use_default_or_null_flag, has_default_or_nullable, + segment_start_pos, block_start_pos, block, skip_bitmaps)); + } else { + RETURN_IF_ERROR(fill_non_primary_key_columns_for_row_store( + rowset_ctx, rsid_to_rowset, tablet_schema, non_sort_key_cids, old_value_block, + mutable_full_columns, use_default_or_null_flag, has_default_or_nullable, + segment_start_pos, block_start_pos, block, skip_bitmaps)); + } + full_block.set_columns(std::move(mutable_full_columns)); + return Status::OK(); +} + +Status FlexibleReadPlan::fill_non_primary_key_columns_for_column_store( + RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, + vectorized::Block& old_value_block, vectorized::MutableColumns& mutable_full_columns, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const std::size_t segment_start_pos, const std::size_t block_start_pos, + const vectorized::Block* block, std::vector* skip_bitmaps) const { + // cid -> segment pos to write -> rowid to read in old_value_block + std::map> read_index; + RETURN_IF_ERROR( + read_columns_by_plan(tablet_schema, rsid_to_rowset, old_value_block, &read_index)); + // !!!ATTENTION!!!: columns in old_value_block may have different size because every row has different columns to update + + const auto* delete_sign_column_data = BaseTablet::get_delete_sign_column_data(old_value_block); + // build default value columns + auto default_value_block = old_value_block.clone_empty(); + if (has_default_or_nullable || delete_sign_column_data != nullptr) { + RETURN_IF_ERROR(BaseTablet::generate_default_value_block( + tablet_schema, non_sort_key_cids, rowset_ctx->partial_update_info->default_values, + old_value_block, default_value_block)); + } + + auto fill_one_cell = [&tablet_schema, &read_index]( + const TabletColumn& tablet_column, uint32_t cid, + vectorized::MutableColumnPtr& new_col, + const vectorized::IColumn& default_value_col, + const vectorized::IColumn& old_value_col, + const vectorized::IColumn& cur_col, std::size_t block_pos, + std::size_t segment_pos, bool skipped, bool use_default, + const signed char* delete_sign_column_data) { + if (skipped) { + DCHECK(cid != tablet_schema.skip_bitmap_col_idx()); + DCHECK(cid != tablet_schema.version_col_idx()); + DCHECK(!tablet_column.is_row_store_column()); + + auto delete_sign_pos = read_index[tablet_schema.delete_sign_idx()][segment_pos]; + if (use_default || (delete_sign_column_data != nullptr && + delete_sign_column_data[delete_sign_pos] != 0)) { + if (tablet_column.has_default_value()) { + new_col->insert_from(default_value_col, 0); + } else if (tablet_column.is_nullable()) { + assert_cast( + new_col.get()) + ->insert_null_elements(1); + } else { + new_col->insert_default(); + } + } else { + auto pos_in_old_block = read_index.at(cid).at(segment_pos); + new_col->insert_from(old_value_col, pos_in_old_block); + } + } else { + new_col->insert_from(cur_col, block_pos); + } + }; + + // fill all non sort key columns from mutable_old_columns, need to consider default value and null value + for (std::size_t i {0}; i < non_sort_key_cids.size(); i++) { + auto cid = non_sort_key_cids[i]; + const auto& tablet_column = tablet_schema.column(cid); + auto col_uid = tablet_column.unique_id(); + for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { + auto segment_pos = segment_start_pos + idx; + auto block_pos = block_start_pos + idx; + + fill_one_cell(tablet_column, cid, mutable_full_columns[cid], + *default_value_block.get_by_position(i).column, + *old_value_block.get_by_position(i).column, + *block->get_by_position(cid).column, block_pos, segment_pos, + skip_bitmaps->at(block_pos).contains(col_uid), + use_default_or_null_flag[idx], delete_sign_column_data); + } + } + return Status::OK(); +} + +Status FlexibleReadPlan::fill_non_primary_key_columns_for_row_store( + RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, + vectorized::Block& old_value_block, vectorized::MutableColumns& mutable_full_columns, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const std::size_t segment_start_pos, const std::size_t block_start_pos, + const vectorized::Block* block, std::vector* skip_bitmaps) const { + // segment pos to write -> rowid to read in old_value_block + std::map read_index; + RETURN_IF_ERROR(read_columns_by_plan(tablet_schema, non_sort_key_cids, rsid_to_rowset, + old_value_block, &read_index)); + + const auto* delete_sign_column_data = BaseTablet::get_delete_sign_column_data(old_value_block); + // build default value columns + auto default_value_block = old_value_block.clone_empty(); + if (has_default_or_nullable || delete_sign_column_data != nullptr) { + RETURN_IF_ERROR(BaseTablet::generate_default_value_block( + tablet_schema, non_sort_key_cids, rowset_ctx->partial_update_info->default_values, + old_value_block, default_value_block)); + } + + auto fill_one_cell = [&tablet_schema](const TabletColumn& tablet_column, uint32_t cid, + vectorized::MutableColumnPtr& new_col, + const vectorized::IColumn& default_value_col, + const vectorized::IColumn& old_value_col, + const vectorized::IColumn& cur_col, std::size_t block_pos, + bool skipped, bool use_default, + const signed char* delete_sign_column_data, + uint32_t pos_in_old_block) { + if (skipped) { + DCHECK(cid != tablet_schema.skip_bitmap_col_idx()); + DCHECK(cid != tablet_schema.version_col_idx()); + DCHECK(!tablet_column.is_row_store_column()); + + if (use_default || (delete_sign_column_data != nullptr && + delete_sign_column_data[pos_in_old_block] != 0)) { + if (tablet_column.has_default_value()) { + new_col->insert_from(default_value_col, 0); + } else if (tablet_column.is_nullable()) { + assert_cast( + new_col.get()) + ->insert_null_elements(1); + } else { + new_col->insert_default(); + } + } else { + new_col->insert_from(old_value_col, pos_in_old_block); + } + } else { + new_col->insert_from(cur_col, block_pos); + } + }; + + // fill all non sort key columns from mutable_old_columns, need to consider default value and null value + for (std::size_t i {0}; i < non_sort_key_cids.size(); i++) { + auto cid = non_sort_key_cids[i]; + const auto& tablet_column = tablet_schema.column(cid); + auto col_uid = tablet_column.unique_id(); + for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { + auto segment_pos = segment_start_pos + idx; + auto block_pos = block_start_pos + idx; + auto pos_in_old_block = read_index[segment_pos]; + + fill_one_cell(tablet_column, cid, mutable_full_columns[cid], + *default_value_block.get_by_position(i).column, + *old_value_block.get_by_position(i).column, + *block->get_by_position(cid).column, block_pos, + skip_bitmaps->at(block_pos).contains(col_uid), + use_default_or_null_flag[idx], delete_sign_column_data, pos_in_old_block); } } return Status::OK(); diff --git a/be/src/olap/partial_update_info.h b/be/src/olap/partial_update_info.h index 278b027942eb20..c7e69f33184c41 100644 --- a/be/src/olap/partial_update_info.h +++ b/be/src/olap/partial_update_info.h @@ -16,6 +16,8 @@ // under the License. #pragma once +#include + #include #include #include @@ -25,32 +27,62 @@ #include "common/status.h" #include "olap/rowset/rowset_fwd.h" #include "olap/tablet_fwd.h" +#include "vec/columns/column.h" namespace doris { class TabletSchema; class PartialUpdateInfoPB; +class BitmapValue; struct RowLocation; namespace vectorized { class Block; } struct RowsetWriterContext; struct RowsetId; +class BitmapValue; struct PartialUpdateInfo { - void init(const TabletSchema& tablet_schema, bool partial_update, + void init(const TabletSchema& tablet_schema, UniqueKeyUpdateModePB unique_key_update_mode, const std::set& partial_update_cols, bool is_strict_mode, int64_t timestamp_ms, int32_t nano_seconds, const std::string& timezone, - const std::string& auto_increment_column, int64_t cur_max_version = -1); + const std::string& auto_increment_column, int32_t sequence_map_col_uid = -1, + int64_t cur_max_version = -1); void to_pb(PartialUpdateInfoPB* partial_update_info) const; void from_pb(PartialUpdateInfoPB* partial_update_info); - Status handle_non_strict_mode_not_found_error(const TabletSchema& tablet_schema); + Status handle_non_strict_mode_not_found_error(const TabletSchema& tablet_schema, + BitmapValue* skip_bitmap = nullptr) const; + + Status handle_not_found_error_for_fixed_partial_update(const TabletSchema& tablet_schema) const; + Status handle_not_found_error_for_flexible_partial_update(const TabletSchema& tablet_schema, + BitmapValue* skip_bitmap) const; std::string summary() const; + std::string partial_update_mode_str() const { + switch (partial_update_mode) { + case UniqueKeyUpdateModePB::UPSERT: + return "upsert"; + case UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS: + return "partial update"; + case UniqueKeyUpdateModePB::UPDATE_FLEXIBLE_COLUMNS: + return "flexible partial update"; + } + return ""; + } + bool is_partial_update() const { return partial_update_mode != UniqueKeyUpdateModePB::UPSERT; } + bool is_fixed_partial_update() const { + return partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS; + } + bool is_flexible_partial_update() const { + return partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FLEXIBLE_COLUMNS; + } + UniqueKeyUpdateModePB update_mode() const { return partial_update_mode; } + int32_t sequence_map_col_uid() const { return sequence_map_col_unqiue_id; } + private: void _generate_default_values_for_missing_cids(const TabletSchema& tablet_schema); public: - bool is_partial_update {false}; + UniqueKeyUpdateModePB partial_update_mode {UniqueKeyUpdateModePB::UPSERT}; int64_t max_version_in_flush_phase {-1}; std::set partial_update_input_columns; std::vector missing_cids; @@ -67,6 +99,8 @@ struct PartialUpdateInfo { // default values for missing cids std::vector default_values; + + int32_t sequence_map_col_unqiue_id {-1}; }; // used in mow partial update @@ -76,14 +110,14 @@ struct RidAndPos { size_t pos; }; -class PartialUpdateReadPlan { +class FixedReadPlan { public: void prepare_to_read(const RowLocation& row_location, size_t pos); Status read_columns_by_plan(const TabletSchema& tablet_schema, const std::vector cids_to_read, const std::map& rsid_to_rowset, vectorized::Block& block, std::map* read_index, - const signed char* __restrict skip_map = nullptr) const; + const signed char* __restrict delete_signs = nullptr) const; Status fill_missing_columns(RowsetWriterContext* rowset_ctx, const std::map& rsid_to_rowset, const TabletSchema& tablet_schema, vectorized::Block& full_block, @@ -92,7 +126,56 @@ class PartialUpdateReadPlan { const vectorized::Block* block) const; private: - std::map>> plan; + std::map>> plan; +}; + +class FlexibleReadPlan { +public: + FlexibleReadPlan(bool has_row_store_for_column) : use_row_store(has_row_store_for_column) {} + void prepare_to_read(const RowLocation& row_location, size_t pos, + const BitmapValue& skip_bitmap); + // for column store + Status read_columns_by_plan(const TabletSchema& tablet_schema, + const std::map& rsid_to_rowset, + vectorized::Block& old_value_block, + std::map>* read_index) const; + + // for row_store + Status read_columns_by_plan(const TabletSchema& tablet_schema, + const std::vector& cids_to_read, + const std::map& rsid_to_rowset, + vectorized::Block& old_value_block, + std::map* read_index) const; + Status fill_non_primary_key_columns( + RowsetWriterContext* rowset_ctx, + const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, vectorized::Block& full_block, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const std::size_t segment_start_pos, const std::size_t block_start_pos, + const vectorized::Block* block, std::vector* skip_bitmaps) const; + + Status fill_non_primary_key_columns_for_column_store( + RowsetWriterContext* rowset_ctx, + const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, + vectorized::Block& old_value_block, vectorized::MutableColumns& mutable_full_columns, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const std::size_t segment_start_pos, const std::size_t block_start_pos, + const vectorized::Block* block, std::vector* skip_bitmaps) const; + Status fill_non_primary_key_columns_for_row_store( + RowsetWriterContext* rowset_ctx, + const std::map& rsid_to_rowset, + const TabletSchema& tablet_schema, const std::vector& non_sort_key_cids, + vectorized::Block& old_value_block, vectorized::MutableColumns& mutable_full_columns, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const std::size_t segment_start_pos, const std::size_t block_start_pos, + const vectorized::Block* block, std::vector* skip_bitmaps) const; + +private: + bool use_row_store {false}; + // rowset_id -> segment_id -> column unique id -> mappings + std::map>>> plan; + std::map>> row_store_plan; }; struct PartialUpdateStats { diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index d3554cae15d66a..9d40ff5a8fad51 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -17,6 +17,7 @@ #include "olap/primary_key_index.h" +#include #include #include @@ -95,7 +96,8 @@ Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader, // parse primary key index _index_reader.reset(new segment_v2::IndexedColumnReader(file_reader, meta.primary_key_index())); _index_reader->set_is_pk_index(true); - RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false)); + RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false, + _pk_index_load_stats)); _index_parsed = true; return Status::OK(); @@ -107,7 +109,8 @@ Status PrimaryKeyIndexReader::parse_bf(io::FileReaderSPtr file_reader, segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index(); segment_v2::BloomFilterIndexReader bf_index_reader(std::move(file_reader), column_index_meta.bloom_filter_index()); - RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false)); + RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false, + _pk_index_load_stats)); std::unique_ptr bf_iter; RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter)); RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf)); diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h index b5eb13131b73a0..dcbbc5f30625f4 100644 --- a/be/src/olap/primary_key_index.h +++ b/be/src/olap/primary_key_index.h @@ -25,6 +25,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/indexed_column_reader.h" @@ -97,7 +98,8 @@ class PrimaryKeyIndexBuilder { class PrimaryKeyIndexReader { public: - PrimaryKeyIndexReader() : _index_parsed(false), _bf_parsed(false) {} + PrimaryKeyIndexReader(OlapReaderStatistics* pk_index_load_stats = nullptr) + : _index_parsed(false), _bf_parsed(false), _pk_index_load_stats(pk_index_load_stats) {} ~PrimaryKeyIndexReader() { segment_v2::g_pk_total_bloom_filter_num << -static_cast(_bf_num); @@ -111,9 +113,10 @@ class PrimaryKeyIndexReader { Status parse_bf(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta); - Status new_iterator(std::unique_ptr* index_iterator) const { + Status new_iterator(std::unique_ptr* index_iterator, + OlapReaderStatistics* stats = nullptr) const { DCHECK(_index_parsed); - index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get())); + index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get(), stats)); return Status::OK(); } @@ -152,6 +155,7 @@ class PrimaryKeyIndexReader { std::unique_ptr _bf; size_t _bf_num = 0; uint64 _bf_bytes = 0; + OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace doris diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 575b002b2f6086..56d167459f5be7 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -471,7 +471,7 @@ Status PushBrokerReader::_init_src_block() { } Status PushBrokerReader::_cast_to_input_block() { - size_t idx = 0; + uint32_t idx = 0; for (auto& slot_desc : _src_slot_descs) { if (_name_to_col_type.find(slot_desc->col_name()) == _name_to_col_type.end()) { continue; diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index 209aca7fb03b4c..bbb2ca72b4ae7f 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -81,12 +81,7 @@ Status BetaRowset::get_inverted_index_size(size_t* index_size) { } if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { - auto indices = _schema->indexes(); - for (auto& index : indices) { - // only get file_size for inverted index - if (index.index_type() != IndexType::INVERTED) { - continue; - } + for (const auto& index : _schema->inverted_indexes()) { for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { auto seg_path = DORIS_TRY(segment_path(seg_id)); int64_t file_size = 0; @@ -94,7 +89,7 @@ Status BetaRowset::get_inverted_index_size(size_t* index_size) { std::string inverted_index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( InvertedIndexDescriptor::get_index_file_path_prefix(seg_path), - index.index_id(), index.get_index_suffix()); + index->index_id(), index->get_index_suffix()); RETURN_IF_ERROR(fs->file_size(inverted_index_file_path, &file_size)); *index_size += file_size; } @@ -122,7 +117,7 @@ void BetaRowset::clear_inverted_index_cache() { auto index_path_prefix = InvertedIndexDescriptor::get_index_file_path_prefix(*seg_path); for (const auto& column : tablet_schema()->columns()) { - const TabletIndex* index_meta = tablet_schema()->get_inverted_index(*column); + const TabletIndex* index_meta = tablet_schema()->inverted_index(*column); if (index_meta) { auto inverted_index_file_cache_key = InvertedIndexDescriptor::get_index_file_cache_key( @@ -183,8 +178,9 @@ Status BetaRowset::load_segment(int64_t seg_id, segment_v2::SegmentSharedPtr* se .file_size = _rowset_meta->segment_file_size(seg_id), }; - auto s = segment_v2::Segment::open(fs, seg_path, seg_id, rowset_id(), _schema, reader_options, - segment, _rowset_meta->inverted_index_file_info(seg_id)); + auto s = segment_v2::Segment::open(fs, seg_path, _rowset_meta->tablet_id(), seg_id, rowset_id(), + _schema, reader_options, segment, + _rowset_meta->inverted_index_file_info(seg_id)); if (!s.ok()) { LOG(WARNING) << "failed to open segment. " << seg_path << " under rowset " << rowset_id() << " : " << s.to_string(); @@ -226,7 +222,7 @@ Status BetaRowset::remove() { if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { for (auto& column : _schema->columns()) { - const TabletIndex* index_meta = _schema->get_inverted_index(*column); + const TabletIndex* index_meta = _schema->inverted_index(*column); if (index_meta) { std::string inverted_index_file = InvertedIndexDescriptor::get_index_file_path_v1( @@ -310,22 +306,19 @@ Status BetaRowset::link_files_to(const std::string& dir, RowsetId new_rowset_id, return status; }); if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { - for (const auto& index : _schema->indexes()) { - if (index.index_type() != IndexType::INVERTED) { - continue; - } - auto index_id = index.index_id(); + for (const auto& index : _schema->inverted_indexes()) { + auto index_id = index->index_id(); if (without_index_uids != nullptr && without_index_uids->count(index_id)) { continue; } std::string inverted_index_src_file_path = InvertedIndexDescriptor::get_index_file_path_v1( InvertedIndexDescriptor::get_index_file_path_prefix(src_path), - index_id, index.get_index_suffix()); + index_id, index->get_index_suffix()); std::string inverted_index_dst_file_path = InvertedIndexDescriptor::get_index_file_path_v1( InvertedIndexDescriptor::get_index_file_path_prefix(dst_path), - index_id, index.get_index_suffix()); + index_id, index->get_index_suffix()); bool index_file_exists = true; RETURN_IF_ERROR(local_fs->exists(inverted_index_src_file_path, &index_file_exists)); if (index_file_exists) { @@ -404,7 +397,7 @@ Status BetaRowset::copy_files_to(const std::string& dir, const RowsetId& new_row if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { for (auto& column : _schema->columns()) { // if (column.has_inverted_index()) { - const TabletIndex* index_meta = _schema->get_inverted_index(*column); + const TabletIndex* index_meta = _schema->inverted_index(*column); if (index_meta) { std::string inverted_index_src_file_path = InvertedIndexDescriptor::get_index_file_path_v1( @@ -463,7 +456,7 @@ Status BetaRowset::upload_to(const StorageResource& dest_fs, const RowsetId& new if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { for (auto& column : _schema->columns()) { // if (column.has_inverted_index()) { - const TabletIndex* index_meta = _schema->get_inverted_index(*column); + const TabletIndex* index_meta = _schema->inverted_index(*column); if (index_meta) { std::string remote_inverted_index_file = InvertedIndexDescriptor::get_index_file_path_v1( @@ -497,7 +490,7 @@ Status BetaRowset::upload_to(const StorageResource& dest_fs, const RowsetId& new auto st = dest_fs.fs->batch_upload(local_paths, dest_paths); if (st.ok()) { DorisMetrics::instance()->upload_rowset_count->increment(1); - DorisMetrics::instance()->upload_total_byte->increment(data_disk_size()); + DorisMetrics::instance()->upload_total_byte->increment(total_disk_size()); } else { DorisMetrics::instance()->upload_fail_count->increment(1); } @@ -543,8 +536,8 @@ Status BetaRowset::check_current_rowset_segment() { .file_size = _rowset_meta->segment_file_size(seg_id), }; - auto s = segment_v2::Segment::open(fs, seg_path, seg_id, rowset_id(), _schema, - reader_options, &segment, + auto s = segment_v2::Segment::open(fs, seg_path, _rowset_meta->tablet_id(), seg_id, + rowset_id(), _schema, reader_options, &segment, _rowset_meta->inverted_index_file_info(seg_id)); if (!s.ok()) { LOG(WARNING) << "segment can not be opened. file=" << seg_path; @@ -612,14 +605,11 @@ Status BetaRowset::add_to_binlog() { linked_success_files.push_back(binlog_file); if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { - for (const auto& index : _schema->indexes()) { - if (index.index_type() != IndexType::INVERTED) { - continue; - } - auto index_id = index.index_id(); + for (const auto& index : _schema->inverted_indexes()) { + auto index_id = index->index_id(); auto index_file = InvertedIndexDescriptor::get_index_file_path_v1( InvertedIndexDescriptor::get_index_file_path_prefix(seg_file), index_id, - index.get_index_suffix()); + index->get_index_suffix()); auto binlog_index_file = (std::filesystem::path(binlog_dir) / std::filesystem::path(index_file).filename()) .string(); @@ -660,7 +650,7 @@ Status BetaRowset::calc_file_crc(uint32_t* crc_value, int64_t* file_count) { file_paths.emplace_back(seg_path); if (_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { for (auto& column : _schema->columns()) { - const TabletIndex* index_meta = _schema->get_inverted_index(*column); + const TabletIndex* index_meta = _schema->inverted_index(*column); if (index_meta) { std::string inverted_index_file = InvertedIndexDescriptor::get_index_file_path_v1( @@ -804,7 +794,7 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, } else { rapidjson::Value indices(rapidjson::kArrayType); for (auto column : _rowset_meta->tablet_schema()->columns()) { - const auto* index_meta = _rowset_meta->tablet_schema()->get_inverted_index(*column); + const auto* index_meta = _rowset_meta->tablet_schema()->inverted_index(*column); if (index_meta == nullptr) { continue; } diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index d2c7023f659c20..042893f1374374 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -235,6 +235,12 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context for (size_t i = 0; i < segments.size(); i++) { _segments_rows[i] = segments[i]->num_rows(); } + if (_read_context->record_rowids) { + // init segment rowid map for rowid conversion + std::vector segment_num_rows; + RETURN_IF_ERROR(get_segment_num_rows(&segment_num_rows)); + _read_context->rowid_conversion->init_segment_map(rowset()->rowset_id(), segment_num_rows); + } auto [seg_start, seg_end] = _segment_offsets; if (seg_start == seg_end) { diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 22fff5608c7d09..198b4e8595ed20 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -81,7 +81,7 @@ void build_rowset_meta_with_spec_field(RowsetMeta& rowset_meta, const RowsetMeta& spec_rowset_meta) { rowset_meta.set_num_rows(spec_rowset_meta.num_rows()); rowset_meta.set_total_disk_size(spec_rowset_meta.total_disk_size()); - rowset_meta.set_data_disk_size(spec_rowset_meta.total_disk_size()); + rowset_meta.set_data_disk_size(spec_rowset_meta.data_disk_size()); rowset_meta.set_index_disk_size(spec_rowset_meta.index_disk_size()); // TODO write zonemap to meta rowset_meta.set_empty(spec_rowset_meta.num_rows() == 0); @@ -189,13 +189,67 @@ Result> SegmentFileCollection::segments_file_size(int seg_id return ResultError(st); } +InvertedIndexFileCollection::~InvertedIndexFileCollection() = default; + +Status InvertedIndexFileCollection::add(int seg_id, InvertedIndexFileWriterPtr&& index_writer) { + std::lock_guard lock(_lock); + if (_inverted_index_file_writers.find(seg_id) != _inverted_index_file_writers.end()) + [[unlikely]] { + DCHECK(false); + return Status::InternalError("The seg_id already exists, seg_id is {}", seg_id); + } + _inverted_index_file_writers.emplace(seg_id, std::move(index_writer)); + return Status::OK(); +} + +Status InvertedIndexFileCollection::close() { + std::lock_guard lock(_lock); + for (auto&& [id, writer] : _inverted_index_file_writers) { + RETURN_IF_ERROR(writer->close()); + _total_size += writer->get_index_file_total_size(); + } + + return Status::OK(); +} + +Result> +InvertedIndexFileCollection::inverted_index_file_info(int seg_id_offset) { + std::lock_guard lock(_lock); + + Status st; + std::vector idx_file_info(_inverted_index_file_writers.size()); + bool succ = std::all_of( + _inverted_index_file_writers.begin(), _inverted_index_file_writers.end(), + [&](auto&& it) { + auto&& [seg_id, writer] = it; + + int idx = seg_id - seg_id_offset; + if (idx >= idx_file_info.size()) [[unlikely]] { + auto err_msg = + fmt::format("invalid seg_id={} num_file_writers={} seg_id_offset={}", + seg_id, idx_file_info.size(), seg_id_offset); + DCHECK(false) << err_msg; + st = Status::InternalError(err_msg); + return false; + } + idx_file_info[idx] = _inverted_index_file_writers[seg_id]->get_index_file_info(); + return true; + }); + + if (succ) { + return idx_file_info; + } + + return ResultError(st); +} + BaseBetaRowsetWriter::BaseBetaRowsetWriter() : _num_segment(0), _segment_start_id(0), _num_rows_written(0), _total_data_size(0), _total_index_size(0), - _segment_creator(_context, _seg_files, _idx_files_info) {} + _segment_creator(_context, _seg_files, _idx_files) {} BetaRowsetWriter::BetaRowsetWriter(StorageEngine& engine) : _engine(engine), _segcompaction_worker(std::make_shared(this)) {} @@ -259,7 +313,7 @@ Status BaseBetaRowsetWriter::add_block(const vectorized::Block* block) { Status BaseBetaRowsetWriter::_generate_delete_bitmap(int32_t segment_id) { SCOPED_RAW_TIMER(&_delete_bitmap_ns); if (!_context.tablet->enable_unique_key_merge_on_write() || - (_context.partial_update_info && _context.partial_update_info->is_partial_update)) { + (_context.partial_update_info && _context.partial_update_info->is_partial_update())) { return Status::OK(); } RowsetSharedPtr rowset_ptr; @@ -282,8 +336,7 @@ Status BaseBetaRowsetWriter::_generate_delete_bitmap(int32_t segment_id) { LOG(INFO) << "[Memtable Flush] construct delete bitmap tablet: " << _context.tablet->tablet_id() << ", rowset_ids: " << _context.mow_context->rowset_ids.size() << ", cur max_version: " << _context.mow_context->max_version - << ", transaction_id: " << _context.mow_context->txn_id << ", delete_bitmap_count: " - << _context.tablet->tablet_meta()->delete_bitmap().get_delete_bitmap_count() + << ", transaction_id: " << _context.mow_context->txn_id << ", cost: " << watch.get_elapse_time_us() << "(us), total rows: " << total_rows; return Status::OK(); } @@ -291,7 +344,7 @@ Status BaseBetaRowsetWriter::_generate_delete_bitmap(int32_t segment_id) { Status BetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_context) { RETURN_IF_ERROR(BaseBetaRowsetWriter::init(rowset_writer_context)); if (_segcompaction_worker) { - _segcompaction_worker->init_mem_tracker(rowset_writer_context.txn_id); + _segcompaction_worker->init_mem_tracker(rowset_writer_context); } return Status::OK(); } @@ -315,7 +368,8 @@ Status BetaRowsetWriter::_load_noncompacted_segment(segment_v2::SegmentSharedPtr .is_doris_table = true, .cache_base_path {}, }; - auto s = segment_v2::Segment::open(io::global_local_filesystem(), path, segment_id, rowset_id(), + auto s = segment_v2::Segment::open(io::global_local_filesystem(), path, + _rowset_meta->tablet_id(), segment_id, rowset_id(), _context.tablet_schema, reader_options, &segment); if (!s.ok()) { LOG(WARNING) << "failed to open segment. " << path << ":" << s; @@ -493,8 +547,8 @@ Status BetaRowsetWriter::_rename_compacted_indices(int64_t begin, int64_t end, u } // rename remaining inverted index files for (auto column : _context.tablet_schema->columns()) { - if (_context.tablet_schema->has_inverted_index(*column)) { - const auto* index_info = _context.tablet_schema->get_inverted_index(*column); + if (const auto& index_info = _context.tablet_schema->inverted_index(*column); + index_info != nullptr) { auto index_id = index_info->index_id(); if (_context.tablet_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { @@ -728,7 +782,6 @@ Status BetaRowsetWriter::_close_file_writers() { Status BetaRowsetWriter::build(RowsetSharedPtr& rowset) { RETURN_IF_ERROR(_close_file_writers()); - const auto total_segment_num = _num_segment - _segcompacted_point + 1 + _num_segcompacted; RETURN_NOT_OK_STATUS_WITH_WARN(_check_segment_number_limit(total_segment_num), "too many segments when build new rowset"); @@ -748,12 +801,15 @@ Status BetaRowsetWriter::build(RowsetSharedPtr& rowset) { : _context.tablet_schema; _rowset_meta->set_tablet_schema(rowset_schema); - if (auto idx_files_info = _idx_files_info.get_inverted_files_info(_segment_start_id); - !idx_files_info.has_value()) [[unlikely]] { - LOG(ERROR) << "expected inverted index files info, but none presents: " - << idx_files_info.error(); - } else { - _rowset_meta->add_inverted_index_files_info(idx_files_info.value()); + // If segment compaction occurs, the idx file info will become inaccurate. + if (rowset_schema->has_inverted_index() && _num_segcompacted == 0) { + if (auto idx_files_info = _idx_files.inverted_index_file_info(_segment_start_id); + !idx_files_info.has_value()) [[unlikely]] { + LOG(ERROR) << "expected inverted index files info, but none presents: " + << idx_files_info.error(); + } else { + _rowset_meta->add_inverted_index_files_info(idx_files_info.value()); + } } RETURN_NOT_OK_STATUS_WITH_WARN(RowsetFactory::create_rowset(rowset_schema, _context.tablet_path, @@ -830,7 +886,8 @@ Status BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch rowset_meta->set_num_segments(segment_num); rowset_meta->set_num_rows(num_rows_written + _num_rows_written); - rowset_meta->set_total_disk_size(total_data_size + _total_data_size); + rowset_meta->set_total_disk_size(total_data_size + _total_data_size + total_index_size + + _total_index_size); rowset_meta->set_data_disk_size(total_data_size + _total_data_size); rowset_meta->set_index_disk_size(total_index_size + _total_index_size); rowset_meta->set_segments_key_bounds(segments_encoded_key_bounds); @@ -891,7 +948,15 @@ Status BaseBetaRowsetWriter::create_file_writer(uint32_t segment_id, io::FileWri fmt::format("failed to create file = {}, file type = {}", segment_path, file_type)); } -Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( +Status BaseBetaRowsetWriter::create_inverted_index_file_writer( + uint32_t segment_id, InvertedIndexFileWriterPtr* index_file_writer) { + RETURN_IF_ERROR(RowsetWriter::create_inverted_index_file_writer(segment_id, index_file_writer)); + // used for inverted index format v1 + (*index_file_writer)->set_file_writer_opts(_context.get_file_writer_options()); + return Status::OK(); +} + +Status BetaRowsetWriter::create_segment_writer_for_segcompaction( std::unique_ptr* writer, int64_t begin, int64_t end) { DCHECK(begin >= 0 && end >= 0); std::string path = BetaRowset::local_segment_path_segcompacted(_context.tablet_path, @@ -899,6 +964,22 @@ Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( io::FileWriterPtr file_writer; RETURN_IF_ERROR(_create_file_writer(path, file_writer)); + InvertedIndexFileWriterPtr index_file_writer; + if (_context.tablet_schema->has_inverted_index()) { + io::FileWriterPtr idx_file_writer; + if (_context.tablet_schema->get_inverted_index_storage_format() != + InvertedIndexStorageFormatPB::V1) { + std::string prefix = + std::string {InvertedIndexDescriptor::get_index_file_path_prefix(path)}; + std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(prefix); + RETURN_IF_ERROR(_create_file_writer(index_path, idx_file_writer)); + } + index_file_writer = std::make_unique( + _context.fs(), path, _context.rowset_id.to_string(), _num_segcompacted, + _context.tablet_schema->get_inverted_index_storage_format(), + std::move(idx_file_writer)); + } + segment_v2::SegmentWriterOptions writer_options; writer_options.enable_unique_key_merge_on_write = _context.enable_unique_key_merge_on_write; writer_options.rowset_ctx = &_context; @@ -907,15 +988,19 @@ Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( writer_options.max_rows_per_segment = _context.max_rows_per_segment; writer_options.mow_ctx = _context.mow_context; - *writer = std::make_unique(file_writer.get(), _num_segcompacted, - _context.tablet_schema, _context.tablet, - _context.data_dir, writer_options); + *writer = std::make_unique( + file_writer.get(), _num_segcompacted, _context.tablet_schema, _context.tablet, + _context.data_dir, writer_options, index_file_writer.get()); if (auto& seg_writer = _segcompaction_worker->get_file_writer(); seg_writer != nullptr && seg_writer->state() != io::FileWriter::State::CLOSED) { RETURN_IF_ERROR(_segcompaction_worker->get_file_writer()->close()); } _segcompaction_worker->get_file_writer().reset(file_writer.release()); - + if (auto& idx_file_writer = _segcompaction_worker->get_inverted_index_file_writer(); + idx_file_writer != nullptr) { + RETURN_IF_ERROR(idx_file_writer->close()); + } + _segcompaction_worker->get_inverted_index_file_writer().reset(index_file_writer.release()); return Status::OK(); } @@ -1005,11 +1090,13 @@ Status BetaRowsetWriter::flush_segment_writer_for_segcompaction( return Status::Error("failed to finalize segment: {}", s.to_string()); } + int64_t inverted_index_file_size = 0; + RETURN_IF_ERROR((*writer)->close_inverted_index(&inverted_index_file_size)); SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + (*writer)->get_inverted_index_total_size(); - segstat.index_size = index_size + (*writer)->get_inverted_index_total_size(); + segstat.data_size = segment_size; + segstat.index_size = inverted_index_file_size; segstat.key_bounds = key_bounds; { std::lock_guard lock(_segid_statistics_map_mutex); diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index eda5d03b1d7369..d96301af22630d 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -42,6 +42,7 @@ #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" #include "olap/rowset/segment_creator.h" +#include "segment_v2/inverted_index_file_writer.h" #include "segment_v2/segment.h" #include "util/spinlock.h" @@ -84,58 +85,33 @@ class SegmentFileCollection { bool _closed {false}; }; -// Collect the size of the inverted index files -class InvertedIndexFilesInfo { +class InvertedIndexFileCollection { public: + ~InvertedIndexFileCollection(); + + // `seg_id` -> inverted index file writer + Status add(int seg_id, InvertedIndexFileWriterPtr&& writer); + + // Close all file writers + // If the inverted index file writer is not closed, an error will be thrown during destruction + Status close(); + // Get inverted index file info in segment id order. - // Return the info of inverted index files from seg_id_offset to the last one. - Result> get_inverted_files_info(int seg_id_offset) { - std::lock_guard lock(_lock); - - Status st; - std::vector inverted_files_info(_inverted_index_files_info.size()); - bool succ = std::all_of( - _inverted_index_files_info.begin(), _inverted_index_files_info.end(), - [&](auto&& it) { - auto&& [seg_id, info] = it; - - int idx = seg_id - seg_id_offset; - if (idx >= inverted_files_info.size()) [[unlikely]] { - auto err_msg = fmt::format( - "invalid seg_id={} num_inverted_files_info={} seg_id_offset={}", - seg_id, inverted_files_info.size(), seg_id_offset); - DCHECK(false) << err_msg; - st = Status::InternalError(err_msg); - return false; - } - - auto& finfo = inverted_files_info[idx]; - if (finfo.has_index_size() || finfo.index_info_size() > 0) [[unlikely]] { - // File size should not been set - auto err_msg = fmt::format("duplicate seg_id={}", seg_id); - DCHECK(false) << err_msg; - st = Status::InternalError(err_msg); - return false; - } - finfo = info; - return true; - }); - - if (succ) { - return inverted_files_info; - } - - return ResultError(st); - } + // `seg_id_offset` is the offset of the segment id relative to the subscript of `_inverted_index_file_writers`, + // for more details, see `Tablet::create_transient_rowset_writer`. + Result> inverted_index_file_info(int seg_id_offset); - void add_file_info(int seg_id, InvertedIndexFileInfo file_info) { - std::lock_guard lock(_lock); - _inverted_index_files_info.emplace(seg_id, file_info); + // return all inverted index file writers + std::unordered_map& get_file_writers() { + return _inverted_index_file_writers; } + int64_t get_total_index_size() const { return _total_size; } + private: - std::unordered_map _inverted_index_files_info; mutable SpinLock _lock; + std::unordered_map _inverted_index_file_writers; + int64_t _total_size = 0; }; class BaseBetaRowsetWriter : public RowsetWriter { @@ -156,6 +132,9 @@ class BaseBetaRowsetWriter : public RowsetWriter { Status create_file_writer(uint32_t segment_id, io::FileWriterPtr& writer, FileType file_type = FileType::SEGMENT_FILE) override; + Status create_inverted_index_file_writer(uint32_t segment_id, + InvertedIndexFileWriterPtr* writer) override; + Status add_segment(uint32_t segment_id, const SegmentStatistics& segstat, TabletSchemaSPtr flush_schema) override; @@ -208,14 +187,16 @@ class BaseBetaRowsetWriter : public RowsetWriter { } bool is_partial_update() override { - return _context.partial_update_info && _context.partial_update_info->is_partial_update; + return _context.partial_update_info && _context.partial_update_info->is_partial_update(); } const std::unordered_map& get_file_writers() const { return _seg_files.get_file_writers(); } - InvertedIndexFilesInfo& get_inverted_index_files_info() { return _idx_files_info; } + std::unordered_map& inverted_index_file_writers() { + return this->_idx_files.get_file_writers(); + } private: void update_rowset_schema(TabletSchemaSPtr flush_schema); @@ -235,6 +216,15 @@ class BaseBetaRowsetWriter : public RowsetWriter { std::lock_guard l(_segid_statistics_map_mutex); return std::accumulate(_segment_num_rows.begin(), _segment_num_rows.end(), uint64_t(0)); } + // Only during vertical compaction is this method called + // Some index files are written during normal compaction and some files are written during index compaction. + // After all index writes are completed, call this method to write the final compound index file. + Status _close_inverted_index_file_writers() { + RETURN_NOT_OK_STATUS_WITH_WARN(_idx_files.close(), + "failed to close index file when build new rowset"); + this->_total_index_size += _idx_files.get_total_index_size(); + return Status::OK(); + } std::atomic _num_segment; // number of consecutive flushed segments roaring::Roaring _segment_set; // bitmap set to record flushed segment id @@ -242,6 +232,7 @@ class BaseBetaRowsetWriter : public RowsetWriter { int32_t _segment_start_id; // basic write start from 0, partial update may be different SegmentFileCollection _seg_files; + InvertedIndexFileCollection _idx_files; // record rows number of every segment already written, using for rowid // conversion when compaction in unique key with MoW model @@ -269,9 +260,6 @@ class BaseBetaRowsetWriter : public RowsetWriter { int64_t _delete_bitmap_ns = 0; int64_t _segment_writer_ns = 0; - - // map - InvertedIndexFilesInfo _idx_files_info; }; class SegcompactionWorker; @@ -293,6 +281,8 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { Status flush_segment_writer_for_segcompaction( std::unique_ptr* writer, uint64_t index_size, KeyBoundsPB& key_bounds); + Status create_segment_writer_for_segcompaction( + std::unique_ptr* writer, int64_t begin, int64_t end); bool is_segcompacted() const { return _num_segcompacted > 0; } @@ -303,8 +293,6 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { Status _check_segment_number_limit(size_t segnum) override; int64_t _num_seg() const override; Status _wait_flying_segcompaction(); - Status _create_segment_writer_for_segcompaction( - std::unique_ptr* writer, int64_t begin, int64_t end); Status _segcompaction_if_necessary(); Status _segcompaction_rename_last_segments(); Status _load_noncompacted_segment(segment_v2::SegmentSharedPtr& segment, int32_t segment_id); diff --git a/be/src/olap/rowset/beta_rowset_writer_v2.cpp b/be/src/olap/rowset/beta_rowset_writer_v2.cpp index 0d0ad435b9efd1..cb5dd5a5ee272d 100644 --- a/be/src/olap/rowset/beta_rowset_writer_v2.cpp +++ b/be/src/olap/rowset/beta_rowset_writer_v2.cpp @@ -58,7 +58,7 @@ namespace doris { using namespace ErrorCode; BetaRowsetWriterV2::BetaRowsetWriterV2(const std::vector>& streams) - : _segment_creator(_context, _seg_files, _idx_files_info), _streams(streams) {} + : _segment_creator(_context, _seg_files, _idx_files), _streams(streams) {} BetaRowsetWriterV2::~BetaRowsetWriterV2() = default; diff --git a/be/src/olap/rowset/beta_rowset_writer_v2.h b/be/src/olap/rowset/beta_rowset_writer_v2.h index 174b70a072bc17..78ec4a7dce703c 100644 --- a/be/src/olap/rowset/beta_rowset_writer_v2.h +++ b/be/src/olap/rowset/beta_rowset_writer_v2.h @@ -141,7 +141,7 @@ class BetaRowsetWriterV2 : public RowsetWriter { } bool is_partial_update() override { - return _context.partial_update_info && _context.partial_update_info->is_partial_update; + return _context.partial_update_info && _context.partial_update_info->is_partial_update(); } private: @@ -154,11 +154,10 @@ class BetaRowsetWriterV2 : public RowsetWriter { std::vector _segments_encoded_key_bounds; SegmentFileCollection _seg_files; + InvertedIndexFileCollection _idx_files; SegmentCreator _segment_creator; - InvertedIndexFilesInfo _idx_files_info; - fmt::memory_buffer vlog_buffer; std::vector> _streams; diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 6050a33bfc2f5d..e1a2347f6aeaa8 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -149,7 +149,8 @@ class Rowset : public std::enable_shared_from_this { int64_t start_version() const { return rowset_meta()->version().first; } int64_t end_version() const { return rowset_meta()->version().second; } size_t index_disk_size() const { return rowset_meta()->index_disk_size(); } - size_t data_disk_size() const { return rowset_meta()->total_disk_size(); } + size_t data_disk_size() const { return rowset_meta()->data_disk_size(); } + size_t total_disk_size() const { return rowset_meta()->total_disk_size(); } bool empty() const { return rowset_meta()->empty(); } bool zero_num_rows() const { return rowset_meta()->num_rows() == 0; } size_t num_rows() const { return rowset_meta()->num_rows(); } @@ -269,7 +270,9 @@ class Rowset : public std::enable_shared_from_this { _rowset_meta->get_segments_key_bounds(segments_key_bounds); return Status::OK(); } - bool min_key(std::string* min_key) { + + // min key of the first segment + bool first_key(std::string* min_key) { KeyBoundsPB key_bounds; bool ret = _rowset_meta->get_first_segment_key_bound(&key_bounds); if (!ret) { @@ -278,7 +281,9 @@ class Rowset : public std::enable_shared_from_this { *min_key = key_bounds.min_key(); return true; } - bool max_key(std::string* max_key) { + + // max key of the last segment + bool last_key(std::string* max_key) { KeyBoundsPB key_bounds; bool ret = _rowset_meta->get_last_segment_key_bound(&key_bounds); if (!ret) { diff --git a/be/src/olap/rowset/rowset_meta.cpp b/be/src/olap/rowset/rowset_meta.cpp index f053ad26d7efb9..6bed5e800ede4d 100644 --- a/be/src/olap/rowset/rowset_meta.cpp +++ b/be/src/olap/rowset/rowset_meta.cpp @@ -199,6 +199,7 @@ void RowsetMeta::_init() { } else { _rowset_id.init(_rowset_meta_pb.rowset_id_v2()); } + update_metadata_size(); } void RowsetMeta::add_segments_file_size(const std::vector& seg_file_size) { @@ -225,6 +226,7 @@ void RowsetMeta::merge_rowset_meta(const RowsetMeta& other) { set_data_disk_size(data_disk_size() + other.data_disk_size()); set_total_disk_size(total_disk_size() + other.total_disk_size()); set_index_disk_size(index_disk_size() + other.index_disk_size()); + set_total_disk_size(data_disk_size() + index_disk_size()); for (auto&& key_bound : other.get_segments_key_bounds()) { add_segment_key_bounds(key_bound); } @@ -255,6 +257,12 @@ void RowsetMeta::merge_rowset_meta(const RowsetMeta& other) { if (rowset_state() == RowsetStatePB::BEGIN_PARTIAL_UPDATE) { set_rowset_state(RowsetStatePB::COMMITTED); } + + update_metadata_size(); +} + +int64_t RowsetMeta::get_metadata_size() const { + return sizeof(RowsetMeta) + _rowset_meta_pb.ByteSizeLong(); } InvertedIndexFileInfo RowsetMeta::inverted_index_file_info(int seg_id) { @@ -266,20 +274,14 @@ InvertedIndexFileInfo RowsetMeta::inverted_index_file_info(int seg_id) { } void RowsetMeta::add_inverted_index_files_info( - const std::vector& idx_file_info) { + const std::vector& idx_file_info) { _rowset_meta_pb.set_enable_inverted_index_file_info(true); for (auto finfo : idx_file_info) { auto* new_file_info = _rowset_meta_pb.add_inverted_index_file_info(); - *new_file_info = finfo; + *new_file_info = *finfo; } } -void RowsetMeta::update_inverted_index_files_info( - const std::vector& idx_file_info) { - _rowset_meta_pb.clear_inverted_index_file_info(); - add_inverted_index_files_info(idx_file_info); -} - bool operator==(const RowsetMeta& a, const RowsetMeta& b) { if (a._rowset_id != b._rowset_id) return false; if (a._is_removed_from_rowset_meta != b._is_removed_from_rowset_meta) return false; diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index 4f25c676f6bd7f..46121aeae2be6d 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -25,6 +25,7 @@ #include #include "io/fs/file_system.h" +#include "olap/metadata_adder.h" #include "olap/olap_common.h" #include "olap/rowset/rowset_fwd.h" #include "olap/storage_policy.h" @@ -33,7 +34,7 @@ namespace doris { -class RowsetMeta { +class RowsetMeta : public MetadataAdder { public: RowsetMeta() = default; ~RowsetMeta(); @@ -363,9 +364,10 @@ class RowsetMeta { return _rowset_meta_pb.inverted_index_file_info(); } - void add_inverted_index_files_info(const std::vector& idx_file_info); + void add_inverted_index_files_info( + const std::vector& idx_file_info); - void update_inverted_index_files_info(const std::vector& idx_file_info); + int64_t get_metadata_size() const override; // Because the member field '_handle' is a raw pointer, use member func 'init' to replace copy ctor RowsetMeta(const RowsetMeta&) = delete; diff --git a/be/src/olap/rowset/rowset_reader_context.h b/be/src/olap/rowset/rowset_reader_context.h index 0d4f5897772ad5..fd4fe7a18234f1 100644 --- a/be/src/olap/rowset/rowset_reader_context.h +++ b/be/src/olap/rowset/rowset_reader_context.h @@ -21,6 +21,7 @@ #include "io/io_common.h" #include "olap/column_predicate.h" #include "olap/olap_common.h" +#include "olap/rowid_conversion.h" #include "runtime/runtime_state.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" @@ -75,14 +76,13 @@ struct RowsetReaderContext { bool enable_unique_key_merge_on_write = false; const DeleteBitmap* delete_bitmap = nullptr; bool record_rowids = false; - bool is_vertical_compaction = false; + RowIdConversion* rowid_conversion; bool is_key_column_group = false; const std::set* output_columns = nullptr; RowsetId rowset_id; // slots that cast may be eliminated in storage layer std::map target_cast_type_for_variants; int64_t ttl_seconds = 0; - size_t topn_limit = 0; }; } // namespace doris diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index 6861b8ab7e2ce6..ad42982488b316 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -31,6 +31,7 @@ #include "olap/column_mapping.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_writer_context.h" +#include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/tablet_fwd.h" #include "olap/tablet_schema.h" #include "vec/core/block.h" @@ -95,6 +96,24 @@ class RowsetWriter { return Status::NotSupported("RowsetWriter does not support create_file_writer"); } + virtual Status create_inverted_index_file_writer( + uint32_t segment_id, InvertedIndexFileWriterPtr* index_file_writer) { + // Create file writer for the inverted index format v2. + io::FileWriterPtr idx_file_v2_ptr; + if (_context.tablet_schema->get_inverted_index_storage_format() != + InvertedIndexStorageFormatPB::V1) { + RETURN_IF_ERROR( + create_file_writer(segment_id, idx_file_v2_ptr, FileType::INVERTED_INDEX_FILE)); + } + std::string segment_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + _context.segment_path(segment_id))}; + *index_file_writer = std::make_unique( + _context.fs(), segment_prefix, _context.rowset_id.to_string(), segment_id, + _context.tablet_schema->get_inverted_index_storage_format(), + std::move(idx_file_v2_ptr)); + return Status::OK(); + } + // explicit flush all buffered rows into segment file. // note that `add_row` could also trigger flush when certain conditions are met virtual Status flush() = 0; diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index e13f7efe6e94fa..cb0fda83e60777 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -79,8 +79,8 @@ struct RowsetWriterContext { int64_t newest_write_timestamp = -1; bool enable_unique_key_merge_on_write = false; - // store column_unique_id to skip write inverted index - std::set skip_inverted_index; + // store column_unique_id to do index compaction + std::set columns_to_do_index_compaction; DataWriteType write_type = DataWriteType::TYPE_DEFAULT; BaseTabletSPtr tablet = nullptr; diff --git a/be/src/olap/rowset/segcompaction.cpp b/be/src/olap/rowset/segcompaction.cpp index d6bdb9387e98fd..427236a6119673 100644 --- a/be/src/olap/rowset/segcompaction.cpp +++ b/be/src/olap/rowset/segcompaction.cpp @@ -69,9 +69,17 @@ using namespace ErrorCode; SegcompactionWorker::SegcompactionWorker(BetaRowsetWriter* writer) : _writer(writer) {} -void SegcompactionWorker::init_mem_tracker(int64_t txn_id) { +void SegcompactionWorker::init_mem_tracker(const RowsetWriterContext& rowset_writer_context) { _seg_compact_mem_tracker = MemTrackerLimiter::create_shared( - MemTrackerLimiter::Type::COMPACTION, "segcompaction-" + std::to_string(txn_id)); + MemTrackerLimiter::Type::COMPACTION, + fmt::format("segcompaction-txnID_{}-loadID_{}-tabletID_{}-indexID_{}-" + "partitionID_{}-version_{}", + std::to_string(rowset_writer_context.txn_id), + print_id(rowset_writer_context.load_id), + std::to_string(rowset_writer_context.tablet_id), + std::to_string(rowset_writer_context.index_id), + std::to_string(rowset_writer_context.partition_id), + rowset_writer_context.version.to_string())); } Status SegcompactionWorker::_get_segcompaction_reader( @@ -157,8 +165,7 @@ Status SegcompactionWorker::_delete_original_segments(uint32_t begin, uint32_t e } // Delete inverted index files for (auto&& column : schema->columns()) { - if (schema->has_inverted_index(*column)) { - const auto* index_info = schema->get_inverted_index(*column); + if (const auto* index_info = schema->inverted_index(*column); index_info != nullptr) { auto index_id = index_info->index_id(); if (schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { @@ -224,7 +231,7 @@ Status SegcompactionWorker::_check_correctness(OlapReaderStatistics& reader_stat Status SegcompactionWorker::_create_segment_writer_for_segcompaction( std::unique_ptr* writer, uint32_t begin, uint32_t end) { - return _writer->_create_segment_writer_for_segcompaction(writer, begin, end); + return _writer->create_segment_writer_for_segcompaction(writer, begin, end); } Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPtr segments) { @@ -316,7 +323,9 @@ Status SegcompactionWorker::_do_compact_segments(SegCompactionCandidatesSharedPt _writer->_num_segcompacted); } RETURN_IF_ERROR(_writer->_rename_compacted_segments(begin, end)); - + if (_inverted_index_file_writer != nullptr) { + _inverted_index_file_writer.reset(); + } if (VLOG_DEBUG_IS_ON) { _writer->vlog_buffer.clear(); for (const auto& entry : std::filesystem::directory_iterator(ctx.tablet_path)) { diff --git a/be/src/olap/rowset/segcompaction.h b/be/src/olap/rowset/segcompaction.h index d498a5b8e33016..5ec74c0e660963 100644 --- a/be/src/olap/rowset/segcompaction.h +++ b/be/src/olap/rowset/segcompaction.h @@ -25,6 +25,7 @@ #include "olap/merger.h" #include "olap/simple_rowid_conversion.h" #include "olap/tablet.h" +#include "segment_v2/inverted_index_file_writer.h" #include "segment_v2/segment.h" namespace doris { @@ -69,11 +70,14 @@ class SegcompactionWorker { DeleteBitmapPtr get_converted_delete_bitmap() { return _converted_delete_bitmap; } io::FileWriterPtr& get_file_writer() { return _file_writer; } + InvertedIndexFileWriterPtr& get_inverted_index_file_writer() { + return _inverted_index_file_writer; + } // set the cancel flag, tasks already started will not be cancelled. bool cancel(); - void init_mem_tracker(int64_t txn_id); + void init_mem_tracker(const RowsetWriterContext& rowset_writer_context); private: Status _create_segment_writer_for_segcompaction( @@ -96,6 +100,7 @@ class SegcompactionWorker { // Currently cloud storage engine doesn't need segcompaction BetaRowsetWriter* _writer = nullptr; io::FileWriterPtr _file_writer; + InvertedIndexFileWriterPtr _inverted_index_file_writer = nullptr; // for unique key mow table std::unique_ptr _rowid_conversion = nullptr; diff --git a/be/src/olap/rowset/segment_creator.cpp b/be/src/olap/rowset/segment_creator.cpp index 1afd3215db42f6..e0eb7534123a86 100644 --- a/be/src/olap/rowset/segment_creator.cpp +++ b/be/src/olap/rowset/segment_creator.cpp @@ -53,8 +53,8 @@ namespace doris { using namespace ErrorCode; SegmentFlusher::SegmentFlusher(RowsetWriterContext& context, SegmentFileCollection& seg_files, - InvertedIndexFilesInfo& idx_files_info) - : _context(context), _seg_files(seg_files), _idx_files_info(idx_files_info) {} + InvertedIndexFileCollection& idx_files) + : _context(context), _seg_files(seg_files), _idx_files(idx_files) {} SegmentFlusher::~SegmentFlusher() = default; @@ -140,13 +140,10 @@ Status SegmentFlusher::_create_segment_writer(std::unique_ptrcreate(segment_id, segment_file_writer)); - io::FileWriterPtr inverted_file_writer; - if (_context.tablet_schema->has_inverted_index() && - _context.tablet_schema->get_inverted_index_storage_format() >= - InvertedIndexStorageFormatPB::V2 && - _context.memtable_on_sink_support_index_v2) { - RETURN_IF_ERROR(_context.file_writer_creator->create(segment_id, inverted_file_writer, - FileType::INVERTED_INDEX_FILE)); + InvertedIndexFileWriterPtr inverted_index_file_writer; + if (_context.tablet_schema->has_inverted_index()) { + RETURN_IF_ERROR( + _context.file_writer_creator->create(segment_id, &inverted_index_file_writer)); } segment_v2::SegmentWriterOptions writer_options; @@ -161,8 +158,11 @@ Status SegmentFlusher::_create_segment_writer(std::unique_ptr( segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, - _context.data_dir, writer_options, std::move(inverted_file_writer)); + _context.data_dir, writer_options, inverted_index_file_writer.get()); RETURN_IF_ERROR(_seg_files.add(segment_id, std::move(segment_file_writer))); + if (_context.tablet_schema->has_inverted_index()) { + RETURN_IF_ERROR(_idx_files.add(segment_id, std::move(inverted_index_file_writer))); + } auto s = writer->init(); if (!s.ok()) { LOG(WARNING) << "failed to init segment writer: " << s.to_string(); @@ -178,13 +178,10 @@ Status SegmentFlusher::_create_segment_writer( io::FileWriterPtr segment_file_writer; RETURN_IF_ERROR(_context.file_writer_creator->create(segment_id, segment_file_writer)); - io::FileWriterPtr inverted_file_writer; - if (_context.tablet_schema->has_inverted_index() && - _context.tablet_schema->get_inverted_index_storage_format() >= - InvertedIndexStorageFormatPB::V2 && - _context.memtable_on_sink_support_index_v2) { - RETURN_IF_ERROR(_context.file_writer_creator->create(segment_id, inverted_file_writer, - FileType::INVERTED_INDEX_FILE)); + InvertedIndexFileWriterPtr inverted_index_file_writer; + if (_context.tablet_schema->has_inverted_index()) { + RETURN_IF_ERROR( + _context.file_writer_creator->create(segment_id, &inverted_index_file_writer)); } segment_v2::VerticalSegmentWriterOptions writer_options; @@ -198,8 +195,11 @@ Status SegmentFlusher::_create_segment_writer( writer = std::make_unique( segment_file_writer.get(), segment_id, _context.tablet_schema, _context.tablet, - _context.data_dir, writer_options, std::move(inverted_file_writer)); + _context.data_dir, writer_options, inverted_index_file_writer.get()); RETURN_IF_ERROR(_seg_files.add(segment_id, std::move(segment_file_writer))); + if (_context.tablet_schema->has_inverted_index()) { + RETURN_IF_ERROR(_idx_files.add(segment_id, std::move(inverted_index_file_writer))); + } auto s = writer->init(); if (!s.ok()) { LOG(WARNING) << "failed to init segment writer: " << s.to_string(); @@ -225,12 +225,16 @@ Status SegmentFlusher::_flush_segment_writer( if (row_num == 0) { return Status::OK(); } - uint64_t segment_size; - uint64_t index_size; - Status s = writer->finalize(&segment_size, &index_size); + uint64_t segment_file_size; + uint64_t common_index_size; + Status s = writer->finalize(&segment_file_size, &common_index_size); if (!s.ok()) { return Status::Error(s.code(), "failed to finalize segment: {}", s.to_string()); } + + int64_t inverted_index_file_size = 0; + RETURN_IF_ERROR(writer->close_inverted_index(&inverted_index_file_size)); + VLOG_DEBUG << "tablet_id:" << _context.tablet_id << " flushing filename: " << writer->data_dir_path() << " rowset_id:" << _context.rowset_id; @@ -245,17 +249,20 @@ Status SegmentFlusher::_flush_segment_writer( uint32_t segment_id = writer->segment_id(); SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + writer->get_inverted_index_total_size(); - segstat.index_size = index_size + writer->get_inverted_index_total_size(); + segstat.data_size = segment_file_size; + segstat.index_size = inverted_index_file_size; segstat.key_bounds = key_bounds; + LOG(INFO) << "tablet_id:" << _context.tablet_id + << ", flushing rowset_dir: " << _context.tablet_path + << ", rowset_id:" << _context.rowset_id << ", data size:" << segstat.data_size + << ", index size:" << segstat.index_size; - _idx_files_info.add_file_info(segment_id, writer->get_inverted_index_file_info()); writer.reset(); RETURN_IF_ERROR(_context.segment_collector->add(segment_id, segstat, flush_schema)); if (flush_size) { - *flush_size = segment_size + index_size; + *flush_size = segment_file_size; } return Status::OK(); } @@ -271,12 +278,16 @@ Status SegmentFlusher::_flush_segment_writer(std::unique_ptrfinalize(&segment_size, &index_size); + uint64_t segment_file_size; + uint64_t common_index_size; + Status s = writer->finalize(&segment_file_size, &common_index_size); if (!s.ok()) { return Status::Error(s.code(), "failed to finalize segment: {}", s.to_string()); } + + int64_t inverted_index_file_size = 0; + RETURN_IF_ERROR(writer->close_inverted_index(&inverted_index_file_size)); + VLOG_DEBUG << "tablet_id:" << _context.tablet_id << " flushing rowset_dir: " << _context.tablet_path << " rowset_id:" << _context.rowset_id; @@ -291,17 +302,20 @@ Status SegmentFlusher::_flush_segment_writer(std::unique_ptrget_segment_id(); SegmentStatistics segstat; segstat.row_num = row_num; - segstat.data_size = segment_size + writer->get_inverted_index_total_size(); - segstat.index_size = index_size + writer->get_inverted_index_total_size(); + segstat.data_size = segment_file_size; + segstat.index_size = inverted_index_file_size; segstat.key_bounds = key_bounds; + LOG(INFO) << "tablet_id:" << _context.tablet_id + << ", flushing rowset_dir: " << _context.tablet_path + << ", rowset_id:" << _context.rowset_id << ", data size:" << segstat.data_size + << ", index size:" << segstat.index_size; - _idx_files_info.add_file_info(segment_id, writer->get_inverted_index_file_info()); writer.reset(); RETURN_IF_ERROR(_context.segment_collector->add(segment_id, segstat, flush_schema)); if (flush_size) { - *flush_size = segment_size + index_size; + *flush_size = segment_file_size; } return Status::OK(); } @@ -330,8 +344,8 @@ int64_t SegmentFlusher::Writer::max_row_to_add(size_t row_avg_size_in_bytes) { } SegmentCreator::SegmentCreator(RowsetWriterContext& context, SegmentFileCollection& seg_files, - InvertedIndexFilesInfo& idx_files_info) - : _segment_flusher(context, seg_files, idx_files_info) {} + InvertedIndexFileCollection& idx_files) + : _segment_flusher(context, seg_files, idx_files) {} Status SegmentCreator::add_block(const vectorized::Block* block) { if (block->rows() == 0) { diff --git a/be/src/olap/rowset/segment_creator.h b/be/src/olap/rowset/segment_creator.h index c862fce87a43bd..f8afd5798927d4 100644 --- a/be/src/olap/rowset/segment_creator.h +++ b/be/src/olap/rowset/segment_creator.h @@ -29,6 +29,7 @@ #include "io/fs/file_reader_writer_fwd.h" #include "olap/olap_common.h" #include "olap/rowset/rowset_writer_context.h" +#include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/tablet_fwd.h" #include "util/spinlock.h" #include "vec/core/block.h" @@ -46,7 +47,7 @@ class VerticalSegmentWriter; struct SegmentStatistics; class BetaRowsetWriter; class SegmentFileCollection; -class InvertedIndexFilesInfo; +class InvertedIndexFileCollection; class FileWriterCreator { public: @@ -54,9 +55,12 @@ class FileWriterCreator { virtual Status create(uint32_t segment_id, io::FileWriterPtr& file_writer, FileType file_type = FileType::SEGMENT_FILE) = 0; + + virtual Status create(uint32_t segment_id, InvertedIndexFileWriterPtr* file_writer) = 0; }; template + requires std::is_base_of_v class FileWriterCreatorT : public FileWriterCreator { public: explicit FileWriterCreatorT(T* t) : _t(t) {} @@ -66,6 +70,10 @@ class FileWriterCreatorT : public FileWriterCreator { return _t->create_file_writer(segment_id, file_writer, file_type); } + Status create(uint32_t segment_id, InvertedIndexFileWriterPtr* file_writer) override { + return _t->create_inverted_index_file_writer(segment_id, file_writer); + } + private: T* _t = nullptr; }; @@ -79,6 +87,7 @@ class SegmentCollector { }; template + requires std::is_base_of_v class SegmentCollectorT : public SegmentCollector { public: explicit SegmentCollectorT(T* t) : _t(t) {} @@ -95,7 +104,7 @@ class SegmentCollectorT : public SegmentCollector { class SegmentFlusher { public: SegmentFlusher(RowsetWriterContext& context, SegmentFileCollection& seg_files, - InvertedIndexFilesInfo& idx_files_info); + InvertedIndexFileCollection& idx_files); ~SegmentFlusher(); @@ -164,7 +173,7 @@ class SegmentFlusher { private: RowsetWriterContext& _context; SegmentFileCollection& _seg_files; - InvertedIndexFilesInfo& _idx_files_info; + InvertedIndexFileCollection& _idx_files; // written rows by add_block/add_row std::atomic _num_rows_written = 0; @@ -177,7 +186,7 @@ class SegmentFlusher { class SegmentCreator { public: SegmentCreator(RowsetWriterContext& context, SegmentFileCollection& seg_files, - InvertedIndexFilesInfo& idx_files_info); + InvertedIndexFileCollection& idx_files); ~SegmentCreator() = default; diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h index 9753972583ea20..8d344e43ac727d 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h @@ -41,7 +41,7 @@ namespace segment_v2 { class BitmapIndexIterator; class BitmapIndexPB; -class BitmapIndexReader { +class BitmapIndexReader : public MetadataAdder { public: explicit BitmapIndexReader(io::FileReaderSPtr file_reader, const BitmapIndexPB& index_meta) : _file_reader(std::move(file_reader)), diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp index 0857c1890c47ce..609d21ce4f5c22 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp @@ -31,18 +31,26 @@ namespace doris { namespace segment_v2 { -Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory) { +Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { // TODO yyq: implement a new once flag to avoid status construct. + _index_load_stats = index_load_stats; return _load_once.call([this, use_page_cache, kept_in_memory] { return _load(use_page_cache, kept_in_memory); }); } +int64_t BloomFilterIndexReader::get_metadata_size() const { + return sizeof(BloomFilterIndexReader) + + (_bloom_filter_index_meta ? _bloom_filter_index_meta->ByteSizeLong() : 0); +} + Status BloomFilterIndexReader::_load(bool use_page_cache, bool kept_in_memory) { const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta->bloom_filter(); _bloom_filter_reader.reset(new IndexedColumnReader(_file_reader, bf_index_meta)); - RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory)); + RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory, _index_load_stats)); + update_metadata_size(); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h index c2617ef4e4e980..fcb0239a2440fa 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h @@ -38,7 +38,7 @@ class BloomFilterIndexIterator; class BloomFilter; class BloomFilterIndexPB; -class BloomFilterIndexReader { +class BloomFilterIndexReader : public MetadataAdder { public: explicit BloomFilterIndexReader(io::FileReaderSPtr file_reader, const BloomFilterIndexPB& bloom_filter_index_meta) @@ -47,7 +47,8 @@ class BloomFilterIndexReader { _bloom_filter_index_meta.reset(new BloomFilterIndexPB(bloom_filter_index_meta)); } - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* _bf_index_load_stats = nullptr); BloomFilterAlgorithmPB algorithm() { return _bloom_filter_index_meta->algorithm(); } @@ -59,6 +60,8 @@ class BloomFilterIndexReader { private: Status _load(bool use_page_cache, bool kept_in_memory); + int64_t get_metadata_size() const override; + private: friend class BloomFilterIndexIterator; @@ -67,6 +70,7 @@ class BloomFilterIndexReader { const TypeInfo* _type_info = nullptr; std::unique_ptr _bloom_filter_index_meta = nullptr; std::unique_ptr _bloom_filter_reader; + OlapReaderStatistics* _index_load_stats = nullptr; }; class BloomFilterIndexIterator { diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 3c9b5b7ce7e5ab..aad3725d5a3f6e 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -86,8 +86,6 @@ inline bool read_as_string(PrimitiveType type) { type == PrimitiveType::TYPE_OBJECT; } -static bvar::Adder g_column_reader_memory_bytes("doris_column_reader_memory_bytes"); -static bvar::Adder g_column_reader_num("doris_column_reader_num"); Status ColumnReader::create_array(const ColumnReaderOptions& opts, const ColumnMetaPB& meta, const io::FileReaderSPtr& file_reader, std::unique_ptr* reader) { @@ -276,14 +274,12 @@ ColumnReader::ColumnReader(const ColumnReaderOptions& opts, const ColumnMetaPB& _meta_is_nullable = meta.is_nullable(); _meta_dict_page = meta.dict_page(); _meta_compression = meta.compression(); - - g_column_reader_memory_bytes << sizeof(*this); - g_column_reader_num << 1; } -ColumnReader::~ColumnReader() { - g_column_reader_memory_bytes << -sizeof(*this); - g_column_reader_num << -1; +ColumnReader::~ColumnReader() = default; + +int64_t ColumnReader::get_metadata_size() const { + return sizeof(ColumnReader) + (_segment_zone_map ? _segment_zone_map->ByteSizeLong() : 0); } Status ColumnReader::init(const ColumnMetaPB* meta) { @@ -323,6 +319,7 @@ Status ColumnReader::init(const ColumnMetaPB* meta) { _file_reader->path().native(), index_meta.type()); } } + update_metadata_size(); // ArrayColumnWriter writes a single empty array and flushes. In this scenario, // the item writer doesn't write any data and the corresponding ordinal index is empty. @@ -1595,28 +1592,15 @@ void DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP } } -Status VariantRootColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, - bool* has_null) { - size_t size = dst->size(); +Status VariantRootColumnIterator::_process_root_column( + vectorized::MutableColumnPtr& dst, vectorized::MutableColumnPtr& root_column, + const vectorized::DataTypePtr& most_common_type) { auto& obj = dst->is_nullable() ? assert_cast( assert_cast(*dst).get_nested_column()) : assert_cast(*dst); - if (obj.is_null_root()) { - obj.create_root(); - } - if (!obj.is_finalized()) { - obj.finalize(); - } - auto root_column = obj.get_root(); - RETURN_IF_ERROR(_inner_iter->next_batch(n, root_column, has_null)); - obj.incr_num_rows(*n); - for (auto& entry : obj.get_subcolumns()) { - if (entry->data.size() != size + *n) { - entry->data.insert_many_defaults(*n); - } - } + // fill nullmap if (root_column->is_nullable() && dst->is_nullable()) { vectorized::ColumnUInt8& dst_null_map = @@ -1625,47 +1609,57 @@ Status VariantRootColumnIterator::next_batch(size_t* n, vectorized::MutableColum assert_cast(*root_column).get_null_map_column(); dst_null_map.insert_range_from(src_null_map, 0, src_null_map.size()); } + + // add root column to a tmp object column + auto tmp = vectorized::ColumnObject::create(true, false); + auto& tmp_obj = assert_cast(*tmp); + tmp_obj.add_sub_column({}, std::move(root_column), most_common_type); + + // merge tmp object column to dst + obj.insert_range_from(*tmp, 0, tmp->size()); + + // finalize object if needed + if (!obj.is_finalized()) { + obj.finalize(); + } + #ifndef NDEBUG obj.check_consistency(); #endif + return Status::OK(); } +Status VariantRootColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, + bool* has_null) { + // read root column + auto& obj = + dst->is_nullable() + ? assert_cast( + assert_cast(*dst).get_nested_column()) + : assert_cast(*dst); + + auto most_common_type = obj.get_most_common_type(); + auto root_column = most_common_type->create_column(); + RETURN_IF_ERROR(_inner_iter->next_batch(n, root_column, has_null)); + + return _process_root_column(dst, root_column, most_common_type); +} + Status VariantRootColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t count, vectorized::MutableColumnPtr& dst) { - size_t size = dst->size(); + // read root column auto& obj = dst->is_nullable() ? assert_cast( assert_cast(*dst).get_nested_column()) : assert_cast(*dst); - if (obj.is_null_root()) { - obj.create_root(); - } - if (!obj.is_finalized()) { - obj.finalize(); - } - auto root_column = obj.get_root(); + + auto most_common_type = obj.get_most_common_type(); + auto root_column = most_common_type->create_column(); RETURN_IF_ERROR(_inner_iter->read_by_rowids(rowids, count, root_column)); - obj.incr_num_rows(count); - for (auto& entry : obj.get_subcolumns()) { - if (entry->data.size() != (size + count)) { - entry->data.insert_many_defaults(count); - } - } - // fill nullmap - if (root_column->is_nullable() && dst->is_nullable()) { - vectorized::ColumnUInt8& dst_null_map = - assert_cast(*dst).get_null_map_column(); - vectorized::ColumnUInt8& src_null_map = - assert_cast(*root_column).get_null_map_column(); - DCHECK_EQ(src_null_map.size() - size, count); - dst_null_map.insert_range_from(src_null_map, size, count); - } -#ifndef NDEBUG - obj.check_consistency(); -#endif - return Status::OK(); + + return _process_root_column(dst, root_column, most_common_type); } Status DefaultNestedColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst) { diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 6727ea7dc8182c..d72d802f97769b 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -47,7 +47,6 @@ #include "vec/columns/column.h" #include "vec/columns/column_array.h" // ColumnArray #include "vec/columns/subcolumn_tree.h" -#include "vec/common/hash_table/hash_map_context_creator.h" #include "vec/data_types/data_type.h" #include "vec/json/path_in_data.h" @@ -111,7 +110,7 @@ struct ColumnIteratorOptions { // we should do our best to reduce resource usage through share // same information, such as OrdinalPageIndex and Page data. // This will cache data shared by all reader -class ColumnReader { +class ColumnReader : public MetadataAdder { public: // Create an initialized ColumnReader in *reader. // This should be a lightweight operation without I/O. @@ -244,6 +243,8 @@ class ColumnReader { Status _calculate_row_ranges(const std::vector& page_indexes, RowRanges* row_ranges); + int64_t get_metadata_size() const override; + private: int64_t _meta_length; FieldType _meta_type; @@ -654,6 +655,9 @@ class VariantRootColumnIterator : public ColumnIterator { ordinal_t get_current_ordinal() const override { return _inner_iter->get_current_ordinal(); } private: + Status _process_root_column(vectorized::MutableColumnPtr& dst, + vectorized::MutableColumnPtr& root_column, + const vectorized::DataTypePtr& most_common_type); std::unique_ptr _inner_iter; }; @@ -723,7 +727,7 @@ class DefaultValueColumnIterator : public ColumnIterator { class DefaultNestedColumnIterator : public ColumnIterator { public: DefaultNestedColumnIterator(std::unique_ptr&& sibling, - DataTypePtr file_column_type) + vectorized::DataTypePtr file_column_type) : _sibling_iter(std::move(sibling)), _file_column_type(std::move(file_column_type)) {} Status init(const ColumnIteratorOptions& opts) override { diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 62f209db5ad4a5..2d66b940a3893b 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -63,7 +63,7 @@ struct ColumnWriterOptions { bool need_inverted_index = false; uint8_t gram_size; uint16_t gram_bf_size; - std::vector indexes; + std::vector indexes; // unused const TabletIndex* inverted_index = nullptr; InvertedIndexFileWriter* inverted_index_file_writer; std::string to_string() const { diff --git a/be/src/olap/rowset/segment_v2/index_page.cpp b/be/src/olap/rowset/segment_v2/index_page.cpp index 9af7047c49b39a..1b033a9ff62c74 100644 --- a/be/src/olap/rowset/segment_v2/index_page.cpp +++ b/be/src/olap/rowset/segment_v2/index_page.cpp @@ -64,6 +64,10 @@ Status IndexPageBuilder::get_first_key(Slice* key) const { /////////////////////////////////////////////////////////////////////////////// +int64_t IndexPageReader::get_metadata_size() const { + return sizeof(IndexPageReader) + _vl_field_mem_size; +} + Status IndexPageReader::parse(const Slice& body, const IndexPageFooterPB& footer) { _footer = footer; size_t num_entries = _footer.num_entries(); @@ -80,8 +84,13 @@ Status IndexPageReader::parse(const Slice& body, const IndexPageFooterPB& footer } _keys.push_back(key); _values.push_back(value); + _vl_field_mem_size += sizeof(char) * key.size; } + _vl_field_mem_size += + _keys.capacity() * sizeof(Slice) + _values.capacity() * sizeof(PagePointer); + _vl_field_mem_size += _footer.ByteSizeLong(); + update_metadata_size(); _parsed = true; return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/index_page.h b/be/src/olap/rowset/segment_v2/index_page.h index 7b15ef66391ad7..0ebf425fc5ca99 100644 --- a/be/src/olap/rowset/segment_v2/index_page.h +++ b/be/src/olap/rowset/segment_v2/index_page.h @@ -26,6 +26,7 @@ #include #include "common/status.h" +#include "olap/metadata_adder.h" #include "olap/rowset/segment_v2/page_pointer.h" #include "util/faststring.h" #include "util/slice.h" @@ -79,7 +80,7 @@ class IndexPageBuilder { uint32_t _count = 0; }; -class IndexPageReader { +class IndexPageReader : public MetadataAdder { public: IndexPageReader() : _parsed(false) {} @@ -110,11 +111,14 @@ class IndexPageReader { void reset(); private: + int64_t get_metadata_size() const override; + bool _parsed; IndexPageFooterPB _footer; std::vector _keys; std::vector _values; + int64_t _vl_field_mem_size {0}; }; class IndexPageIterator { diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp index 59251b5595dd07..3028211f266157 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp @@ -56,13 +56,17 @@ static bvar::Adder g_index_reader_pk_pages("doris_pk", "index_reader_p static bvar::PerSecond> g_index_reader_pk_bytes_per_second( "doris_pk", "index_reader_pk_pages_per_second", &g_index_reader_pk_pages, 60); -static bvar::Adder g_index_reader_memory_bytes("doris_index_reader_memory_bytes"); - using strings::Substitute; -Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory) { +int64_t IndexedColumnReader::get_metadata_size() const { + return sizeof(IndexedColumnReader) + _meta.ByteSizeLong(); +} + +Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats) { _use_page_cache = use_page_cache; _kept_in_memory = kept_in_memory; + _index_load_stats = index_load_stats; _type_info = get_scalar_type_info((FieldType)_meta.data_type()); if (_type_info == nullptr) { @@ -94,7 +98,7 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory) { } _num_values = _meta.num_values(); - g_index_reader_memory_bytes << sizeof(*this); + update_metadata_size(); return Status::OK(); } @@ -105,7 +109,7 @@ Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* BlockCompressionCodec* local_compress_codec; RETURN_IF_ERROR(get_block_compression_codec(_meta.compression(), &local_compress_codec)); RETURN_IF_ERROR(read_page(PagePointer(pp), handle, &body, &footer, INDEX_PAGE, - local_compress_codec, false)); + local_compress_codec, false, _index_load_stats)); RETURN_IF_ERROR(reader->parse(body, footer.index_page_footer())); _mem_size += body.get_size(); return Status::OK(); @@ -113,8 +117,10 @@ Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, PageTypePB type, - BlockCompressionCodec* codec, bool pre_decode) const { + BlockCompressionCodec* codec, bool pre_decode, + OlapReaderStatistics* stats) const { OlapReaderStatistics tmp_stats; + OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; PageReadOptions opts { .use_page_cache = _use_page_cache, .kept_in_memory = _kept_in_memory, @@ -123,9 +129,10 @@ Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, .file_reader = _file_reader.get(), .page_pointer = pp, .codec = codec, - .stats = &tmp_stats, + .stats = stats_ptr, .encoding_info = _encoding_info, - .io_ctx = io::IOContext {.is_index_data = true}, + .io_ctx = io::IOContext {.is_index_data = true, + .file_cache_stats = &stats_ptr->file_cache_stats}, }; if (_is_pk_index) { opts.type = PRIMARY_KEY_INDEX_PAGE; @@ -138,9 +145,7 @@ Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, return st; } -IndexedColumnReader::~IndexedColumnReader() { - g_index_reader_memory_bytes << -sizeof(*this); -} +IndexedColumnReader::~IndexedColumnReader() = default; /////////////////////////////////////////////////////////////////////////////// @@ -154,8 +159,8 @@ Status IndexedColumnIterator::_read_data_page(const PagePointer& pp) { PageHandle handle; Slice body; PageFooterPB footer; - RETURN_IF_ERROR( - _reader->read_page(pp, &handle, &body, &footer, DATA_PAGE, _compress_codec, true)); + RETURN_IF_ERROR(_reader->read_page(pp, &handle, &body, &footer, DATA_PAGE, _compress_codec, + true, _stats)); // parse data page // note that page_index is not used in IndexedColumnIterator, so we pass 0 PageDecoderOptions opts; diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.h b/be/src/olap/rowset/segment_v2/indexed_column_reader.h index d156643a21c11d..c3469f9f6bed0d 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.h @@ -27,6 +27,7 @@ #include "common/status.h" #include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/index_page.h" #include "olap/rowset/segment_v2/page_handle.h" @@ -46,18 +47,20 @@ namespace segment_v2 { class EncodingInfo; // thread-safe reader for IndexedColumn (see comments of `IndexedColumnWriter` to understand what IndexedColumn is) -class IndexedColumnReader { +class IndexedColumnReader : public MetadataAdder { public: explicit IndexedColumnReader(io::FileReaderSPtr file_reader, const IndexedColumnMetaPB& meta) : _file_reader(std::move(file_reader)), _meta(meta) {} ~IndexedColumnReader(); - Status load(bool use_page_cache, bool kept_in_memory); + Status load(bool use_page_cache, bool kept_in_memory, + OlapReaderStatistics* index_load_stats = nullptr); // read a page specified by `pp' from `file' into `handle' Status read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, - PageTypePB type, BlockCompressionCodec* codec, bool pre_decode) const; + PageTypePB type, BlockCompressionCodec* codec, bool pre_decode, + OlapReaderStatistics* stats = nullptr) const; int64_t num_values() const { return _num_values; } const EncodingInfo* encoding_info() const { return _encoding_info; } @@ -72,6 +75,8 @@ class IndexedColumnReader { private: Status load_index_page(const PagePointerPB& pp, PageHandle* handle, IndexPageReader* reader); + int64_t get_metadata_size() const override; + friend class IndexedColumnIterator; io::FileReaderSPtr _file_reader; @@ -95,14 +100,17 @@ class IndexedColumnReader { const KeyCoder* _value_key_coder = nullptr; uint64_t _mem_size = 0; bool _is_pk_index = false; + OlapReaderStatistics* _index_load_stats = nullptr; }; class IndexedColumnIterator { public: - explicit IndexedColumnIterator(const IndexedColumnReader* reader) + explicit IndexedColumnIterator(const IndexedColumnReader* reader, + OlapReaderStatistics* stats = nullptr) : _reader(reader), _ordinal_iter(&reader->_ordinal_index_reader), - _value_iter(&reader->_value_index_reader) {} + _value_iter(&reader->_value_index_reader), + _stats(stats) {} // Seek to the given ordinal entry. Entry 0 is the first entry. // Return Status::Error if provided seek point is past the end. @@ -151,6 +159,7 @@ class IndexedColumnIterator { ordinal_t _current_ordinal = 0; // iterator owned compress codec, should NOT be shared by threads, initialized before used BlockCompressionCodec* _compress_codec = nullptr; + OlapReaderStatistics* _stats = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp index 8ad1abb322f01f..94ba8fce0bc9c4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp @@ -115,4 +115,21 @@ std::vector InvertedIndexAnalyzer::get_analyse_result( return analyse_result; } +std::vector InvertedIndexAnalyzer::get_analyse_result( + const std::string& search_str, const std::string& field_name, + InvertedIndexQueryType query_type, const std::map& properties) { + InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared( + get_inverted_index_parser_type_from_string( + get_parser_string_from_properties(properties)), + get_parser_mode_string_from_properties(properties), + get_parser_char_filter_map_from_properties(properties), + get_parser_lowercase_from_properties(properties), + get_parser_stopwords_from_properties(properties)); + auto analyzer = create_analyzer(inverted_index_ctx.get()); + inverted_index_ctx->analyzer = analyzer.get(); + auto reader = create_reader(inverted_index_ctx->char_filter_map); + reader->init(search_str.data(), search_str.size(), true); + return get_analyse_result(reader.get(), analyzer.get(), field_name, query_type); +} + } // namespace doris::segment_v2::inverted_index diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h index ad5d71a536420d..6f369d504b247d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h @@ -32,6 +32,7 @@ class Analyzer; } // namespace lucene namespace doris::segment_v2::inverted_index { + class InvertedIndexAnalyzer { public: static std::unique_ptr create_reader(CharFilterMap& char_filter_map); @@ -44,5 +45,10 @@ class InvertedIndexAnalyzer { const std::string& field_name, InvertedIndexQueryType query_type, bool drop_duplicates = true); + + static std::vector get_analyse_result( + const std::string& search_str, const std::string& field_name, + InvertedIndexQueryType query_type, + const std::map& properties); }; } // namespace doris::segment_v2::inverted_index \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h index 561054863d7461..bebbea58f72d86 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h @@ -27,6 +27,7 @@ class CharFilterFactory { public: template static lucene::analysis::CharFilter* create(const std::string& name, Args&&... args) { + DBUG_EXECUTE_IF("CharFilterFactory::create_return_nullptr", { return nullptr; }) if (name == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) { return new CharReplaceCharFilter(std::forward(args)...); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp index 0ca2dce94e3dd2..9a3ecc68f89fa0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp @@ -17,8 +17,13 @@ #include "phrase_query.h" +#include +#include #include +#include "CLucene/index/Terms.h" +#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h" + namespace doris::segment_v2 { template @@ -141,19 +146,21 @@ void PhraseQuery::add(const InvertedIndexQueryInfo& query_info) { _slop = query_info.slop; if (_slop == 0 || query_info.ordered) { + if (query_info.ordered) { + _additional_terms = query_info.additional_terms; + } // Logic for no slop query and ordered phrase query add(query_info.field_name, query_info.terms); } else { // Simple slop query follows the default phrase query algorithm - auto query = std::make_unique(); + _phrase_query = std::make_unique(); for (const auto& term : query_info.terms) { std::wstring ws_term = StringUtil::string_to_wstring(term); auto* t = _CLNEW lucene::index::Term(query_info.field_name.c_str(), ws_term.c_str()); - query->add(t); + _phrase_query->add(t); _CLDECDELETE(t); } - query->setSlop(_slop); - _matcher = std::move(query); + _phrase_query->setSlop(_slop); } } @@ -173,13 +180,16 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vector iterators; - auto ensureTermPosition = [this, &iterators, &field_name](const std::string& term) { + auto ensureTermPosition = [this, &iterators, &field_name](const std::string& term, + bool is_save_iter = true) { std::wstring ws_term = StringUtil::string_to_wstring(term); Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); _terms.push_back(t); TermPositions* term_pos = _searcher->getReader()->termPositions(t); _term_docs.push_back(term_pos); - iterators.emplace_back(term_pos); + if (is_save_iter) { + iterators.emplace_back(term_pos); + } return term_pos; }; @@ -190,16 +200,29 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vector(_matcher)) { + if (_phrase_query) { _searcher->_search( - std::get(_matcher).get(), + _phrase_query.get(), [&roaring](const int32_t docid, const float_t /*score*/) { roaring.add(docid); }); } else { if (_lead1.isEmpty()) { @@ -288,17 +311,9 @@ int32_t PhraseQuery::do_next(int32_t doc) { } bool PhraseQuery::matches(int32_t doc) { - return std::visit( - [&doc](auto&& m) -> bool { - using T = std::decay_t; - if constexpr (std::is_same_v) { - _CLTHROWA(CL_ERR_IllegalArgument, - "PhraseQueryPtr does not support matches function"); - } else { - return m.matches(doc); - } - }, - _matcher); + return std::ranges::all_of(_matchers, [&doc](auto&& matcher) { + return std::visit([&doc](auto&& m) -> bool { return m.matches(doc); }, matcher); + }); } void PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& query_info) { @@ -343,6 +358,24 @@ void PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& query_ } } +void PhraseQuery::parser_info(std::string& query, const std::string& field_name, + InvertedIndexQueryType query_type, + const std::map& properties, + InvertedIndexQueryInfo& query_info, bool sequential_opt) { + parser_slop(query, query_info); + query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + query, field_name, query_type, properties); + if (sequential_opt && query_info.ordered) { + std::vector t_querys; + boost::split(t_querys, query, boost::algorithm::is_any_of(" ")); + for (auto& t_query : t_querys) { + auto terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + t_query, field_name, query_type, properties); + query_info.additional_terms.emplace_back(std::move(terms)); + } + } +} + template class PhraseMatcherBase; template class PhraseMatcherBase; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h index 253ba782b78181..35a479ff7f9781 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h @@ -24,6 +24,8 @@ #include +#include "olap/rowset/segment_v2/inverted_index_query_type.h" + CL_NS_USE(index) CL_NS_USE(search) @@ -76,11 +78,11 @@ class OrderedSloppyPhraseMatcher : public PhraseMatcherBase; // ExactPhraseMatcher: x match_phrase 'aaa bbb' // PhraseQueryPtr: x match_phrase 'aaa bbb ~2', support slop // OrderedSloppyPhraseMatcher: x match_phrase 'aaa bbb ~2+', ensuring that the words appear in the specified order. -using Matcher = std::variant; +using PhraseQueryPtr = std::unique_ptr; +using Matcher = std::variant; class PhraseQuery : public Query { public: @@ -103,6 +105,10 @@ class PhraseQuery : public Query { public: static void parser_slop(std::string& query, InvertedIndexQueryInfo& query_info); + static void parser_info(std::string& query, const std::string& field_name, + InvertedIndexQueryType query_type, + const std::map& properties, + InvertedIndexQueryInfo& query_info, bool sequential_opt); private: std::shared_ptr _searcher; @@ -117,7 +123,9 @@ class PhraseQuery : public Query { std::vector _term_docs; int32_t _slop = 0; - Matcher _matcher; + std::vector> _additional_terms; + PhraseQueryPtr _phrase_query = nullptr; + std::vector _matchers; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h index cef7fd51f72b58..c295765ec63478 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h @@ -38,8 +38,18 @@ namespace doris::segment_v2 { struct InvertedIndexQueryInfo { std::wstring field_name; std::vector terms; + std::vector> additional_terms; int32_t slop = 0; bool ordered = false; + + std::string to_string() { + std::string s; + s += std::to_string(terms.size()) + ", "; + s += std::to_string(additional_terms.size()) + ", "; + s += std::to_string(slop) + ", "; + s += std::to_string(ordered); + return s; + } }; class Query { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index e47189f9137ada..88a8f2417228bc 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -24,7 +24,8 @@ #include "util/debug_points.h" namespace doris::segment_v2 { -Status compact_column(int64_t index_id, std::vector& src_index_dirs, +Status compact_column(int64_t index_id, + std::vector>& src_index_dirs, std::vector& dest_index_dirs, std::string_view tmp_path, const std::vector>>& trans_vec, @@ -40,28 +41,41 @@ Status compact_column(int64_t index_id, std::vector& "debug point: index compaction error"); } }) - lucene::store::Directory* dir = - DorisFSDirectoryFactory::getDirectory(io::global_local_filesystem(), tmp_path.data()); + bool can_use_ram_dir = true; + lucene::store::Directory* dir = DorisFSDirectoryFactory::getDirectory( + io::global_local_filesystem(), tmp_path.data(), can_use_ram_dir); + DBUG_EXECUTE_IF("compact_column_getDirectory_error", { + _CLTHROWA(CL_ERR_IO, "debug point: compact_column_getDirectory_error in index compaction"); + }) lucene::analysis::SimpleAnalyzer analyzer; auto* index_writer = _CLNEW lucene::index::IndexWriter(dir, &analyzer, true /* create */, true /* closeDirOnShutdown */); - + DBUG_EXECUTE_IF("compact_column_create_index_writer_error", { + _CLTHROWA(CL_ERR_IO, + "debug point: compact_column_create_index_writer_error in index compaction"); + }) DCHECK_EQ(src_index_dirs.size(), trans_vec.size()); - index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec, + std::vector tmp_src_index_dirs(src_index_dirs.size()); + for (size_t i = 0; i < tmp_src_index_dirs.size(); ++i) { + tmp_src_index_dirs[i] = src_index_dirs[i].get(); + } + index_writer->indexCompaction(tmp_src_index_dirs, dest_index_dirs, trans_vec, dest_segment_num_rows); + DBUG_EXECUTE_IF("compact_column_indexCompaction_error", { + _CLTHROWA(CL_ERR_IO, + "debug point: compact_column_indexCompaction_error in index compaction"); + }) index_writer->close(); + DBUG_EXECUTE_IF("compact_column_index_writer_close_error", { + _CLTHROWA(CL_ERR_IO, + "debug point: compact_column_index_writer_close_error in index compaction"); + }) _CLDELETE(index_writer); // NOTE: need to ref_cnt-- for dir, // when index_writer is destroyed, if closeDir is set, dir will be close // _CLDECDELETE(dir) will try to ref_cnt--, when it decreases to 1, dir will be destroyed. _CLDECDELETE(dir) - for (auto* d : src_index_dirs) { - if (d != nullptr) { - d->close(); - _CLDELETE(d); - } - } for (auto* d : dest_index_dirs) { if (d != nullptr) { // NOTE: DO NOT close dest dir here, because it will be closed when dest index writer finalize. @@ -70,8 +84,10 @@ Status compact_column(int64_t index_id, std::vector& } } - // delete temporary segment_path - std::ignore = io::global_local_filesystem()->delete_directory(tmp_path.data()); + // delete temporary segment_path, only when inverted_index_ram_dir_enable is false + if (!config::inverted_index_ram_dir_enable) { + std::ignore = io::global_local_filesystem()->delete_directory(tmp_path.data()); + } return Status::OK(); } } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.h b/be/src/olap/rowset/segment_v2/inverted_index_compaction.h index c95a4a7ffae1f8..1a6e4748e033d3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.h @@ -23,6 +23,7 @@ #include #include "common/status.h" +#include "inverted_index_compound_reader.h" namespace doris { class TabletIndex; @@ -30,7 +31,8 @@ namespace segment_v2 { class InvertedIndexFileWriter; class InvertedIndexFileReader; -Status compact_column(int64_t index_id, std::vector& src_index_dirs, +Status compact_column(int64_t index_id, + std::vector>& src_index_dirs, std::vector& dest_index_dirs, std::string_view tmp_path, const std::vector>>& trans_vec, diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp index d11b9fa54d0421..5599faa351dfd6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp @@ -17,6 +17,8 @@ #include "olap/rowset/segment_v2/inverted_index_file_writer.h" +#include + #include #include "common/status.h" @@ -44,25 +46,41 @@ Result InvertedIndexFileWriter::open(const TabletIndex* index index_meta->get_index_suffix()); bool exists = false; auto st = local_fs->exists(local_fs_index_path, &exists); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_error", + { st = Status::Error("debug point: no such file error"); }) if (!st.ok()) { LOG(ERROR) << "index_path:" << local_fs_index_path << " exists error:" << st; return ResultError(st); } - + DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_true", { exists = true; }) if (exists) { LOG(ERROR) << "try to init a directory:" << local_fs_index_path << " already exists"; - return ResultError(Status::InternalError("init_fulltext_index directory already exists")); + return ResultError( + Status::InternalError("InvertedIndexFileWriter::open directory already exists")); } bool can_use_ram_dir = true; auto* dir = DorisFSDirectoryFactory::getDirectory(local_fs, local_fs_index_path.c_str(), can_use_ram_dir); - _indices_dirs.emplace(std::make_pair(index_meta->index_id(), index_meta->get_index_suffix()), - std::unique_ptr(dir)); + auto key = std::make_pair(index_meta->index_id(), index_meta->get_index_suffix()); + auto [it, inserted] = _indices_dirs.emplace(key, std::unique_ptr(dir)); + if (!inserted) { + LOG(ERROR) << "InvertedIndexFileWriter::open attempted to insert a duplicate key: (" + << key.first << ", " << key.second << ")"; + LOG(ERROR) << "Directories already in map: "; + for (const auto& entry : _indices_dirs) { + LOG(ERROR) << "Key: (" << entry.first.first << ", " << entry.first.second << ")"; + } + return ResultError(Status::InternalError( + "InvertedIndexFileWriter::open attempted to insert a duplicate dir")); + } + return dir; } Status InvertedIndexFileWriter::delete_index(const TabletIndex* index_meta) { + DBUG_EXECUTE_IF("InvertedIndexFileWriter::delete_index_index_meta_nullptr", + { index_meta = nullptr; }); if (!index_meta) { return Status::Error("Index metadata is null."); } @@ -72,6 +90,8 @@ Status InvertedIndexFileWriter::delete_index(const TabletIndex* index_meta) { // Check if the specified index exists auto index_it = _indices_dirs.find(std::make_pair(index_id, index_suffix)); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::delete_index_indices_dirs_reach_end", + { index_it = _indices_dirs.end(); }) if (index_it == _indices_dirs.end()) { std::ostringstream errMsg; errMsg << "No inverted index with id " << index_id << " and suffix " << index_suffix @@ -110,6 +130,8 @@ int64_t InvertedIndexFileWriter::headerLength() { } Status InvertedIndexFileWriter::close() { + DCHECK(!_closed) << debug_string(); + _closed = true; if (_indices_dirs.empty()) { return Status::OK(); } @@ -122,7 +144,7 @@ Status InvertedIndexFileWriter::close() { }) if (_storage_format == InvertedIndexStorageFormatPB::V1) { try { - _total_file_size = write_v1(); + RETURN_IF_ERROR(write_v1()); for (const auto& entry : _indices_dirs) { const auto& dir = entry.second; // delete index path, which contains separated inverted index files @@ -137,7 +159,7 @@ Status InvertedIndexFileWriter::close() { } } else { try { - _total_file_size = write_v2(); + RETURN_IF_ERROR(write_v2()); for (const auto& entry : _indices_dirs) { const auto& dir = entry.second; // delete index path, which contains separated inverted index files @@ -184,7 +206,12 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire int64_t bufferLength) { lucene::store::IndexInput* tmp = nullptr; CLuceneError err; - if (!dir->openInput(fileName, tmp, err)) { + auto open = dir->openInput(fileName, tmp, err); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::copyFile_openInput_error", { + open = false; + err.set(CL_ERR_IO, "debug point: copyFile_openInput_error"); + }); + if (!open) { throw err; } @@ -200,6 +227,7 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire output->writeBytes(buffer, len); remainder -= len; } + DBUG_EXECUTE_IF("InvertedIndexFileWriter::copyFile_remainder_is_not_zero", { remainder = 10; }); if (remainder != 0) { std::ostringstream errMsg; errMsg << "Non-zero remainder length after copying: " << remainder << " (id: " << fileName @@ -210,6 +238,8 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire int64_t end_ptr = output->getFilePointer(); int64_t diff = end_ptr - start_ptr; + DBUG_EXECUTE_IF("InvertedIndexFileWriter::copyFile_diff_not_equals_length", + { diff = length - 10; }); if (diff != length) { std::ostringstream errMsg; errMsg << "Difference in the output file offsets " << diff @@ -220,7 +250,7 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire input->close(); } -int64_t InvertedIndexFileWriter::write_v1() { +Status InvertedIndexFileWriter::write_v1() { int64_t total_size = 0; for (const auto& entry : _indices_dirs) { const int64_t index_id = entry.first.first; @@ -253,6 +283,8 @@ int64_t InvertedIndexFileWriter::write_v1() { // write file entries to ram directory to get header length lucene::store::RAMDirectory ram_dir; auto* out_idx = ram_dir.createOutput(idx_name.c_str()); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_ram_output_is_nullptr", + { out_idx = nullptr; }) if (out_idx == nullptr) { LOG(WARNING) << "Write compound file error: RAMDirectory output is nullptr."; _CLTHROWA(CL_ERR_IO, "Create RAMDirectory output error"); @@ -286,6 +318,8 @@ int64_t InvertedIndexFileWriter::write_v1() { out_dir->set_file_writer_opts(_opts); auto* out = out_dir->createOutput(idx_name.c_str()); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_out_dir_createOutput_nullptr", + { out = nullptr; }); if (out == nullptr) { LOG(WARNING) << "Write compound file error: CompoundDirectory output is nullptr."; _CLTHROWA(CL_ERR_IO, "Create CompoundDirectory output error"); @@ -337,110 +371,125 @@ int64_t InvertedIndexFileWriter::write_v1() { auto* new_index_info = _file_info.add_index_info(); *new_index_info = index_info; } catch (CLuceneError& err) { - LOG(ERROR) << "CLuceneError occur when close idx file " - << InvertedIndexDescriptor::get_index_file_path_v1(_index_path_prefix, - index_id, index_suffix) + auto index_path = InvertedIndexDescriptor::get_index_file_path_v1( + _index_path_prefix, index_id, index_suffix); + LOG(ERROR) << "CLuceneError occur when write_v1 idx file " << index_path << " error msg: " << err.what(); - throw err; + return Status::Error( + "CLuceneError occur when write_v1 idx file: {}, error msg: {}", index_path, + err.what()); } } - return total_size; + _total_file_size = total_size; + return Status::OK(); } -int64_t InvertedIndexFileWriter::write_v2() { - // Create the output stream to write the compound file - int64_t current_offset = headerLength(); - +Status InvertedIndexFileWriter::write_v2() { io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; + std::unique_ptr compound_file_output; + try { + // Create the output stream to write the compound file + int64_t current_offset = headerLength(); - auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); - out_dir->set_file_writer_opts(_opts); + io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; - std::unique_ptr compound_file_output; - // idx v2 writer != nullptr means memtable on sink node now - if (_idx_v2_writer != nullptr) { + auto* out_dir = + DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); + out_dir->set_file_writer_opts(_opts); + + std::unique_ptr compound_file_output; + + DCHECK(_idx_v2_writer != nullptr) << "inverted index file writer v2 is nullptr"; compound_file_output = std::unique_ptr( out_dir->createOutputV2(_idx_v2_writer.get())); - } else { - compound_file_output = std::unique_ptr( - out_dir->createOutput(index_path.filename().c_str())); - } - // Write the version number - compound_file_output->writeInt(InvertedIndexStorageFormatPB::V2); + // Write the version number + compound_file_output->writeInt(InvertedIndexStorageFormatPB::V2); - // Write the number of indices - const auto numIndices = static_cast(_indices_dirs.size()); - compound_file_output->writeInt(numIndices); + // Write the number of indices + const auto numIndices = static_cast(_indices_dirs.size()); + compound_file_output->writeInt(numIndices); - std::vector> - file_metadata; // Store file name, offset, file length, and corresponding directory + std::vector> + file_metadata; // Store file name, offset, file length, and corresponding directory - // First, write all index information and file metadata - for (const auto& entry : _indices_dirs) { - const int64_t index_id = entry.first.first; - const auto& index_suffix = entry.first.second; - const auto& dir = entry.second; - std::vector files; - dir->list(&files); + // First, write all index information and file metadata + for (const auto& entry : _indices_dirs) { + const int64_t index_id = entry.first.first; + const auto& index_suffix = entry.first.second; + const auto& dir = entry.second; + std::vector files; + dir->list(&files); - auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); - if (it != files.end()) { - files.erase(it); - } - // sort file list by file length - std::vector> sorted_files; - for (const auto& file : files) { - sorted_files.emplace_back(file, dir->fileLength(file.c_str())); - } + auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); + if (it != files.end()) { + files.erase(it); + } + // sort file list by file length + std::vector> sorted_files; + for (const auto& file : files) { + sorted_files.emplace_back(file, dir->fileLength(file.c_str())); + } - std::sort(sorted_files.begin(), sorted_files.end(), - [](const std::pair& a, - const std::pair& b) { return (a.second < b.second); }); - - int32_t file_count = sorted_files.size(); - - // Write the index ID and the number of files - compound_file_output->writeLong(index_id); - compound_file_output->writeInt(static_cast(index_suffix.length())); - compound_file_output->writeBytes(reinterpret_cast(index_suffix.data()), - index_suffix.length()); - compound_file_output->writeInt(file_count); - - // Calculate the offset for each file and write the file metadata - for (const auto& file : sorted_files) { - int64_t file_length = dir->fileLength(file.first.c_str()); - compound_file_output->writeInt(static_cast(file.first.length())); - compound_file_output->writeBytes(reinterpret_cast(file.first.data()), - file.first.length()); - compound_file_output->writeLong(current_offset); - compound_file_output->writeLong(file_length); - - file_metadata.emplace_back(file.first, current_offset, file_length, dir.get()); - current_offset += file_length; // Update the data offset + std::sort( + sorted_files.begin(), sorted_files.end(), + [](const std::pair& a, + const std::pair& b) { return (a.second < b.second); }); + + int32_t file_count = sorted_files.size(); + + // Write the index ID and the number of files + compound_file_output->writeLong(index_id); + compound_file_output->writeInt(static_cast(index_suffix.length())); + compound_file_output->writeBytes(reinterpret_cast(index_suffix.data()), + index_suffix.length()); + compound_file_output->writeInt(file_count); + + // Calculate the offset for each file and write the file metadata + for (const auto& file : sorted_files) { + int64_t file_length = dir->fileLength(file.first.c_str()); + compound_file_output->writeInt(static_cast(file.first.length())); + compound_file_output->writeBytes( + reinterpret_cast(file.first.data()), file.first.length()); + compound_file_output->writeLong(current_offset); + compound_file_output->writeLong(file_length); + + file_metadata.emplace_back(file.first, current_offset, file_length, dir.get()); + current_offset += file_length; // Update the data offset + } } - } - const int64_t buffer_length = 16384; - uint8_t header_buffer[buffer_length]; + const int64_t buffer_length = 16384; + uint8_t header_buffer[buffer_length]; - // Next, write the file data - for (const auto& info : file_metadata) { - const std::string& file = std::get<0>(info); - auto* dir = std::get<3>(info); + // Next, write the file data + for (const auto& info : file_metadata) { + const std::string& file = std::get<0>(info); + auto* dir = std::get<3>(info); - // Write the actual file data - copyFile(file.c_str(), dir, compound_file_output.get(), header_buffer, buffer_length); - } + // Write the actual file data + copyFile(file.c_str(), dir, compound_file_output.get(), header_buffer, buffer_length); + } - out_dir->close(); - // NOTE: need to decrease ref count, but not to delete here, - // because index cache may get the same directory from DIRECTORIES - _CLDECDELETE(out_dir) - auto compound_file_size = compound_file_output->getFilePointer(); - compound_file_output->close(); - _file_info.set_index_size(compound_file_size); - return compound_file_size; + out_dir->close(); + // NOTE: need to decrease ref count, but not to delete here, + // because index cache may get the same directory from DIRECTORIES + _CLDECDELETE(out_dir) + _total_file_size = compound_file_output->getFilePointer(); + compound_file_output->close(); + _file_info.set_index_size(_total_file_size); + } catch (CLuceneError& err) { + LOG(ERROR) << "CLuceneError occur when close idx file " << index_path + << " error msg: " << err.what(); + if (compound_file_output) { + compound_file_output->close(); + compound_file_output.reset(); + } + return Status::Error( + "CLuceneError occur when close idx file: {}, error msg: {}", index_path.c_str(), + err.what()); + } + return Status::OK(); } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index 2aceb671d809a7..31e287d6dd3f71 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -38,6 +38,9 @@ class DorisFSDirectory; using InvertedIndexDirectoryMap = std::map, std::unique_ptr>; +class InvertedIndexFileWriter; +using InvertedIndexFileWriterPtr = std::unique_ptr; + class FileInfo { public: std::string filename; @@ -61,12 +64,18 @@ class InvertedIndexFileWriter { Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); ~InvertedIndexFileWriter() = default; - int64_t write_v2(); - int64_t write_v1(); + Status write_v2(); + Status write_v1(); Status close(); int64_t headerLength(); - InvertedIndexFileInfo get_index_file_info() const { return _file_info; } - int64_t get_index_file_total_size() const { return _total_file_size; } + const InvertedIndexFileInfo* get_index_file_info() const { + DCHECK(_closed) << debug_string(); + return &_file_info; + } + int64_t get_index_file_total_size() const { + DCHECK(_closed) << debug_string(); + return _total_file_size; + } const io::FileSystemSPtr& get_fs() const { return _fs; } void sort_files(std::vector& file_infos); void copyFile(const char* fileName, lucene::store::Directory* dir, @@ -75,6 +84,20 @@ class InvertedIndexFileWriter { void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } + std::string debug_string() const { + std::stringstream indices_dirs; + for (const auto& [index, dir] : _indices_dirs) { + indices_dirs << "index id is: " << index.first << " , index suffix is: " << index.second + << " , index dir is: " << dir->toString(); + } + return fmt::format( + "inverted index file writer debug string: index storage format is: {}, index path " + "prefix is: {}, rowset id is: {}, seg id is: {}, closed is: {}, total file size " + "is: {}, index dirs is: {}", + _storage_format, _index_path_prefix, _rowset_id, _seg_id, _closed, _total_file_size, + indices_dirs.str()); + } + private: InvertedIndexDirectoryMap _indices_dirs; const io::FileSystemSPtr _fs; @@ -82,14 +105,18 @@ class InvertedIndexFileWriter { std::string _rowset_id; int64_t _seg_id; InvertedIndexStorageFormatPB _storage_format; - // v1: all file size - // v2: file size - int64_t _total_file_size = 0; + // write to disk or stream - io::FileWriterPtr _idx_v2_writer; + io::FileWriterPtr _idx_v2_writer = nullptr; io::FileWriterOptions _opts; + // v1: all file size + // v2: file size + int64_t _total_file_size = 0; InvertedIndexFileInfo _file_info; + + // only once + bool _closed = false; }; } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp index f752c5300204de..ded71c8a6cc73e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp @@ -183,7 +183,10 @@ DorisFSDirectory::FSIndexInput::SharedHandle::SharedHandle(const char* path) { DorisFSDirectory::FSIndexInput::SharedHandle::~SharedHandle() { if (_reader) { - if (_reader->close().ok()) { + auto st = _reader->close(); + DBUG_EXECUTE_IF("FSIndexInput::~SharedHandle_reader_close_error", + { st = Status::Error("failed to close"); }); + if (st.ok()) { _reader = nullptr; } } @@ -238,10 +241,17 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len) Slice result {b, (size_t)len}; size_t bytes_read = 0; - if (!_handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx).ok()) { + auto st = _handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx); + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error", { + st = Status::InternalError( + "debug point: DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error"); + }) + if (!st.ok()) { _CLTHROWA(CL_ERR_IO, "read past EOF"); } bufferLength = len; + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexInput::readInternal_bytes_read_error", + { bytes_read = len + 10; }) if (bytes_read != len) { _CLTHROWA(CL_ERR_IO, "read error"); } @@ -313,6 +323,10 @@ void DorisFSDirectory::FSIndexOutput::flushBuffer(const uint8_t* b, const int32_ _CLTHROWA(CL_ERR_IO, "writer append data when flushBuffer error"); } } else { + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput::flushBuffer_writer_is_nullptr", + { _writer = nullptr; }) + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput::flushBuffer_b_is_nullptr", + { b = nullptr; }) if (_writer == nullptr) { LOG(WARNING) << "File writer is nullptr in DorisFSDirectory::FSIndexOutput, " "ignore flush."; @@ -327,8 +341,7 @@ void DorisFSDirectory::FSIndexOutput::close() { try { BufferedIndexOutput::close(); DBUG_EXECUTE_IF( - "DorisFSDirectory::FSIndexOutput._throw_clucene_error_in_bufferedindexoutput_" - "close", + "DorisFSDirectory::FSIndexOutput._throw_clucene_error_in_bufferedindexoutput_close", { _CLTHROWA(CL_ERR_IO, "debug point: test throw error in bufferedindexoutput close"); @@ -342,6 +355,10 @@ void DorisFSDirectory::FSIndexOutput::close() { _writer.reset(nullptr); _CLTHROWA(err.number(), err.what()); } + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput.set_writer_nullptr", { + LOG(WARNING) << "Dbug execute, set _writer to nullptr"; + _writer = nullptr; + }) if (_writer) { auto ret = _writer->close(); DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput._set_writer_close_status_error", @@ -353,6 +370,7 @@ void DorisFSDirectory::FSIndexOutput::close() { } } else { LOG(WARNING) << "File writer is nullptr, ignore finalize and close."; + _CLTHROWA(CL_ERR_IO, "close file writer error, _writer = nullptr"); } _writer.reset(nullptr); } @@ -364,13 +382,9 @@ int64_t DorisFSDirectory::FSIndexOutput::length() const { void DorisFSDirectory::FSIndexOutputV2::init(io::FileWriter* file_writer) { _index_v2_file_writer = file_writer; - DBUG_EXECUTE_IF( - "DorisFSDirectory::FSIndexOutput._throw_clucene_error_in_fsindexoutput_" - "init", - { - _CLTHROWA(CL_ERR_IO, - "debug point: test throw error in fsindexoutput init mock error"); - }) + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput._throw_clucene_error_in_fsindexoutput_init", { + _CLTHROWA(CL_ERR_IO, "debug point: test throw error in fsindexoutput init mock error"); + }) } DorisFSDirectory::FSIndexOutputV2::~FSIndexOutputV2() {} @@ -393,6 +407,10 @@ void DorisFSDirectory::FSIndexOutputV2::flushBuffer(const uint8_t* b, const int3 _CLTHROWA(CL_ERR_IO, "writer append data when flushBuffer error"); } } else { + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutputV2::flushBuffer_file_writer_is_nullptr", + { _index_v2_file_writer = nullptr; }) + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutputV2::flushBuffer_b_is_nullptr", + { b = nullptr; }) if (_index_v2_file_writer == nullptr) { LOG(WARNING) << "File writer is nullptr in DorisFSDirectory::FSIndexOutputV2, " "ignore flush."; @@ -408,8 +426,7 @@ void DorisFSDirectory::FSIndexOutputV2::close() { try { BufferedIndexOutput::close(); DBUG_EXECUTE_IF( - "DorisFSDirectory::FSIndexOutput._throw_clucene_error_in_bufferedindexoutput_" - "close", + "DorisFSDirectory::FSIndexOutput._throw_clucene_error_in_bufferedindexoutput_close", { _CLTHROWA(CL_ERR_IO, "debug point: test throw error in bufferedindexoutput close"); @@ -422,6 +439,10 @@ void DorisFSDirectory::FSIndexOutputV2::close() { } _CLTHROWA(err.number(), err.what()); } + DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput.set_writer_nullptr", { + LOG(WARNING) << "Dbug execute, set _index_v2_file_writer to nullptr"; + _index_v2_file_writer = nullptr; + }) if (_index_v2_file_writer) { auto ret = _index_v2_file_writer->close(); DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexOutput._set_writer_close_status_error", @@ -480,7 +501,16 @@ bool DorisFSDirectory::list(std::vector* names) const { priv_getFN(fl, ""); std::vector files; bool exists; - LOG_AND_THROW_IF_ERROR(_fs->list(fl, true, &files, &exists), "List file IO error"); + auto st = _fs->list(fl, true, &files, &exists); + DBUG_EXECUTE_IF("DorisFSDirectory::list_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::list_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "List file IO error"); + DBUG_EXECUTE_IF("DorisFSDirectory::list_directory_not_exists", { exists = false; }) + if (!exists) { + LOG_AND_THROW_IF_ERROR(st, fmt::format("Directory {} is not exist", fl)); + } for (auto& file : files) { names->push_back(file.file_name); } @@ -492,7 +522,12 @@ bool DorisFSDirectory::fileExists(const char* name) const { char fl[CL_MAX_DIR]; priv_getFN(fl, name); bool exists = false; - LOG_AND_THROW_IF_ERROR(_fs->exists(fl, &exists), "File exists IO error"); + auto st = _fs->exists(fl, &exists); + DBUG_EXECUTE_IF("DorisFSDirectory::fileExists_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::fileExists_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "File exists IO error"); return exists; } @@ -518,7 +553,12 @@ void DorisFSDirectory::touchFile(const char* name) { snprintf(buffer, CL_MAX_DIR, "%s%s%s", directory.c_str(), PATH_DELIMITERA, name); io::FileWriterPtr tmp_writer; - LOG_AND_THROW_IF_ERROR(_fs->create_file(buffer, &tmp_writer), "Touch file IO error"); + auto st = _fs->create_file(buffer, &tmp_writer); + DBUG_EXECUTE_IF("DorisFSDirectory::touchFile_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::touchFile_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "Touch file IO error"); } int64_t DorisFSDirectory::fileLength(const char* name) const { @@ -532,6 +572,10 @@ int64_t DorisFSDirectory::fileLength(const char* name) const { if (st.code() == ErrorCode::NOT_FOUND) { _CLTHROWA(CL_ERR_FileNotFound, "File does not exist"); } + DBUG_EXECUTE_IF("DorisFSDirectory::fileLength_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::fileLength_status_is_not_ok"); + }) LOG_AND_THROW_IF_ERROR(st, "Get file size IO error"); return size; } @@ -544,13 +588,21 @@ bool DorisFSDirectory::openInput(const char* name, lucene::store::IndexInput*& r return FSIndexInput::open(_fs, fl, ret, error, bufferSize); } -void DorisFSDirectory::close() {} +void DorisFSDirectory::close() { + DBUG_EXECUTE_IF("DorisFSDirectory::close_close_with_error", + { _CLTHROWA(CL_ERR_IO, "debug_point: close DorisFSDirectory error"); }) +} bool DorisFSDirectory::doDeleteFile(const char* name) { CND_PRECONDITION(directory[0] != 0, "directory is not open"); char fl[CL_MAX_DIR]; priv_getFN(fl, name); - LOG_AND_THROW_IF_ERROR(_fs->delete_file(fl), "Delete file IO error"); + auto st = _fs->delete_file(fl); + DBUG_EXECUTE_IF("DorisFSDirectory::doDeleteFile_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::doDeleteFile_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "Delete file IO error"); return true; } @@ -558,8 +610,12 @@ bool DorisFSDirectory::deleteDirectory() { CND_PRECONDITION(directory[0] != 0, "directory is not open"); char fl[CL_MAX_DIR]; priv_getFN(fl, ""); - LOG_AND_THROW_IF_ERROR(_fs->delete_directory(fl), - fmt::format("Delete directory {} IO error", fl)); + auto st = _fs->delete_directory(fl); + DBUG_EXECUTE_IF("DorisFSDirectory::deleteDirectory_throw_is_not_directory", { + st = Status::Error( + fmt::format("debug point: {} is not a directory", fl)); + }) + LOG_AND_THROW_IF_ERROR(st, fmt::format("Delete directory {} IO error", fl)); return true; } @@ -573,11 +629,26 @@ void DorisFSDirectory::renameFile(const char* from, const char* to) { priv_getFN(nu, to); bool exists = false; - LOG_AND_THROW_IF_ERROR(_fs->exists(nu, &exists), "File exists IO error"); + auto st = _fs->exists(nu, &exists); + DBUG_EXECUTE_IF("DorisFSDirectory::renameFile_exists_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::renameFile_exists_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "File exists IO error"); if (exists) { - LOG_AND_THROW_IF_ERROR(_fs->delete_directory(nu), fmt::format("Delete {} IO error", nu)); + st = _fs->delete_directory(nu); + DBUG_EXECUTE_IF("DorisFSDirectory::renameFile_delete_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::renameFile_delete_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, fmt::format("Delete {} IO error", nu)); } - LOG_AND_THROW_IF_ERROR(_fs->rename(old, nu), fmt::format("Rename {} to {} IO error", old, nu)); + st = _fs->rename(old, nu); + DBUG_EXECUTE_IF("DorisFSDirectory::renameFile_rename_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::renameFile_rename_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, fmt::format("Rename {} to {} IO error", old, nu)); } lucene::store::IndexOutput* DorisFSDirectory::createOutput(const char* name) { @@ -585,11 +656,31 @@ lucene::store::IndexOutput* DorisFSDirectory::createOutput(const char* name) { char fl[CL_MAX_DIR]; priv_getFN(fl, name); bool exists = false; - LOG_AND_THROW_IF_ERROR(_fs->exists(fl, &exists), "Create output file exists IO error"); + auto st = _fs->exists(fl, &exists); + DBUG_EXECUTE_IF("DorisFSDirectory::createOutput_exists_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::createOutput_exists_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "Create output file exists IO error"); if (exists) { - LOG_AND_THROW_IF_ERROR(_fs->delete_file(fl), - fmt::format("Create output delete file {} IO error", fl)); - LOG_AND_THROW_IF_ERROR(_fs->exists(fl, &exists), "Create output file exists IO error"); + st = _fs->delete_file(fl); + DBUG_EXECUTE_IF("DorisFSDirectory::createOutput_delete_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectory::createOutput_delete_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, fmt::format("Create output delete file {} IO error", fl)); + st = _fs->exists(fl, &exists); + DBUG_EXECUTE_IF("DorisFSDirectory::createOutput_exists_after_delete_status_is_not_ok", { + st = Status::Error( + "debug point: " + "DorisFSDirectory::createOutput_exists_after_delete_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "Create output file exists IO error"); + DBUG_EXECUTE_IF("DorisFSDirectory::createOutput_exists_after_delete_error", + { exists = true; }) + if (exists) { + _CLTHROWA(CL_ERR_IO, fmt::format("File {} should not exist", fl).c_str()); + } assert(!exists); } auto* ret = _CLNEW FSIndexOutput(); @@ -653,6 +744,10 @@ bool DorisRAMFSDirectory::fileExists(const char* name) const { int64_t DorisRAMFSDirectory::fileModified(const char* name) const { std::lock_guard wlock(_this_lock); auto* f = filesMap->get((char*)name); + DBUG_EXECUTE_IF("DorisRAMFSDirectory::fileModified_file_not_found", { f = nullptr; }) + if (f == nullptr) { + _CLTHROWA(CL_ERR_IO, fmt::format("NOT FOUND File {}.", name).c_str()); + } return f->getLastModified(); } @@ -661,6 +756,10 @@ void DorisRAMFSDirectory::touchFile(const char* name) { { std::lock_guard wlock(_this_lock); file = filesMap->get((char*)name); + DBUG_EXECUTE_IF("DorisRAMFSDirectory::touchFile_file_not_found", { file = nullptr; }) + if (file == nullptr) { + _CLTHROWA(CL_ERR_IO, fmt::format("NOT FOUND File {}.", name).c_str()); + } } const uint64_t ts1 = file->getLastModified(); uint64_t ts2 = lucene::util::Misc::currentTimeMillis(); @@ -677,6 +776,10 @@ void DorisRAMFSDirectory::touchFile(const char* name) { int64_t DorisRAMFSDirectory::fileLength(const char* name) const { std::lock_guard wlock(_this_lock); auto* f = filesMap->get((char*)name); + DBUG_EXECUTE_IF("DorisRAMFSDirectory::fileLength_file_not_found", { f = nullptr; }) + if (f == nullptr) { + _CLTHROWA(CL_ERR_IO, fmt::format("NOT FOUND File {}.", name).c_str()); + } return f->getLength(); } @@ -684,6 +787,7 @@ bool DorisRAMFSDirectory::openInput(const char* name, lucene::store::IndexInput* CLuceneError& error, int32_t bufferSize) { std::lock_guard wlock(_this_lock); auto* file = filesMap->get((char*)name); + DBUG_EXECUTE_IF("DorisRAMFSDirectory::openInput_file_not_found", { file = nullptr; }) if (file == nullptr) { error.set(CL_ERR_IO, "[DorisRAMCompoundDirectory::open] The requested file does not exist."); @@ -695,6 +799,8 @@ bool DorisRAMFSDirectory::openInput(const char* name, lucene::store::IndexInput* void DorisRAMFSDirectory::close() { DorisFSDirectory::close(); + DBUG_EXECUTE_IF("DorisRAMFSDirectory::close_close_with_error", + { _CLTHROWA(CL_ERR_IO, "debug_point: close DorisRAMFSDirectory error"); }) } bool DorisRAMFSDirectory::doDeleteFile(const char* name) { @@ -730,6 +836,7 @@ void DorisRAMFSDirectory::renameFile(const char* from, const char* to) { sizeInBytes -= itr1->second->sizeInBytes; filesMap->removeitr(itr1); } + DBUG_EXECUTE_IF("DorisRAMFSDirectory::renameFile_itr_filesMap_end", { itr = filesMap->end(); }) if (itr == filesMap->end()) { char tmp[1024]; snprintf(tmp, 1024, "cannot rename %s, file does not exist", from); @@ -752,6 +859,8 @@ lucene::store::IndexOutput* DorisRAMFSDirectory::createOutput(const char* name) // get the actual pointer to the output name char* n = nullptr; auto itr = filesMap->find(const_cast(name)); + DBUG_EXECUTE_IF("DorisRAMFSDirectory::createOutput_itr_filesMap_end", + { itr = filesMap->end(); }) if (itr != filesMap->end()) { n = itr->first; lucene::store::RAMFile* rf = itr->second; @@ -784,6 +893,7 @@ DorisFSDirectory* DorisFSDirectoryFactory::getDirectory(const io::FileSystemSPtr const char* _file, bool can_use_ram_dir, lucene::store::LockFactory* lock_factory) { DorisFSDirectory* dir = nullptr; + DBUG_EXECUTE_IF("DorisFSDirectoryFactory::getDirectory_file_is_nullptr", { _file = nullptr; }); if (!_file || !*_file) { _CLTHROWA(CL_ERR_IO, "Invalid directory"); } @@ -797,10 +907,22 @@ DorisFSDirectory* DorisFSDirectoryFactory::getDirectory(const io::FileSystemSPtr dir = _CLNEW DorisRAMFSDirectory(); } else { bool exists = false; - LOG_AND_THROW_IF_ERROR(_fs->exists(file, &exists), "Get directory exists IO error"); + auto st = _fs->exists(file, &exists); + DBUG_EXECUTE_IF("DorisFSDirectoryFactory::getDirectory_exists_status_is_not_ok", { + st = Status::Error( + "debug point: DorisFSDirectoryFactory::getDirectory_exists_status_is_not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "Get directory exists IO error"); if (!exists) { - LOG_AND_THROW_IF_ERROR(_fs->create_directory(file), - "Get directory create directory IO error"); + st = _fs->create_directory(file); + DBUG_EXECUTE_IF( + "DorisFSDirectoryFactory::getDirectory_create_directory_status_is_not_ok", { + st = Status::Error( + "debug point: " + "DorisFSDirectoryFactory::getDirectory_create_directory_status_is_" + "not_ok"); + }) + LOG_AND_THROW_IF_ERROR(st, "Get directory create directory IO error"); } dir = _CLNEW DorisFSDirectory(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 7b8504322d2687..b7cfe7dfaffb31 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -266,24 +266,13 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run query_info.terms.emplace_back(search_str); } else { if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { - PhraseQuery::parser_slop(search_str, query_info); + PhraseQuery::parser_info( + search_str, column_name, query_type, _index_meta.properties(), query_info, + runtime_state->query_options().enable_phrase_query_sequential_opt); + } else { + query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, column_name, query_type, _index_meta.properties()); } - - InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared( - get_inverted_index_parser_type_from_string( - get_parser_string_from_properties(_index_meta.properties())), - get_parser_mode_string_from_properties(_index_meta.properties()), - get_parser_char_filter_map_from_properties(_index_meta.properties()), - get_parser_lowercase_from_properties(_index_meta.properties()), - get_parser_stopwords_from_properties(_index_meta.properties())); - auto analyzer = inverted_index::InvertedIndexAnalyzer::create_analyzer( - inverted_index_ctx.get()); - inverted_index_ctx->analyzer = analyzer.get(); - auto reader = inverted_index::InvertedIndexAnalyzer::create_reader( - inverted_index_ctx->char_filter_map); - reader->init(search_str.data(), search_str.size(), true); - query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result( - reader.get(), analyzer.get(), column_name, query_type); } if (query_info.terms.empty()) { auto msg = fmt::format( diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 9bd13309fa76e7..d3a0ff3cf118ba 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -171,7 +171,8 @@ class InvertedIndexResultBitmap { bool is_empty() const { return (_data_bitmap == nullptr && _null_bitmap == nullptr); } }; -class InvertedIndexReader : public std::enable_shared_from_this { +class InvertedIndexReader : public std::enable_shared_from_this, + public MetadataAdder { public: explicit InvertedIndexReader( const TabletIndex* index_meta, @@ -407,6 +408,10 @@ class InvertedIndexQueryParamFactory { M(PrimitiveType::TYPE_CHAR) M(PrimitiveType::TYPE_VARCHAR) M(PrimitiveType::TYPE_STRING) + M(PrimitiveType::TYPE_DATEV2) + M(PrimitiveType::TYPE_DATETIMEV2) + M(PrimitiveType::TYPE_IPV4) + M(PrimitiveType::TYPE_IPV6) #undef M default: return Status::NotSupported("Unsupported primitive type {} for inverted index reader", diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 8729bd0c590276..29fe4609e59e9c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -75,6 +75,23 @@ const int32_t MAX_LEAF_COUNT = 1024; const float MAXMBSortInHeap = 512.0 * 8; const int DIMS = 1; +bool InvertedIndexColumnWriter::check_support_inverted_index(const TabletColumn& column) { + // bellow types are not supported in inverted index for extracted columns + static std::set invalid_types = { + FieldType::OLAP_FIELD_TYPE_DOUBLE, + FieldType::OLAP_FIELD_TYPE_JSONB, + FieldType::OLAP_FIELD_TYPE_ARRAY, + FieldType::OLAP_FIELD_TYPE_FLOAT, + }; + if (column.is_extracted_column() && (invalid_types.contains(column.type()))) { + return false; + } + if (column.is_variant_type()) { + return false; + } + return true; +} + template class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { public: @@ -101,6 +118,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { Status init() override { try { + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::init_field_type_not_supported", { + return Status::Error( + "Field type not supported"); + }) + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::init_inverted_index_writer_init_error", + { _CLTHROWA(CL_ERR_IO, "debug point: init index error"); }) if constexpr (field_is_slice_type(field_type)) { return init_fulltext_index(); } else if constexpr (field_is_numeric_type(field_type)) { @@ -124,6 +147,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { void close_on_error() override { try { + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::close_on_error_throw_exception", + { _CLTHROWA(CL_ERR_IO, "debug point: close on error"); }) if (_index_writer) { _index_writer->close(); } @@ -143,6 +168,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { _bkd_writer = std::make_shared( max_doc, DIMS, DIMS, value_length, MAX_LEAF_COUNT, MAXMBSortInHeap, total_point_count, true, config::max_depth_in_bkd_tree); + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::init_bkd_index_throw_error", { + _CLTHROWA(CL_ERR_IllegalArgument, "debug point: create bkd_writer error"); + }) return open_index_directory(); } @@ -157,6 +185,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } Status open_index_directory() { + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::open_index_directory_error", { + return Status::Error( + "debug point: open_index_directory_error"); + }) _dir = DORIS_TRY(_index_file_writer->open(_index_meta)); return Status::OK(); } @@ -166,6 +198,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { bool close_dir_on_shutdown = true; auto index_writer = std::make_unique( _dir, _analyzer.get(), create_index, close_dir_on_shutdown); + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setRAMBufferSizeMB_error", + { index_writer->setRAMBufferSizeMB(-100); }) + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setMaxBufferedDocs_error", + { index_writer->setMaxBufferedDocs(1); }) + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setMergeFactor_error", + { index_writer->setMergeFactor(1); }) index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size); index_writer->setMaxBufferedDocs(config::inverted_index_max_buffered_docs); index_writer->setMaxFieldLength(MAX_FIELD_LEN); @@ -230,6 +268,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { try { _index_writer->addDocument(_doc.get()); + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_document_throw_error", + { _CLTHROWA(CL_ERR_IO, "debug point: add_document io error"); }) } catch (const CLuceneError& e) { close_on_error(); return Status::Error( @@ -241,6 +281,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { Status add_null_document() { try { _index_writer->addNullDocument(_doc.get()); + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_null_document_throw_error", + { _CLTHROWA(CL_ERR_IO, "debug point: add_null_document io error"); }) } catch (const CLuceneError& e) { close_on_error(); return Status::Error( @@ -253,6 +295,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { _null_bitmap.addRange(_rid, _rid + count); _rid += count; if constexpr (field_is_slice_type(field_type)) { + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_nulls_field_nullptr", + { _field = nullptr; }) + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_nulls_index_writer_nullptr", + { _index_writer = nullptr; }) if (_field == nullptr || _index_writer == nullptr) { LOG(ERROR) << "field or index writer is null in inverted index writer."; return Status::InternalError( @@ -271,17 +317,30 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } - void new_inverted_index_field(const char* field_value_data, size_t field_value_size) { - if (_parser_type != InvertedIndexParserType::PARSER_UNKNOWN && - _parser_type != InvertedIndexParserType::PARSER_NONE) { - new_char_token_stream(field_value_data, field_value_size, _field); - } else { - new_field_char_value(field_value_data, field_value_size, _field); + Status new_inverted_index_field(const char* field_value_data, size_t field_value_size) { + try { + if (_parser_type != InvertedIndexParserType::PARSER_UNKNOWN && + _parser_type != InvertedIndexParserType::PARSER_NONE) { + new_char_token_stream(field_value_data, field_value_size, _field); + } else { + new_field_char_value(field_value_data, field_value_size, _field); + } + } catch (const CLuceneError& e) { + return Status::Error( + "CLuceneError create new index field error: {}", e.what()); } + return Status::OK(); } void new_char_token_stream(const char* s, size_t len, lucene::document::Field* field) { _char_string_reader->init(s, len, false); + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::new_char_token_stream__char_string_reader_init_" + "error", + { + _CLTHROWA(CL_ERR_UnsupportedOperation, + "UnsupportedOperationException: CLStream::init"); + }) auto* stream = _analyzer->reusableTokenStream(field->name(), _char_string_reader.get()); field->setValue(stream); } @@ -299,6 +358,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { Status add_values(const std::string fn, const void* values, size_t count) override { if constexpr (field_is_slice_type(field_type)) { + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_values_field_is_nullptr", + { _field = nullptr; }) + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_values_index_writer_is_nullptr", + { _index_writer = nullptr; }) if (_field == nullptr || _index_writer == nullptr) { LOG(ERROR) << "field or index writer is null in inverted index writer."; return Status::InternalError( @@ -312,7 +375,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { (_parser_type != InvertedIndexParserType::PARSER_NONE && v->empty())) { RETURN_IF_ERROR(add_null_document()); } else { - new_inverted_index_field(v->get_data(), v->get_size()); + RETURN_IF_ERROR(new_inverted_index_field(v->get_data(), v->get_size())); RETURN_IF_ERROR(add_document()); } ++v; @@ -326,12 +389,17 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { Status add_array_values(size_t field_size, const void* value_ptr, const uint8_t* null_map, const uint8_t* offsets_ptr, size_t count) override { + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_count_is_zero", + { count = 0; }) if (count == 0) { // no values to add inverted index return Status::OK(); } const auto* offsets = reinterpret_cast(offsets_ptr); if constexpr (field_is_slice_type(field_type)) { + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_array_values_index_writer_is_nullptr", + { _index_writer = nullptr; }) if (_index_writer == nullptr) { LOG(ERROR) << "index writer is null in inverted index writer."; return Status::InternalError("index writer is null in inverted index writer"); @@ -357,7 +425,15 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { continue; } else { // now we temp create field . later make a pool - if (Status st = create_field(&new_field); st != Status::OK()) { + Status st = create_field(&new_field); + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_array_values_create_field_" + "error", + { + st = Status::Error( + "debug point: add_array_values_create_field_error"); + }) + if (st != Status::OK()) { LOG(ERROR) << "create field " << string(_field_name.begin(), _field_name.end()) << " error:" << st; @@ -409,7 +485,14 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { // avoid to add doc which without any field which may make threadState init skip // init fieldDataArray, then will make error with next doc with fields in // resetCurrentFieldData - if (Status st = create_field(&new_field); st != Status::OK()) { + Status st = create_field(&new_field); + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_array_values_create_field_error_2", + { + st = Status::Error( + "debug point: add_array_values_create_field_error_2"); + }) + if (st != Status::OK()) { LOG(ERROR) << "create field " << string(_field_name.begin(), _field_name.end()) << " error:" << st; @@ -443,6 +526,11 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { Status add_array_values(size_t field_size, const CollectionValue* values, size_t count) override { if constexpr (field_is_slice_type(field_type)) { + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_field_is_nullptr", + { _field = nullptr; }) + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_array_values_index_writer_is_nullptr", + { _index_writer = nullptr; }) if (_field == nullptr || _index_writer == nullptr) { LOG(ERROR) << "field or index writer is null in inverted index writer."; return Status::InternalError( @@ -461,7 +549,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { item_data_ptr = (uint8_t*)item_data_ptr + field_size; } auto value = join(strings, " "); - new_inverted_index_field(value.c_str(), value.length()); + RETURN_IF_ERROR(new_inverted_index_field(value.c_str(), value.length())); _rid++; RETURN_IF_ERROR(add_document()); values++; @@ -651,6 +739,8 @@ Status InvertedIndexColumnWriter::create(const Field* field, bool single_field = true; if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) { const auto* array_typeinfo = dynamic_cast(typeinfo); + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_array_typeinfo_is_nullptr", + { array_typeinfo = nullptr; }) if (array_typeinfo != nullptr) { typeinfo = array_typeinfo->item_type_info(); type = typeinfo->type(); @@ -661,6 +751,8 @@ Status InvertedIndexColumnWriter::create(const Field* field, } } + DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_unsupported_type_for_inverted_index", + { type = FieldType::OLAP_FIELD_TYPE_FLOAT; }) switch (type) { #define M(TYPE) \ case TYPE: \ diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index 63c1e219e649e8..da90752db09168 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -33,7 +33,6 @@ #include "io/fs/local_file_system.h" #include "olap/olap_common.h" #include "olap/options.h" -#include "olap/tablet_schema.h" namespace doris { class CollectionValue; @@ -41,6 +40,7 @@ class CollectionValue; class Field; class TabletIndex; +class TabletColumn; namespace segment_v2 { class InvertedIndexFileWriter; @@ -74,22 +74,7 @@ class InvertedIndexColumnWriter { // check if the column is valid for inverted index, some columns // are generated from variant, but not all of them are supported - static bool check_support_inverted_index(const TabletColumn& column) { - // bellow types are not supported in inverted index for extracted columns - static std::set invalid_types = { - FieldType::OLAP_FIELD_TYPE_DOUBLE, - FieldType::OLAP_FIELD_TYPE_JSONB, - FieldType::OLAP_FIELD_TYPE_ARRAY, - FieldType::OLAP_FIELD_TYPE_FLOAT, - }; - if (column.is_extracted_column() && (invalid_types.contains(column.type()))) { - return false; - } - if (column.is_variant_type()) { - return false; - } - return true; - } + static bool check_support_inverted_index(const TabletColumn& column); private: DISALLOW_COPY_AND_ASSIGN(InvertedIndexColumnWriter); diff --git a/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp b/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp index 24b2e3379963bc..9ee82bacdd73d2 100644 --- a/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp +++ b/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp @@ -34,8 +34,6 @@ namespace doris { -static bvar::Adder g_ordinal_index_memory_bytes("doris_ordinal_index_memory_bytes"); - namespace segment_v2 { void OrdinalIndexWriter::append_entry(ordinal_t ordinal, const PagePointer& data_pp) { @@ -116,10 +114,6 @@ Status OrdinalIndexReader::_load(bool use_page_cache, bool kept_in_memory, _ordinals.resize(_num_pages + 1); _pages.resize(_num_pages); - g_ordinal_index_memory_bytes << sizeof(*this) + _ordinals.size() * sizeof(ordinal_t) + - _pages.size() * sizeof(PagePointer) + - sizeof(OrdinalIndexReader); - for (int i = 0; i < _num_pages; i++) { Slice key = reader.get_key(i); ordinal_t ordinal = 0; @@ -132,9 +126,16 @@ Status OrdinalIndexReader::_load(bool use_page_cache, bool kept_in_memory, } _ordinals[_num_pages] = _num_values; + update_metadata_size(); + return Status::OK(); } +int64_t OrdinalIndexReader::get_metadata_size() const { + return sizeof(OrdinalIndexReader) + _ordinals.capacity() * sizeof(ordinal_t) + + _pages.capacity() * sizeof(PagePointer); +} + OrdinalPageIndexIterator OrdinalIndexReader::seek_at_or_before(ordinal_t ordinal) { int32_t left = 0; int32_t right = _num_pages - 1; @@ -156,13 +157,7 @@ OrdinalPageIndexIterator OrdinalIndexReader::seek_at_or_before(ordinal_t ordinal return OrdinalPageIndexIterator(this, left); } -OrdinalIndexReader::~OrdinalIndexReader() { - if (_ordinals.size() > 0) { - g_ordinal_index_memory_bytes << -sizeof(*this) - _ordinals.size() * sizeof(ordinal_t) - - _pages.size() * sizeof(PagePointer) - - sizeof(OrdinalIndexReader); - } -} +OrdinalIndexReader::~OrdinalIndexReader() = default; } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/ordinal_page_index.h b/be/src/olap/rowset/segment_v2/ordinal_page_index.h index 8f9e0afe1bf49b..1d74cf989520aa 100644 --- a/be/src/olap/rowset/segment_v2/ordinal_page_index.h +++ b/be/src/olap/rowset/segment_v2/ordinal_page_index.h @@ -64,7 +64,7 @@ class OrdinalIndexWriter { class OrdinalPageIndexIterator; -class OrdinalIndexReader { +class OrdinalIndexReader : public MetadataAdder { public: explicit OrdinalIndexReader(io::FileReaderSPtr file_reader, ordinal_t num_values, const OrdinalIndexPB& meta_pb) @@ -96,6 +96,8 @@ class OrdinalIndexReader { Status _load(bool use_page_cache, bool kept_in_memory, std::unique_ptr index_meta); + int64_t get_metadata_size() const override; + private: friend OrdinalPageIndexIterator; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index dca3ba54b9d4bc..0ad799683fc458 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -74,7 +74,7 @@ #include "vec/olap/vgeneric_iterators.h" namespace doris::segment_v2 { -static bvar::Adder g_total_segment_num("doris_total_segment_num"); + class InvertedIndexIterator; io::UInt128Wrapper file_cache_key_from_path(const std::string& seg_path) { @@ -86,10 +86,30 @@ std::string file_cache_key_str(const std::string& seg_path) { return file_cache_key_from_path(seg_path).to_string(); } -Status Segment::open(io::FileSystemSPtr fs, const std::string& path, uint32_t segment_id, - RowsetId rowset_id, TabletSchemaSPtr tablet_schema, +Status Segment::open(io::FileSystemSPtr fs, const std::string& path, int64_t tablet_id, + uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, const io::FileReaderOptions& reader_options, std::shared_ptr* output, InvertedIndexFileInfo idx_file_info) { + auto s = _open(fs, path, segment_id, rowset_id, tablet_schema, reader_options, output, + idx_file_info); + if (!s.ok()) { + if (!config::is_cloud_mode()) { + auto res = ExecEnv::get_tablet(tablet_id); + TabletSharedPtr tablet = + res.has_value() ? std::dynamic_pointer_cast(res.value()) : nullptr; + if (tablet) { + tablet->report_error(s); + } + } + } + + return s; +} + +Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t segment_id, + RowsetId rowset_id, TabletSchemaSPtr tablet_schema, + const io::FileReaderOptions& reader_options, std::shared_ptr* output, + InvertedIndexFileInfo idx_file_info) { io::FileReaderSPtr file_reader; RETURN_IF_ERROR(fs->open_file(path, &file_reader, &reader_options)); std::shared_ptr segment( @@ -141,18 +161,19 @@ Segment::Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr table _meta_mem_usage(0), _rowset_id(rowset_id), _tablet_schema(std::move(tablet_schema)), - _idx_file_info(idx_file_info) { - g_total_segment_num << 1; -} + _idx_file_info(idx_file_info) {} -Segment::~Segment() { - g_total_segment_num << -1; -} +Segment::~Segment() = default; io::UInt128Wrapper Segment::file_cache_key(std::string_view rowset_id, uint32_t seg_id) { return io::BlockFileCache::hash(fmt::format("{}_{}.dat", rowset_id, seg_id)); } +int64_t Segment::get_metadata_size() const { + return sizeof(Segment) + (_footer_pb ? _footer_pb->ByteSizeLong() : 0) + + (_pk_index_meta ? _pk_index_meta->ByteSizeLong() : 0); +} + Status Segment::_open() { _footer_pb = std::make_unique(); RETURN_IF_ERROR(_parse_footer(_footer_pb.get())); @@ -169,6 +190,9 @@ Status Segment::_open() { if (_pk_index_meta != nullptr) { _meta_mem_usage += _pk_index_meta->ByteSizeLong(); } + + update_metadata_size(); + _meta_mem_usage += sizeof(*this); _meta_mem_usage += _tablet_schema->num_columns() * config::estimated_mem_per_column_reader; @@ -450,45 +474,25 @@ Status Segment::_load_pk_bloom_filter() { DCHECK(_tablet_schema->keys_type() == UNIQUE_KEYS); DCHECK(_pk_index_meta != nullptr); DCHECK(_pk_index_reader != nullptr); - auto status = [this]() { - return _load_pk_bf_once.call([this] { - RETURN_IF_ERROR(_pk_index_reader->parse_bf(_file_reader, *_pk_index_meta)); - // _meta_mem_usage += _pk_index_reader->get_bf_memory_size(); - return Status::OK(); - }); - }(); - if (!status.ok()) { - remove_from_segment_cache(); - } - return status; -} -void Segment::remove_from_segment_cache() const { - if (config::disable_segment_cache) { - return; - } - SegmentCache::CacheKey cache_key(_rowset_id, _segment_id); - SegmentLoader::instance()->erase_segment(cache_key); + return _load_pk_bf_once.call([this] { + RETURN_IF_ERROR(_pk_index_reader->parse_bf(_file_reader, *_pk_index_meta)); + // _meta_mem_usage += _pk_index_reader->get_bf_memory_size(); + return Status::OK(); + }); } -Status Segment::load_pk_index_and_bf() { +Status Segment::load_pk_index_and_bf(OlapReaderStatistics* index_load_stats) { + _pk_index_load_stats = index_load_stats; RETURN_IF_ERROR(load_index()); RETURN_IF_ERROR(_load_pk_bloom_filter()); return Status::OK(); } Status Segment::load_index() { - auto status = [this]() { return _load_index_impl(); }(); - if (!status.ok()) { - remove_from_segment_cache(); - } - return status; -} - -Status Segment::_load_index_impl() { return _load_index_once.call([this] { if (_tablet_schema->keys_type() == UNIQUE_KEYS && _pk_index_meta != nullptr) { - _pk_index_reader = std::make_unique(); + _pk_index_reader = std::make_unique(_pk_index_load_stats); RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta)); // _meta_mem_usage += _pk_index_reader->get_memory_size(); return Status::OK(); @@ -519,6 +523,32 @@ Status Segment::_load_index_impl() { }); } +Status Segment::healthy_status() { + try { + if (_load_index_once.has_called()) { + RETURN_IF_ERROR(_load_index_once.stored_result()); + } + if (_load_pk_bf_once.has_called()) { + RETURN_IF_ERROR(_load_pk_bf_once.stored_result()); + } + if (_create_column_readers_once_call.has_called()) { + RETURN_IF_ERROR(_create_column_readers_once_call.stored_result()); + } + if (_inverted_index_file_reader_open.has_called()) { + RETURN_IF_ERROR(_inverted_index_file_reader_open.stored_result()); + } + // This status is set by running time, for example, if there is something wrong during read segment iterator. + return _healthy_status.status(); + } catch (const doris::Exception& e) { + // If there is an exception during load_xxx, should not throw exception directly because + // the caller may not exception safe. + return e.to_status(); + } catch (const std::exception& e) { + // The exception is not thrown by doris code. + return Status::InternalError("Unexcepted error during load segment: {}", e.what()); + } +} + // Return the storage datatype of related column to field. // Return nullptr meaning no such storage infomation for this column vectorized::DataTypePtr Segment::get_data_type_of(const ColumnIdentifier& identifier, @@ -921,7 +951,8 @@ Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column, } Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_schema, - bool with_seq_col, bool with_rowid, RowLocation* row_location) { + bool with_seq_col, bool with_rowid, RowLocation* row_location, + std::string* encoded_seq_value, OlapReaderStatistics* stats) { RETURN_IF_ERROR(load_pk_index_and_bf()); bool has_seq_col = latest_schema->has_sequence_col(); bool has_rowid = !latest_schema->cluster_key_idxes().empty(); @@ -941,7 +972,7 @@ Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_sche } bool exact_match = false; std::unique_ptr index_iterator; - RETURN_IF_ERROR(_pk_index_reader->new_iterator(&index_iterator)); + RETURN_IF_ERROR(_pk_index_reader->new_iterator(&index_iterator, stats)); auto st = index_iterator->seek_at_or_after(&key_without_seq, &exact_match); if (!st.ok() && !st.is()) { return st; @@ -970,6 +1001,7 @@ Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_sche Slice sought_key_without_seq = Slice( sought_key.get_data(), sought_key.get_size() - (segment_has_seq_col ? seq_col_length : 0) - rowid_length); + if (has_seq_col) { // compare key if (key_without_seq.compare(sought_key_without_seq) != 0) { @@ -1007,6 +1039,16 @@ Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_sche (uint8_t*)&row_location->row_id)); } + if (encoded_seq_value) { + if (!segment_has_seq_col) { + *encoded_seq_value = std::string {}; + } else { + // include marker + *encoded_seq_value = + Slice(sought_key.get_data() + sought_key_without_seq.get_size(), seq_col_length) + .to_string(); + } + } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index a4f01873f4c74f..bc5ab1e1fdc80a 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -78,10 +78,10 @@ using SegmentSharedPtr = std::shared_ptr; // NOTE: This segment is used to a specified TabletSchema, when TabletSchema // is changed, this segment can not be used any more. For example, after a schema // change finished, client should disable all cached Segment for old TabletSchema. -class Segment : public std::enable_shared_from_this { +class Segment : public std::enable_shared_from_this, public MetadataAdder { public: - static Status open(io::FileSystemSPtr fs, const std::string& path, uint32_t segment_id, - RowsetId rowset_id, TabletSchemaSPtr tablet_schema, + static Status open(io::FileSystemSPtr fs, const std::string& path, int64_t tablet_id, + uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, const io::FileReaderOptions& reader_options, std::shared_ptr* output, InvertedIndexFileInfo idx_file_info = {}); @@ -92,6 +92,8 @@ class Segment : public std::enable_shared_from_this { ~Segment(); + int64_t get_metadata_size() const override; + Status new_iterator(SchemaSPtr schema, const StorageReadOptions& read_options, std::unique_ptr* iter); @@ -130,7 +132,9 @@ class Segment : public std::enable_shared_from_this { } Status lookup_row_key(const Slice& key, const TabletSchema* latest_schema, bool with_seq_col, - bool with_rowid, RowLocation* row_location); + bool with_rowid, RowLocation* row_location, + std::string* encoded_seq_value = nullptr, + OlapReaderStatistics* stats = nullptr); Status read_key_by_rowid(uint32_t row_id, std::string* key); @@ -140,7 +144,13 @@ class Segment : public std::enable_shared_from_this { Status load_index(); - Status load_pk_index_and_bf(); + Status load_pk_index_and_bf(OlapReaderStatistics* index_load_stats = nullptr); + + void update_healthy_status(Status new_status) { _healthy_status.update(new_status); } + // The segment is loaded into SegmentCache and then will load indices, if there are something wrong + // during loading indices, should remove it from SegmentCache. If not, it will always report error during + // query. So we add a healthy status API, the caller should check the healhty status before using the segment. + Status healthy_status(); std::string min_key() { DCHECK(_tablet_schema->keys_type() == UNIQUE_KEYS && _pk_index_meta != nullptr); @@ -155,8 +165,6 @@ class Segment : public std::enable_shared_from_this { int64_t meta_mem_usage() const { return _meta_mem_usage; } - void remove_from_segment_cache() const; - // Identify the column by unique id or path info struct ColumnIdentifier { int32_t unique_id = -1; @@ -207,6 +215,10 @@ class Segment : public std::enable_shared_from_this { DISALLOW_COPY_AND_ASSIGN(Segment); Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, InvertedIndexFileInfo idx_file_info = InvertedIndexFileInfo()); + static Status _open(io::FileSystemSPtr fs, const std::string& path, uint32_t segment_id, + RowsetId rowset_id, TabletSchemaSPtr tablet_schema, + const io::FileReaderOptions& reader_options, + std::shared_ptr* output, InvertedIndexFileInfo idx_file_info); // open segment file and read the minimum amount of necessary information (footer) Status _open(); Status _parse_footer(SegmentFooterPB* footer); @@ -222,7 +234,6 @@ class Segment : public std::enable_shared_from_this { Status _write_error_file(size_t file_size, size_t offset, size_t bytes_read, char* data, io::IOContext& io_ctx); - Status _load_index_impl(); Status _open_inverted_index(); Status _create_column_readers_once(); @@ -233,6 +244,7 @@ class Segment : public std::enable_shared_from_this { io::FileReaderSPtr _file_reader; uint32_t _segment_id; uint32_t _num_rows; + AtomicStatus _healthy_status; // 1. Tracking memory use by segment meta data such as footer or index page. // 2. Tracking memory use by segment column reader @@ -290,6 +302,7 @@ class Segment : public std::enable_shared_from_this { InvertedIndexFileInfo _idx_file_info; int _be_exec_version = BeExecVersionManager::get_newest_version(); + OlapReaderStatistics* _pk_index_load_stats = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 2a7da619c7a3ab..5b1bfaf076279f 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -270,8 +270,8 @@ SegmentIterator::SegmentIterator(std::shared_ptr segment, SchemaSPtr sc Status SegmentIterator::init(const StorageReadOptions& opts) { auto status = _init_impl(opts); - if (!status.ok() && !config::disable_segment_cache) { - _segment->remove_from_segment_cache(); + if (!status.ok()) { + _segment->update_healthy_status(status); } return status; } @@ -497,7 +497,7 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra } Status SegmentIterator::_get_row_ranges_by_column_conditions() { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_ns); if (_row_bitmap.isEmpty()) { return Status::OK(); } @@ -505,7 +505,8 @@ Status SegmentIterator::_get_row_ranges_by_column_conditions() { RETURN_IF_ERROR(_apply_bitmap_index()); { if (_opts.runtime_state && - _opts.runtime_state->query_options().enable_inverted_index_query) { + _opts.runtime_state->query_options().enable_inverted_index_query && + has_inverted_index_in_iterators()) { SCOPED_RAW_TIMER(&_opts.stats->inverted_index_filter_timer); size_t input_rows = _row_bitmap.cardinality(); RETURN_IF_ERROR(_apply_inverted_index()); @@ -564,7 +565,7 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row size_t pre_size = 0; { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_bf_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_bf_ns); // first filter data by bloom filter index // bloom filter index only use CondColumn RowRanges bf_row_ranges = RowRanges::create_single(num_rows()); @@ -584,10 +585,17 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row pre_size = condition_row_ranges->count(); RowRanges::ranges_intersection(*condition_row_ranges, bf_row_ranges, condition_row_ranges); _opts.stats->rows_bf_filtered += (pre_size - condition_row_ranges->count()); + + DBUG_EXECUTE_IF("bloom_filter_must_filter_data", { + if (pre_size - condition_row_ranges->count() == 0) { + return Status::Error( + "Bloom filter did not filter the data."); + } + }) } { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_zonemap_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_zonemap_ns); RowRanges zone_map_row_ranges = RowRanges::create_single(num_rows()); // second filter data by zone map for (const auto& cid : cids) { @@ -651,7 +659,7 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row } { - SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_dict_ns); + SCOPED_RAW_TIMER(&_opts.stats->generate_row_ranges_by_dict_ns); /// Low cardinality optimization is currently not very stable, so to prevent data corruption, /// we are temporarily disabling its use in data compaction. if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) { @@ -1056,16 +1064,17 @@ Status SegmentIterator::_init_inverted_index_iterators() { return Status::OK(); } for (auto cid : _schema->column_ids()) { + // Use segment’s own index_meta, for compatibility with future indexing needs to default to lowercase. if (_inverted_index_iterators[cid] == nullptr) { - // Not check type valid, since we need to get inverted index for related variant type when reading the segment. - // If check type valid, we can not get inverted index for variant type, and result nullptr.The result for calling - // get_inverted_index with variant suffix should return corresponding inverted index meta. - bool check_inverted_index_by_type = false; - // Use segment’s own index_meta, for compatibility with future indexing needs to default to lowercase. + // In the _opts.tablet_schema, the sub-column type information for the variant is FieldType::OLAP_FIELD_TYPE_VARIANT. + // This is because the sub-column is created in create_materialized_variant_column. + // We use this column to locate the metadata for the inverted index, which requires a unique_id and path. + const auto& column = _opts.tablet_schema->column(cid); + int32_t col_unique_id = + column.is_extracted_column() ? column.parent_unique_id() : column.unique_id(); RETURN_IF_ERROR(_segment->new_inverted_index_iterator( - _opts.tablet_schema->column(cid), - _segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid), - check_inverted_index_by_type), + column, + _segment->_tablet_schema->inverted_index(col_unique_id, column.suffix_path()), _opts, &_inverted_index_iterators[cid])); } } @@ -1331,7 +1340,7 @@ Status SegmentIterator::_vec_init_lazy_materialization() { short_cir_pred_col_id_set.insert(cid); _short_cir_eval_predicate.push_back(predicate); } - if (predicate->is_filter()) { + if (predicate->is_runtime_filter()) { _filter_info_id.push_back(predicate); } } @@ -1401,7 +1410,7 @@ Status SegmentIterator::_vec_init_lazy_materialization() { if (!_is_common_expr_column[cid]) { _non_predicate_columns.push_back(cid); } else { - _second_read_column_ids.push_back(cid); + _non_predicate_column_ids.push_back(cid); } } } @@ -1411,13 +1420,13 @@ Status SegmentIterator::_vec_init_lazy_materialization() { if (_lazy_materialization_read) { // insert pred cid to first_read_columns for (auto cid : pred_column_ids) { - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } } else if (!_is_need_vec_eval && !_is_need_short_eval && !_is_need_expr_eval) { // no pred exists, just read and output column for (int i = 0; i < _schema->num_column_ids(); i++) { auto cid = _schema->column_id(i); - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } } else { if (_is_need_vec_eval || _is_need_short_eval) { @@ -1429,18 +1438,18 @@ Status SegmentIterator::_vec_init_lazy_materialization() { _short_cir_pred_column_ids.end()); pred_id_set.insert(_vec_pred_column_ids.begin(), _vec_pred_column_ids.end()); - DCHECK(_second_read_column_ids.empty()); - // _second_read_column_ids must be empty. Otherwise _lazy_materialization_read must not false. + DCHECK(_non_predicate_column_ids.empty()); + // _non_predicate_column_ids must be empty. Otherwise _lazy_materialization_read must not false. for (int i = 0; i < _schema->num_column_ids(); i++) { auto cid = _schema->column_id(i); if (pred_id_set.find(cid) != pred_id_set.end()) { - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } // In the past, if schema columns > pred columns, the _lazy_materialization_read maybe == false, but // we make sure using _lazy_materialization_read= true now, so these logic may never happens. I comment // these lines and we could delete them in the future to make the code more clear. // else if (non_pred_set.find(cid) != non_pred_set.end()) { - // _first_read_column_ids.push_back(cid); + // _predicate_column_ids.push_back(cid); // // when _lazy_materialization_read = false, non-predicate column should also be filtered by sel idx, so we regard it as pred columns // _is_pred_column[cid] = true; // } @@ -1448,7 +1457,7 @@ Status SegmentIterator::_vec_init_lazy_materialization() { } else if (_is_need_expr_eval) { DCHECK(!_is_need_vec_eval && !_is_need_short_eval); for (auto cid : _common_expr_columns) { - _first_read_column_ids.push_back(cid); + _predicate_column_ids.push_back(cid); } } } @@ -1634,7 +1643,7 @@ void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) { * 1. Reads a batch of rowids (up to the specified limit), and checks if they are continuous. * Continuous here means that the rowids form an unbroken sequence (e.g., 1, 2, 3, 4...). * - * 2. For each column that needs to be read (identified by _first_read_column_ids): + * 2. For each column that needs to be read (identified by _predicate_column_ids): * - If the rowids are continuous, the function uses seek_to_ordinal and next_batch * for efficient reading. * - If the rowids are not continuous, the function processes them in smaller batches @@ -1647,13 +1656,13 @@ void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) { */ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32_t& nrows_read, bool set_block_rowid) { - SCOPED_RAW_TIMER(&_opts.stats->first_read_ns); + SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_ns); nrows_read = _range_iter->read_batch_rowids(_block_rowids.data(), nrows_read_limit); bool is_continuous = (nrows_read > 1) && (_block_rowids[nrows_read - 1] - _block_rowids[0] == nrows_read - 1); - for (auto cid : _first_read_column_ids) { + for (auto cid : _predicate_column_ids) { auto& column = _current_return_columns[cid]; if (_no_need_read_key_data(cid, column, nrows_read)) { continue; @@ -1678,9 +1687,9 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 if (is_continuous) { size_t rows_read = nrows_read; - _opts.stats->block_first_read_seek_num += 1; + _opts.stats->predicate_column_read_seek_num += 1; if (_opts.runtime_state && _opts.runtime_state->enable_profile()) { - SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns); + SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_seek_ns); RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(_block_rowids[0])); } else { RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(_block_rowids[0])); @@ -1702,9 +1711,9 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 if (batch_continuous) { size_t rows_read = current_batch_size; - _opts.stats->block_first_read_seek_num += 1; + _opts.stats->predicate_column_read_seek_num += 1; if (_opts.runtime_state && _opts.runtime_state->enable_profile()) { - SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns); + SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_seek_ns); RETURN_IF_ERROR( _column_iterators[cid]->seek_to_ordinal(_block_rowids[processed])); } else { @@ -1760,31 +1769,33 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ SCOPED_RAW_TIMER(&_opts.stats->vec_cond_ns); bool all_pred_always_true = true; for (const auto& pred : _pre_eval_block_predicate) { - if (!pred->always_true(false)) { + if (!pred->always_true()) { all_pred_always_true = false; break; } } if (all_pred_always_true) { for (const auto& pred : _pre_eval_block_predicate) { - pred->always_true(true); + pred->always_true(); } } + const uint16_t original_size = selected_size; //If all predicates are always_true, then return directly. if (all_pred_always_true || !_is_need_vec_eval) { - for (uint16_t i = 0; i < selected_size; ++i) { + for (uint16_t i = 0; i < original_size; ++i) { sel_rowid_idx[i] = i; } - return selected_size; + // All preds are always_true, so return immediately and update the profile statistics here. + _opts.stats->vec_cond_input_rows += original_size; + return original_size; } - uint16_t original_size = selected_size; _ret_flags.resize(original_size); DCHECK(!_pre_eval_block_predicate.empty()); bool is_first = true; for (auto& pred : _pre_eval_block_predicate) { - if (pred->always_true(true)) { + if (pred->always_true()) { continue; } auto column_id = pred->column_id(); @@ -1845,10 +1856,6 @@ uint16_t SegmentIterator::_evaluate_short_circuit_predicate(uint16_t* vec_sel_ro selected_size = predicate->evaluate(*short_cir_column, vec_sel_rowid_idx, selected_size); } - // collect profile - for (auto* p : _filter_info_id) { - _opts.stats->filter_info[p->get_filter_id()] = p->get_filtered_info(); - } _opts.stats->short_circuit_cond_input_rows += original_size; _opts.stats->rows_short_circuit_cond_filtered += original_size - selected_size; @@ -1860,6 +1867,17 @@ uint16_t SegmentIterator::_evaluate_short_circuit_predicate(uint16_t* vec_sel_ro return selected_size; } +void SegmentIterator::_collect_runtime_filter_predicate() { + // collect profile + for (auto* p : _filter_info_id) { + // There is a situation, such as with in or minmax filters, + // where intermediate conversion to a key range or other types + // prevents obtaining the filter id. + if (p->get_filter_id() >= 0) { + _opts.stats->filter_info[p->get_filter_id()] = p->get_filtered_info(); + } + } +} Status SegmentIterator::_read_columns_by_rowids(std::vector& read_column_ids, std::vector& rowid_vector, uint16_t* sel_rowid_idx, size_t select_size, @@ -1926,7 +1944,7 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { // if rows read by batch is 0, will return end of file, we should not remove segment cache in this situation. if (!status.ok() && !status.is()) { - _segment->remove_from_segment_cache(); + _segment->update_healthy_status(status); } return status; } @@ -1988,6 +2006,9 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (UNLIKELY(!_lazy_inited)) { RETURN_IF_ERROR(_lazy_init()); _lazy_inited = true; + // If the row bitmap size is smaller than block_row_max, there's no need to reserve that many column rows. + auto nrows_reserve_limit = + std::min(_row_bitmap.cardinality(), uint64_t(_opts.block_row_max)); if (_lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval) { _block_rowids.resize(_opts.block_row_max); } @@ -2012,7 +2033,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { storage_column_type->is_nullable(), _opts.io_ctx.reader_type)); _current_return_columns[cid]->set_rowset_segment_id( {_segment->rowset_id(), _segment->id()}); - _current_return_columns[cid]->reserve(_opts.block_row_max); + _current_return_columns[cid]->reserve(nrows_reserve_limit); } else if (i >= block->columns()) { // if i >= block->columns means the column and not the pred_column means `column i` is // a delete condition column. but the column is not effective in the segment. so we just @@ -2023,7 +2044,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { // TODO: skip read the not effective delete column to speed up segment read. _current_return_columns[cid] = Schema::get_data_type_ptr(*column_desc)->create_column(); - _current_return_columns[cid]->reserve(_opts.block_row_max); + _current_return_columns[cid]->reserve(nrows_reserve_limit); } } @@ -2048,7 +2069,8 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (_can_opt_topn_reads()) { nrows_read_limit = std::min(static_cast(_opts.topn_limit), nrows_read_limit); } - + // If the row bitmap size is smaller than nrows_read_limit, there's no need to reserve that many column rows. + nrows_read_limit = std::min(_row_bitmap.cardinality(), uint64_t(nrows_read_limit)); DBUG_EXECUTE_IF("segment_iterator.topn_opt_1", { if (nrows_read_limit != 1) { return Status::Error("topn opt 1 execute failed: {}", @@ -2063,8 +2085,8 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { RETURN_IF_ERROR(_read_columns_by_index( nrows_read_limit, _current_batch_rows_read, _lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval)); - if (std::find(_first_read_column_ids.begin(), _first_read_column_ids.end(), - _schema->version_col_idx()) != _first_read_column_ids.end()) { + if (std::find(_predicate_column_ids.begin(), _predicate_column_ids.end(), + _schema->version_col_idx()) != _predicate_column_ids.end()) { _replace_version_col(_current_batch_rows_read); } @@ -2089,7 +2111,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (_non_predicate_columns.empty()) { return Status::InternalError("_non_predicate_columns is empty"); } - RETURN_IF_ERROR(_convert_to_expected_type(_first_read_column_ids)); + RETURN_IF_ERROR(_convert_to_expected_type(_predicate_column_ids)); RETURN_IF_ERROR(_convert_to_expected_type(_non_predicate_columns)); _output_non_pred_columns(block); } else { @@ -2108,29 +2130,31 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { // In SSB test, it make no difference; So need more scenarios to test selected_size = _evaluate_short_circuit_predicate(_sel_rowid_idx.data(), selected_size); + _collect_runtime_filter_predicate(); if (selected_size > 0) { // step 3.1: output short circuit and predicate column - // when lazy materialization enables, _first_read_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids) + // when lazy materialization enables, _predicate_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids) // see _vec_init_lazy_materialization // todo(wb) need to tell input columnids from output columnids - RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, + RETURN_IF_ERROR(_output_column_by_sel_idx(block, _predicate_column_ids, _sel_rowid_idx.data(), selected_size)); // step 3.2: read remaining expr column and evaluate it. if (_is_need_expr_eval) { // The predicate column contains the remaining expr column, no need second read. - if (!_second_read_column_ids.empty()) { - SCOPED_RAW_TIMER(&_opts.stats->second_read_ns); + if (!_non_predicate_column_ids.empty()) { + SCOPED_RAW_TIMER(&_opts.stats->non_predicate_read_ns); RETURN_IF_ERROR(_read_columns_by_rowids( - _second_read_column_ids, _block_rowids, _sel_rowid_idx.data(), + _non_predicate_column_ids, _block_rowids, _sel_rowid_idx.data(), selected_size, &_current_return_columns)); - if (std::find(_second_read_column_ids.begin(), - _second_read_column_ids.end(), _schema->version_col_idx()) != - _second_read_column_ids.end()) { + if (std::find(_non_predicate_column_ids.begin(), + _non_predicate_column_ids.end(), + _schema->version_col_idx()) != + _non_predicate_column_ids.end()) { _replace_version_col(selected_size); } - RETURN_IF_ERROR(_convert_to_expected_type(_second_read_column_ids)); - for (auto cid : _second_read_column_ids) { + RETURN_IF_ERROR(_convert_to_expected_type(_non_predicate_column_ids)); + for (auto cid : _non_predicate_column_ids) { auto loc = _schema_block_id_map[cid]; block->replace_by_position(loc, std::move(_current_return_columns[cid])); @@ -2163,17 +2187,17 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { } } } else if (_is_need_expr_eval) { - RETURN_IF_ERROR(_convert_to_expected_type(_second_read_column_ids)); - for (auto cid : _second_read_column_ids) { + RETURN_IF_ERROR(_convert_to_expected_type(_non_predicate_column_ids)); + for (auto cid : _non_predicate_column_ids) { auto loc = _schema_block_id_map[cid]; block->replace_by_position(loc, std::move(_current_return_columns[cid])); } } } else if (_is_need_expr_eval) { - DCHECK(!_first_read_column_ids.empty()); - RETURN_IF_ERROR(_convert_to_expected_type(_first_read_column_ids)); + DCHECK(!_predicate_column_ids.empty()); + RETURN_IF_ERROR(_convert_to_expected_type(_predicate_column_ids)); // first read all rows are insert block, initialize sel_rowid_idx to all rows. - for (auto cid : _first_read_column_ids) { + for (auto cid : _predicate_column_ids) { auto loc = _schema_block_id_map[cid]; block->replace_by_position(loc, std::move(_current_return_columns[cid])); } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 50876cd7c55fe8..5588661302dd06 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -156,6 +156,11 @@ class SegmentIterator : public RowwiseIterator { return _inverted_index_iterators; } + bool has_inverted_index_in_iterators() const { + return std::any_of(_inverted_index_iterators.begin(), _inverted_index_iterators.end(), + [](const auto& iterator) { return iterator != nullptr; }); + } + private: Status _next_batch_internal(vectorized::Block* block); @@ -229,6 +234,7 @@ class SegmentIterator : public RowwiseIterator { uint32_t nrows_read_limit); uint16_t _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); uint16_t _evaluate_short_circuit_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); + void _collect_runtime_filter_predicate(); void _output_non_pred_columns(vectorized::Block* block); [[nodiscard]] Status _read_columns_by_rowids(std::vector& read_column_ids, std::vector& rowid_vector, @@ -426,8 +432,8 @@ class SegmentIterator : public RowwiseIterator { // first, read predicate columns by various index // second, read non-predicate columns // so we need a field to stand for columns first time to read - std::vector _first_read_column_ids; - std::vector _second_read_column_ids; + std::vector _predicate_column_ids; + std::vector _non_predicate_column_ids; std::vector _columns_to_filter; std::vector _converted_column_ids; std::vector _schema_block_id_map; // map from schema column id to column idx in Block diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index ab552055790c2c..e4288338d48458 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -85,13 +85,14 @@ inline std::string segment_mem_tracker_name(uint32_t segment_id) { SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, const SegmentWriterOptions& opts, - io::FileWriterPtr inverted_file_writer) + InvertedIndexFileWriter* inverted_file_writer) : _segment_id(segment_id), _tablet_schema(std::move(tablet_schema)), _tablet(std::move(tablet)), _data_dir(data_dir), _opts(opts), _file_writer(file_writer), + _inverted_index_file_writer(inverted_file_writer), _mem_tracker(std::make_unique(segment_mem_tracker_name(segment_id))), _mow_context(std::move(opts.mow_ctx)) { CHECK_NOTNULL(file_writer); @@ -132,17 +133,6 @@ SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, } } } - if (_tablet_schema->has_inverted_index()) { - _inverted_index_file_writer = std::make_unique( - _opts.rowset_ctx->fs(), - std::string {InvertedIndexDescriptor::get_index_file_path_prefix( - file_writer->path().c_str())}, - _opts.rowset_ctx->rowset_id.to_string(), segment_id, - _tablet_schema->get_inverted_index_storage_format(), - std::move(inverted_file_writer)); - _inverted_index_file_writer->set_file_writer_opts( - _opts.rowset_ctx->get_file_writer_options()); - } } SegmentWriter::~SegmentWriter() { @@ -202,57 +192,60 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co if (tablet_index) { opts.need_bloom_filter = true; opts.is_ngram_bf_index = true; - opts.gram_size = tablet_index->get_gram_size(); - opts.gram_bf_size = tablet_index->get_gram_bf_size(); + //narrow convert from int32_t to uint8_t and uint16_t which is dangerous + auto gram_size = tablet_index->get_gram_size(); + auto gram_bf_size = tablet_index->get_gram_bf_size(); + if (gram_size > 256 || gram_size < 1) { + return Status::NotSupported("Do not support ngram bloom filter for ngram_size: ", + gram_size); + } + if (gram_bf_size > 65535 || gram_bf_size < 64) { + return Status::NotSupported("Do not support ngram bloom filter for bf_size: ", + gram_bf_size); + } + opts.gram_size = gram_size; + opts.gram_bf_size = gram_bf_size; } opts.need_bitmap_index = column.has_bitmap_index(); bool skip_inverted_index = false; if (_opts.rowset_ctx != nullptr) { - // skip write inverted index for index compaction - skip_inverted_index = _opts.rowset_ctx->skip_inverted_index.count(column.unique_id()) > 0; + // skip write inverted index for index compaction column + skip_inverted_index = + _opts.rowset_ctx->columns_to_do_index_compaction.count(column.unique_id()) > 0; } // skip write inverted index on load if skip_write_index_on_load is true if (_opts.write_type == DataWriteType::TYPE_DIRECT && schema->skip_write_index_on_load()) { skip_inverted_index = true; } - // indexes for this column - opts.indexes = schema->get_indexes_for_column(column); - if (!InvertedIndexColumnWriter::check_support_inverted_index(column)) { - opts.need_zone_map = false; - opts.need_bloom_filter = false; - opts.need_bitmap_index = false; - } - opts.inverted_index_file_writer = _inverted_index_file_writer.get(); - for (const auto* index : opts.indexes) { - if (!skip_inverted_index && index->index_type() == IndexType::INVERTED) { - opts.inverted_index = index; - opts.need_inverted_index = true; - // TODO support multiple inverted index - break; - } - } -#define CHECK_FIELD_TYPE(TYPE, type_name) \ - if (column.type() == FieldType::OLAP_FIELD_TYPE_##TYPE) { \ - opts.need_zone_map = false; \ - if (opts.need_bloom_filter) { \ - return Status::NotSupported("Do not support bloom filter for " type_name " type"); \ - } \ - if (opts.need_bitmap_index) { \ - return Status::NotSupported("Do not support bitmap index for " type_name " type"); \ - } \ - } - - CHECK_FIELD_TYPE(STRUCT, "struct") - CHECK_FIELD_TYPE(ARRAY, "array") - CHECK_FIELD_TYPE(JSONB, "jsonb") - CHECK_FIELD_TYPE(AGG_STATE, "agg_state") - CHECK_FIELD_TYPE(MAP, "map") - CHECK_FIELD_TYPE(OBJECT, "object") - CHECK_FIELD_TYPE(HLL, "hll") - CHECK_FIELD_TYPE(QUANTILE_STATE, "quantile_state") -#undef CHECK_FIELD_TYPE + // indexes for this column + if (const auto& index = schema->inverted_index(column); + index != nullptr && !skip_inverted_index) { + opts.inverted_index = index; + opts.need_inverted_index = true; + DCHECK(_inverted_index_file_writer != nullptr); + opts.inverted_index_file_writer = _inverted_index_file_writer; + // TODO support multiple inverted index + } +#define DISABLE_INDEX_IF_FIELD_TYPE(TYPE, type_name) \ + if (column.type() == FieldType::OLAP_FIELD_TYPE_##TYPE) { \ + opts.need_zone_map = false; \ + opts.need_bloom_filter = false; \ + opts.need_bitmap_index = false; \ + } + + DISABLE_INDEX_IF_FIELD_TYPE(STRUCT, "struct") + DISABLE_INDEX_IF_FIELD_TYPE(ARRAY, "array") + DISABLE_INDEX_IF_FIELD_TYPE(JSONB, "jsonb") + DISABLE_INDEX_IF_FIELD_TYPE(AGG_STATE, "agg_state") + DISABLE_INDEX_IF_FIELD_TYPE(MAP, "map") + DISABLE_INDEX_IF_FIELD_TYPE(OBJECT, "object") + DISABLE_INDEX_IF_FIELD_TYPE(HLL, "hll") + DISABLE_INDEX_IF_FIELD_TYPE(QUANTILE_STATE, "quantile_state") + DISABLE_INDEX_IF_FIELD_TYPE(VARIANT, "variant") + +#undef DISABLE_INDEX_IF_FIELD_TYPE if (column.is_row_store_column()) { // smaller page size for row store column @@ -464,10 +457,11 @@ void SegmentWriter::_serialize_block_to_row_column(vectorized::Block& block) { Status SegmentWriter::probe_key_for_mow( std::string key, std::size_t segment_pos, bool have_input_seq_column, bool have_delete_sign, - PartialUpdateReadPlan& read_plan, const std::vector& specified_rowsets, + const std::vector& specified_rowsets, std::vector>& segment_caches, bool& has_default_or_nullable, std::vector& use_default_or_null_flag, - PartialUpdateStats& stats) { + const std::function& found_cb, + const std::function& not_found_cb, PartialUpdateStats& stats) { RowLocation loc; // save rowset shared ptr so this rowset wouldn't delete RowsetSharedPtr rowset; @@ -482,9 +476,7 @@ Status SegmentWriter::probe_key_for_mow( {_opts.rowset_ctx->rowset_id, _segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, segment_pos); } else if (!have_delete_sign) { - RETURN_IF_ERROR( - _opts.rowset_ctx->partial_update_info->handle_non_strict_mode_not_found_error( - *_tablet_schema)); + RETURN_IF_ERROR(not_found_cb()); } ++stats.num_rows_new_added; has_default_or_nullable = true; @@ -508,7 +500,7 @@ Status SegmentWriter::probe_key_for_mow( // partial update should not contain invisible columns use_default_or_null_flag.emplace_back(false); _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); - read_plan.prepare_to_read(loc, segment_pos); + found_cb(loc); } if (st.is()) { @@ -546,6 +538,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* DCHECK(_is_mow()); DCHECK(_opts.rowset_ctx->partial_update_info); + DCHECK(_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()); DCHECK(row_pos == 0); // find missing column cids @@ -595,7 +588,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* const std::vector& specified_rowsets = _mow_context->rowset_ptrs; std::vector> segment_caches(specified_rowsets.size()); - PartialUpdateReadPlan read_plan; + FixedReadPlan read_plan; // locate rows in base data PartialUpdateStats stats; @@ -625,10 +618,17 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* bool have_delete_sign = (delete_sign_column_data != nullptr && delete_sign_column_data[block_pos] != 0); - RETURN_IF_ERROR(probe_key_for_mow(key, segment_pos, have_input_seq_column, have_delete_sign, - read_plan, specified_rowsets, segment_caches, + auto not_found_cb = [&]() { + return _opts.rowset_ctx->partial_update_info->handle_non_strict_mode_not_found_error( + *_tablet_schema); + }; + auto update_read_plan = [&](const RowLocation& loc) { + read_plan.prepare_to_read(loc, segment_pos); + }; + RETURN_IF_ERROR(probe_key_for_mow(std::move(key), segment_pos, have_input_seq_column, + have_delete_sign, specified_rowsets, segment_caches, has_default_or_nullable, use_default_or_null_flag, - stats)); + update_read_plan, not_found_cb, stats)); } CHECK_EQ(use_default_or_null_flag.size(), num_rows); @@ -637,7 +637,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* _mow_context->rowset_ids); } - // read and fill block + // read to fill full block RETURN_IF_ERROR(read_plan.fill_missing_columns( _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, use_default_or_null_flag, has_default_or_nullable, segment_start_pos, block)); @@ -692,10 +692,17 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_pos, size_t num_rows) { if (_opts.rowset_ctx->partial_update_info && - _opts.rowset_ctx->partial_update_info->is_partial_update && + _opts.rowset_ctx->partial_update_info->is_partial_update() && _opts.write_type == DataWriteType::TYPE_DIRECT && !_opts.rowset_ctx->is_transient_rowset_writer) { - RETURN_IF_ERROR(append_block_with_partial_content(block, row_pos, num_rows)); + if (_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()) { + RETURN_IF_ERROR(append_block_with_partial_content(block, row_pos, num_rows)); + } else { + return Status::NotSupported( + "SegmentWriter doesn't support flexible partial update, please set " + "enable_vertical_segment_writer=true in be.conf on all BEs to use " + "VerticalSegmentWriter."); + } return Status::OK(); } CHECK(block->columns() >= _column_writers.size()) @@ -1010,10 +1017,6 @@ Status SegmentWriter::finalize_footer(uint64_t* segment_file_size) { if (*segment_file_size == 0) { return Status::Corruption("Bad segment, file size = 0"); } - if (_inverted_index_file_writer != nullptr) { - RETURN_IF_ERROR(_inverted_index_file_writer->close()); - _inverted_index_file_info = _inverted_index_file_writer->get_index_file_info(); - } return Status::OK(); } @@ -1254,13 +1257,6 @@ Status SegmentWriter::_generate_short_key_index( return Status::OK(); } -int64_t SegmentWriter::get_inverted_index_total_size() { - if (_inverted_index_file_writer != nullptr) { - return _inverted_index_file_writer->get_index_file_total_size(); - } - return 0; -} - inline bool SegmentWriter::_is_mow() { return _tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write; } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 37b514e69c7001..9a8af131087f92 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -23,6 +23,7 @@ #include #include +#include #include #include // unique_ptr #include @@ -33,6 +34,7 @@ #include "gutil/strings/substitute.h" #include "olap/olap_define.h" #include "olap/rowset/segment_v2/column_writer.h" +#include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/tablet.h" #include "olap/tablet_schema.h" #include "util/faststring.h" @@ -60,7 +62,6 @@ class FileWriter; } // namespace io namespace segment_v2 { -class InvertedIndexFileWriter; extern const char* k_segment_magic; extern const uint32_t k_segment_magic_length; @@ -83,7 +84,7 @@ class SegmentWriter { explicit SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, const SegmentWriterOptions& opts, - io::FileWriterPtr inverted_file_writer = nullptr); + InvertedIndexFileWriter* inverted_file_writer); ~SegmentWriter(); Status init(); @@ -96,11 +97,13 @@ class SegmentWriter { Status append_block(const vectorized::Block* block, size_t row_pos, size_t num_rows); Status probe_key_for_mow(std::string key, std::size_t segment_pos, bool have_input_seq_column, - bool have_delete_sign, PartialUpdateReadPlan& read_plan, + bool have_delete_sign, const std::vector& specified_rowsets, std::vector>& segment_caches, bool& has_default_or_nullable, std::vector& use_default_or_null_flag, + const std::function& found_cb, + const std::function& not_found_cb, PartialUpdateStats& stats); Status append_block_with_partial_content(const vectorized::Block* block, size_t row_pos, size_t num_rows); @@ -110,9 +113,6 @@ class SegmentWriter { uint64_t estimate_segment_size(); - InvertedIndexFileInfo get_inverted_index_file_info() const { return _inverted_index_file_info; } - int64_t get_inverted_index_total_size(); - uint32_t num_rows_written() const { return _num_rows_written; } // for partial update @@ -144,6 +144,17 @@ class SegmentWriter { void set_mow_context(std::shared_ptr mow_context); + Status close_inverted_index(int64_t* inverted_index_file_size) { + // no inverted index + if (_inverted_index_file_writer == nullptr) { + *inverted_index_file_size = 0; + return Status::OK(); + } + RETURN_IF_ERROR(_inverted_index_file_writer->close()); + *inverted_index_file_size = _inverted_index_file_writer->get_index_file_total_size(); + return Status::OK(); + } + private: DISALLOW_COPY_AND_ASSIGN(SegmentWriter); Status _create_column_writer(uint32_t cid, const TabletColumn& column, @@ -199,13 +210,15 @@ class SegmentWriter { // Not owned. owned by RowsetWriter or SegmentFlusher io::FileWriter* _file_writer = nullptr; - std::unique_ptr _inverted_index_file_writer; + // Not owned. owned by RowsetWriter or SegmentFlusher + InvertedIndexFileWriter* _inverted_index_file_writer = nullptr; + SegmentFooterPB _footer; // for mow tables with cluster key, the sort key is the cluster keys not unique keys // for other tables, the sort key is the keys size_t _num_sort_key_columns; size_t _num_short_key_columns; - InvertedIndexFileInfo _inverted_index_file_info; + std::unique_ptr _short_key_index_builder; std::unique_ptr _primary_key_index_builder; std::vector> _column_writers; diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 5d9275b7742ee8..e8804d93e78fc2 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -32,6 +32,7 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" // LOG +#include "common/status.h" #include "gutil/port.h" #include "inverted_index_fs_directory.h" #include "io/fs/file_writer.h" @@ -64,6 +65,7 @@ #include "vec/columns/column_nullable.h" #include "vec/columns/column_vector.h" #include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" #include "vec/common/schema_util.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" @@ -88,13 +90,14 @@ VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32 TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, const VerticalSegmentWriterOptions& opts, - io::FileWriterPtr inverted_file_writer) + InvertedIndexFileWriter* inverted_file_writer) : _segment_id(segment_id), _tablet_schema(std::move(tablet_schema)), _tablet(std::move(tablet)), _data_dir(data_dir), _opts(opts), _file_writer(file_writer), + _inverted_index_file_writer(inverted_file_writer), _mem_tracker(std::make_unique( vertical_segment_writer_mem_tracker_name(segment_id))), _mow_context(std::move(opts.mow_ctx)) { @@ -136,17 +139,6 @@ VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32 } } } - if (_tablet_schema->has_inverted_index()) { - _inverted_index_file_writer = std::make_unique( - _opts.rowset_ctx->fs(), - std::string {InvertedIndexDescriptor::get_index_file_path_prefix( - _opts.rowset_ctx->segment_path(segment_id))}, - _opts.rowset_ctx->rowset_id.to_string(), segment_id, - _tablet_schema->get_inverted_index_storage_format(), - std::move(inverted_file_writer)); - _inverted_index_file_writer->set_file_writer_opts( - _opts.rowset_ctx->get_file_writer_options()); - } } VerticalSegmentWriter::~VerticalSegmentWriter() { @@ -193,57 +185,60 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo if (tablet_index) { opts.need_bloom_filter = true; opts.is_ngram_bf_index = true; - opts.gram_size = tablet_index->get_gram_size(); - opts.gram_bf_size = tablet_index->get_gram_bf_size(); + //narrow convert from int32_t to uint8_t and uint16_t which is dangerous + auto gram_size = tablet_index->get_gram_size(); + auto gram_bf_size = tablet_index->get_gram_bf_size(); + if (gram_size > 256 || gram_size < 1) { + return Status::NotSupported("Do not support ngram bloom filter for ngram_size: ", + gram_size); + } + if (gram_bf_size > 65535 || gram_bf_size < 64) { + return Status::NotSupported("Do not support ngram bloom filter for bf_size: ", + gram_bf_size); + } + opts.gram_size = gram_size; + opts.gram_bf_size = gram_bf_size; } opts.need_bitmap_index = column.has_bitmap_index(); bool skip_inverted_index = false; if (_opts.rowset_ctx != nullptr) { - // skip write inverted index for index compaction - skip_inverted_index = _opts.rowset_ctx->skip_inverted_index.contains(column.unique_id()); + // skip write inverted index for index compaction column + skip_inverted_index = + _opts.rowset_ctx->columns_to_do_index_compaction.contains(column.unique_id()); } // skip write inverted index on load if skip_write_index_on_load is true if (_opts.write_type == DataWriteType::TYPE_DIRECT && tablet_schema->skip_write_index_on_load()) { skip_inverted_index = true; } - // indexes for this column - opts.indexes = tablet_schema->get_indexes_for_column(column); - if (!InvertedIndexColumnWriter::check_support_inverted_index(column)) { - opts.need_zone_map = false; - opts.need_bloom_filter = false; - opts.need_bitmap_index = false; - } - for (const auto* index : opts.indexes) { - if (!skip_inverted_index && index->index_type() == IndexType::INVERTED) { - opts.inverted_index = index; - opts.need_inverted_index = true; - // TODO support multiple inverted index - break; - } + if (const auto& index = tablet_schema->inverted_index(column); + index != nullptr && !skip_inverted_index) { + opts.inverted_index = index; + opts.need_inverted_index = true; + DCHECK(_inverted_index_file_writer != nullptr); + opts.inverted_index_file_writer = _inverted_index_file_writer; + // TODO support multiple inverted index } - opts.inverted_index_file_writer = _inverted_index_file_writer.get(); -#define CHECK_FIELD_TYPE(TYPE, type_name) \ - if (column.type() == FieldType::OLAP_FIELD_TYPE_##TYPE) { \ - opts.need_zone_map = false; \ - if (opts.need_bloom_filter) { \ - return Status::NotSupported("Do not support bloom filter for " type_name " type"); \ - } \ - if (opts.need_bitmap_index) { \ - return Status::NotSupported("Do not support bitmap index for " type_name " type"); \ - } \ +#define DISABLE_INDEX_IF_FIELD_TYPE(TYPE, type_name) \ + if (column.type() == FieldType::OLAP_FIELD_TYPE_##TYPE) { \ + opts.need_zone_map = false; \ + opts.need_bloom_filter = false; \ + opts.need_bitmap_index = false; \ } - CHECK_FIELD_TYPE(STRUCT, "struct") - CHECK_FIELD_TYPE(ARRAY, "array") - CHECK_FIELD_TYPE(JSONB, "jsonb") - CHECK_FIELD_TYPE(AGG_STATE, "agg_state") - CHECK_FIELD_TYPE(MAP, "map") - CHECK_FIELD_TYPE(OBJECT, "object") - CHECK_FIELD_TYPE(HLL, "hll") - CHECK_FIELD_TYPE(QUANTILE_STATE, "quantile_state") + DISABLE_INDEX_IF_FIELD_TYPE(STRUCT, "struct") + DISABLE_INDEX_IF_FIELD_TYPE(ARRAY, "array") + DISABLE_INDEX_IF_FIELD_TYPE(JSONB, "jsonb") + DISABLE_INDEX_IF_FIELD_TYPE(AGG_STATE, "agg_state") + DISABLE_INDEX_IF_FIELD_TYPE(MAP, "map") + DISABLE_INDEX_IF_FIELD_TYPE(OBJECT, "object") + DISABLE_INDEX_IF_FIELD_TYPE(HLL, "hll") + DISABLE_INDEX_IF_FIELD_TYPE(QUANTILE_STATE, "quantile_state") + DISABLE_INDEX_IF_FIELD_TYPE(VARIANT, "variant") + +#undef DISABLE_INDEX_IF_FIELD_TYPE #undef CHECK_FIELD_TYPE @@ -335,10 +330,11 @@ void VerticalSegmentWriter::_serialize_block_to_row_column(vectorized::Block& bl Status VerticalSegmentWriter::_probe_key_for_mow( std::string key, std::size_t segment_pos, bool have_input_seq_column, bool have_delete_sign, - PartialUpdateReadPlan& read_plan, const std::vector& specified_rowsets, + const std::vector& specified_rowsets, std::vector>& segment_caches, bool& has_default_or_nullable, std::vector& use_default_or_null_flag, - PartialUpdateStats& stats) { + const std::function& found_cb, + const std::function& not_found_cb, PartialUpdateStats& stats) { RowLocation loc; // save rowset shared ptr so this rowset wouldn't delete RowsetSharedPtr rowset; @@ -353,9 +349,7 @@ Status VerticalSegmentWriter::_probe_key_for_mow( {_opts.rowset_ctx->rowset_id, _segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, segment_pos); } else if (!have_delete_sign) { - RETURN_IF_ERROR( - _opts.rowset_ctx->partial_update_info->handle_non_strict_mode_not_found_error( - *_tablet_schema)); + RETURN_IF_ERROR(not_found_cb()); } ++stats.num_rows_new_added; has_default_or_nullable = true; @@ -379,7 +373,7 @@ Status VerticalSegmentWriter::_probe_key_for_mow( // partial update should not contain invisible columns use_default_or_null_flag.emplace_back(false); _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); - read_plan.prepare_to_read(loc, segment_pos); + found_cb(loc); } if (st.is()) { @@ -408,6 +402,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da vectorized::Block& full_block) { DCHECK(_is_mow()); DCHECK(_opts.rowset_ctx->partial_update_info != nullptr); + DCHECK(_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()); DCHECK(data.row_pos == 0); // create full block and fill with input columns @@ -455,7 +450,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da const std::vector& specified_rowsets = _mow_context->rowset_ptrs; std::vector> segment_caches(specified_rowsets.size()); - PartialUpdateReadPlan read_plan; + FixedReadPlan read_plan; // locate rows in base data PartialUpdateStats stats; @@ -485,10 +480,17 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da bool have_delete_sign = (delete_sign_column_data != nullptr && delete_sign_column_data[block_pos] != 0); - RETURN_IF_ERROR(_probe_key_for_mow(key, segment_pos, have_input_seq_column, - have_delete_sign, read_plan, specified_rowsets, - segment_caches, has_default_or_nullable, - use_default_or_null_flag, stats)); + auto not_found_cb = [&]() { + return _opts.rowset_ctx->partial_update_info->handle_non_strict_mode_not_found_error( + *_tablet_schema); + }; + auto update_read_plan = [&](const RowLocation& loc) { + read_plan.prepare_to_read(loc, segment_pos); + }; + RETURN_IF_ERROR(_probe_key_for_mow(std::move(key), segment_pos, have_input_seq_column, + have_delete_sign, specified_rowsets, segment_caches, + has_default_or_nullable, use_default_or_null_flag, + update_read_plan, not_found_cb, stats)); } CHECK_EQ(use_default_or_null_flag.size(), data.num_rows); @@ -497,7 +499,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da _mow_context->rowset_ids); } - // read and fill block + // read to fill full_block RETURN_IF_ERROR(read_plan.fill_missing_columns( _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, use_default_or_null_flag, has_default_or_nullable, segment_start_pos, data.block)); @@ -549,19 +551,396 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da return Status::OK(); } +Status VerticalSegmentWriter::_append_block_with_flexible_partial_content( + RowsInBlock& data, vectorized::Block& full_block) { + DCHECK(_is_mow()); + DCHECK(_opts.rowset_ctx->partial_update_info != nullptr); + DCHECK(_opts.rowset_ctx->partial_update_info->is_flexible_partial_update()); + DCHECK(data.row_pos == 0); + + // data.block has the same schema with full_block + DCHECK(data.block->columns() == _tablet_schema->num_columns()); + + // create full block and fill with sort key columns + full_block = _tablet_schema->create_block(); + + auto segment_start_pos = _column_writers.front()->get_next_rowid(); + + DCHECK(_tablet_schema->has_skip_bitmap_col()); + auto skip_bitmap_col_idx = _tablet_schema->skip_bitmap_col_idx(); + + auto get_skip_bitmaps = [&skip_bitmap_col_idx](const vectorized::Block* block) { + return &(assert_cast( + block->get_by_position(skip_bitmap_col_idx).column->assume_mutable().get()) + ->get_data()); + }; + std::vector* skip_bitmaps = get_skip_bitmaps(data.block); + + bool has_default_or_nullable = false; + std::vector use_default_or_null_flag; + use_default_or_null_flag.reserve(data.num_rows); + + int32_t seq_map_col_unique_id = _opts.rowset_ctx->partial_update_info->sequence_map_col_uid(); + bool schema_has_sequence_col = _tablet_schema->has_sequence_col(); + + DBUG_EXECUTE_IF("VerticalSegmentWriter._append_block_with_flexible_partial_content.sleep", + { sleep(60); }) + const std::vector& specified_rowsets = _mow_context->rowset_ptrs; + std::vector> segment_caches(specified_rowsets.size()); + + std::vector key_columns {}; + vectorized::IOlapColumnDataAccessor* seq_column {nullptr}; + + auto encode_key_columns = + [&full_block, &data, + this](std::vector& key_columns) -> Status { + key_columns.clear(); + for (std::size_t cid {0}; cid < _num_sort_key_columns; cid++) { + full_block.replace_by_position(cid, data.block->get_by_position(cid).column); + RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( + full_block.get_by_position(cid), data.row_pos, data.num_rows, cid)); + auto [status, column] = _olap_data_convertor->convert_column_data(cid); + if (!status.ok()) { + return status; + } + key_columns.push_back(column); + } + return Status::OK(); + }; + + auto encode_seq_column = [&data, &schema_has_sequence_col, + this](vectorized::IOlapColumnDataAccessor*& seq_column) -> Status { + seq_column = nullptr; + if (schema_has_sequence_col) { + auto seq_col_idx = _tablet_schema->sequence_col_idx(); + RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( + data.block->get_by_position(seq_col_idx), data.row_pos, data.num_rows, + seq_col_idx)); + auto [status, column] = _olap_data_convertor->convert_column_data(seq_col_idx); + if (!status.ok()) { + return status; + } + seq_column = column; + } + return Status::OK(); + }; + + // 1. encode key columns + // we can only encode sort key columns currently becasue all non-key columns in flexible partial update + // can have missing cells + RETURN_IF_ERROR(encode_key_columns(key_columns)); + + // 2. encode sequence column + // We encode the seguence column even thought it may have invalid values in some rows because we need to + // encode the value of sequence column in key for rows that have a valid value in sequence column during + // lookup_raw_key. We will encode the sequence column again at the end of this method. At that time, we have + // a valid sequence column to encode the key with seq col. + RETURN_IF_ERROR(encode_seq_column(seq_column)); + + // 3. merge duplicate rows when table has sequence column + // When there are multiple rows with the same keys in memtable, some of them specify specify the sequence column, + // some of them don't. We can't do the de-duplication in memtable. We must de-duplicate them here. + if (schema_has_sequence_col) { + std::size_t origin_rows = data.num_rows; + RETURN_IF_ERROR(_merge_rows_for_sequence_column(data, skip_bitmaps, key_columns, seq_column, + specified_rowsets, segment_caches)); + if (origin_rows != data.num_rows) { + // data in block has changed, should re-encode key columns, sequence column and re-get skip_bitmaps + _olap_data_convertor->clear_source_content(); + RETURN_IF_ERROR(encode_key_columns(key_columns)); + RETURN_IF_ERROR(encode_seq_column(seq_column)); + skip_bitmaps = get_skip_bitmaps(data.block); + } + } + + const auto* delete_sign_column_data = + BaseTablet::get_delete_sign_column_data(*data.block, data.row_pos + data.num_rows); + DCHECK(delete_sign_column_data != nullptr); + + // 4. write key columns data + for (std::size_t cid {0}; cid < _num_sort_key_columns; cid++) { + const auto& column = key_columns[cid]; + DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written); + RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), + data.num_rows)); + DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written + data.num_rows); + } + + // 5. genreate read plan + FlexibleReadPlan read_plan {_tablet_schema->has_row_store_for_all_columns()}; + PartialUpdateStats stats; + RETURN_IF_ERROR(_generate_flexible_read_plan( + read_plan, data, segment_start_pos, schema_has_sequence_col, seq_map_col_unique_id, + skip_bitmaps, key_columns, seq_column, delete_sign_column_data, specified_rowsets, + segment_caches, has_default_or_nullable, use_default_or_null_flag, stats)); + CHECK_EQ(use_default_or_null_flag.size(), data.num_rows); + + if (config::enable_merge_on_write_correctness_check) { + _tablet->add_sentinel_mark_to_delete_bitmap(_mow_context->delete_bitmap.get(), + _mow_context->rowset_ids); + } + + // 6. read according plan to fill full_block + RETURN_IF_ERROR(read_plan.fill_non_primary_key_columns( + _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, + use_default_or_null_flag, has_default_or_nullable, segment_start_pos, data.row_pos, + data.block, skip_bitmaps)); + + // TODO(bobhan1): should we replace the skip bitmap column with empty bitmaps to reduce storage occupation? + // this column is not needed in read path for merge-on-write table + + // 7. fill row store column + _serialize_block_to_row_column(full_block); + + // 8. encode and write all non-primary key columns(including sequence column if exists) + for (auto cid = _num_sort_key_columns; cid < _tablet_schema->num_columns(); cid++) { + RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( + full_block.get_by_position(cid), data.row_pos, data.num_rows, cid)); + auto [status, column] = _olap_data_convertor->convert_column_data(cid); + if (!status.ok()) { + return status; + } + if (cid == _tablet_schema->sequence_col_idx()) { + // should use the latest encoded sequence column to build the primary index + seq_column = column; + } + DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written); + RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), + data.num_rows)); + DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written + data.num_rows); + } + + _num_rows_updated += stats.num_rows_updated; + _num_rows_deleted += stats.num_rows_deleted; + _num_rows_new_added += stats.num_rows_new_added; + _num_rows_filtered += stats.num_rows_filtered; + + if (_num_rows_written != data.row_pos || + _primary_key_index_builder->num_rows() != _num_rows_written) { + return Status::InternalError( + "Correctness check failed, _num_rows_written: {}, row_pos: {}, primary key " + "index builder num rows: {}", + _num_rows_written, data.row_pos, _primary_key_index_builder->num_rows()); + } + + // 9. build primary key index + RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, data.num_rows, + false)); + + _num_rows_written += data.num_rows; + DCHECK_EQ(_primary_key_index_builder->num_rows(), _num_rows_written) + << "primary key index builder num rows(" << _primary_key_index_builder->num_rows() + << ") not equal to segment writer's num rows written(" << _num_rows_written << ")"; + _olap_data_convertor->clear_source_content(); + return Status::OK(); +} + +Status VerticalSegmentWriter::_generate_encoded_default_seq_value(const TabletSchema& tablet_schema, + const PartialUpdateInfo& info, + std::string* encoded_value) { + const auto& seq_column = tablet_schema.column(tablet_schema.sequence_col_idx()); + auto block = tablet_schema.create_block_by_cids( + {static_cast(tablet_schema.sequence_col_idx())}); + if (seq_column.has_default_value()) { + auto idx = tablet_schema.sequence_col_idx() - tablet_schema.num_key_columns(); + const auto& default_value = info.default_values[idx]; + vectorized::ReadBuffer rb(const_cast(default_value.c_str()), default_value.size()); + RETURN_IF_ERROR(block.get_by_position(0).type->from_string( + rb, block.get_by_position(0).column->assume_mutable().get())); + + } else { + block.get_by_position(0).column->assume_mutable()->insert_default(); + } + DCHECK_EQ(block.rows(), 1); + auto olap_data_convertor = std::make_unique(); + olap_data_convertor->add_column_data_convertor(seq_column); + olap_data_convertor->set_source_content(&block, 0, 1); + auto [status, column] = olap_data_convertor->convert_column_data(0); + if (!status.ok()) { + return status; + } + // include marker + _encode_seq_column(column, 0, encoded_value); + return Status::OK(); +} + +Status VerticalSegmentWriter::_generate_flexible_read_plan( + FlexibleReadPlan& read_plan, RowsInBlock& data, size_t segment_start_pos, + bool schema_has_sequence_col, int32_t seq_map_col_unique_id, + std::vector* skip_bitmaps, + const std::vector& key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, const signed char* delete_sign_column_data, + const std::vector& specified_rowsets, + std::vector>& segment_caches, + bool& has_default_or_nullable, std::vector& use_default_or_null_flag, + PartialUpdateStats& stats) { + int32_t delete_sign_col_unique_id = + _tablet_schema->column(_tablet_schema->delete_sign_idx()).unique_id(); + int32_t seq_col_unique_id = + (_tablet_schema->has_sequence_col() + ? _tablet_schema->column(_tablet_schema->sequence_col_idx()).unique_id() + : -1); + for (size_t block_pos = data.row_pos; block_pos < data.row_pos + data.num_rows; block_pos++) { + size_t delta_pos = block_pos - data.row_pos; + size_t segment_pos = segment_start_pos + delta_pos; + auto& skip_bitmap = skip_bitmaps->at(block_pos); + + // the hidden sequence column should have the same mark with sequence map column + if (seq_map_col_unique_id != -1) { + DCHECK(schema_has_sequence_col); + if (skip_bitmap.contains(seq_map_col_unique_id)) { + skip_bitmap.add(seq_col_unique_id); + } + } + + std::string key = _full_encode_keys(key_columns, delta_pos); + _maybe_invalid_row_cache(key); + bool row_has_sequence_col = + (schema_has_sequence_col && !skip_bitmap.contains(seq_col_unique_id)); + if (row_has_sequence_col) { + _encode_seq_column(seq_column, delta_pos, &key); + } + + // mark key with delete sign as deleted. + bool have_delete_sign = (!skip_bitmap.contains(delete_sign_col_unique_id) && + delete_sign_column_data[block_pos] != 0); + + auto not_found_cb = [&]() { + return _opts.rowset_ctx->partial_update_info->handle_non_strict_mode_not_found_error( + *_tablet_schema, &skip_bitmap); + }; + auto update_read_plan = [&](const RowLocation& loc) { + read_plan.prepare_to_read(loc, segment_pos, skip_bitmap); + }; + + RETURN_IF_ERROR(_probe_key_for_mow(std::move(key), segment_pos, row_has_sequence_col, + have_delete_sign, specified_rowsets, segment_caches, + has_default_or_nullable, use_default_or_null_flag, + update_read_plan, not_found_cb, stats)); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_merge_rows_for_sequence_column( + RowsInBlock& data, std::vector* skip_bitmaps, + const std::vector& key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, + const std::vector& specified_rowsets, + std::vector>& segment_caches) { + VLOG_DEBUG << fmt::format( + "VerticalSegmentWriter::_merge_rows_for_sequence_column enter: data.block:{}\n", + data.block->dump_data()); + auto seq_col_unique_id = _tablet_schema->column(_tablet_schema->sequence_col_idx()).unique_id(); + std::string previous_key {}; + bool previous_has_seq_col {false}; + int duplicate_keys {0}; + + auto filter_column = vectorized::ColumnUInt8::create(data.num_rows, 1); + auto* __restrict filter_map = filter_column->get_data().data(); + + std::string encoded_default_seq_value {}; + RETURN_IF_ERROR(_generate_encoded_default_seq_value( + *_tablet_schema, *_opts.rowset_ctx->partial_update_info, &encoded_default_seq_value)); + + for (size_t block_pos = data.row_pos; block_pos < data.row_pos + data.num_rows; block_pos++) { + size_t delta_pos = block_pos - data.row_pos; + auto& skip_bitmap = skip_bitmaps->at(block_pos); + std::string key = _full_encode_keys(key_columns, delta_pos); + bool row_has_sequence_col = (!skip_bitmap.contains(seq_col_unique_id)); + Status st; + if (delta_pos > 0 && previous_key == key) { + DCHECK(previous_has_seq_col == !row_has_sequence_col); + ++duplicate_keys; + RowLocation loc; + RowsetSharedPtr rowset; + size_t rid_missing_seq {}; + size_t rid_with_seq {}; + if (row_has_sequence_col) { + rid_missing_seq = block_pos - 1; + rid_with_seq = block_pos; + } else { + rid_missing_seq = block_pos; + rid_with_seq = block_pos - 1; + } + std::string previous_encoded_seq_value {}; + st = _tablet->lookup_row_key(key, _tablet_schema.get(), false, specified_rowsets, &loc, + _mow_context->max_version, segment_caches, &rowset, true, + &previous_encoded_seq_value); + DCHECK(st.is() || st.ok()); + + Slice previous_seq_slice {}; + if (st.is()) { + previous_seq_slice = Slice {encoded_default_seq_value}; + } else { + // TODO(bobhan1): we can mark these rows in delete bitmap and eliminate reading them in later phase + _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); + previous_seq_slice = Slice {previous_encoded_seq_value}; + } + std::string cur_encoded_seq_value {}; + _encode_seq_column(seq_column, rid_with_seq, &cur_encoded_seq_value); + // the encoded value is order-preserving, so we can use Slice::compare() to compare them + int res = previous_seq_slice.compare(Slice {cur_encoded_seq_value}); + VLOG_DEBUG << fmt::format( + "VerticalSegmentWriter::_merge_rows_for_sequence_column: rid_with_seq={}, " + "rid_missing_seq={}, res={}", + rid_with_seq, rid_missing_seq, res); + if (res > 0) { + filter_map[rid_with_seq] = 0; + } else if (res < 0) { + filter_map[rid_missing_seq] = 0; + } else { + filter_map[std::min(rid_with_seq, rid_missing_seq)] = 0; + } + } + previous_key = std::move(key); + previous_has_seq_col = row_has_sequence_col; + } + if (duplicate_keys > 0) { + auto num_cols = data.block->columns(); + auto* block = const_cast(data.block); + block->insert({std::move(filter_column), std::make_shared(), + "__dup_key_filter_col__"}); + RETURN_IF_ERROR(vectorized::Block::filter_block(block, num_cols, num_cols)); + int merged_rows = data.num_rows - block->rows(); + VLOG_DEBUG << fmt::format( + "VerticalSegmentWriter::_merge_rows_for_sequence_column after filter: " + "data.block:{}\n", + data.block->dump_data()); + if (duplicate_keys != merged_rows) { + auto msg = fmt::format( + "duplicate_keys != merged_rows, duplicate_keys={}, merged_rows={}, " + "num_rows={}, mutable_block->rows()={}", + duplicate_keys, merged_rows, data.num_rows, block->rows()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + data.num_rows = block->rows(); + } + return Status::OK(); +} + Status VerticalSegmentWriter::batch_block(const vectorized::Block* block, size_t row_pos, size_t num_rows) { if (_opts.rowset_ctx->partial_update_info && - _opts.rowset_ctx->partial_update_info->is_partial_update && + _opts.rowset_ctx->partial_update_info->is_partial_update() && _opts.write_type == DataWriteType::TYPE_DIRECT && !_opts.rowset_ctx->is_transient_rowset_writer) { - if (block->columns() < _tablet_schema->num_key_columns() || - block->columns() >= _tablet_schema->num_columns()) { - return Status::InvalidArgument(fmt::format( - "illegal partial update block columns: {}, num key columns: {}, total " - "schema columns: {}", - block->columns(), _tablet_schema->num_key_columns(), - _tablet_schema->num_columns())); + if (_opts.rowset_ctx->partial_update_info->is_flexible_partial_update()) { + if (block->columns() != _tablet_schema->num_columns()) { + return Status::InvalidArgument( + "illegal flexible partial update block columns, block columns = {}, " + "tablet_schema columns = {}", + block->dump_structure(), _tablet_schema->dump_structure()); + } + } else { + if (block->columns() < _tablet_schema->num_key_columns() || + block->columns() >= _tablet_schema->num_columns()) { + return Status::InvalidArgument(fmt::format( + "illegal partial update block columns: {}, num key columns: {}, total " + "schema columns: {}", + block->columns(), _tablet_schema->num_key_columns(), + _tablet_schema->num_columns())); + } } } else if (block->columns() != _tablet_schema->num_columns()) { return Status::InvalidArgument( @@ -665,8 +1044,10 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock& _opts.rowset_ctx->merged_tablet_schema = _opts.rowset_ctx->tablet_schema; } TabletSchemaSPtr update_schema; + bool check_schema_size = true; RETURN_IF_ERROR(vectorized::schema_util::get_least_common_schema( - {_opts.rowset_ctx->merged_tablet_schema, _flush_schema}, nullptr, update_schema)); + {_opts.rowset_ctx->merged_tablet_schema, _flush_schema}, nullptr, update_schema, + check_schema_size)); CHECK_GE(update_schema->num_columns(), _flush_schema->num_columns()) << "Rowset merge schema columns count is " << update_schema->num_columns() << ", but flush_schema is larger " << _flush_schema->num_columns() @@ -681,16 +1062,22 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock& Status VerticalSegmentWriter::write_batch() { if (_opts.rowset_ctx->partial_update_info && - _opts.rowset_ctx->partial_update_info->is_partial_update && + _opts.rowset_ctx->partial_update_info->is_partial_update() && _opts.write_type == DataWriteType::TYPE_DIRECT && !_opts.rowset_ctx->is_transient_rowset_writer) { + bool is_flexible_partial_update = + _opts.rowset_ctx->partial_update_info->is_flexible_partial_update(); for (uint32_t cid = 0; cid < _tablet_schema->num_columns(); ++cid) { RETURN_IF_ERROR( _create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema)); } vectorized::Block full_block; for (auto& data : _batched_blocks) { - RETURN_IF_ERROR(_append_block_with_partial_content(data, full_block)); + if (is_flexible_partial_update) { + RETURN_IF_ERROR(_append_block_with_flexible_partial_content(data, full_block)); + } else { + RETURN_IF_ERROR(_append_block_with_partial_content(data, full_block)); + } } for (auto& data : _batched_blocks) { RowsInBlock full_rows_block {&full_block, data.row_pos, data.num_rows}; @@ -993,9 +1380,6 @@ Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { *index_size = _file_writer->bytes_appended() - index_start; } - if (_inverted_index_file_writer != nullptr) { - _inverted_index_file_info = _inverted_index_file_writer->get_index_file_info(); - } // reset all column writers and data_conveter clear(); @@ -1070,9 +1454,6 @@ Status VerticalSegmentWriter::_write_inverted_index() { for (auto& column_writer : _column_writers) { RETURN_IF_ERROR(column_writer->write_inverted_index()); } - if (_inverted_index_file_writer != nullptr) { - RETURN_IF_ERROR(_inverted_index_file_writer->close()); - } return Status::OK(); } @@ -1159,13 +1540,6 @@ void VerticalSegmentWriter::_set_max_key(const Slice& key) { _max_key.append(key.get_data(), key.get_size()); } -int64_t VerticalSegmentWriter::get_inverted_index_total_size() { - if (_inverted_index_file_writer != nullptr) { - return _inverted_index_file_writer->get_index_file_total_size(); - } - return 0; -} - inline bool VerticalSegmentWriter::_is_mow() { return _tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write; } diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h index 56102c5d58d32b..951e9c2e2838c3 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h @@ -34,6 +34,7 @@ #include "gutil/strings/substitute.h" #include "olap/olap_define.h" #include "olap/rowset/segment_v2/column_writer.h" +#include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/tablet.h" #include "olap/tablet_schema.h" #include "util/faststring.h" @@ -82,7 +83,7 @@ class VerticalSegmentWriter { explicit VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, const VerticalSegmentWriterOptions& opts, - io::FileWriterPtr inverted_file_writer = nullptr); + InvertedIndexFileWriter* inverted_file_writer); ~VerticalSegmentWriter(); VerticalSegmentWriter(const VerticalSegmentWriter&) = delete; @@ -99,9 +100,7 @@ class VerticalSegmentWriter { [[nodiscard]] std::string data_dir_path() const { return _data_dir == nullptr ? "" : _data_dir->path(); } - [[nodiscard]] InvertedIndexFileInfo get_inverted_index_file_info() const { - return _inverted_index_file_info; - } + [[nodiscard]] uint32_t num_rows_written() const { return _num_rows_written; } // for partial update @@ -122,10 +121,19 @@ class VerticalSegmentWriter { TabletSchemaSPtr flush_schema() const { return _flush_schema; }; - int64_t get_inverted_index_total_size(); - void clear(); + Status close_inverted_index(int64_t* inverted_index_file_size) { + // no inverted index + if (_inverted_index_file_writer == nullptr) { + *inverted_index_file_size = 0; + return Status::OK(); + } + RETURN_IF_ERROR(_inverted_index_file_writer->close()); + *inverted_index_file_size = _inverted_index_file_writer->get_index_file_total_size(); + return Status::OK(); + } + private: void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column); Status _create_column_writer(uint32_t cid, const TabletColumn& column, @@ -159,13 +167,37 @@ class VerticalSegmentWriter { void _set_max_key(const Slice& key); void _serialize_block_to_row_column(vectorized::Block& block); Status _probe_key_for_mow(std::string key, std::size_t segment_pos, bool have_input_seq_column, - bool have_delete_sign, PartialUpdateReadPlan& read_plan, + bool have_delete_sign, const std::vector& specified_rowsets, std::vector>& segment_caches, bool& has_default_or_nullable, std::vector& use_default_or_null_flag, + const std::function& found_cb, + const std::function& not_found_cb, PartialUpdateStats& stats); Status _append_block_with_partial_content(RowsInBlock& data, vectorized::Block& full_block); + Status _append_block_with_flexible_partial_content(RowsInBlock& data, + vectorized::Block& full_block); + Status _generate_encoded_default_seq_value(const TabletSchema& tablet_schema, + const PartialUpdateInfo& info, + std::string* encoded_value); + Status _generate_flexible_read_plan( + FlexibleReadPlan& read_plan, RowsInBlock& data, size_t segment_start_pos, + bool schema_has_sequence_col, int32_t seq_map_col_unique_id, + std::vector* skip_bitmaps, + const std::vector& key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, + const signed char* delete_sign_column_data, + const std::vector& specified_rowsets, + std::vector>& segment_caches, + bool& has_default_or_nullable, std::vector& use_default_or_null_flag, + PartialUpdateStats& stats); + Status _merge_rows_for_sequence_column( + RowsInBlock& data, std::vector* skip_bitmaps, + const std::vector& key_columns, + vectorized::IOlapColumnDataAccessor* seq_column, + const std::vector& specified_rowsets, + std::vector>& segment_caches); Status _append_block_with_variant_subcolumns(RowsInBlock& data); Status _generate_key_index( RowsInBlock& data, std::vector& key_columns, @@ -189,14 +221,15 @@ class VerticalSegmentWriter { // Not owned. owned by RowsetWriter io::FileWriter* _file_writer = nullptr; - std::unique_ptr _inverted_index_file_writer; + // Not owned. owned by RowsetWriter or SegmentFlusher + InvertedIndexFileWriter* _inverted_index_file_writer = nullptr; SegmentFooterPB _footer; // for mow tables with cluster key, the sort key is the cluster keys not unique keys // for other tables, the sort key is the keys size_t _num_sort_key_columns; size_t _num_short_key_columns; - InvertedIndexFileInfo _inverted_index_file_info; + std::unique_ptr _short_key_index_builder; std::unique_ptr _primary_key_index_builder; std::vector> _column_writers; diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.cpp b/be/src/olap/rowset/segment_v2/zone_map_index.cpp index 991df2f94755c9..c2139ff0899090 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.cpp +++ b/be/src/olap/rowset/segment_v2/zone_map_index.cpp @@ -39,8 +39,6 @@ namespace doris { struct uint24_t; -static bvar::Adder g_zone_map_memory_bytes("doris_zone_map_memory_bytes"); - namespace segment_v2 { template @@ -157,9 +155,6 @@ Status ZoneMapIndexReader::_load(bool use_page_cache, bool kept_in_memory, _page_zone_maps.resize(reader.num_values()); - g_zone_map_memory_bytes << sizeof(*this) + sizeof(ZoneMapPB) * _page_zone_maps.size() + - sizeof(IndexedColumnMetaPB); - // read and cache all page zone maps for (int i = 0; i < reader.num_values(); ++i) { size_t num_to_read = 1; @@ -177,18 +172,18 @@ Status ZoneMapIndexReader::_load(bool use_page_cache, bool kept_in_memory, column->get_data_at(0).size)) { return Status::Corruption("Failed to parse zone map"); } + _pb_meta_size += _page_zone_maps[i].ByteSizeLong(); } + update_metadata_size(); return Status::OK(); } -ZoneMapIndexReader::~ZoneMapIndexReader() { - // Maybe wrong due to load failures. - if (_page_zone_maps.size() > 0) { - g_zone_map_memory_bytes << -sizeof(*this) - sizeof(ZoneMapPB) * _page_zone_maps.size() - - sizeof(IndexedColumnMetaPB); - } +int64_t ZoneMapIndexReader::get_metadata_size() const { + return sizeof(ZoneMapIndexReader) + _pb_meta_size; } + +ZoneMapIndexReader::~ZoneMapIndexReader() = default; #define APPLY_FOR_PRIMITITYPE(M) \ M(TYPE_TINYINT) \ M(TYPE_SMALLINT) \ diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.h b/be/src/olap/rowset/segment_v2/zone_map_index.h index 923bd2c2046da7..34869bbbfeea62 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.h +++ b/be/src/olap/rowset/segment_v2/zone_map_index.h @@ -143,7 +143,7 @@ class TypedZoneMapIndexWriter final : public ZoneMapIndexWriter { uint64_t _estimated_size = 0; }; -class ZoneMapIndexReader { +class ZoneMapIndexReader : public MetadataAdder { public: explicit ZoneMapIndexReader(io::FileReaderSPtr file_reader, const IndexedColumnMetaPB& page_zone_maps) @@ -163,12 +163,15 @@ class ZoneMapIndexReader { private: Status _load(bool use_page_cache, bool kept_in_memory, std::unique_ptr); + int64_t get_metadata_size() const override; + private: DorisCallOnce _load_once; // TODO: yyq, we shoud remove file_reader from here. io::FileReaderSPtr _file_reader; std::unique_ptr _page_zone_maps_meta; std::vector _page_zone_maps; + int64_t _pb_meta_size {0}; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index ced0fb880c41fb..46070f8dccd7ce 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -138,7 +138,6 @@ Status VerticalBetaRowsetWriter::_flush_columns(segment_v2::SegmentWriter* se this->_segment_num_rows.resize(_cur_writer_idx + 1); this->_segment_num_rows[_cur_writer_idx] = _segment_writers[_cur_writer_idx]->row_count(); } - this->_total_index_size += static_cast(index_size); return Status::OK(); } @@ -164,26 +163,28 @@ Status VerticalBetaRowsetWriter::_create_segment_writer( int seg_id = this->_num_segment.fetch_add(1, std::memory_order_relaxed); - io::FileWriterPtr file_writer; - io::FileWriterOptions opts = this->_context.get_file_writer_options(); + io::FileWriterPtr segment_file_writer; + RETURN_IF_ERROR(BaseBetaRowsetWriter::create_file_writer(seg_id, segment_file_writer)); + DCHECK(segment_file_writer != nullptr); - auto path = context.segment_path(seg_id); - auto& fs = context.fs_ref(); - Status st = fs.create_file(path, &file_writer, &opts); - if (!st.ok()) { - LOG(WARNING) << "failed to create writable file. path=" << path << ", err: " << st; - return st; + InvertedIndexFileWriterPtr inverted_index_file_writer; + if (context.tablet_schema->has_inverted_index()) { + RETURN_IF_ERROR(RowsetWriter::create_inverted_index_file_writer( + seg_id, &inverted_index_file_writer)); } - DCHECK(file_writer != nullptr); segment_v2::SegmentWriterOptions writer_options; writer_options.enable_unique_key_merge_on_write = context.enable_unique_key_merge_on_write; writer_options.rowset_ctx = &context; writer_options.max_rows_per_segment = context.max_rows_per_segment; - *writer = std::make_unique(file_writer.get(), seg_id, - context.tablet_schema, context.tablet, - context.data_dir, writer_options); - RETURN_IF_ERROR(this->_seg_files.add(seg_id, std::move(file_writer))); + *writer = std::make_unique( + segment_file_writer.get(), seg_id, context.tablet_schema, context.tablet, + context.data_dir, writer_options, inverted_index_file_writer.get()); + + RETURN_IF_ERROR(this->_seg_files.add(seg_id, std::move(segment_file_writer))); + if (context.tablet_schema->has_inverted_index()) { + RETURN_IF_ERROR(this->_idx_files.add(seg_id, std::move(inverted_index_file_writer))); + } auto s = (*writer)->init(column_ids, is_key); if (!s.ok()) { @@ -205,10 +206,7 @@ Status VerticalBetaRowsetWriter::final_flush() { LOG(WARNING) << "Fail to finalize segment footer, " << st; return st; } - this->_total_data_size += segment_size + segment_writer->get_inverted_index_total_size(); - this->_total_index_size += segment_writer->get_inverted_index_total_size(); - this->_idx_files_info.add_file_info(segment_writer->get_segment_id(), - segment_writer->get_inverted_index_file_info()); + this->_total_data_size += segment_size; segment_writer.reset(); } return Status::OK(); @@ -217,6 +215,7 @@ Status VerticalBetaRowsetWriter::final_flush() { template requires std::is_base_of_v Status VerticalBetaRowsetWriter::_close_file_writers() { + RETURN_IF_ERROR(BaseBetaRowsetWriter::_close_inverted_index_file_writers()); return this->_seg_files.close(); } diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index 1bf00ec889550f..9bb0df318ee11c 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -258,6 +258,17 @@ Status BaseRowsetBuilder::submit_calc_delete_bitmap_task() { } std::lock_guard l(_lock); SCOPED_TIMER(_submit_delete_bitmap_timer); + if (_partial_update_info && _partial_update_info->is_flexible_partial_update()) { + if (_rowset->num_segments() > 1) { + // in flexible partial update, when there are more one segment in one load, + // we need to do alignment process for same keys between segments, we haven't + // implemented it yet and just report an error when encouter this situation + return Status::NotSupported( + "too large input data in flexible partial update, Please " + "reduce the amount of data imported in a single load."); + } + } + // tablet is under alter process. The delete bitmap will be calculated after conversion. if (_tablet->tablet_state() == TABLET_NOTREADY) { LOG(INFO) << "tablet is under alter process, delete bitmap will be calculated later, " @@ -277,15 +288,15 @@ Status BaseRowsetBuilder::submit_calc_delete_bitmap_task() { // For partial update, we need to fill in the entire row of data, during the calculation // of the delete bitmap. This operation is resource-intensive, and we need to minimize // the number of times it occurs. Therefore, we skip this operation here. - if (_partial_update_info->is_partial_update) { + if (_partial_update_info->is_partial_update()) { // for partial update, the delete bitmap calculation is done while append_block() // we print it's summarize logs here before commit. LOG(INFO) << fmt::format( - "partial update calc delete bitmap summary before commit: tablet({}), txn_id({}), " + "{} calc delete bitmap summary before commit: tablet({}), txn_id({}), " "rowset_ids({}), cur max_version({}), bitmap num({}), num rows updated({}), num " "rows new added({}), num rows deleted({}), total rows({})", - tablet()->tablet_id(), _req.txn_id, _rowset_ids.size(), - rowset_writer()->context().mow_context->max_version, + _partial_update_info->partial_update_mode_str(), tablet()->tablet_id(), _req.txn_id, + _rowset_ids.size(), rowset_writer()->context().mow_context->max_version, _delete_bitmap->delete_bitmap.size(), rowset_writer()->num_rows_updated(), rowset_writer()->num_rows_new_added(), rowset_writer()->num_rows_deleted(), rowset_writer()->num_rows()); @@ -300,7 +311,7 @@ Status BaseRowsetBuilder::submit_calc_delete_bitmap_task() { } Status BaseRowsetBuilder::wait_calc_delete_bitmap() { - if (!_tablet->enable_unique_key_merge_on_write() || _partial_update_info->is_partial_update) { + if (!_tablet->enable_unique_key_merge_on_write() || _partial_update_info->is_partial_update()) { return Status::OK(); } std::lock_guard l(_lock); @@ -421,11 +432,12 @@ void BaseRowsetBuilder::_build_current_tablet_schema(int64_t index_id, // set partial update columns info _partial_update_info = std::make_shared(); _partial_update_info->init( - *_tablet_schema, table_schema_param->is_partial_update(), + *_tablet_schema, table_schema_param->unique_key_update_mode(), table_schema_param->partial_update_input_columns(), table_schema_param->is_strict_mode(), table_schema_param->timestamp_ms(), table_schema_param->nano_seconds(), table_schema_param->timezone(), - table_schema_param->auto_increment_coulumn(), _max_version_in_flush_phase); + table_schema_param->auto_increment_coulumn(), + table_schema_param->sequence_map_col_uid(), _max_version_in_flush_phase); } } // namespace doris diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index d40a44fd710936..5ef85dbaf11c19 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -81,6 +81,7 @@ #include "vec/columns/column.h" #include "vec/columns/column_nullable.h" #include "vec/common/assert_cast.h" +#include "vec/common/schema_util.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" #include "vec/exprs/vexpr.h" @@ -899,7 +900,7 @@ Status SchemaChangeJob::_do_process_alter_tablet(const TAlterTabletReqV2& reques } } std::vector empty_vec; - _new_tablet->delete_rowsets(rowsets_to_delete, false); + RETURN_IF_ERROR(_new_tablet->delete_rowsets(rowsets_to_delete, false)); // inherit cumulative_layer_point from base_tablet // check if new_tablet.ce_point > base_tablet.ce_point? _new_tablet->set_cumulative_layer_point(-1); @@ -1367,13 +1368,9 @@ Status SchemaChangeJob::parse_request(const SchemaChangeParams& sc_params, *sc_directly = true; return Status::OK(); } else if (column_mapping->ref_column_idx >= 0) { - const auto& column_new = new_tablet_schema->column(i); - const auto& column_old = base_tablet_schema->column(column_mapping->ref_column_idx); // index changed - if (column_new.is_bf_column() != column_old.is_bf_column() || - column_new.has_bitmap_index() != column_old.has_bitmap_index() || - new_tablet_schema->has_inverted_index(column_new) != - base_tablet_schema->has_inverted_index(column_old)) { + if (vectorized::schema_util::has_schema_index_diff( + new_tablet_schema, base_tablet_schema, i, column_mapping->ref_column_idx)) { *sc_directly = true; return Status::OK(); } diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index fd7e3f476ad082..26ac54c699b81a 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -17,6 +17,8 @@ #include "olap/segment_loader.h" +#include + #include "common/config.h" #include "common/status.h" #include "olap/olap_define.h" @@ -52,19 +54,26 @@ void SegmentCache::erase(const SegmentCache::CacheKey& key) { Status SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, bool use_cache, - bool need_load_pk_index_and_bf) { + bool need_load_pk_index_and_bf, + OlapReaderStatistics* index_load_stats) { if (cache_handle->is_inited()) { return Status::OK(); } for (int64_t i = 0; i < rowset->num_segments(); i++) { SegmentCache::CacheKey cache_key(rowset->rowset_id(), i); if (_segment_cache->lookup(cache_key, cache_handle)) { - continue; + // Has to check the segment status here, because the segment in cache may has something wrong during + // load index or create column reader. + // Not merge this if logic with previous to make the logic more clear. + if (cache_handle->pop_unhealthy_segment() == nullptr) { + continue; + } } + // If the segment is not healthy, then will create a new segment and will replace the unhealthy one in SegmentCache. segment_v2::SegmentSharedPtr segment; RETURN_IF_ERROR(rowset->load_segment(i, &segment)); if (need_load_pk_index_and_bf) { - RETURN_IF_ERROR(segment->load_pk_index_and_bf()); + RETURN_IF_ERROR(segment->load_pk_index_and_bf(index_load_stats)); } if (use_cache && !config::disable_segment_cache) { // memory of SegmentCache::CacheValue will be handled by SegmentCache diff --git a/be/src/olap/segment_loader.h b/be/src/olap/segment_loader.h index d177024242db33..834906da93bf74 100644 --- a/be/src/olap/segment_loader.h +++ b/be/src/olap/segment_loader.h @@ -117,7 +117,8 @@ class SegmentLoader { // Load segments of "rowset", return the "cache_handle" which contains segments. // If use_cache is true, it will be loaded from _cache. Status load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, - bool use_cache = false, bool need_load_pk_index_and_bf = false); + bool use_cache = false, bool need_load_pk_index_and_bf = false, + OlapReaderStatistics* index_load_stats = nullptr); void erase_segment(const SegmentCache::CacheKey& key); @@ -161,6 +162,18 @@ class SegmentCacheHandle { _init = true; } + segment_v2::SegmentSharedPtr pop_unhealthy_segment() { + if (segments.empty()) { + return nullptr; + } + segment_v2::SegmentSharedPtr last_segment = segments.back(); + if (last_segment->healthy_status().ok()) { + return nullptr; + } + segments.pop_back(); + return last_segment; + } + private: std::vector segments; bool _init {false}; diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index ef93ab25caeac9..7470afe0ef62c7 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -149,11 +149,15 @@ Status SingleReplicaCompaction::_do_single_replica_compaction_impl() { LOG(INFO) << "succeed to do single replica compaction" << ". tablet=" << _tablet->tablet_id() << ", output_version=" << _output_version << ", current_max_version=" << current_max_version - << ", input_rowset_size=" << _input_rowsets_size + << ", input_rowsets_data_size=" << _input_rowsets_data_size + << ", input_rowsets_index_size=" << _input_rowsets_index_size + << ", input_rowsets_total_size=" << _input_rowsets_total_size << ", input_row_num=" << _input_row_num << ", input_segments_num=" << _input_num_segments - << ", _input_index_size=" << _input_index_size + << ", _input_index_size=" << _input_rowsets_index_size << ", output_rowset_data_size=" << _output_rowset->data_disk_size() + << ", output_rowset_index_size=" << _output_rowset->index_disk_size() + << ", output_rowset_total_size=" << _output_rowset->total_disk_size() << ", output_row_num=" << _output_rowset->num_rows() << ", output_segments_num=" << _output_rowset->num_segments(); return Status::OK(); @@ -264,10 +268,11 @@ bool SingleReplicaCompaction::_find_rowset_to_fetch(const std::vector& return false; } for (auto& rowset : _input_rowsets) { - _input_rowsets_size += rowset->data_disk_size(); + _input_rowsets_data_size += rowset->data_disk_size(); _input_row_num += rowset->num_rows(); _input_num_segments += rowset->num_segments(); - _input_index_size += rowset->index_disk_size(); + _input_rowsets_index_size += rowset->index_disk_size(); + _input_rowsets_total_size += rowset->data_disk_size() + rowset->index_disk_size(); } _output_version = *proper_version; } diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index 2cfa9a8e8b763d..67205835b53947 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -698,11 +698,8 @@ Status SnapshotManager::_create_snapshot_files(const TabletSharedPtr& ref_tablet if (tablet_schema.get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { - for (const auto& index : tablet_schema.indexes()) { - if (index.index_type() != IndexType::INVERTED) { - continue; - } - auto index_id = index.index_id(); + for (const auto& index : tablet_schema.inverted_indexes()) { + auto index_id = index->index_id(); auto index_file = ref_tablet->get_segment_index_filepath( rowset_id, segment_index, index_id); auto snapshot_segment_index_file_path = diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index ebf40c90bea35b..e00b5b595e20dc 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -209,7 +209,7 @@ StorageEngine::StorageEngine(const EngineOptions& options) _txn_manager(new TxnManager(*this, config::txn_map_shard_size, config::txn_shard_size)), _default_rowset_type(BETA_ROWSET), _create_tablet_idx_lru_cache( - new CreateTabletIdxCache(config::partition_disk_index_lru_size)), + new CreateTabletRRIdxCache(config::partition_disk_index_lru_size)), _snapshot_mgr(std::make_unique(*this)) { REGISTER_HOOK_METRIC(unused_rowsets_count, [this]() { // std::lock_guard lock(_gc_mutex); @@ -515,7 +515,7 @@ Status StorageEngine::set_cluster_id(int32_t cluster_id) { int StorageEngine::_get_and_set_next_disk_index(int64 partition_id, TStorageMedium::type storage_medium) { - auto key = CreateTabletIdxCache::get_key(partition_id, storage_medium); + auto key = CreateTabletRRIdxCache::get_key(partition_id, storage_medium); int curr_index = _create_tablet_idx_lru_cache->get_index(key); // -1, lru can't find key if (curr_index == -1) { @@ -1511,7 +1511,7 @@ Status StorageEngine::_persist_broken_paths() { return Status::OK(); } -int CreateTabletIdxCache::get_index(const std::string& key) { +int CreateTabletRRIdxCache::get_index(const std::string& key) { auto* lru_handle = lookup(key); if (lru_handle) { Defer release([cache = this, lru_handle] { cache->release(lru_handle); }); @@ -1522,7 +1522,7 @@ int CreateTabletIdxCache::get_index(const std::string& key) { return -1; } -void CreateTabletIdxCache::set_index(const std::string& key, int next_idx) { +void CreateTabletRRIdxCache::set_index(const std::string& key, int next_idx) { assert(next_idx >= 0); auto* value = new CacheValue; value->idx = next_idx; diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index b2a313adcdbb7e..421c0eb352d712 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -69,7 +69,7 @@ class Thread; class ThreadPool; class TxnManager; class ReportWorker; -class CreateTabletIdxCache; +class CreateTabletRRIdxCache; struct DirInfo; class SnapshotManager; @@ -532,7 +532,7 @@ class StorageEngine final : public BaseStorageEngine { // next index for create tablet std::map _last_use_index; - std::unique_ptr _create_tablet_idx_lru_cache; + std::unique_ptr _create_tablet_idx_lru_cache; std::unique_ptr _snapshot_mgr; }; @@ -540,7 +540,7 @@ class StorageEngine final : public BaseStorageEngine { // lru cache for create tabelt round robin in disks // key: partitionId_medium // value: index -class CreateTabletIdxCache : public LRUCachePolicy { +class CreateTabletRRIdxCache : public LRUCachePolicy { public: // get key, delimiter with DELIMITER '-' static std::string get_key(int64_t partition_id, TStorageMedium::type medium) { @@ -557,10 +557,10 @@ class CreateTabletIdxCache : public LRUCachePolicy { int idx = 0; }; - CreateTabletIdxCache(size_t capacity) + CreateTabletRRIdxCache(size_t capacity) : LRUCachePolicy(CachePolicy::CacheType::CREATE_TABLET_RR_IDX_CACHE, capacity, LRUCacheType::NUMBER, - /*stale_sweep_time_s*/ 30 * 60) {} + /*stale_sweep_time_s*/ 30 * 60, 1) {} }; struct DirInfo { diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 51eabe5495ef89..b6b81811091d94 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include "common/compiler_util.h" // IWYU pragma: keep @@ -57,7 +59,6 @@ #include "agent/utils.h" #include "common/config.h" #include "common/consts.h" -#include "common/exception.h" #include "common/logging.h" #include "common/signal_handler.h" #include "common/status.h" @@ -87,6 +88,7 @@ #include "olap/rowset/beta_rowset.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" #include "olap/rowset/rowset_writer.h" @@ -330,6 +332,7 @@ Status Tablet::init() { // should save tablet meta to remote meta store // if it's a primary replica void Tablet::save_meta() { + check_table_size_correctness(); auto res = _tablet_meta->save_meta(_data_dir); CHECK_EQ(res, Status::OK()) << "fail to save tablet_meta. res=" << res << ", root=" << _data_dir->path(); @@ -425,7 +428,7 @@ Status Tablet::revise_tablet_meta(const std::vector& to_add, // error handling if (!calc_bm_status.ok()) { if (is_incremental_clone) { - delete_rowsets(to_add, false); + RETURN_IF_ERROR(delete_rowsets(to_add, false)); LOG(WARNING) << "incremental clone on tablet: " << tablet_id() << " failed due to " << calc_bm_status.msg() << ", revert " << to_add.size() << " rowsets added before."; @@ -438,7 +441,7 @@ Status Tablet::revise_tablet_meta(const std::vector& to_add, // full clone, calculate delete bitmap succeeded, update rowset if (!is_incremental_clone) { - delete_rowsets(to_delete, false); + RETURN_IF_ERROR(delete_rowsets(to_delete, false)); add_rowsets(to_add); // reconstruct from tablet meta _timestamped_version_tracker.construct_versioned_tracker(_tablet_meta->all_rs_metas()); @@ -489,6 +492,7 @@ Status Tablet::add_rowset(RowsetSharedPtr rowset) { RETURN_IF_ERROR(_tablet_meta->add_rs_meta(rowset->rowset_meta())); _rs_version_map[rowset->version()] = rowset; _timestamped_version_tracker.add_version(rowset->version()); + add_compaction_score(rowset->rowset_meta()->get_compaction_score()); std::vector rowsets_to_delete; // yiguolei: temp code, should remove the rowset contains by this rowset @@ -594,6 +598,17 @@ Status Tablet::modify_rowsets(std::vector& to_add, } } } + + int32_t add_score = 0; + for (auto rs : to_add) { + add_score += rs->rowset_meta()->get_compaction_score(); + } + int32_t sub_score = 0; + for (auto rs : to_delete) { + sub_score += rs->rowset_meta()->get_compaction_score(); + } + add_compaction_score(add_score - sub_score); + return Status::OK(); } @@ -611,30 +626,33 @@ void Tablet::add_rowsets(const std::vector& to_add) { _tablet_meta->modify_rs_metas(rs_metas, {}); } -void Tablet::delete_rowsets(const std::vector& to_delete, bool move_to_stale) { +Status Tablet::delete_rowsets(const std::vector& to_delete, bool move_to_stale) { if (to_delete.empty()) { - return; + return Status::OK(); } std::vector rs_metas; rs_metas.reserve(to_delete.size()); - for (auto& rs : to_delete) { + for (const auto& rs : to_delete) { rs_metas.push_back(rs->rowset_meta()); _rs_version_map.erase(rs->version()); } _tablet_meta->modify_rs_metas({}, rs_metas, !move_to_stale); if (move_to_stale) { - for (auto& rs : to_delete) { + for (const auto& rs : to_delete) { _stale_rs_version_map[rs->version()] = rs; } _timestamped_version_tracker.add_stale_path_version(rs_metas); } else { - for (auto& rs : to_delete) { + for (const auto& rs : to_delete) { _timestamped_version_tracker.delete_version(rs->version()); if (rs->is_local()) { _engine.add_unused_rowset(rs); + RETURN_IF_ERROR(RowsetMetaManager::remove(_data_dir->get_meta(), tablet_uid(), + rs->rowset_meta()->rowset_id())); } } } + return Status::OK(); } RowsetSharedPtr Tablet::_rowset_with_largest_size() { @@ -668,6 +686,9 @@ Status Tablet::add_inc_rowset(const RowsetSharedPtr& rowset) { _timestamped_version_tracker.add_version(rowset->version()); ++_newly_created_rowset_num; + + add_compaction_score(rowset->rowset_meta()->get_compaction_score()); + return Status::OK(); } @@ -983,17 +1004,41 @@ bool Tablet::can_do_compaction(size_t path_hash, CompactionType compaction_type) return tablet_state() == TABLET_RUNNING || tablet_state() == TABLET_NOTREADY; } -uint32_t Tablet::calc_compaction_score( +uint32_t Tablet::calc_compaction_score() { + if (_score_check_cnt++ % config::check_score_rounds_num != 0) { + std::shared_lock rdlock(_meta_lock); + if (_compaction_score > 0) { + return _compaction_score; + } + } + + { + // Need meta lock, because it will iterator "all_rs_metas" of tablet meta. + std::shared_lock rdlock(_meta_lock); + int32_t score = get_real_compaction_score(); + if (_compaction_score > 0 && _compaction_score != score) { + LOG(WARNING) << "cumu cache score not equal real score, cache score; " + << _compaction_score << ", real score: " << score + << ", tablet: " << tablet_id(); + } + _compaction_score = score; + return score; + } +} + +bool Tablet::suitable_for_compaction( CompactionType compaction_type, std::shared_ptr cumulative_compaction_policy) { // Need meta lock, because it will iterator "all_rs_metas" of tablet meta. std::shared_lock rdlock(_meta_lock); + int32_t score = -1; if (compaction_type == CompactionType::CUMULATIVE_COMPACTION) { - return _calc_cumulative_compaction_score(cumulative_compaction_policy); + score = _calc_cumulative_compaction_score(cumulative_compaction_policy); } else { DCHECK_EQ(compaction_type, CompactionType::BASE_COMPACTION); - return _calc_base_compaction_score(); + score = _calc_base_compaction_score(); } + return score > 0; } uint32_t Tablet::calc_cold_data_compaction_score() const { @@ -1160,10 +1205,6 @@ Status Tablet::_contains_version(const Version& version) { return Status::OK(); } -TabletInfo Tablet::get_tablet_info() const { - return TabletInfo(tablet_id(), tablet_uid()); -} - std::vector Tablet::pick_candidate_rowsets_to_cumulative_compaction() { std::vector candidate_rowsets; if (_cumulative_point == K_INVALID_CUMULATIVE_POINT) { @@ -1231,7 +1272,7 @@ std::vector Tablet::pick_candidate_rowsets_to_build_inverted_in std::shared_lock rlock(_meta_lock); auto has_alter_inverted_index = [&](RowsetSharedPtr rowset) -> bool { for (const auto& index_id : alter_index_uids) { - if (rowset->tablet_schema()->has_inverted_index_with_index_id(index_id, "")) { + if (rowset->tablet_schema()->has_inverted_index_with_index_id(index_id)) { return true; } } @@ -1653,6 +1694,19 @@ void Tablet::build_tablet_report_info(TTabletInfo* tablet_info, } } +void Tablet::report_error(const Status& st) { + if (st.is()) { + ++_io_error_times; + } else if (st.is()) { + _io_error_times = config::max_tablet_io_errors + 1; + } else if (st.is()) { + check_tablet_path_exists(); + if (!_is_tablet_path_exists.load(std::memory_order_relaxed)) { + _io_error_times = config::max_tablet_io_errors + 1; + } + } +} + Status Tablet::prepare_compaction_and_calculate_permits( CompactionType compaction_type, const TabletSharedPtr& tablet, std::shared_ptr& compaction, int64_t& permits) { @@ -1790,6 +1844,7 @@ void Tablet::execute_compaction(CompactionMixin& compaction) { watch.start(); Status res = [&]() { RETURN_IF_CATCH_EXCEPTION({ return compaction.execute_compact(); }); }(); + if (!res.ok()) [[unlikely]] { set_last_failure_time(this, compaction, UnixMillis()); LOG(WARNING) << "failed to do " << compaction.compaction_name() @@ -1992,8 +2047,8 @@ Status Tablet::_cooldown_data(RowsetSharedPtr rowset) { LOG(INFO) << "Upload rowset " << old_rowset->version() << " " << new_rowset_id.to_string() << " to " << storage_resource.fs->root_path().native() << ", tablet_id=" << tablet_id() << ", duration=" << duration.count() - << ", capacity=" << old_rowset->data_disk_size() - << ", tp=" << old_rowset->data_disk_size() / duration.count() + << ", capacity=" << old_rowset->total_disk_size() + << ", tp=" << old_rowset->total_disk_size() / duration.count() << ", old rowset_id=" << old_rowset->rowset_id().to_string(); // gen a new rowset @@ -2011,7 +2066,7 @@ Status Tablet::_cooldown_data(RowsetSharedPtr rowset) { std::unique_lock meta_wlock(_meta_lock); SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); if (tablet_state() == TABLET_RUNNING) { - delete_rowsets({std::move(old_rowset)}, false); + RETURN_IF_ERROR(delete_rowsets({std::move(old_rowset)}, false)); add_rowsets({std::move(new_rowset)}); // TODO(plat1ko): process primary key _tablet_meta->set_cooldown_meta_id(cooldown_meta_id); @@ -2230,7 +2285,7 @@ Status Tablet::_follow_cooldowned_data() { to_add.push_back(std::move(rs)); } // Note: We CANNOT call `modify_rowsets` here because `modify_rowsets` cannot process version graph correctly. - delete_rowsets(to_delete, false); + RETURN_IF_ERROR(delete_rowsets(to_delete, false)); add_rowsets(to_add); // TODO(plat1ko): process primary key _tablet_meta->set_cooldown_meta_id(cooldown_meta_pb.cooldown_meta_id()); @@ -2372,7 +2427,7 @@ RowsetSharedPtr Tablet::need_cooldown(int64_t* cooldown_timestamp, size_t* file_ // current time or it's datatime is less than current time if (newest_cooldown_time != 0 && newest_cooldown_time < UnixSeconds()) { *cooldown_timestamp = newest_cooldown_time; - *file_size = rowset->data_disk_size(); + *file_size = rowset->total_disk_size(); VLOG_DEBUG << "tablet need cooldown, tablet id: " << tablet_id() << " file_size: " << *file_size; return rowset; @@ -2595,12 +2650,9 @@ void Tablet::gc_binlogs(int64_t version) { // add binlog segment files and index files for (int64_t i = 0; i < num_segments; ++i) { wait_for_deleted_binlog_files.emplace_back(get_segment_filepath(rowset_id, i)); - for (const auto& index : this->tablet_schema()->indexes()) { - if (index.index_type() != IndexType::INVERTED) { - continue; - } + for (const auto& index : this->tablet_schema()->inverted_indexes()) { wait_for_deleted_binlog_files.emplace_back( - get_segment_index_filepath(rowset_id, i, index.index_id())); + get_segment_index_filepath(rowset_id, i, index->index_id())); } } }; @@ -2682,4 +2734,124 @@ void Tablet::clear_cache() { } } +void Tablet::check_table_size_correctness() { + if (!config::enable_table_size_correctness_check) { + return; + } + const std::vector& all_rs_metas = _tablet_meta->all_rs_metas(); + for (const auto& rs_meta : all_rs_metas) { + int64_t total_segment_size = get_segment_file_size(rs_meta); + int64_t total_inverted_index_size = get_inverted_index_file_szie(rs_meta); + if (rs_meta->data_disk_size() != total_segment_size || + rs_meta->index_disk_size() != total_inverted_index_size || + rs_meta->data_disk_size() + rs_meta->index_disk_size() != rs_meta->total_disk_size()) { + LOG(WARNING) << "[Local table table size check failed]:" + << " tablet id: " << rs_meta->tablet_id() + << ", rowset id:" << rs_meta->rowset_id() + << ", rowset data disk size:" << rs_meta->data_disk_size() + << ", rowset real data disk size:" << total_segment_size + << ", rowset index disk size:" << rs_meta->index_disk_size() + << ", rowset real index disk size:" << total_inverted_index_size + << ", rowset total disk size:" << rs_meta->total_disk_size() + << ", rowset segment path:" + << StorageResource().remote_segment_path( + rs_meta->tablet_id(), rs_meta->rowset_id().to_string(), 0); + DCHECK(false); + } + } +} + +std::string Tablet::get_segment_path(const RowsetMetaSharedPtr& rs_meta, int64_t seg_id) { + std::string segment_path; + if (rs_meta->is_local()) { + segment_path = local_segment_path(_tablet_path, rs_meta->rowset_id().to_string(), seg_id); + } else { + segment_path = rs_meta->remote_storage_resource().value()->remote_segment_path( + rs_meta->tablet_id(), rs_meta->rowset_id().to_string(), seg_id); + } + return segment_path; +} + +int64_t Tablet::get_segment_file_size(const RowsetMetaSharedPtr& rs_meta) { + const auto& fs = rs_meta->fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta->resource_id(); + } + int64_t total_segment_size = 0; + for (int64_t seg_id = 0; seg_id < rs_meta->num_segments(); seg_id++) { + std::string segment_path = get_segment_path(rs_meta, seg_id); + int64_t segment_file_size = 0; + auto st = fs->file_size(segment_path, &segment_file_size); + if (!st.ok()) { + segment_file_size = 0; + LOG(WARNING) << "table size correctness check get segment size failed! msg:" + << st.to_string() << ", segment path:" << segment_path; + } + total_segment_size += segment_file_size; + } + return total_segment_size; +} + +int64_t Tablet::get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta) { + const auto& fs = rs_meta->fs(); + if (!fs) { + LOG(WARNING) << "get fs failed, resource_id={}" << rs_meta->resource_id(); + } + int64_t total_inverted_index_size = 0; + + if (rs_meta->tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + const auto& indices = rs_meta->tablet_schema()->inverted_indexes(); + for (auto& index : indices) { + for (int seg_id = 0; seg_id < rs_meta->num_segments(); ++seg_id) { + std::string segment_path = get_segment_path(rs_meta, seg_id); + int64_t file_size = 0; + + std::string inverted_index_file_path = + InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path), + index->index_id(), index->get_index_suffix()); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + LOG(WARNING) << " tablet id: " << get_tablet_info().tablet_id + << ", rowset id:" << rs_meta->rowset_id() + << ", table size correctness check get inverted index v1 " + "size failed! msg:" + << st.to_string() + << ", inverted index path:" << inverted_index_file_path; + } + total_inverted_index_size += file_size; + } + } + } else { + for (int seg_id = 0; seg_id < rs_meta->num_segments(); ++seg_id) { + int64_t file_size = 0; + std::string segment_path = get_segment_path(rs_meta, seg_id); + std::string inverted_index_file_path = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_path)); + auto st = fs->file_size(inverted_index_file_path, &file_size); + if (!st.ok()) { + file_size = 0; + if (st.is()) { + LOG(INFO) << " tablet id: " << get_tablet_info().tablet_id + << ", rowset id:" << rs_meta->rowset_id() + << ", table size correctness check get inverted index v2 failed " + "because file not exist:" + << inverted_index_file_path; + } else { + LOG(WARNING) << " tablet id: " << get_tablet_info().tablet_id + << ", rowset id:" << rs_meta->rowset_id() + << ", table size correctness check get inverted index v2 " + "size failed! msg:" + << st.to_string() + << ", inverted index path:" << inverted_index_file_path; + } + } + total_inverted_index_size += file_size; + } + } + return total_inverted_index_size; +} + } // namespace doris diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 33253e82ced2b5..f5866c67641581 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -115,7 +115,6 @@ class Tablet final : public BaseTablet { DataDir* data_dir() const { return _data_dir; } int64_t replica_id() const { return _tablet_meta->replica_id(); } - TabletUid tablet_uid() const { return _tablet_meta->tablet_uid(); } const std::string& tablet_path() const { return _tablet_path; } @@ -221,10 +220,12 @@ class Tablet final : public BaseTablet { // operation for compaction bool can_do_compaction(size_t path_hash, CompactionType compaction_type); - uint32_t calc_compaction_score( + bool suitable_for_compaction( CompactionType compaction_type, std::shared_ptr cumulative_compaction_policy); + uint32_t calc_compaction_score(); + // This function to find max continuous version from the beginning. // For example: If there are 1, 2, 3, 5, 6, 7 versions belongs tablet, then 3 is target. // 3 will be saved in "version", and 7 will be saved in "max_version", if max_version != nullptr @@ -277,8 +278,6 @@ class Tablet final : public BaseTablet { void check_tablet_path_exists(); - TabletInfo get_tablet_info() const; - std::vector pick_candidate_rowsets_to_cumulative_compaction(); std::vector pick_candidate_rowsets_to_base_compaction(); std::vector pick_candidate_rowsets_to_full_compaction(); @@ -360,7 +359,7 @@ class Tablet final : public BaseTablet { // MUST hold EXCLUSIVE `_meta_lock` void add_rowsets(const std::vector& to_add); // MUST hold EXCLUSIVE `_meta_lock` - void delete_rowsets(const std::vector& to_delete, bool move_to_stale); + Status delete_rowsets(const std::vector& to_delete, bool move_to_stale); // MUST hold SHARED `_meta_lock` const auto& rowset_map() const { return _rs_version_map; } @@ -449,13 +448,7 @@ class Tablet final : public BaseTablet { void gc_binlogs(int64_t version); Status ingest_binlog_metas(RowsetBinlogMetasPB* metas_pb); - inline void report_error(const Status& st) { - if (st.is()) { - ++_io_error_times; - } else if (st.is()) { - _io_error_times = config::max_tablet_io_errors + 1; - } - } + void report_error(const Status& st); inline int64_t get_io_error_times() const { return _io_error_times; } @@ -482,6 +475,24 @@ class Tablet final : public BaseTablet { inline bool is_full_compaction_running() const { return _is_full_compaction_running; } void clear_cache() override; + int32_t get_compaction_score() const { return _compaction_score; } + + void set_compaction_score(int32_t compaction_score) { _compaction_score = compaction_score; } + + void add_compaction_score(int32_t score) { + if (_compaction_score < 0) { + return; + } + _compaction_score += score; + } + + void minus_compaction_score(int32_t score) { + if (_compaction_score < 0) { + return; + } + _compaction_score -= score; + } + private: Status _init_once_action(); bool _contains_rowset(const RowsetId rowset_id); @@ -520,6 +531,10 @@ class Tablet final : public BaseTablet { //////////////////////////////////////////////////////////////////////////// void _clear_cache_by_rowset(const BetaRowsetSharedPtr& rowset); + void check_table_size_correctness(); + std::string get_segment_path(const RowsetMetaSharedPtr& rs_meta, int64_t seg_id); + int64_t get_segment_file_size(const RowsetMetaSharedPtr& rs_meta); + int64_t get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta); public: static const int64_t K_INVALID_CUMULATIVE_POINT = -1; @@ -608,6 +623,9 @@ class Tablet final : public BaseTablet { std::shared_ptr _visible_version; std::atomic_bool _is_full_compaction_running = false; + + int32_t _compaction_score = -1; + int32_t _score_check_cnt = 0; }; inline CumulativeCompactionPolicy* Tablet::cumulative_compaction_policy() { diff --git a/be/src/olap/tablet_column_object_pool.cpp b/be/src/olap/tablet_column_object_pool.cpp new file mode 100644 index 00000000000000..6e07fb4e831e60 --- /dev/null +++ b/be/src/olap/tablet_column_object_pool.cpp @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/tablet_column_object_pool.h" + +#include +#include + +#include "olap/tablet_schema.h" + +namespace doris { + +bvar::Adder g_tablet_column_cache_count("tablet_column_cache_count"); +bvar::Adder g_tablet_column_cache_hit_count("tablet_column_cache_hit_count"); + +std::pair TabletColumnObjectPool::insert(const std::string& key) { + auto* lru_handle = lookup(key); + TabletColumnPtr tablet_column_ptr; + if (lru_handle) { + auto* value = (CacheValue*)LRUCachePolicy::value(lru_handle); + tablet_column_ptr = value->tablet_column; + VLOG_DEBUG << "reuse column "; + g_tablet_column_cache_hit_count << 1; + } else { + auto* value = new CacheValue; + tablet_column_ptr = std::make_shared(); + ColumnPB pb; + pb.ParseFromString(key); + tablet_column_ptr->init_from_pb(pb); + VLOG_DEBUG << "create column "; + value->tablet_column = tablet_column_ptr; + lru_handle = LRUCachePolicy::insert(key, value, 1, 0, CachePriority::NORMAL); + g_tablet_column_cache_count << 1; + } + DCHECK(lru_handle != nullptr); + return {lru_handle, tablet_column_ptr}; +} + +TabletColumnObjectPool::CacheValue::~CacheValue() { + g_tablet_column_cache_count << -1; +} + +} // namespace doris diff --git a/be/src/olap/tablet_column_object_pool.h b/be/src/olap/tablet_column_object_pool.h new file mode 100644 index 00000000000000..1eead6a25c9609 --- /dev/null +++ b/be/src/olap/tablet_column_object_pool.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/tablet_fwd.h" +#include "olap/tablet_schema.h" +#include "runtime/exec_env.h" +#include "runtime/memory/lru_cache_policy.h" + +namespace doris { + +// TabletColumnObjectPool is a cache for TabletColumn objects. It is used to reduce memory consumption +// when there are a large number of identical TabletColumns in the cluster, which usually occurs +// when VARIANT type columns are modified and added, each Rowset has an individual TabletSchema. +// Excessive TabletSchemas can lead to significant memory overhead. Reusing memory for identical +// TabletColumns would greatly reduce this memory consumption. + +class TabletColumnObjectPool : public LRUCachePolicy { +public: + TabletColumnObjectPool(size_t capacity) + : LRUCachePolicy(CachePolicy::CacheType::TABLET_COLUMN_OBJECT_POOL, capacity, + LRUCacheType::NUMBER, config::tablet_schema_cache_recycle_interval) {} + + static TabletColumnObjectPool* create_global_column_cache(size_t capacity) { + auto* res = new TabletColumnObjectPool(capacity); + return res; + } + + static TabletColumnObjectPool* instance() { + return ExecEnv::GetInstance()->get_tablet_column_object_pool(); + } + + std::pair insert(const std::string& key); + +private: + class CacheValue : public LRUCacheValueBase { + public: + ~CacheValue() override; + TabletColumnPtr tablet_column; + }; +}; + +} // namespace doris diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 468a6b2fb126f0..b853401855ce94 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -101,7 +101,9 @@ TabletManager::TabletManager(StorageEngine& engine, int32_t tablet_map_lock_shar } TabletManager::~TabletManager() { +#ifndef BE_TEST DEREGISTER_HOOK_METRIC(tablet_meta_mem_consumption); +#endif } Status TabletManager::_add_tablet_unlocked(TTabletId tablet_id, const TabletSharedPtr& tablet, @@ -797,8 +799,7 @@ std::vector TabletManager::find_best_tablets_to_compaction( } auto cumulative_compaction_policy = all_cumulative_compaction_policies.at( tablet_ptr->tablet_meta()->compaction_policy()); - uint32_t current_compaction_score = - tablet_ptr->calc_compaction_score(compaction_type, cumulative_compaction_policy); + uint32_t current_compaction_score = tablet_ptr->calc_compaction_score(); if (current_compaction_score < 5) { tablet_ptr->set_skip_compaction(true, compaction_type, UnixSeconds()); } @@ -806,14 +807,22 @@ std::vector TabletManager::find_best_tablets_to_compaction( // tablet should do single compaction if (current_compaction_score > single_compact_highest_score && tablet_ptr->should_fetch_from_peer()) { - single_compact_highest_score = current_compaction_score; - best_single_compact_tablet = tablet_ptr; + bool ret = tablet_ptr->suitable_for_compaction(compaction_type, + cumulative_compaction_policy); + if (ret) { + single_compact_highest_score = current_compaction_score; + best_single_compact_tablet = tablet_ptr; + } } // tablet should do cumu or base compaction if (current_compaction_score > highest_score && !tablet_ptr->should_fetch_from_peer()) { - highest_score = current_compaction_score; - best_tablet = tablet_ptr; + bool ret = tablet_ptr->suitable_for_compaction(compaction_type, + cumulative_compaction_policy); + if (ret) { + highest_score = current_compaction_score; + best_tablet = tablet_ptr; + } } }; diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 6123dc6123184a..91f3b7dd8169bf 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -42,6 +42,7 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_meta_manager.h" #include "olap/tablet_meta_manager.h" #include "olap/utils.h" #include "util/debug_points.h" @@ -345,7 +346,8 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id } TabletMeta::TabletMeta(const TabletMeta& b) - : _table_id(b._table_id), + : MetadataAdder(b), + _table_id(b._table_id), _index_id(b._index_id), _partition_id(b._partition_id), _tablet_id(b._tablet_id), @@ -1187,6 +1189,9 @@ void DeleteBitmap::add_to_remove_queue( } void DeleteBitmap::remove_stale_delete_bitmap_from_queue(const std::vector& vector) { + if (!config::enable_delete_bitmap_merge_on_compaction) { + return; + } std::shared_lock l(stale_delete_bitmap_lock); // std::vector> to_delete; diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index a453baf745d602..d56e529e42bf4b 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -43,6 +43,7 @@ #include "io/fs/file_system.h" #include "olap/binlog_config.h" #include "olap/lru_cache.h" +#include "olap/metadata_adder.h" #include "olap/olap_common.h" #include "olap/rowset/rowset_meta.h" #include "olap/tablet_schema.h" @@ -90,7 +91,7 @@ class TBinlogConfig; // Class encapsulates meta of tablet. // The concurrency control is handled in Tablet Class, not in this class. -class TabletMeta { +class TabletMeta : public MetadataAdder { public: static TabletMetaSharedPtr create( const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id, @@ -118,6 +119,11 @@ class TabletMeta { TabletMeta(const TabletMeta& tablet_meta); TabletMeta(TabletMeta&& tablet_meta) = delete; +// UT +#ifdef BE_TEST + TabletMeta(TabletSchemaSPtr tablet_schema) : _schema(tablet_schema) {} +#endif + // Function create_from_file is used to be compatible with previous tablet_meta. // Previous tablet_meta is a physical file in tablet dir, which is not stored in rocksdb. Status create_from_file(const std::string& file_path); @@ -573,8 +579,6 @@ class DeleteBitmap { _stale_delete_bitmap; }; -static const std::string SEQUENCE_COL = "__DORIS_SEQUENCE_COL__"; - inline TabletUid TabletMeta::tablet_uid() const { return _tablet_uid; } @@ -638,7 +642,7 @@ inline size_t TabletMeta::num_rows() const { inline size_t TabletMeta::tablet_footprint() const { size_t total_size = 0; for (auto& rs : _rs_metas) { - total_size += rs->data_disk_size(); + total_size += rs->total_disk_size(); } return total_size; } @@ -647,7 +651,7 @@ inline size_t TabletMeta::tablet_local_size() const { size_t total_size = 0; for (auto& rs : _rs_metas) { if (rs->is_local()) { - total_size += rs->data_disk_size(); + total_size += rs->total_disk_size(); } } return total_size; @@ -657,7 +661,7 @@ inline size_t TabletMeta::tablet_remote_size() const { size_t total_size = 0; for (auto& rs : _rs_metas) { if (!rs->is_local()) { - total_size += rs->data_disk_size(); + total_size += rs->total_disk_size(); } } return total_size; diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index 9ab9e4b1b365f5..7410b70f4aa471 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -254,6 +254,7 @@ Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) { _reader_context.delete_bitmap = read_params.delete_bitmap; _reader_context.enable_unique_key_merge_on_write = tablet()->enable_unique_key_merge_on_write(); _reader_context.record_rowids = read_params.record_rowids; + _reader_context.rowid_conversion = read_params.rowid_conversion; _reader_context.is_key_column_group = read_params.is_key_column_group; _reader_context.remaining_conjunct_roots = read_params.remaining_conjunct_roots; _reader_context.common_expr_ctxs_push_down = read_params.common_expr_ctxs_push_down; diff --git a/be/src/olap/tablet_reader.h b/be/src/olap/tablet_reader.h index 50517e047ba556..dd9d39d9decee0 100644 --- a/be/src/olap/tablet_reader.h +++ b/be/src/olap/tablet_reader.h @@ -39,6 +39,7 @@ #include "olap/olap_common.h" #include "olap/olap_tuple.h" #include "olap/row_cursor.h" +#include "olap/rowid_conversion.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_reader.h" @@ -166,6 +167,7 @@ class TabletReader { // used for compaction to record row ids bool record_rowids = false; + RowIdConversion* rowid_conversion = nullptr; std::vector topn_filter_source_node_ids; int topn_filter_target_node_id = -1; // used for special optimization for query : ORDER BY key LIMIT n diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index a9fcad7690cea1..523944c4d77408 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -38,8 +38,10 @@ #include "exec/tablet_info.h" #include "olap/inverted_index_parser.h" #include "olap/olap_define.h" +#include "olap/tablet_column_object_pool.h" #include "olap/types.h" #include "olap/utils.h" +#include "runtime/memory/lru_cache_policy.h" #include "runtime/thread_context.h" #include "tablet_meta.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" @@ -53,8 +55,6 @@ namespace doris { -static bvar::Adder g_total_tablet_schema_num("doris_total_tablet_schema_num"); - FieldType TabletColumn::get_field_type_by_type(PrimitiveType primitiveType) { switch (primitiveType) { case PrimitiveType::INVALID_TYPE: @@ -749,7 +749,15 @@ void TabletIndex::init_from_thrift(const TOlapTableIndex& index, if (column_idx >= 0) { col_unique_ids[i] = tablet_schema.column(column_idx).unique_id(); } else { - col_unique_ids[i] = -1; + // if column unique id not found by column name, find by column unique id + // column unique id can not bigger than tablet schema column size, if bigger than column size means + // this column is a new column added by light schema change + if (index.__isset.column_unique_ids && + index.column_unique_ids[i] < tablet_schema.num_columns()) { + col_unique_ids[i] = index.column_unique_ids[i]; + } else { + col_unique_ids[i] = -1; + } } } _col_unique_ids = std::move(col_unique_ids); @@ -845,12 +853,14 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const { } } -TabletSchema::TabletSchema() { - g_total_tablet_schema_num << 1; -} +TabletSchema::TabletSchema() = default; TabletSchema::~TabletSchema() { - g_total_tablet_schema_num << -1; + clear_column_cache_handlers(); +} + +int64_t TabletSchema::get_metadata_size() const { + return sizeof(TabletSchema) + _vl_field_mem_size; } void TabletSchema::append_column(TabletColumn column, ColumnType col_type) { @@ -874,6 +884,8 @@ void TabletSchema::append_column(TabletColumn column, ColumnType col_type) { _sequence_col_idx = _num_columns; } else if (UNLIKELY(column.name() == VERSION_COL)) { _version_col_idx = _num_columns; + } else if (UNLIKELY(column.name() == SKIP_BITMAP_COL)) { + _skip_bitmap_col_idx = _num_columns; } _field_id_to_index[column.unique_id()] = _num_columns; _cols.push_back(std::make_shared(std::move(column))); @@ -894,18 +906,20 @@ void TabletColumn::append_sparse_column(TabletColumn column) { _num_sparse_columns++; } -void TabletSchema::append_index(TabletIndex index) { +void TabletSchema::append_index(TabletIndex&& index) { _indexes.push_back(std::move(index)); } -void TabletSchema::update_index(const TabletColumn& col, TabletIndex index) { - int32_t col_unique_id = col.unique_id(); - const std::string& suffix_path = - col.has_path_info() ? escape_for_path_name(col.path_info_ptr()->get_path()) : ""; +void TabletSchema::update_index(const TabletColumn& col, const IndexType& index_type, + TabletIndex&& index) { + int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id(); + const std::string& suffix_path = escape_for_path_name(col.suffix_path()); for (size_t i = 0; i < _indexes.size(); i++) { for (int32_t id : _indexes[i].col_unique_ids()) { - if (id == col_unique_id && _indexes[i].get_index_suffix() == suffix_path) { - _indexes[i] = index; + if (_indexes[i].index_type() == index_type && id == col_unique_id && + _indexes[i].get_index_suffix() == suffix_path) { + _indexes[i] = std::move(index); + break; } } } @@ -940,9 +954,18 @@ void TabletSchema::clear_columns() { _num_null_columns = 0; _num_key_columns = 0; _cols.clear(); + clear_column_cache_handlers(); } -void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns) { +void TabletSchema::clear_column_cache_handlers() { + for (auto* cache_handle : _column_cache_handlers) { + TabletColumnObjectPool::instance()->release(cache_handle); + } + _column_cache_handlers.clear(); +} + +void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns, + bool reuse_cache_column) { _keys_type = schema.keys_type(); _num_columns = 0; _num_variant_columns = 0; @@ -953,27 +976,42 @@ void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extrac _field_name_to_index.clear(); _field_id_to_index.clear(); _cluster_key_idxes.clear(); + clear_column_cache_handlers(); for (const auto& i : schema.cluster_key_idxes()) { _cluster_key_idxes.push_back(i); } for (auto& column_pb : schema.column()) { - TabletColumn column; - column.init_from_pb(column_pb); - if (ignore_extracted_columns && column.is_extracted_column()) { + TabletColumnPtr column; + if (reuse_cache_column) { + auto pair = TabletColumnObjectPool::instance()->insert( + deterministic_string_serialize(column_pb)); + column = pair.second; + _column_cache_handlers.push_back(pair.first); + } else { + column = std::make_shared(); + column->init_from_pb(column_pb); + } + if (ignore_extracted_columns && column->is_extracted_column()) { continue; } - if (column.is_key()) { + if (column->is_key()) { _num_key_columns++; } - if (column.is_nullable()) { + if (column->is_nullable()) { _num_null_columns++; } - if (column.is_variant_type()) { + if (column->is_variant_type()) { ++_num_variant_columns; } - _cols.emplace_back(std::make_shared(std::move(column))); - _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns); - _field_id_to_index[_cols.back()->unique_id()] = _num_columns; + + _cols.emplace_back(std::move(column)); + if (!_cols.back()->is_extracted_column()) { + _vl_field_mem_size += + sizeof(StringRef) + sizeof(char) * _cols.back()->name().size() + sizeof(size_t); + _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns); + _vl_field_mem_size += sizeof(int32_t) * 2; + _field_id_to_index[_cols.back()->unique_id()] = _num_columns; + } _num_columns++; } for (auto& index_pb : schema.index()) { @@ -1000,6 +1038,7 @@ void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extrac _delete_sign_idx = schema.delete_sign_idx(); _sequence_col_idx = schema.sequence_col_idx(); _version_col_idx = schema.version_col_idx(); + _skip_bitmap_col_idx = schema.skip_bitmap_col_idx(); _sort_type = schema.sort_type(); _sort_col_num = schema.sort_col_num(); _compression_type = schema.compression_type(); @@ -1015,6 +1054,8 @@ void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extrac _row_store_column_unique_ids.assign(schema.row_store_column_unique_ids().begin(), schema.row_store_column_unique_ids().end()); _variant_enable_flatten_nested = schema.variant_enable_flatten_nested(); + _vl_field_mem_size += _row_store_column_unique_ids.capacity() * sizeof(int32_t); + update_metadata_size(); } void TabletSchema::copy_from(const TabletSchema& tablet_schema) { @@ -1082,7 +1123,9 @@ void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version _delete_sign_idx = -1; _sequence_col_idx = -1; _version_col_idx = -1; + _skip_bitmap_col_idx = -1; _cluster_key_idxes.clear(); + clear_column_cache_handlers(); for (const auto& i : ori_tablet_schema._cluster_key_idxes) { _cluster_key_idxes.push_back(i); } @@ -1105,6 +1148,8 @@ void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version _sequence_col_idx = _num_columns; } else if (UNLIKELY(column->name() == VERSION_COL)) { _version_col_idx = _num_columns; + } else if (UNLIKELY(column->name() == SKIP_BITMAP_COL)) { + _skip_bitmap_col_idx = _num_columns; } _cols.emplace_back(std::make_shared(*column)); _field_name_to_index.emplace(StringRef(_cols.back()->name()), _num_columns); @@ -1222,6 +1267,7 @@ void TabletSchema::to_schema_pb(TabletSchemaPB* tablet_schema_pb) const { tablet_schema_pb->set_compression_type(_compression_type); tablet_schema_pb->set_row_store_page_size(_row_store_page_size); tablet_schema_pb->set_version_col_idx(_version_col_idx); + tablet_schema_pb->set_skip_bitmap_col_idx(_skip_bitmap_col_idx); tablet_schema_pb->set_inverted_index_storage_format(_inverted_index_storage_format); tablet_schema_pb->mutable_row_store_column_unique_ids()->Assign( _row_store_column_unique_ids.begin(), _row_store_column_unique_ids.end()); @@ -1319,28 +1365,6 @@ Result TabletSchema::column(const std::string& field_name) return _cols[it->second].get(); } -std::vector TabletSchema::get_indexes_for_column( - const TabletColumn& col) const { - std::vector indexes_for_column; - // Some columns (Float, Double, JSONB ...) from the variant do not support index, but they are listed in TabltetIndex. - if (!segment_v2::InvertedIndexColumnWriter::check_support_inverted_index(col)) { - return indexes_for_column; - } - int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id(); - const std::string& suffix_path = - col.has_path_info() ? escape_for_path_name(col.path_info_ptr()->get_path()) : ""; - // TODO use more efficient impl - for (size_t i = 0; i < _indexes.size(); i++) { - for (int32_t id : _indexes[i].col_unique_ids()) { - if (id == col_unique_id && _indexes[i].get_index_suffix() == suffix_path) { - indexes_for_column.push_back(&(_indexes[i])); - } - } - } - - return indexes_for_column; -} - void TabletSchema::update_tablet_columns(const TabletSchema& tablet_schema, const std::vector& t_columns) { copy_from(tablet_schema); @@ -1352,49 +1376,17 @@ void TabletSchema::update_tablet_columns(const TabletSchema& tablet_schema, } } -bool TabletSchema::has_inverted_index(const TabletColumn& col) const { - // TODO use more efficient impl - int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id(); - const std::string& suffix_path = - col.has_path_info() ? escape_for_path_name(col.path_info_ptr()->get_path()) : ""; - for (size_t i = 0; i < _indexes.size(); i++) { - if (_indexes[i].index_type() == IndexType::INVERTED) { - for (int32_t id : _indexes[i].col_unique_ids()) { - if (id == col_unique_id && _indexes[i].get_index_suffix() == suffix_path) { - return true; - } - } - } - } - - return false; -} - -bool TabletSchema::has_inverted_index_with_index_id(int64_t index_id, - const std::string& suffix_name) const { +bool TabletSchema::has_inverted_index_with_index_id(int64_t index_id) const { for (size_t i = 0; i < _indexes.size(); i++) { - if (_indexes[i].index_type() == IndexType::INVERTED && - _indexes[i].get_index_suffix() == suffix_name && _indexes[i].index_id() == index_id) { + if (_indexes[i].index_type() == IndexType::INVERTED && _indexes[i].index_id() == index_id) { return true; } } return false; } -const TabletIndex* TabletSchema::get_inverted_index_with_index_id( - int64_t index_id, const std::string& suffix_name) const { - for (size_t i = 0; i < _indexes.size(); i++) { - if (_indexes[i].index_type() == IndexType::INVERTED && - _indexes[i].get_index_suffix() == suffix_name && _indexes[i].index_id() == index_id) { - return &(_indexes[i]); - } - } - - return nullptr; -} - -const TabletIndex* TabletSchema::get_inverted_index(int32_t col_unique_id, - const std::string& suffix_path) const { +const TabletIndex* TabletSchema::inverted_index(int32_t col_unique_id, + const std::string& suffix_path) const { for (size_t i = 0; i < _indexes.size(); i++) { if (_indexes[i].index_type() == IndexType::INVERTED) { for (int32_t id : _indexes[i].col_unique_ids()) { @@ -1408,19 +1400,15 @@ const TabletIndex* TabletSchema::get_inverted_index(int32_t col_unique_id, return nullptr; } -const TabletIndex* TabletSchema::get_inverted_index(const TabletColumn& col, - bool check_valid) const { - // With check_valid set to true by default +const TabletIndex* TabletSchema::inverted_index(const TabletColumn& col) const { // Some columns(Float, Double, JSONB ...) from the variant do not support inverted index - if (check_valid && !segment_v2::InvertedIndexColumnWriter::check_support_inverted_index(col)) { + if (!segment_v2::InvertedIndexColumnWriter::check_support_inverted_index(col)) { return nullptr; } // TODO use more efficient impl // Use parent id if unique not assigned, this could happend when accessing subcolumns of variants int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() : col.unique_id(); - const std::string& suffix_path = - col.has_path_info() ? escape_for_path_name(col.path_info_ptr()->get_path()) : ""; - return get_inverted_index(col_unique_id, suffix_path); + return inverted_index(col_unique_id, escape_for_path_name(col.suffix_path())); } bool TabletSchema::has_ngram_bf_index(int32_t col_unique_id) const { @@ -1449,7 +1437,6 @@ const TabletIndex* TabletSchema::get_ngram_bf_index(int32_t col_unique_id) const } } } - return nullptr; } @@ -1555,13 +1542,4 @@ bool operator!=(const TabletSchema& a, const TabletSchema& b) { return !(a == b); } -std::string TabletSchema::deterministic_string_serialize(const TabletSchemaPB& schema_pb) { - std::string output; - google::protobuf::io::StringOutputStream string_output_stream(&output); - google::protobuf::io::CodedOutputStream output_stream(&string_output_stream); - output_stream.SetSerializationDeterministic(true); - schema_pb.SerializeToCodedStream(&output_stream); - return output; -} - } // namespace doris diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index b7fe0e9310183d..1e66e36809d8d4 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -35,10 +35,12 @@ #include "common/consts.h" #include "common/status.h" #include "gutil/stringprintf.h" +#include "olap/metadata_adder.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/options.h" #include "runtime/define_primitive_type.h" #include "runtime/descriptors.h" +#include "runtime/memory/lru_cache_policy.h" #include "util/string_util.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/common/string_ref.h" @@ -60,7 +62,7 @@ class TabletColumn; using TabletColumnPtr = std::shared_ptr; -class TabletColumn { +class TabletColumn : public MetadataAdder { public: TabletColumn(); TabletColumn(const ColumnPB& column); @@ -163,6 +165,9 @@ class TabletColumn { bool is_extracted_column() const { return _column_path != nullptr && !_column_path->empty() && _parent_col_unique_id > 0; }; + std::string suffix_path() const { + return is_extracted_column() ? _column_path->get_path() : ""; + } bool is_nested_subcolumn() const { return _column_path != nullptr && _column_path->has_nested_part(); } @@ -223,13 +228,16 @@ class TabletColumn { bool _has_bitmap_index = false; bool _visible = true; - int32_t _parent_col_unique_id = -1; + std::vector _sub_columns; uint32_t _sub_column_count = 0; bool _result_is_nullable = false; int _be_exec_version = -1; - vectorized::PathInDataPtr _column_path; + + // The extracted sub-columns from "variant" contain the following information: + int32_t _parent_col_unique_id = -1; // "variant" -> col_unique_id + vectorized::PathInDataPtr _column_path; // the path of the sub-columns themselves // Record information about columns merged into a sparse column within a variant // `{"id": 100, "name" : "jack", "point" : 3.9}` @@ -246,7 +254,7 @@ bool operator!=(const TabletColumn& a, const TabletColumn& b); class TabletSchema; -class TabletIndex { +class TabletIndex : public MetadataAdder { public: TabletIndex() = default; void init_from_thrift(const TOlapTableIndex& index, const TabletSchema& tablet_schema); @@ -288,7 +296,7 @@ class TabletIndex { std::map _properties; }; -class TabletSchema { +class TabletSchema : public MetadataAdder { public: enum ColumnType { NORMAL = 0, DROPPED = 1, VARIANT = 2 }; // TODO(yingchun): better to make constructor as private to avoid @@ -297,14 +305,26 @@ class TabletSchema { TabletSchema(); virtual ~TabletSchema(); - void init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns = false); + // Init from pb + // ignore_extracted_columns: ignore the extracted columns from variant column + // reuse_cached_column: reuse the cached column in the schema if they are the same, to reduce memory usage + void init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns = false, + bool reuse_cached_column = false); // Notice: Use deterministic way to serialize protobuf, // since serialize Map in protobuf may could lead to un-deterministic by default - static std::string deterministic_string_serialize(const TabletSchemaPB& schema_pb); + template + static std::string deterministic_string_serialize(const PbType& pb) { + std::string output; + google::protobuf::io::StringOutputStream string_output_stream(&output); + google::protobuf::io::CodedOutputStream output_stream(&string_output_stream); + output_stream.SetSerializationDeterministic(true); + pb.SerializeToCodedStream(&output_stream); + return output; + } void to_schema_pb(TabletSchemaPB* tablet_meta_pb) const; void append_column(TabletColumn column, ColumnType col_type = ColumnType::NORMAL); - void append_index(TabletIndex index); - void update_index(const TabletColumn& column, TabletIndex index); + void append_index(TabletIndex&& index); + void update_index(const TabletColumn& column, const IndexType& index_type, TabletIndex&& index); void remove_index(int64_t index_id); void clear_index(); // Must make sure the row column is always the last column @@ -369,11 +389,21 @@ class TabletSchema { int32_t sequence_col_idx() const { return _sequence_col_idx; } void set_version_col_idx(int32_t version_col_idx) { _version_col_idx = version_col_idx; } int32_t version_col_idx() const { return _version_col_idx; } + bool has_skip_bitmap_col() const { return _skip_bitmap_col_idx != -1; } + int32_t skip_bitmap_col_idx() const { return _skip_bitmap_col_idx; } segment_v2::CompressionTypePB compression_type() const { return _compression_type; } void set_row_store_page_size(long page_size) { _row_store_page_size = page_size; } long row_store_page_size() const { return _row_store_page_size; } - const std::vector& indexes() const { return _indexes; } + const std::vector inverted_indexes() const { + std::vector inverted_indexes; + for (const auto& index : _indexes) { + if (index.index_type() == IndexType::INVERTED) { + inverted_indexes.emplace_back(&index); + } + } + return inverted_indexes; + } bool has_inverted_index() const { for (const auto& index : _indexes) { if (index.index_type() == IndexType::INVERTED) { @@ -382,17 +412,15 @@ class TabletSchema { } return false; } - std::vector get_indexes_for_column(const TabletColumn& col) const; - bool has_inverted_index(const TabletColumn& col) const; - bool has_inverted_index_with_index_id(int64_t index_id, const std::string& suffix_path) const; - const TabletIndex* get_inverted_index_with_index_id(int64_t index_id, - const std::string& suffix_name) const; - // check_valid: check if this column supports inverted index + bool has_inverted_index_with_index_id(int64_t index_id) const; + // Check whether this column supports inverted index // Some columns (Float, Double, JSONB ...) from the variant do not support index, but they are listed in TabletIndex. - // If returned, the index file will not be found. - const TabletIndex* get_inverted_index(const TabletColumn& col, bool check_valid = true) const; - const TabletIndex* get_inverted_index(int32_t col_unique_id, - const std::string& suffix_path) const; + const TabletIndex* inverted_index(const TabletColumn& col) const; + + // Regardless of whether this column supports inverted index + // TabletIndex information will be returned as long as it exists. + const TabletIndex* inverted_index(int32_t col_unique_id, + const std::string& suffix_path = "") const; bool has_ngram_bf_index(int32_t col_unique_id) const; const TabletIndex* get_ngram_bf_index(int32_t col_unique_id) const; void update_indexes_from_thrift(const std::vector& indexes); @@ -498,14 +526,19 @@ class TabletSchema { const std::vector& row_columns_uids() const { return _row_store_column_unique_ids; } + int64_t get_metadata_size() const override; + private: friend bool operator==(const TabletSchema& a, const TabletSchema& b); friend bool operator!=(const TabletSchema& a, const TabletSchema& b); + void clear_column_cache_handlers(); + KeysType _keys_type = DUP_KEYS; SortType _sort_type = SortType::LEXICAL; size_t _sort_col_num = 0; std::vector _cols; + std::vector _column_cache_handlers; std::vector _indexes; std::unordered_map _field_name_to_index; @@ -531,6 +564,7 @@ class TabletSchema { int32_t _delete_sign_idx = -1; int32_t _sequence_col_idx = -1; int32_t _version_col_idx = -1; + int32_t _skip_bitmap_col_idx = -1; int32_t _schema_version = -1; int64_t _table_id = -1; int64_t _db_id = -1; @@ -545,6 +579,7 @@ class TabletSchema { // ATTN: For compability reason empty cids means all columns of tablet schema are encoded to row column std::vector _row_store_column_unique_ids; bool _variant_enable_flatten_nested = false; + int64_t _vl_field_mem_size {0}; // variable length field }; bool operator==(const TabletSchema& a, const TabletSchema& b); diff --git a/be/src/olap/tablet_schema_cache.cpp b/be/src/olap/tablet_schema_cache.cpp index e339c947bb97a4..fd238fa5affb3f 100644 --- a/be/src/olap/tablet_schema_cache.cpp +++ b/be/src/olap/tablet_schema_cache.cpp @@ -18,30 +18,45 @@ #include "olap/tablet_schema_cache.h" #include +#include +#include #include "bvar/bvar.h" #include "olap/tablet_schema.h" +#include "util/sha.h" bvar::Adder g_tablet_schema_cache_count("tablet_schema_cache_count"); bvar::Adder g_tablet_schema_cache_columns_count("tablet_schema_cache_columns_count"); +bvar::Adder g_tablet_schema_cache_hit_count("tablet_schema_cache_hit_count"); namespace doris { +// to reduce the memory consumption of the serialized TabletSchema as key. +// use sha256 to prevent from hash collision +static std::string get_key_signature(const std::string& origin) { + SHA256Digest digest; + digest.reset(origin.data(), origin.length()); + return std::string {digest.digest().data(), digest.digest().length()}; +} + std::pair TabletSchemaCache::insert(const std::string& key) { - auto* lru_handle = lookup(key); + std::string key_signature = get_key_signature(key); + auto* lru_handle = lookup(key_signature); TabletSchemaSPtr tablet_schema_ptr; if (lru_handle) { auto* value = (CacheValue*)LRUCachePolicy::value(lru_handle); tablet_schema_ptr = value->tablet_schema; + g_tablet_schema_cache_hit_count << 1; } else { auto* value = new CacheValue; tablet_schema_ptr = std::make_shared(); TabletSchemaPB pb; pb.ParseFromString(key); - tablet_schema_ptr->init_from_pb(pb); + // We should reuse the memory of the same TabletColumn object, set reuse_cached_column to true + tablet_schema_ptr->init_from_pb(pb, false, true); value->tablet_schema = tablet_schema_ptr; - lru_handle = LRUCachePolicy::insert(key, value, tablet_schema_ptr->num_columns(), 0, - CachePriority::NORMAL); + lru_handle = LRUCachePolicy::insert(key_signature, value, tablet_schema_ptr->num_columns(), + 0, CachePriority::NORMAL); g_tablet_schema_cache_count << 1; g_tablet_schema_cache_columns_count << tablet_schema_ptr->num_columns(); } diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp index d0c4b0e45f468e..05ecfc0401b6d0 100644 --- a/be/src/olap/task/engine_checksum_task.cpp +++ b/be/src/olap/task/engine_checksum_task.cpp @@ -93,7 +93,7 @@ Status EngineChecksumTask::_compute_checksum() { } size_t input_size = 0; for (const auto& rowset : input_rowsets) { - input_size += rowset->data_disk_size(); + input_size += rowset->total_disk_size(); } auto res = reader.init(reader_params); diff --git a/be/src/olap/task/engine_publish_version_task.cpp b/be/src/olap/task/engine_publish_version_task.cpp index dae4c6be814d5a..75e589f3b97728 100644 --- a/be/src/olap/task/engine_publish_version_task.cpp +++ b/be/src/olap/task/engine_publish_version_task.cpp @@ -82,8 +82,10 @@ EnginePublishVersionTask::EnginePublishVersionTask( _succ_tablets(succ_tablets), _discontinuous_version_tablets(discontinuous_version_tablets), _table_id_to_tablet_id_to_num_delta_rows(table_id_to_tablet_id_to_num_delta_rows) { - _mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::OTHER, - "TabletPublishTxnTask"); + _mem_tracker = MemTrackerLimiter::create_shared( + MemTrackerLimiter::Type::OTHER, + fmt::format("EnginePublishVersionTask-transactionID_{}", + std::to_string(_publish_version_req.transaction_id))); } void EnginePublishVersionTask::add_error_tablet_id(int64_t tablet_id) { @@ -381,8 +383,11 @@ TabletPublishTxnTask::TabletPublishTxnTask(StorageEngine& engine, _transaction_id(transaction_id), _version(version), _tablet_info(tablet_info), - _mem_tracker(MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::OTHER, - "TabletPublishTxnTask")) { + _mem_tracker(MemTrackerLimiter::create_shared( + MemTrackerLimiter::Type::OTHER, + fmt::format("TabletPublishTxnTask-partitionID_{}-transactionID_{}-version_{}", + std::to_string(partition_id), std::to_string(transaction_id), + version.to_string()))) { _stats.submit_time_us = MonotonicMicros(); } diff --git a/be/src/olap/task/engine_storage_migration_task.cpp b/be/src/olap/task/engine_storage_migration_task.cpp index 21be34a334dd8d..a300e6e0f09fa3 100644 --- a/be/src/olap/task/engine_storage_migration_task.cpp +++ b/be/src/olap/task/engine_storage_migration_task.cpp @@ -407,11 +407,8 @@ Status EngineStorageMigrationTask::_copy_index_and_data_files( if (tablet_schema.get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { - for (const auto& index : tablet_schema.indexes()) { - if (index.index_type() != IndexType::INVERTED) { - continue; - } - auto index_id = index.index_id(); + for (const auto& index : tablet_schema.inverted_indexes()) { + auto index_id = index->index_id(); auto index_file = _tablet->get_segment_index_filepath(rowset_id, segment_index, index_id); auto snapshot_segment_index_file_path = diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index 38a52d1d2118aa..2ce31527f61c64 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -68,8 +68,11 @@ Status IndexBuilder::update_inverted_index_info() { _output_rowsets.reserve(_input_rowsets.size()); _pending_rs_guards.reserve(_input_rowsets.size()); for (auto&& input_rowset : _input_rowsets) { - if (!input_rowset->is_local()) [[unlikely]] { - DCHECK(false) << _tablet->tablet_id() << ' ' << input_rowset->rowset_id(); + bool is_local_rowset = input_rowset->is_local(); + DBUG_EXECUTE_IF("IndexBuilder::update_inverted_index_info_is_local_rowset", + { is_local_rowset = false; }) + if (!is_local_rowset) [[unlikely]] { + // DCHECK(false) << _tablet->tablet_id() << ' ' << input_rowset->rowset_id(); return Status::InternalError("should be local rowset. tablet_id={} rowset_id={}", _tablet->tablet_id(), input_rowset->rowset_id().to_string()); @@ -81,6 +84,9 @@ Status IndexBuilder::update_inverted_index_info() { size_t total_index_size = 0; auto* beta_rowset = reinterpret_cast(input_rowset.get()); auto size_st = beta_rowset->get_inverted_index_size(&total_index_size); + DBUG_EXECUTE_IF("IndexBuilder::update_inverted_index_info_size_st_not_ok", { + size_st = Status::Error("debug point: get fs failed"); + }) if (!size_st.ok() && !size_st.is() && !size_st.is()) { return size_st; @@ -94,13 +100,19 @@ Status IndexBuilder::update_inverted_index_info() { auto column_name = t_inverted_index.columns[0]; auto column_idx = output_rs_tablet_schema->field_index(column_name); if (column_idx < 0) { - LOG(WARNING) << "referenced column was missing. " - << "[column=" << column_name << " referenced_column=" << column_idx - << "]"; - continue; + if (!t_inverted_index.column_unique_ids.empty()) { + auto column_unique_id = t_inverted_index.column_unique_ids[0]; + column_idx = output_rs_tablet_schema->field_index(column_unique_id); + } + if (column_idx < 0) { + LOG(WARNING) << "referenced column was missing. " + << "[column=" << column_name + << " referenced_column=" << column_idx << "]"; + continue; + } } auto column = output_rs_tablet_schema->column(column_idx); - const auto* index_meta = output_rs_tablet_schema->get_inverted_index(column); + const auto* index_meta = output_rs_tablet_schema->inverted_index(column); if (index_meta == nullptr) { LOG(ERROR) << "failed to find column: " << column_name << " index_id: " << t_inverted_index.index_id; @@ -136,12 +148,7 @@ Status IndexBuilder::update_inverted_index_info() { return Status::Error( "indexes count cannot be negative"); } - int32_t indexes_size = 0; - for (auto index : output_rs_tablet_schema->indexes()) { - if (index.index_type() == IndexType::INVERTED) { - indexes_size++; - } - } + int32_t indexes_size = output_rs_tablet_schema->inverted_indexes().size(); if (indexes_count != indexes_size) { return Status::Error( "indexes count not equal to expected"); @@ -159,11 +166,11 @@ Status IndexBuilder::update_inverted_index_info() { LOG(WARNING) << "referenced column was missing. " << "[column=" << t_inverted_index.columns[0] << " referenced_column=" << column_uid << "]"; - output_rs_tablet_schema->append_index(index); + output_rs_tablet_schema->append_index(std::move(index)); continue; } const TabletColumn& col = output_rs_tablet_schema->column_by_uid(column_uid); - const TabletIndex* exist_index = output_rs_tablet_schema->get_inverted_index(col); + const TabletIndex* exist_index = output_rs_tablet_schema->inverted_index(col); if (exist_index && exist_index->index_id() != index.index_id()) { LOG(WARNING) << fmt::format( "column: {} has a exist inverted index, but the index id not equal " @@ -173,7 +180,7 @@ Status IndexBuilder::update_inverted_index_info() { without_index_uids.insert(exist_index->index_id()); output_rs_tablet_schema->remove_index(exist_index->index_id()); } - output_rs_tablet_schema->append_index(index); + output_rs_tablet_schema->append_index(std::move(index)); } } // construct input rowset reader @@ -207,13 +214,12 @@ Status IndexBuilder::update_inverted_index_info() { InvertedIndexStorageFormatPB::V1) { if (_is_drop_op) { VLOG_DEBUG << "data_disk_size:" << input_rowset_meta->data_disk_size() - << " total_disk_size:" << input_rowset_meta->data_disk_size() + << " total_disk_size:" << input_rowset_meta->total_disk_size() << " index_disk_size:" << input_rowset_meta->index_disk_size() << " drop_index_size:" << drop_index_size; rowset_meta->set_total_disk_size(input_rowset_meta->total_disk_size() - drop_index_size); - rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size() - - drop_index_size); + rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size()); rowset_meta->set_index_disk_size(input_rowset_meta->index_disk_size() - drop_index_size); } else { @@ -229,6 +235,11 @@ Status IndexBuilder::update_inverted_index_info() { std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)}, output_rs_tablet_schema->get_inverted_index_storage_format()); auto st = idx_file_reader->init(); + DBUG_EXECUTE_IF( + "IndexBuilder::update_inverted_index_info_index_file_reader_init_not_ok", { + st = Status::Error( + "debug point: reader init error"); + }) if (!st.ok() && !st.is()) { return st; } @@ -238,7 +249,7 @@ Status IndexBuilder::update_inverted_index_info() { } rowset_meta->set_total_disk_size(input_rowset_meta->total_disk_size() - total_index_size); - rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size() - total_index_size); + rowset_meta->set_data_disk_size(input_rowset_meta->data_disk_size()); rowset_meta->set_index_disk_size(input_rowset_meta->index_disk_size() - total_index_size); } @@ -262,8 +273,11 @@ Status IndexBuilder::update_inverted_index_info() { Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta, std::vector& segments) { - if (!output_rowset_meta->is_local()) [[unlikely]] { - DCHECK(false) << _tablet->tablet_id() << ' ' << output_rowset_meta->rowset_id(); + bool is_local_rowset = output_rowset_meta->is_local(); + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_is_local_rowset", + { is_local_rowset = false; }) + if (!is_local_rowset) [[unlikely]] { + // DCHECK(false) << _tablet->tablet_id() << ' ' << output_rowset_meta->rowset_id(); return Status::InternalError("should be local rowset. tablet_id={} rowset_id={}", _tablet->tablet_id(), output_rowset_meta->rowset_id().to_string()); @@ -280,6 +294,8 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta for (auto& seg_ptr : segments) { auto idx_file_reader_iter = _inverted_index_file_readers.find( std::make_pair(output_rowset_meta->rowset_id().to_string(), seg_ptr->id())); + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_can_not_find_reader_drop_op", + { idx_file_reader_iter = _inverted_index_file_readers.end(); }) if (idx_file_reader_iter == _inverted_index_file_readers.end()) { LOG(ERROR) << "idx_file_reader_iter" << output_rowset_meta->rowset_id() << ":" << seg_ptr->id() << " cannot be found"; @@ -292,10 +308,20 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta _tablet->tablet_path(), output_rowset_meta->rowset_id().to_string(), seg_ptr->id()))}; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix); + io::FileWriterPtr file_writer; + Status st = fs->create_file(index_path, &file_writer); + if (!st.ok()) { + LOG(WARNING) << "failed to create writable file. path=" << index_path + << ", err: " << st; + return st; + } auto inverted_index_file_writer = std::make_unique( fs, std::move(index_path_prefix), output_rowset_meta->rowset_id().to_string(), seg_ptr->id(), - output_rowset_schema->get_inverted_index_storage_format()); + output_rowset_schema->get_inverted_index_storage_format(), + std::move(file_writer)); RETURN_IF_ERROR(inverted_index_file_writer->initialize(dirs)); // create inverted index writer for (auto& index_meta : _dropped_inverted_indexes) { @@ -313,8 +339,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta inverted_index_size += inverted_index_writer->get_index_file_total_size(); } _inverted_index_file_writers.clear(); - output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size() + - inverted_index_size); + output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size()); output_rowset_meta->set_total_disk_size(output_rowset_meta->total_disk_size() + inverted_index_size); output_rowset_meta->set_index_disk_size(output_rowset_meta->index_disk_size() + @@ -341,15 +366,27 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta InvertedIndexStorageFormatPB::V2) { auto idx_file_reader_iter = _inverted_index_file_readers.find( std::make_pair(output_rowset_meta->rowset_id().to_string(), seg_ptr->id())); + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_can_not_find_reader", + { idx_file_reader_iter = _inverted_index_file_readers.end(); }) if (idx_file_reader_iter == _inverted_index_file_readers.end()) { LOG(ERROR) << "idx_file_reader_iter" << output_rowset_meta->rowset_id() << ":" << seg_ptr->id() << " cannot be found"; continue; } + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix); + io::FileWriterPtr file_writer; + Status st = fs->create_file(index_path, &file_writer); + if (!st.ok()) { + LOG(WARNING) << "failed to create writable file. path=" << index_path + << ", err: " << st; + return st; + } auto dirs = DORIS_TRY(idx_file_reader_iter->second->get_all_directories()); inverted_index_file_writer = std::make_unique( fs, index_path_prefix, output_rowset_meta->rowset_id().to_string(), - seg_ptr->id(), output_rowset_schema->get_inverted_index_storage_format()); + seg_ptr->id(), output_rowset_schema->get_inverted_index_storage_format(), + std::move(file_writer)); RETURN_IF_ERROR(inverted_index_file_writer->initialize(dirs)); } else { inverted_index_file_writer = std::make_unique( @@ -363,25 +400,42 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta auto column_name = inverted_index.columns[0]; auto column_idx = output_rowset_schema->field_index(column_name); if (column_idx < 0) { - LOG(WARNING) << "referenced column was missing. " - << "[column=" << column_name << " referenced_column=" << column_idx - << "]"; - continue; + if (!inverted_index.column_unique_ids.empty()) { + column_idx = output_rowset_schema->field_index( + inverted_index.column_unique_ids[0]); + } + if (column_idx < 0) { + LOG(WARNING) << "referenced column was missing. " + << "[column=" << column_name + << " referenced_column=" << column_idx << "]"; + continue; + } } auto column = output_rowset_schema->column(column_idx); - if (!InvertedIndexColumnWriter::check_support_inverted_index(column)) { + // variant column is not support for building index + auto is_support_inverted_index = + InvertedIndexColumnWriter::check_support_inverted_index(column); + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_support_inverted_index", + { is_support_inverted_index = false; }) + if (!is_support_inverted_index) { continue; } - DCHECK(output_rowset_schema->has_inverted_index_with_index_id(index_id, "")); + DCHECK(output_rowset_schema->has_inverted_index_with_index_id(index_id)); _olap_data_convertor->add_column_data_convertor(column); return_columns.emplace_back(column_idx); std::unique_ptr field(FieldFactory::create(column)); - const auto* index_meta = output_rowset_schema->get_inverted_index(column); + const auto* index_meta = output_rowset_schema->inverted_index(column); std::unique_ptr inverted_index_builder; try { RETURN_IF_ERROR(segment_v2::InvertedIndexColumnWriter::create( field.get(), &inverted_index_builder, inverted_index_file_writer.get(), index_meta)); + DBUG_EXECUTE_IF( + "IndexBuilder::handle_single_rowset_index_column_writer_create_error", { + _CLTHROWA(CL_ERR_IO, + "debug point: " + "handle_single_rowset_index_column_writer_create_error"); + }) } catch (const std::exception& e) { return Status::Error( "CLuceneError occured: {}", e.what()); @@ -412,6 +466,10 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta std::make_shared(output_rowset_schema->columns(), return_columns); std::unique_ptr iter; auto res = seg_ptr->new_iterator(schema, read_options, &iter); + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_create_iterator_error", { + res = Status::Error( + "debug point: handle_single_rowset_create_iterator_error"); + }) if (!res.ok()) { LOG(WARNING) << "failed to create iterator[" << seg_ptr->id() << "]: " << res.to_string(); @@ -422,7 +480,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta output_rowset_schema->create_block(return_columns)); while (true) { auto status = iter->next_batch(block.get()); - DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset", { + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_iterator_next_batch_error", { status = Status::Error( "next_batch fault injection"); }); @@ -437,8 +495,15 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta } // write inverted index data - if (_write_inverted_index_data(output_rowset_schema, iter->data_id(), - block.get()) != Status::OK()) { + status = _write_inverted_index_data(output_rowset_schema, iter->data_id(), + block.get()); + DBUG_EXECUTE_IF( + "IndexBuilder::handle_single_rowset_write_inverted_index_data_error", { + status = Status::Error( + "debug point: " + "handle_single_rowset_write_inverted_index_data_error"); + }) + if (!status.ok()) { return Status::Error( "failed to write block."); } @@ -451,6 +516,10 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta if (_inverted_index_builders[writer_sign]) { RETURN_IF_ERROR(_inverted_index_builders[writer_sign]->finish()); } + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_index_build_finish_error", { + _CLTHROWA(CL_ERR_IO, + "debug point: handle_single_rowset_index_build_finish_error"); + }) } catch (const std::exception& e) { return Status::Error( "CLuceneError occured: {}", e.what()); @@ -461,6 +530,10 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta } for (auto&& [seg_id, inverted_index_file_writer] : _inverted_index_file_writers) { auto st = inverted_index_file_writer->close(); + DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_file_writer_close_error", { + st = Status::Error( + "debug point: handle_single_rowset_file_writer_close_error"); + }) if (!st.ok()) { LOG(ERROR) << "close inverted_index_writer error:" << st; return st; @@ -469,8 +542,7 @@ Status IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta } _inverted_index_builders.clear(); _inverted_index_file_writers.clear(); - output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size() + - inverted_index_size); + output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size()); output_rowset_meta->set_total_disk_size(output_rowset_meta->total_disk_size() + inverted_index_size); output_rowset_meta->set_index_disk_size(output_rowset_meta->index_disk_size() + @@ -491,15 +563,28 @@ Status IndexBuilder::_write_inverted_index_data(TabletSchemaSPtr tablet_schema, auto index_id = inverted_index.index_id; auto column_name = inverted_index.columns[0]; auto column_idx = tablet_schema->field_index(column_name); + DBUG_EXECUTE_IF("IndexBuilder::_write_inverted_index_data_column_idx_is_negative", + { column_idx = -1; }) if (column_idx < 0) { - LOG(WARNING) << "referenced column was missing. " - << "[column=" << column_name << " referenced_column=" << column_idx << "]"; - continue; + if (!inverted_index.column_unique_ids.empty()) { + auto column_unique_id = inverted_index.column_unique_ids[0]; + column_idx = tablet_schema->field_index(column_unique_id); + } + if (column_idx < 0) { + LOG(WARNING) << "referenced column was missing. " + << "[column=" << column_name << " referenced_column=" << column_idx + << "]"; + continue; + } } auto column = tablet_schema->column(column_idx); auto writer_sign = std::make_pair(segment_idx, index_id); std::unique_ptr field(FieldFactory::create(column)); auto converted_result = _olap_data_convertor->convert_column_data(i); + DBUG_EXECUTE_IF("IndexBuilder::_write_inverted_index_data_convert_column_data_error", { + converted_result.first = Status::Error( + "debug point: _write_inverted_index_data_convert_column_data_error"); + }) if (converted_result.first != Status::OK()) { LOG(WARNING) << "failed to convert block, errcode: " << converted_result.first; return converted_result.first; @@ -551,10 +636,20 @@ Status IndexBuilder::_add_nullable(const std::string& column_name, field->get_sub_field(0)->size(), reinterpret_cast(data), reinterpret_cast(nested_null_map), offsets_ptr, num_rows)); } + DBUG_EXECUTE_IF("IndexBuilder::_add_nullable_add_array_values_error", { + _CLTHROWA(CL_ERR_IO, "debug point: _add_nullable_add_array_values_error"); + }) } catch (const std::exception& e) { return Status::Error( "CLuceneError occured: {}", e.what()); } + // we should refresh nullmap for array + for (int row_id = 0; row_id < num_rows; row_id++) { + if (null_map && null_map[row_id] == 1) { + RETURN_IF_ERROR( + _inverted_index_builders[index_writer_sign]->add_array_nulls(row_id)); + } + } return Status::OK(); } @@ -569,6 +664,8 @@ Status IndexBuilder::_add_nullable(const std::string& column_name, } *ptr += field->size() * step; offset += step; + DBUG_EXECUTE_IF("IndexBuilder::_add_nullable_throw_exception", + { _CLTHROWA(CL_ERR_IO, "debug point: _add_nullable_throw_exception"); }) } while (offset < num_rows); } catch (const std::exception& e) { return Status::Error("CLuceneError occured: {}", @@ -601,6 +698,8 @@ Status IndexBuilder::_add_data(const std::string& column_name, RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_values( column_name, *ptr, num_rows)); } + DBUG_EXECUTE_IF("IndexBuilder::_add_data_throw_exception", + { _CLTHROWA(CL_ERR_IO, "debug point: _add_data_throw_exception"); }) } catch (const std::exception& e) { return Status::Error("CLuceneError occured: {}", e.what()); @@ -626,6 +725,8 @@ Status IndexBuilder::handle_inverted_index_data() { Status IndexBuilder::do_build_inverted_index() { LOG(INFO) << "begin to do_build_inverted_index, tablet=" << _tablet->tablet_id() << ", is_drop_op=" << _is_drop_op; + DBUG_EXECUTE_IF("IndexBuilder::do_build_inverted_index_alter_inverted_indexes_empty", + { _alter_inverted_indexes.clear(); }) if (_alter_inverted_indexes.empty()) { return Status::OK(); } @@ -633,37 +734,41 @@ Status IndexBuilder::do_build_inverted_index() { std::unique_lock schema_change_lock(_tablet->get_schema_change_lock(), std::try_to_lock); if (!schema_change_lock.owns_lock()) { - return Status::Error("try schema_change_lock failed"); + return Status::ObtainLockFailed("try schema_change_lock failed. tablet={} ", + _tablet->tablet_id()); } // Check executing serially with compaction task. std::unique_lock base_compaction_lock(_tablet->get_base_compaction_lock(), std::try_to_lock); if (!base_compaction_lock.owns_lock()) { - return Status::Error("try base_compaction_lock failed"); + return Status::ObtainLockFailed("try base_compaction_lock failed. tablet={} ", + _tablet->tablet_id()); } std::unique_lock cumu_compaction_lock(_tablet->get_cumulative_compaction_lock(), std::try_to_lock); if (!cumu_compaction_lock.owns_lock()) { - return Status::Error("try cumu_compaction_lock failed"); + return Status::ObtainLockFailed("try cumu_compaction_lock failed. tablet={}", + _tablet->tablet_id()); } std::unique_lock cold_compaction_lock(_tablet->get_cold_compaction_lock(), std::try_to_lock); if (!cold_compaction_lock.owns_lock()) { - return Status::Error("try cold_compaction_lock failed"); + return Status::ObtainLockFailed("try cold_compaction_lock failed. tablet={}", + _tablet->tablet_id()); } std::unique_lock build_inverted_index_lock(_tablet->get_build_inverted_index_lock(), std::try_to_lock); if (!build_inverted_index_lock.owns_lock()) { - return Status::Error( - "failed to obtain build inverted index lock. tablet={}", _tablet->tablet_id()); + return Status::ObtainLockFailed("failed to obtain build inverted index lock. tablet={}", + _tablet->tablet_id()); } std::shared_lock migration_rlock(_tablet->get_migration_lock(), std::try_to_lock); if (!migration_rlock.owns_lock()) { - return Status::Error("got migration_rlock failed. tablet={}", - _tablet->tablet_id()); + return Status::ObtainLockFailed("got migration_rlock failed. tablet={}", + _tablet->tablet_id()); } _input_rowsets = @@ -692,6 +797,10 @@ Status IndexBuilder::do_build_inverted_index() { // modify rowsets in memory st = modify_rowsets(); + DBUG_EXECUTE_IF("IndexBuilder::do_build_inverted_index_modify_rowsets_status_error", { + st = Status::Error( + "debug point: do_build_inverted_index_modify_rowsets_status_error"); + }) if (!st.ok()) { LOG(WARNING) << "failed to modify rowsets in memory. " << "tablet=" << _tablet->tablet_id() << ", error=" << st; @@ -749,7 +858,10 @@ Status IndexBuilder::modify_rowsets(const Merger::Statistics* stats) { void IndexBuilder::gc_output_rowset() { for (auto&& output_rowset : _output_rowsets) { - if (!output_rowset->is_local()) { + auto is_local_rowset = output_rowset->is_local(); + DBUG_EXECUTE_IF("IndexBuilder::gc_output_rowset_is_local_rowset", + { is_local_rowset = false; }) + if (!is_local_rowset) { _tablet->record_unused_remote_rowset(output_rowset->rowset_id(), output_rowset->rowset_meta()->resource_id(), output_rowset->num_segments()); diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index 1dd2d52f33b8ac..d227f53053128b 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -89,6 +89,15 @@ TxnManager::TxnManager(StorageEngine& engine, int32_t txn_map_shard_size, int32_ Status TxnManager::prepare_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, bool ingest) { + // check if the tablet has already been shutdown. If it has, it indicates that + // it is an old tablet, and data should not be imported into the old tablet. + // Otherwise, it may lead to data loss during migration. + if (tablet.tablet_state() == TABLET_SHUTDOWN) { + return Status::InternalError( + "The tablet's state is shutdown, tablet_id: {}. The tablet may have been dropped " + "or migrationed. Please check if the table has been dropped or try again.", + tablet.tablet_id()); + } return prepare_txn(partition_id, transaction_id, tablet.tablet_id(), tablet.tablet_uid(), load_id, ingest); } @@ -374,7 +383,7 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, return save_status; } - if (partial_update_info && partial_update_info->is_partial_update) { + if (partial_update_info && partial_update_info->is_partial_update()) { PartialUpdateInfoPB partial_update_info_pb; partial_update_info->to_pb(&partial_update_info_pb); save_status = RowsetMetaManager::save_partial_update_info( @@ -397,7 +406,7 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, if (st.ok()) { decoded_partial_update_info = std::make_shared(); decoded_partial_update_info->from_pb(&partial_update_info_pb); - DCHECK(decoded_partial_update_info->is_partial_update); + DCHECK(decoded_partial_update_info->is_partial_update()); } else if (!st.is()) { // the load is not a partial update return st; @@ -555,7 +564,7 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, } if (tablet_txn_info->unique_key_merge_on_write && tablet_txn_info->partial_update_info && - tablet_txn_info->partial_update_info->is_partial_update) { + tablet_txn_info->partial_update_info->is_partial_update()) { status = RowsetMetaManager::remove_partial_update_info(meta, tablet_id, partition_id, transaction_id); if (!status) { diff --git a/be/src/olap/utils.cpp b/be/src/olap/utils.cpp index 5ae8b7ab9df244..52d05133379143 100644 --- a/be/src/olap/utils.cpp +++ b/be/src/olap/utils.cpp @@ -56,319 +56,6 @@ uint32_t olap_adler32(uint32_t adler, const char* buf, size_t len) { return adler32(adler, reinterpret_cast(buf), len); } -// implement crc32c by looking up 8 tables -static const unsigned int T8_0[256] = { - 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, - 0xD4CA64EB, 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, - 0xAC78BF27, 0x5E133C24, 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, - 0x25AFD373, 0x36FF2087, 0xC494A384, 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, - 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, - 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 0xAA64D611, 0x580F5512, - 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, 0x30E349B1, - 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, - 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, - 0x6EF07595, 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, - 0x67DAFA54, 0x95B17957, 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, - 0xFE53516F, 0xED03A29B, 0x1F682198, 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, - 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, - 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, 0x61C69362, 0x93AD1061, - 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, 0xEB1FCBAD, - 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, - 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, - 0xA55230E6, 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, - 0xDDE0EB2A, 0x2F8B6829, 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, - 0xB7072F64, 0xA457DC90, 0x563C5F93, 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, - 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, 0x92A8FC17, 0x60C37F14, 0x73938CE0, - 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, 0x1871A4D8, 0xEA1A27DB, - 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 0xA24BB5A6, - 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, - 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, - 0xFC588982, 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, - 0x94B49521, 0x66DF1622, 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, - 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, - 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, - 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 0xD3D3E1AB, 0x21B862A8, - 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, 0x590AB964, - 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, - 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, - 0x37FACCF1, 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, - 0x4F48173D, 0xBD23943E, 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, - 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, - 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351}; - -static const unsigned int T8_1[256] = { - 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, - 0x7A6DC945, 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, - 0xF4DB928A, 0xE7790AFD, 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, - 0x62ED082A, 0x560AA0B3, 0x45A838C4, 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, - 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C, 0x7F8BE302, 0x6C297B75, 0x58CED3EC, - 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47, 0xE29F20BA, 0xF13DB8CD, - 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF, 0x404E1283, - 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6, - 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, - 0xA737187E, 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, - 0x96D89736, 0x857A0F41, 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, - 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9, 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, - 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0, 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, - 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78, 0x809C2506, 0x933EBD71, - 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43, 0x1D88E6BE, - 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB, - 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, - 0xC5341DC2, 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, - 0x4B82460D, 0x5820DE7A, 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, - 0xA6EB0352, 0x920CABCB, 0x81AE33BC, 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, - 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004, 0xC4060B78, 0xD7A4930F, 0xE3433B96, - 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D, 0x5912C8C0, 0x4AB050B7, - 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185, 0x844819FB, - 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE, - 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, - 0x63311306, 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, - 0xD242B948, 0xC1E0213F, 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, - 0x7BB1D269, 0x4F567AF0, 0x5CF4E287, 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, - 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8, 0x99C0FF45, 0x8A626732, 0xBE85CFAB, - 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600, 0x3B11CD7C, 0x28B3550B, - 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439, 0xA6050EC4, - 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781, - 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, - 0x013216BA, 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, - 0x8F844D75, 0x9C26D502, 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, - 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B, 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, - 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483}; - -static const unsigned int T8_2[256] = { - 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, - 0x74F06469, 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, - 0xE9E0C8D2, 0x4CA15AAC, 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, - 0x4B3D4BEE, 0xA1138B9D, 0x045219E3, 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, - 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726, 0xE144FB14, 0x4405696A, 0xAE2BA919, - 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D, 0xD915C5D1, 0x7C5457AF, - 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8, 0x91E6869E, - 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7, - 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, - 0xDD47DC32, 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, - 0x16D476CE, 0xB395E4B0, 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, - 0xC4AB8878, 0x2E85480B, 0x8BC4DA75, 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, - 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A, 0x8F96C396, 0x2AD751E8, 0xC0F9919B, - 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF, 0x26217BCD, 0x8360E9B3, - 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4, 0x1E704508, - 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161, - 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, - 0x2273622E, 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, - 0xBF63CE95, 0x1A225CEB, 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, - 0xB0B84127, 0x5A968154, 0xFFD7132A, 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, - 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF, 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, - 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0, 0xC3D4340C, 0x6695A672, - 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065, 0x6A638C57, - 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E, - 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, - 0x26C2D6FB, 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, - 0xCB7007CA, 0x6E3195B4, 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, - 0x190FF97C, 0xF321390F, 0x5660AB71, 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, - 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3, 0x7413C95F, 0xD1525B21, 0x3B7C9B52, - 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36, 0x3CE08A10, 0x99A1186E, - 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79, 0x04B1B4D5, - 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC, - 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, - 0xD9F668E7, 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, - 0x44E6C45C, 0xE1A75622, 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, - 0xE63B4760, 0x0C158713, 0xA954156D, 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, - 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8}; - -static const unsigned int T8_3[256] = { - 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, - 0x1900B8CA, 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, - 0x32017194, 0xEF44DB2C, 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, - 0x4F032A76, 0x2D21A34F, 0xF06409F7, 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, - 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11, 0xD725148B, 0x0A60BE33, 0x6842370A, - 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41, 0x2161776D, 0xFC24DDD5, - 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7, 0x3E41A5B6, - 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C, - 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, - 0xD1057E9A, 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, - 0x6FE34D95, 0xB2A6E72D, 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, - 0xFB85A74A, 0x99A72E73, 0x44E284CB, 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, - 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610, 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, - 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6, 0x7C834B6C, 0xA1C6E1D4, - 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6, 0x8AC7288A, - 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040, - 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, - 0x8CE7429B, 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, - 0xA7E68BC5, 0x7AA3217D, 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, - 0xF4C75274, 0x96E5DB4D, 0x4BA071F5, 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, - 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213, 0xBBC47802, 0x6681D2BA, 0x04A35B83, - 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8, 0x4D801BE4, 0x90C5B15C, - 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E, 0x8585DDB4, - 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E, - 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, - 0x6AC10698, 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, - 0xA8A47EFB, 0x75E1D443, 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, - 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5, 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, - 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12, 0x0F42F53E, 0xD2075F86, 0xB025D6BF, - 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4, 0x106227E5, 0xCD278D5D, - 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F, 0xE6264403, - 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9, - 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, - 0x37233A99, 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, - 0x1C22F3C7, 0xC167597F, 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, - 0x6120A825, 0x0302211C, 0xDE478BA4, 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, - 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842}; - -static const unsigned int T8_4[256] = { - 0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, - 0xA8760E44, 0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, 0x2522B521, 0x1D33DA8D, - 0x55006A79, 0x6D1105D5, 0x8F2261D3, 0xB7330E7F, 0xFF00BE8B, 0xC711D127, 0x6F67DF63, - 0x5776B0CF, 0x1F45003B, 0x27546F97, 0x4A456A42, 0x725405EE, 0x3A67B51A, 0x0276DAB6, - 0xAA00D4F2, 0x9211BB5E, 0xDA220BAA, 0xE2336406, 0x1BA8B557, 0x23B9DAFB, 0x6B8A6A0F, - 0x539B05A3, 0xFBED0BE7, 0xC3FC644B, 0x8BCFD4BF, 0xB3DEBB13, 0xDECFBEC6, 0xE6DED16A, - 0xAEED619E, 0x96FC0E32, 0x3E8A0076, 0x069B6FDA, 0x4EA8DF2E, 0x76B9B082, 0x948AD484, - 0xAC9BBB28, 0xE4A80BDC, 0xDCB96470, 0x74CF6A34, 0x4CDE0598, 0x04EDB56C, 0x3CFCDAC0, - 0x51EDDF15, 0x69FCB0B9, 0x21CF004D, 0x19DE6FE1, 0xB1A861A5, 0x89B90E09, 0xC18ABEFD, - 0xF99BD151, 0x37516AAE, 0x0F400502, 0x4773B5F6, 0x7F62DA5A, 0xD714D41E, 0xEF05BBB2, - 0xA7360B46, 0x9F2764EA, 0xF236613F, 0xCA270E93, 0x8214BE67, 0xBA05D1CB, 0x1273DF8F, - 0x2A62B023, 0x625100D7, 0x5A406F7B, 0xB8730B7D, 0x806264D1, 0xC851D425, 0xF040BB89, - 0x5836B5CD, 0x6027DA61, 0x28146A95, 0x10050539, 0x7D1400EC, 0x45056F40, 0x0D36DFB4, - 0x3527B018, 0x9D51BE5C, 0xA540D1F0, 0xED736104, 0xD5620EA8, 0x2CF9DFF9, 0x14E8B055, - 0x5CDB00A1, 0x64CA6F0D, 0xCCBC6149, 0xF4AD0EE5, 0xBC9EBE11, 0x848FD1BD, 0xE99ED468, - 0xD18FBBC4, 0x99BC0B30, 0xA1AD649C, 0x09DB6AD8, 0x31CA0574, 0x79F9B580, 0x41E8DA2C, - 0xA3DBBE2A, 0x9BCAD186, 0xD3F96172, 0xEBE80EDE, 0x439E009A, 0x7B8F6F36, 0x33BCDFC2, - 0x0BADB06E, 0x66BCB5BB, 0x5EADDA17, 0x169E6AE3, 0x2E8F054F, 0x86F90B0B, 0xBEE864A7, - 0xF6DBD453, 0xCECABBFF, 0x6EA2D55C, 0x56B3BAF0, 0x1E800A04, 0x269165A8, 0x8EE76BEC, - 0xB6F60440, 0xFEC5B4B4, 0xC6D4DB18, 0xABC5DECD, 0x93D4B161, 0xDBE70195, 0xE3F66E39, - 0x4B80607D, 0x73910FD1, 0x3BA2BF25, 0x03B3D089, 0xE180B48F, 0xD991DB23, 0x91A26BD7, - 0xA9B3047B, 0x01C50A3F, 0x39D46593, 0x71E7D567, 0x49F6BACB, 0x24E7BF1E, 0x1CF6D0B2, - 0x54C56046, 0x6CD40FEA, 0xC4A201AE, 0xFCB36E02, 0xB480DEF6, 0x8C91B15A, 0x750A600B, - 0x4D1B0FA7, 0x0528BF53, 0x3D39D0FF, 0x954FDEBB, 0xAD5EB117, 0xE56D01E3, 0xDD7C6E4F, - 0xB06D6B9A, 0x887C0436, 0xC04FB4C2, 0xF85EDB6E, 0x5028D52A, 0x6839BA86, 0x200A0A72, - 0x181B65DE, 0xFA2801D8, 0xC2396E74, 0x8A0ADE80, 0xB21BB12C, 0x1A6DBF68, 0x227CD0C4, - 0x6A4F6030, 0x525E0F9C, 0x3F4F0A49, 0x075E65E5, 0x4F6DD511, 0x777CBABD, 0xDF0AB4F9, - 0xE71BDB55, 0xAF286BA1, 0x9739040D, 0x59F3BFF2, 0x61E2D05E, 0x29D160AA, 0x11C00F06, - 0xB9B60142, 0x81A76EEE, 0xC994DE1A, 0xF185B1B6, 0x9C94B463, 0xA485DBCF, 0xECB66B3B, - 0xD4A70497, 0x7CD10AD3, 0x44C0657F, 0x0CF3D58B, 0x34E2BA27, 0xD6D1DE21, 0xEEC0B18D, - 0xA6F30179, 0x9EE26ED5, 0x36946091, 0x0E850F3D, 0x46B6BFC9, 0x7EA7D065, 0x13B6D5B0, - 0x2BA7BA1C, 0x63940AE8, 0x5B856544, 0xF3F36B00, 0xCBE204AC, 0x83D1B458, 0xBBC0DBF4, - 0x425B0AA5, 0x7A4A6509, 0x3279D5FD, 0x0A68BA51, 0xA21EB415, 0x9A0FDBB9, 0xD23C6B4D, - 0xEA2D04E1, 0x873C0134, 0xBF2D6E98, 0xF71EDE6C, 0xCF0FB1C0, 0x6779BF84, 0x5F68D028, - 0x175B60DC, 0x2F4A0F70, 0xCD796B76, 0xF56804DA, 0xBD5BB42E, 0x854ADB82, 0x2D3CD5C6, - 0x152DBA6A, 0x5D1E0A9E, 0x650F6532, 0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, - 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3}; - -static const unsigned int T8_5[256] = { - 0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, - 0x8649FCAD, 0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, 0xD2F32F68, 0x3DC34471, - 0x097F8FAB, 0xE64FE4B2, 0xC00C303E, 0x2F3C5B27, 0x1B8090FD, 0xF4B0FBE4, 0x72F90749, - 0x9DC96C50, 0xA975A78A, 0x4645CC93, 0xA00A2821, 0x4F3A4338, 0x7B8688E2, 0x94B6E3FB, - 0x12FF1F56, 0xFDCF744F, 0xC973BF95, 0x2643D48C, 0x85F4168D, 0x6AC47D94, 0x5E78B64E, - 0xB148DD57, 0x370121FA, 0xD8314AE3, 0xEC8D8139, 0x03BDEA20, 0xE5F20E92, 0x0AC2658B, - 0x3E7EAE51, 0xD14EC548, 0x570739E5, 0xB83752FC, 0x8C8B9926, 0x63BBF23F, 0x45F826B3, - 0xAAC84DAA, 0x9E748670, 0x7144ED69, 0xF70D11C4, 0x183D7ADD, 0x2C81B107, 0xC3B1DA1E, - 0x25FE3EAC, 0xCACE55B5, 0xFE729E6F, 0x1142F576, 0x970B09DB, 0x783B62C2, 0x4C87A918, - 0xA3B7C201, 0x0E045BEB, 0xE13430F2, 0xD588FB28, 0x3AB89031, 0xBCF16C9C, 0x53C10785, - 0x677DCC5F, 0x884DA746, 0x6E0243F4, 0x813228ED, 0xB58EE337, 0x5ABE882E, 0xDCF77483, - 0x33C71F9A, 0x077BD440, 0xE84BBF59, 0xCE086BD5, 0x213800CC, 0x1584CB16, 0xFAB4A00F, - 0x7CFD5CA2, 0x93CD37BB, 0xA771FC61, 0x48419778, 0xAE0E73CA, 0x413E18D3, 0x7582D309, - 0x9AB2B810, 0x1CFB44BD, 0xF3CB2FA4, 0xC777E47E, 0x28478F67, 0x8BF04D66, 0x64C0267F, - 0x507CEDA5, 0xBF4C86BC, 0x39057A11, 0xD6351108, 0xE289DAD2, 0x0DB9B1CB, 0xEBF65579, - 0x04C63E60, 0x307AF5BA, 0xDF4A9EA3, 0x5903620E, 0xB6330917, 0x828FC2CD, 0x6DBFA9D4, - 0x4BFC7D58, 0xA4CC1641, 0x9070DD9B, 0x7F40B682, 0xF9094A2F, 0x16392136, 0x2285EAEC, - 0xCDB581F5, 0x2BFA6547, 0xC4CA0E5E, 0xF076C584, 0x1F46AE9D, 0x990F5230, 0x763F3929, - 0x4283F2F3, 0xADB399EA, 0x1C08B7D6, 0xF338DCCF, 0xC7841715, 0x28B47C0C, 0xAEFD80A1, - 0x41CDEBB8, 0x75712062, 0x9A414B7B, 0x7C0EAFC9, 0x933EC4D0, 0xA7820F0A, 0x48B26413, - 0xCEFB98BE, 0x21CBF3A7, 0x1577387D, 0xFA475364, 0xDC0487E8, 0x3334ECF1, 0x0788272B, - 0xE8B84C32, 0x6EF1B09F, 0x81C1DB86, 0xB57D105C, 0x5A4D7B45, 0xBC029FF7, 0x5332F4EE, - 0x678E3F34, 0x88BE542D, 0x0EF7A880, 0xE1C7C399, 0xD57B0843, 0x3A4B635A, 0x99FCA15B, - 0x76CCCA42, 0x42700198, 0xAD406A81, 0x2B09962C, 0xC439FD35, 0xF08536EF, 0x1FB55DF6, - 0xF9FAB944, 0x16CAD25D, 0x22761987, 0xCD46729E, 0x4B0F8E33, 0xA43FE52A, 0x90832EF0, - 0x7FB345E9, 0x59F09165, 0xB6C0FA7C, 0x827C31A6, 0x6D4C5ABF, 0xEB05A612, 0x0435CD0B, - 0x308906D1, 0xDFB96DC8, 0x39F6897A, 0xD6C6E263, 0xE27A29B9, 0x0D4A42A0, 0x8B03BE0D, - 0x6433D514, 0x508F1ECE, 0xBFBF75D7, 0x120CEC3D, 0xFD3C8724, 0xC9804CFE, 0x26B027E7, - 0xA0F9DB4A, 0x4FC9B053, 0x7B757B89, 0x94451090, 0x720AF422, 0x9D3A9F3B, 0xA98654E1, - 0x46B63FF8, 0xC0FFC355, 0x2FCFA84C, 0x1B736396, 0xF443088F, 0xD200DC03, 0x3D30B71A, - 0x098C7CC0, 0xE6BC17D9, 0x60F5EB74, 0x8FC5806D, 0xBB794BB7, 0x544920AE, 0xB206C41C, - 0x5D36AF05, 0x698A64DF, 0x86BA0FC6, 0x00F3F36B, 0xEFC39872, 0xDB7F53A8, 0x344F38B1, - 0x97F8FAB0, 0x78C891A9, 0x4C745A73, 0xA344316A, 0x250DCDC7, 0xCA3DA6DE, 0xFE816D04, - 0x11B1061D, 0xF7FEE2AF, 0x18CE89B6, 0x2C72426C, 0xC3422975, 0x450BD5D8, 0xAA3BBEC1, - 0x9E87751B, 0x71B71E02, 0x57F4CA8E, 0xB8C4A197, 0x8C786A4D, 0x63480154, 0xE501FDF9, - 0x0A3196E0, 0x3E8D5D3A, 0xD1BD3623, 0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, - 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C}; - -static const unsigned int T8_6[256] = { - 0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, - 0x1DE5B089, 0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, 0xEBCD3882, 0x83CE144A, - 0x3BCB6112, 0x53C84DDA, 0x9C5BFAA6, 0xF458D66E, 0x4C5DA336, 0x245E8FFE, 0x39BB3F77, - 0x51B813BF, 0xE9BD66E7, 0x81BE4A2F, 0xD27607F5, 0xBA752B3D, 0x02705E65, 0x6A7372AD, - 0x7796C224, 0x1F95EEEC, 0xA7909BB4, 0xCF93B77C, 0x3D5B83BD, 0x5558AF75, 0xED5DDA2D, - 0x855EF6E5, 0x98BB466C, 0xF0B86AA4, 0x48BD1FFC, 0x20BE3334, 0x73767EEE, 0x1B755226, - 0xA370277E, 0xCB730BB6, 0xD696BB3F, 0xBE9597F7, 0x0690E2AF, 0x6E93CE67, 0xA100791B, - 0xC90355D3, 0x7106208B, 0x19050C43, 0x04E0BCCA, 0x6CE39002, 0xD4E6E55A, 0xBCE5C992, - 0xEF2D8448, 0x872EA880, 0x3F2BDDD8, 0x5728F110, 0x4ACD4199, 0x22CE6D51, 0x9ACB1809, - 0xF2C834C1, 0x7AB7077A, 0x12B42BB2, 0xAAB15EEA, 0xC2B27222, 0xDF57C2AB, 0xB754EE63, - 0x0F519B3B, 0x6752B7F3, 0x349AFA29, 0x5C99D6E1, 0xE49CA3B9, 0x8C9F8F71, 0x917A3FF8, - 0xF9791330, 0x417C6668, 0x297F4AA0, 0xE6ECFDDC, 0x8EEFD114, 0x36EAA44C, 0x5EE98884, - 0x430C380D, 0x2B0F14C5, 0x930A619D, 0xFB094D55, 0xA8C1008F, 0xC0C22C47, 0x78C7591F, - 0x10C475D7, 0x0D21C55E, 0x6522E996, 0xDD279CCE, 0xB524B006, 0x47EC84C7, 0x2FEFA80F, - 0x97EADD57, 0xFFE9F19F, 0xE20C4116, 0x8A0F6DDE, 0x320A1886, 0x5A09344E, 0x09C17994, - 0x61C2555C, 0xD9C72004, 0xB1C40CCC, 0xAC21BC45, 0xC422908D, 0x7C27E5D5, 0x1424C91D, - 0xDBB77E61, 0xB3B452A9, 0x0BB127F1, 0x63B20B39, 0x7E57BBB0, 0x16549778, 0xAE51E220, - 0xC652CEE8, 0x959A8332, 0xFD99AFFA, 0x459CDAA2, 0x2D9FF66A, 0x307A46E3, 0x58796A2B, - 0xE07C1F73, 0x887F33BB, 0xF56E0EF4, 0x9D6D223C, 0x25685764, 0x4D6B7BAC, 0x508ECB25, - 0x388DE7ED, 0x808892B5, 0xE88BBE7D, 0xBB43F3A7, 0xD340DF6F, 0x6B45AA37, 0x034686FF, - 0x1EA33676, 0x76A01ABE, 0xCEA56FE6, 0xA6A6432E, 0x6935F452, 0x0136D89A, 0xB933ADC2, - 0xD130810A, 0xCCD53183, 0xA4D61D4B, 0x1CD36813, 0x74D044DB, 0x27180901, 0x4F1B25C9, - 0xF71E5091, 0x9F1D7C59, 0x82F8CCD0, 0xEAFBE018, 0x52FE9540, 0x3AFDB988, 0xC8358D49, - 0xA036A181, 0x1833D4D9, 0x7030F811, 0x6DD54898, 0x05D66450, 0xBDD31108, 0xD5D03DC0, - 0x8618701A, 0xEE1B5CD2, 0x561E298A, 0x3E1D0542, 0x23F8B5CB, 0x4BFB9903, 0xF3FEEC5B, - 0x9BFDC093, 0x546E77EF, 0x3C6D5B27, 0x84682E7F, 0xEC6B02B7, 0xF18EB23E, 0x998D9EF6, - 0x2188EBAE, 0x498BC766, 0x1A438ABC, 0x7240A674, 0xCA45D32C, 0xA246FFE4, 0xBFA34F6D, - 0xD7A063A5, 0x6FA516FD, 0x07A63A35, 0x8FD9098E, 0xE7DA2546, 0x5FDF501E, 0x37DC7CD6, - 0x2A39CC5F, 0x423AE097, 0xFA3F95CF, 0x923CB907, 0xC1F4F4DD, 0xA9F7D815, 0x11F2AD4D, - 0x79F18185, 0x6414310C, 0x0C171DC4, 0xB412689C, 0xDC114454, 0x1382F328, 0x7B81DFE0, - 0xC384AAB8, 0xAB878670, 0xB66236F9, 0xDE611A31, 0x66646F69, 0x0E6743A1, 0x5DAF0E7B, - 0x35AC22B3, 0x8DA957EB, 0xE5AA7B23, 0xF84FCBAA, 0x904CE762, 0x2849923A, 0x404ABEF2, - 0xB2828A33, 0xDA81A6FB, 0x6284D3A3, 0x0A87FF6B, 0x17624FE2, 0x7F61632A, 0xC7641672, - 0xAF673ABA, 0xFCAF7760, 0x94AC5BA8, 0x2CA92EF0, 0x44AA0238, 0x594FB2B1, 0x314C9E79, - 0x8949EB21, 0xE14AC7E9, 0x2ED97095, 0x46DA5C5D, 0xFEDF2905, 0x96DC05CD, 0x8B39B544, - 0xE33A998C, 0x5B3FECD4, 0x333CC01C, 0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, - 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F}; - -static const unsigned int T8_7[256] = { - 0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, - 0xFA590504, 0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, 0x632686B7, 0x2A1AFB90, - 0xF15E7CF9, 0xB86201DE, 0x847609B4, 0xCD4A7493, 0x160EF3FA, 0x5F328EDD, 0xA56B8BD9, - 0xEC57F6FE, 0x37137197, 0x7E2F0CB0, 0xC64D0D6E, 0x8F717049, 0x5435F720, 0x1D098A07, - 0xE7508F03, 0xAE6CF224, 0x7528754D, 0x3C14086A, 0x0D006599, 0x443C18BE, 0x9F789FD7, - 0xD644E2F0, 0x2C1DE7F4, 0x65219AD3, 0xBE651DBA, 0xF759609D, 0x4F3B6143, 0x06071C64, - 0xDD439B0D, 0x947FE62A, 0x6E26E32E, 0x271A9E09, 0xFC5E1960, 0xB5626447, 0x89766C2D, - 0xC04A110A, 0x1B0E9663, 0x5232EB44, 0xA86BEE40, 0xE1579367, 0x3A13140E, 0x732F6929, - 0xCB4D68F7, 0x827115D0, 0x593592B9, 0x1009EF9E, 0xEA50EA9A, 0xA36C97BD, 0x782810D4, - 0x31146DF3, 0x1A00CB32, 0x533CB615, 0x8878317C, 0xC1444C5B, 0x3B1D495F, 0x72213478, - 0xA965B311, 0xE059CE36, 0x583BCFE8, 0x1107B2CF, 0xCA4335A6, 0x837F4881, 0x79264D85, - 0x301A30A2, 0xEB5EB7CB, 0xA262CAEC, 0x9E76C286, 0xD74ABFA1, 0x0C0E38C8, 0x453245EF, - 0xBF6B40EB, 0xF6573DCC, 0x2D13BAA5, 0x642FC782, 0xDC4DC65C, 0x9571BB7B, 0x4E353C12, - 0x07094135, 0xFD504431, 0xB46C3916, 0x6F28BE7F, 0x2614C358, 0x1700AEAB, 0x5E3CD38C, - 0x857854E5, 0xCC4429C2, 0x361D2CC6, 0x7F2151E1, 0xA465D688, 0xED59ABAF, 0x553BAA71, - 0x1C07D756, 0xC743503F, 0x8E7F2D18, 0x7426281C, 0x3D1A553B, 0xE65ED252, 0xAF62AF75, - 0x9376A71F, 0xDA4ADA38, 0x010E5D51, 0x48322076, 0xB26B2572, 0xFB575855, 0x2013DF3C, - 0x692FA21B, 0xD14DA3C5, 0x9871DEE2, 0x4335598B, 0x0A0924AC, 0xF05021A8, 0xB96C5C8F, - 0x6228DBE6, 0x2B14A6C1, 0x34019664, 0x7D3DEB43, 0xA6796C2A, 0xEF45110D, 0x151C1409, - 0x5C20692E, 0x8764EE47, 0xCE589360, 0x763A92BE, 0x3F06EF99, 0xE44268F0, 0xAD7E15D7, - 0x572710D3, 0x1E1B6DF4, 0xC55FEA9D, 0x8C6397BA, 0xB0779FD0, 0xF94BE2F7, 0x220F659E, - 0x6B3318B9, 0x916A1DBD, 0xD856609A, 0x0312E7F3, 0x4A2E9AD4, 0xF24C9B0A, 0xBB70E62D, - 0x60346144, 0x29081C63, 0xD3511967, 0x9A6D6440, 0x4129E329, 0x08159E0E, 0x3901F3FD, - 0x703D8EDA, 0xAB7909B3, 0xE2457494, 0x181C7190, 0x51200CB7, 0x8A648BDE, 0xC358F6F9, - 0x7B3AF727, 0x32068A00, 0xE9420D69, 0xA07E704E, 0x5A27754A, 0x131B086D, 0xC85F8F04, - 0x8163F223, 0xBD77FA49, 0xF44B876E, 0x2F0F0007, 0x66337D20, 0x9C6A7824, 0xD5560503, - 0x0E12826A, 0x472EFF4D, 0xFF4CFE93, 0xB67083B4, 0x6D3404DD, 0x240879FA, 0xDE517CFE, - 0x976D01D9, 0x4C2986B0, 0x0515FB97, 0x2E015D56, 0x673D2071, 0xBC79A718, 0xF545DA3F, - 0x0F1CDF3B, 0x4620A21C, 0x9D642575, 0xD4585852, 0x6C3A598C, 0x250624AB, 0xFE42A3C2, - 0xB77EDEE5, 0x4D27DBE1, 0x041BA6C6, 0xDF5F21AF, 0x96635C88, 0xAA7754E2, 0xE34B29C5, - 0x380FAEAC, 0x7133D38B, 0x8B6AD68F, 0xC256ABA8, 0x19122CC1, 0x502E51E6, 0xE84C5038, - 0xA1702D1F, 0x7A34AA76, 0x3308D751, 0xC951D255, 0x806DAF72, 0x5B29281B, 0x1215553C, - 0x230138CF, 0x6A3D45E8, 0xB179C281, 0xF845BFA6, 0x021CBAA2, 0x4B20C785, 0x906440EC, - 0xD9583DCB, 0x613A3C15, 0x28064132, 0xF342C65B, 0xBA7EBB7C, 0x4027BE78, 0x091BC35F, - 0xD25F4436, 0x9B633911, 0xA777317B, 0xEE4B4C5C, 0x350FCB35, 0x7C33B612, 0x866AB316, - 0xCF56CE31, 0x14124958, 0x5D2E347F, 0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8, - 0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5}; - Status gen_timestamp_string(std::string* out_string) { time_t now = time(nullptr); tm local_tm; diff --git a/be/src/olap/utils.h b/be/src/olap/utils.h index 8c848639147bd7..c163aad11488b5 100644 --- a/be/src/olap/utils.h +++ b/be/src/olap/utils.h @@ -37,6 +37,8 @@ namespace doris { static const std::string DELETE_SIGN = "__DORIS_DELETE_SIGN__"; static const std::string WHERE_SIGN = "__DORIS_WHERE_SIGN__"; static const std::string VERSION_COL = "__DORIS_VERSION_COL__"; +static const std::string SKIP_BITMAP_COL = "__DORIS_SKIP_BITMAP_COL__"; +static const std::string SEQUENCE_COL = "__DORIS_SEQUENCE_COL__"; // 用来加速运算 const static int32_t g_power_table[] = {1, 10, 100, 1000, 10000, diff --git a/be/src/pipeline/common/agg_utils.h b/be/src/pipeline/common/agg_utils.h index e0435954b8bec6..135bc67712345f 100644 --- a/be/src/pipeline/common/agg_utils.h +++ b/be/src/pipeline/common/agg_utils.h @@ -22,29 +22,20 @@ #include "vec/common/arena.h" #include "vec/common/hash_table/hash_map_context.h" -#include "vec/common/hash_table/hash_map_context_creator.h" #include "vec/common/hash_table/hash_map_util.h" #include "vec/common/hash_table/ph_hash_map.h" #include "vec/common/hash_table/string_hash_map.h" namespace doris { -namespace pipeline { + +template +using AggData = PHHashMap>; +template +using AggDataNullable = vectorized::DataWithNullKey>; using AggregatedDataWithoutKey = vectorized::AggregateDataPtr; using AggregatedDataWithStringKey = PHHashMap; using AggregatedDataWithShortStringKey = StringHashMap; -using AggregatedDataWithUInt8Key = PHHashMap; -using AggregatedDataWithUInt16Key = PHHashMap; -using AggregatedDataWithUInt32Key = - PHHashMap>; -using AggregatedDataWithUInt64Key = - PHHashMap>; -using AggregatedDataWithUInt128Key = PHHashMap>; -using AggregatedDataWithUInt256Key = PHHashMap>; -using AggregatedDataWithUInt136Key = PHHashMap>; using AggregatedDataWithUInt32KeyPhase2 = PHHashMap>; -using AggregatedDataWithUInt128KeyPhase2 = - PHHashMap>; -using AggregatedDataWithUInt256KeyPhase2 = - PHHashMap>; - -using AggregatedDataWithUInt136KeyPhase2 = - PHHashMap>; - -using AggregatedDataWithNullableUInt8Key = vectorized::DataWithNullKey; -using AggregatedDataWithNullableUInt16Key = - vectorized::DataWithNullKey; -using AggregatedDataWithNullableUInt32Key = - vectorized::DataWithNullKey; -using AggregatedDataWithNullableUInt64Key = - vectorized::DataWithNullKey; + using AggregatedDataWithNullableUInt32KeyPhase2 = vectorized::DataWithNullKey; using AggregatedDataWithNullableUInt64KeyPhase2 = vectorized::DataWithNullKey; using AggregatedDataWithNullableShortStringKey = vectorized::DataWithNullKey; -using AggregatedDataWithNullableUInt128Key = - vectorized::DataWithNullKey; -using AggregatedDataWithNullableUInt128KeyPhase2 = - vectorized::DataWithNullKey; using AggregatedMethodVariants = std::variant< std::monostate, vectorized::MethodSerialized, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, vectorized::MethodStringNoCache, - vectorized::MethodOneNumber, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, vectorized::MethodOneNumber, vectorized::MethodOneNumber, - vectorized::MethodOneNumber, vectorized::MethodSingleNullableColumn< - vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>>, vectorized::MethodSingleNullableColumn>, + vectorized::UInt16, AggDataNullable>>, vectorized::MethodSingleNullableColumn>, + vectorized::UInt32, AggDataNullable>>, vectorized::MethodSingleNullableColumn>, + vectorized::UInt64, AggDataNullable>>, vectorized::MethodSingleNullableColumn>, vectorized::MethodSingleNullableColumn>, vectorized::MethodSingleNullableColumn>, + vectorized::UInt128, AggDataNullable>>, vectorized::MethodSingleNullableColumn>, + vectorized::UInt256, AggDataNullable>>, vectorized::MethodSingleNullableColumn< vectorized::MethodStringNoCache>, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed>; + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>>; struct AggregatedDataVariants - : public vectorized::DataVariants { + : public DataVariants { AggregatedDataWithoutKey without_key = nullptr; - template - void init(Type type) { - _type = type; - switch (_type) { - case Type::without_key: + void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); + + switch (type) { + case HashKeyType::without_key: break; - case Type::serialized: + case HashKeyType::serialized: method_variant.emplace>(); break; - case Type::int8_key: - emplace_single(); + case HashKeyType::int8_key: + emplace_single>(nullable); break; - case Type::int16_key: - emplace_single(); + case HashKeyType::int16_key: + emplace_single>(nullable); break; - case Type::int32_key: - emplace_single(); + case HashKeyType::int32_key: + emplace_single>(nullable); break; - case Type::int32_key_phase2: - emplace_single(); + case HashKeyType::int32_key_phase2: + emplace_single(nullable); break; - case Type::int64_key: - emplace_single(); + case HashKeyType::int64_key: + emplace_single>(nullable); break; - case Type::int64_key_phase2: - emplace_single(); + case HashKeyType::int64_key_phase2: + emplace_single(nullable); break; - case Type::int128_key: - emplace_single(); + case HashKeyType::int128_key: + emplace_single>(nullable); break; - case Type::int128_key_phase2: - emplace_single(); + case HashKeyType::int256_key: + emplace_single>(nullable); break; - case Type::string_key: + case HashKeyType::string_key: if (nullable) { method_variant.emplace< vectorized::MethodSingleNullableColumn>(); } break; + case HashKeyType::fixed64: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed128: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed136: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed256: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; default: - throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid key type, type={}", type); - } - } - - void init(Type type, bool is_nullable = false) { - if (is_nullable) { - init(type); - } else { - init(type); + throw Exception(ErrorCode::INTERNAL_ERROR, + "AggregatedDataVariants meet invalid key type, type={}", type); } } }; @@ -210,7 +175,7 @@ struct AggregateDataContainer { } *reinterpret_cast(_current_keys) = key; - auto aggregate_data = _current_agg_data; + auto* aggregate_data = _current_agg_data; ++_total_count; ++_index_in_sub_container; _current_agg_data += _size_of_aggregate_states; @@ -275,15 +240,15 @@ struct AggregateDataContainer { using IteratorBase::IteratorBase; }; - ConstIterator begin() const { return ConstIterator(this, 0); } + ConstIterator begin() const { return {this, 0}; } ConstIterator cbegin() const { return begin(); } - Iterator begin() { return Iterator(this, 0); } + Iterator begin() { return {this, 0}; } - ConstIterator end() const { return ConstIterator(this, _total_count); } + ConstIterator end() const { return {this, _total_count}; } ConstIterator cend() const { return end(); } - Iterator end() { return Iterator(this, _total_count); } + Iterator end() { return {this, _total_count}; } void init_once() { if (_inited) { @@ -331,10 +296,4 @@ struct AggregateDataContainer { uint32_t _total_count {}; bool _inited = false; }; - -} // namespace pipeline - -constexpr auto init_agg_hash_method = - init_hash_method; - } // namespace doris diff --git a/be/src/pipeline/common/distinct_agg_utils.h b/be/src/pipeline/common/distinct_agg_utils.h new file mode 100644 index 00000000000000..806039d5a36a4b --- /dev/null +++ b/be/src/pipeline/common/distinct_agg_utils.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "vec/common/arena.h" +#include "vec/common/hash_table/hash_map_context.h" +#include "vec/common/hash_table/hash_map_util.h" +#include "vec/common/hash_table/ph_hash_map.h" +#include "vec/common/hash_table/ph_hash_set.h" +#include "vec/common/hash_table/string_hash_map.h" + +namespace doris { +template +using DistinctData = PHHashSet>; + +template +using DistinctDataPhase2 = PHHashSet>; + +using DistinctDataWithStringKey = PHHashSet; + +// todo: Need to implement StringHashSet like StringHashMap +using DistinctDataWithShortStringKey = PHHashSet; + +using DistinctMethodVariants = std::variant< + std::monostate, vectorized::MethodSerialized, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodStringNoCache, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodOneNumber>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>>, + vectorized::MethodSingleNullableColumn>>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>>; + +struct DistinctDataVariants + : public DataVariants { + void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); + switch (type) { + case HashKeyType::serialized: + method_variant.emplace>(); + break; + case HashKeyType::int8_key: + emplace_single>(nullable); + break; + case HashKeyType::int16_key: + emplace_single>(nullable); + break; + case HashKeyType::int32_key: + emplace_single>(nullable); + break; + case HashKeyType::int32_key_phase2: + emplace_single>(nullable); + break; + case HashKeyType::int64_key: + emplace_single>(nullable); + break; + case HashKeyType::int64_key_phase2: + emplace_single>(nullable); + break; + case HashKeyType::int128_key: + emplace_single>(nullable); + break; + case HashKeyType::int256_key: + emplace_single>(nullable); + break; + case HashKeyType::string_key: + if (nullable) { + method_variant.emplace< + vectorized::MethodSingleNullableColumn>>>(); + } else { + method_variant + .emplace>(); + } + break; + case HashKeyType::fixed64: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed128: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed136: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed256: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + default: + throw Exception(ErrorCode::INTERNAL_ERROR, + "AggregatedDataVariants meet invalid key type, type={}", type); + } + } +}; + +} // namespace doris diff --git a/be/src/pipeline/common/join_utils.h b/be/src/pipeline/common/join_utils.h index 7fcf669d42e7dc..e214d1a52931a9 100644 --- a/be/src/pipeline/common/join_utils.h +++ b/be/src/pipeline/common/join_utils.h @@ -20,10 +20,9 @@ #include #include -#include "vec/common/hash_table/hash_map_context_creator.h" #include "vec/common/hash_table/hash_map_util.h" -namespace doris::pipeline { +namespace doris { using JoinOpVariants = std::variant, std::integral_constant, @@ -37,32 +36,77 @@ using JoinOpVariants = std::integral_constant, std::integral_constant>; -using I8HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I16HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I32HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I64HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I128HashTableContext = vectorized::PrimaryTypeHashTableContext; -using I256HashTableContext = vectorized::PrimaryTypeHashTableContext; -using MethodOneString = vectorized::MethodStringNoCache>; -template -using I64FixedKeyHashTableContext = vectorized::FixedKeyHashTableContext; +template +using PrimaryTypeHashTableContext = vectorized::MethodOneNumber>>; + +template +using FixedKeyHashTableContext = vectorized::MethodKeysFixed>>; -template -using I128FixedKeyHashTableContext = vectorized::FixedKeyHashTableContext; +using SerializedHashTableContext = vectorized::MethodSerialized>; +using MethodOneString = vectorized::MethodStringNoCache>; -template -using I256FixedKeyHashTableContext = vectorized::FixedKeyHashTableContext; +using HashTableVariants = std::variant< + std::monostate, SerializedHashTableContext, PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + PrimaryTypeHashTableContext, + FixedKeyHashTableContext, FixedKeyHashTableContext, + FixedKeyHashTableContext, + FixedKeyHashTableContext, MethodOneString>; -template -using I136FixedKeyHashTableContext = vectorized::FixedKeyHashTableContext; +struct JoinDataVariants { + HashTableVariants method_variant; -using HashTableVariants = - std::variant, - I64FixedKeyHashTableContext, I128FixedKeyHashTableContext, - I128FixedKeyHashTableContext, I256FixedKeyHashTableContext, - I256FixedKeyHashTableContext, I136FixedKeyHashTableContext, - I136FixedKeyHashTableContext, MethodOneString>; + void init(const std::vector& data_types, HashKeyType type) { + // todo: support single column nullable context + switch (type) { + case HashKeyType::serialized: + method_variant.emplace(); + break; + case HashKeyType::int8_key: + method_variant.emplace>(); + break; + case HashKeyType::int16_key: + method_variant.emplace>(); + break; + case HashKeyType::int32_key: + method_variant.emplace>(); + break; + case HashKeyType::int64_key: + method_variant.emplace>(); + break; + case HashKeyType::int128_key: + method_variant.emplace>(); + break; + case HashKeyType::int256_key: + method_variant.emplace>(); + break; + case HashKeyType::string_key: + method_variant.emplace(); + break; + case HashKeyType::fixed64: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed128: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed136: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed256: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + default: + throw Exception(ErrorCode::INTERNAL_ERROR, + "JoinDataVariants meet invalid key type, type={}", type); + } + } +}; -} // namespace doris::pipeline +} // namespace doris diff --git a/be/src/pipeline/common/partition_sort_utils.cpp b/be/src/pipeline/common/partition_sort_utils.cpp new file mode 100644 index 00000000000000..b9656ac2a52d13 --- /dev/null +++ b/be/src/pipeline/common/partition_sort_utils.cpp @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pipeline/common/partition_sort_utils.h" + +namespace doris { + +Status PartitionBlocks::append_block_by_selector(const vectorized::Block* input_block, bool eos) { + if (_blocks.empty() || reach_limit()) { + _init_rows = _partition_sort_info->_runtime_state->batch_size(); + _blocks.push_back(vectorized::Block::create_unique( + vectorized::VectorizedUtils::create_empty_block(_partition_sort_info->_row_desc))); + } + auto columns = input_block->get_columns(); + auto mutable_columns = _blocks.back()->mutate_columns(); + DCHECK(columns.size() == mutable_columns.size()); + for (int i = 0; i < mutable_columns.size(); ++i) { + columns[i]->append_data_by_selector(mutable_columns[i], _selector); + } + _blocks.back()->set_columns(std::move(mutable_columns)); + auto selector_rows = _selector.size(); + _init_rows = _init_rows - selector_rows; + _total_rows = _total_rows + selector_rows; + _current_input_rows = _current_input_rows + selector_rows; + _selector.clear(); + // maybe better could change by user PARTITION_SORT_ROWS_THRESHOLD + if (!eos && _partition_sort_info->_partition_inner_limit != -1 && + _current_input_rows >= PARTITION_SORT_ROWS_THRESHOLD && + _partition_sort_info->_topn_phase != TPartTopNPhase::TWO_PHASE_GLOBAL) { + create_or_reset_sorter_state(); + RETURN_IF_ERROR(do_partition_topn_sort()); + _current_input_rows = 0; // reset record + _do_partition_topn_count++; + } + return Status::OK(); +} + +void PartitionBlocks::create_or_reset_sorter_state() { + if (_partition_topn_sorter == nullptr) { + _previous_row = std::make_unique(); + _partition_topn_sorter = vectorized::PartitionSorter::create_unique( + *_partition_sort_info->_vsort_exec_exprs, _partition_sort_info->_limit, + _partition_sort_info->_offset, _partition_sort_info->_pool, + _partition_sort_info->_is_asc_order, _partition_sort_info->_nulls_first, + _partition_sort_info->_row_desc, _partition_sort_info->_runtime_state, + _is_first_sorter ? _partition_sort_info->_runtime_profile : nullptr, + _partition_sort_info->_has_global_limit, + _partition_sort_info->_partition_inner_limit, + _partition_sort_info->_top_n_algorithm, _previous_row.get()); + _partition_topn_sorter->init_profile(_partition_sort_info->_runtime_profile); + } else { + _partition_topn_sorter->reset_sorter_state(_partition_sort_info->_runtime_state); + } +} + +Status PartitionBlocks::do_partition_topn_sort() { + for (const auto& block : _blocks) { + RETURN_IF_ERROR(_partition_topn_sorter->append_block(block.get())); + } + _blocks.clear(); + RETURN_IF_ERROR(_partition_topn_sorter->prepare_for_read()); + bool current_eos = false; + size_t current_output_rows = 0; + while (!current_eos) { + // output_block maybe need better way + auto output_block = vectorized::Block::create_unique( + vectorized::VectorizedUtils::create_empty_block(_partition_sort_info->_row_desc)); + RETURN_IF_ERROR(_partition_topn_sorter->get_next(_partition_sort_info->_runtime_state, + output_block.get(), ¤t_eos)); + auto rows = output_block->rows(); + if (rows > 0) { + current_output_rows += rows; + _blocks.emplace_back(std::move(output_block)); + } + } + + _topn_filter_rows += (_current_input_rows - current_output_rows); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/pipeline/common/partition_sort_utils.h b/be/src/pipeline/common/partition_sort_utils.h new file mode 100644 index 00000000000000..9317a783ba68bf --- /dev/null +++ b/be/src/pipeline/common/partition_sort_utils.h @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "vec/common/arena.h" +#include "vec/common/hash_table/hash_map_context.h" +#include "vec/common/hash_table/hash_map_util.h" +#include "vec/common/hash_table/ph_hash_map.h" +#include "vec/common/hash_table/string_hash_map.h" +#include "vec/common/sort/partition_sorter.h" +#include "vec/common/sort/vsort_exec_exprs.h" + +namespace doris { + +struct PartitionSortInfo { + ~PartitionSortInfo() = default; + + PartitionSortInfo(vectorized::VSortExecExprs* vsort_exec_exprs, int64_t limit, int64_t offset, + ObjectPool* pool, const std::vector& is_asc_order, + const std::vector& nulls_first, const RowDescriptor& row_desc, + RuntimeState* runtime_state, RuntimeProfile* runtime_profile, + bool has_global_limit, int64_t partition_inner_limit, + TopNAlgorithm::type top_n_algorithm, TPartTopNPhase::type topn_phase) + : _vsort_exec_exprs(vsort_exec_exprs), + _limit(limit), + _offset(offset), + _pool(pool), + _is_asc_order(is_asc_order), + _nulls_first(nulls_first), + _row_desc(row_desc), + _runtime_state(runtime_state), + _runtime_profile(runtime_profile), + _has_global_limit(has_global_limit), + _partition_inner_limit(partition_inner_limit), + _top_n_algorithm(top_n_algorithm), + _topn_phase(topn_phase) {} + +public: + vectorized::VSortExecExprs* _vsort_exec_exprs = nullptr; + int64_t _limit = -1; + int64_t _offset = 0; + ObjectPool* _pool = nullptr; + std::vector _is_asc_order; + std::vector _nulls_first; + const RowDescriptor& _row_desc; + RuntimeState* _runtime_state = nullptr; + RuntimeProfile* _runtime_profile = nullptr; + bool _has_global_limit = false; + int64_t _partition_inner_limit = 0; + TopNAlgorithm::type _top_n_algorithm = TopNAlgorithm::ROW_NUMBER; + TPartTopNPhase::type _topn_phase = TPartTopNPhase::TWO_PHASE_GLOBAL; +}; + +static constexpr size_t INITIAL_BUFFERED_BLOCK_BYTES = 64 << 20; + +#ifndef NDEBUG +static constexpr size_t PARTITION_SORT_ROWS_THRESHOLD = 10; +#else +static constexpr size_t PARTITION_SORT_ROWS_THRESHOLD = 20000; +#endif + +struct PartitionBlocks { +public: + PartitionBlocks(std::shared_ptr partition_sort_info, bool is_first_sorter) + : _is_first_sorter(is_first_sorter), _partition_sort_info(partition_sort_info) {} + ~PartitionBlocks() = default; + + void add_row_idx(size_t row) { _selector.push_back(row); } + + Status append_block_by_selector(const vectorized::Block* input_block, bool eos); + + Status do_partition_topn_sort(); + + void create_or_reset_sorter_state(); + + void append_whole_block(vectorized::Block* input_block, const RowDescriptor& row_desc) { + auto empty_block = vectorized::Block::create_unique( + vectorized::VectorizedUtils::create_empty_block(row_desc)); + empty_block->swap(*input_block); + _blocks.emplace_back(std::move(empty_block)); + } + + bool reach_limit() { + return _init_rows <= 0 || _blocks.back()->bytes() > INITIAL_BUFFERED_BLOCK_BYTES; + } + + size_t get_total_rows() const { return _total_rows; } + size_t get_topn_filter_rows() const { return _topn_filter_rows; } + size_t get_do_topn_count() const { return _do_partition_topn_count; } + + vectorized::IColumn::Selector _selector; + std::vector> _blocks; + size_t _total_rows = 0; + size_t _current_input_rows = 0; + size_t _topn_filter_rows = 0; + size_t _do_partition_topn_count = 0; + int _init_rows = 4096; + bool _is_first_sorter = false; + + std::unique_ptr _previous_row; + std::unique_ptr _partition_topn_sorter = nullptr; + std::shared_ptr _partition_sort_info = nullptr; +}; + +using PartitionDataPtr = PartitionBlocks*; +using PartitionDataWithStringKey = PHHashMap; +using PartitionDataWithShortStringKey = StringHashMap; + +template +using PartitionData = PHHashMap>; + +template +using PartitionDataSingle = vectorized::MethodOneNumber>; + +template +using PartitionDataSingleNullable = vectorized::MethodSingleNullableColumn< + vectorized::MethodOneNumber>>>; + +using PartitionedMethodVariants = std::variant< + std::monostate, vectorized::MethodSerialized, + PartitionDataSingle, PartitionDataSingle, + PartitionDataSingle, PartitionDataSingle, + PartitionDataSingle, PartitionDataSingle, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + PartitionDataSingleNullable, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodKeysFixed>, + vectorized::MethodStringNoCache, + vectorized::MethodSingleNullableColumn>>>; + +struct PartitionedHashMapVariants + : public DataVariants { + void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); + switch (type) { + case HashKeyType::without_key: { + break; + } + case HashKeyType::serialized: { + method_variant.emplace>(); + break; + } + case HashKeyType::int8_key: { + emplace_single>(nullable); + break; + } + case HashKeyType::int16_key: { + emplace_single>(nullable); + break; + } + case HashKeyType::int32_key: { + emplace_single>(nullable); + break; + } + case HashKeyType::int64_key: { + emplace_single>(nullable); + break; + } + case HashKeyType::int128_key: { + emplace_single>(nullable); + break; + } + case HashKeyType::int256_key: { + emplace_single>(nullable); + break; + } + case HashKeyType::string_key: { + if (nullable) { + method_variant.emplace< + vectorized::MethodSingleNullableColumn>>>(); + } else { + method_variant.emplace< + vectorized::MethodStringNoCache>(); + } + break; + } + case HashKeyType::fixed64: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed128: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed136: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed256: + method_variant.emplace>>( + get_key_sizes(data_types)); + break; + default: + throw Exception(ErrorCode::INTERNAL_ERROR, + "PartitionedHashMapVariants meet invalid key type, type={}", type); + } + } +}; + +} // namespace doris diff --git a/be/src/pipeline/common/runtime_filter_consumer.cpp b/be/src/pipeline/common/runtime_filter_consumer.cpp index 817c76a79af47c..29279824964e68 100644 --- a/be/src/pipeline/common/runtime_filter_consumer.cpp +++ b/be/src/pipeline/common/runtime_filter_consumer.cpp @@ -76,7 +76,6 @@ void RuntimeFilterConsumer::init_runtime_filter_dependency( auto runtime_filter = _runtime_filter_ctxs[i].runtime_filter; runtime_filter_dependencies[i] = std::make_shared( id, node_id, name, runtime_filter.get()); - _runtime_filter_ctxs[i].runtime_filter_dependency = runtime_filter_dependencies[i].get(); runtime_filter_timers[i] = std::make_shared( runtime_filter->registration_time(), runtime_filter->wait_time_ms(), runtime_filter_dependencies[i]); diff --git a/be/src/pipeline/common/runtime_filter_consumer.h b/be/src/pipeline/common/runtime_filter_consumer.h index 03868355875454..6d8978bc83e992 100644 --- a/be/src/pipeline/common/runtime_filter_consumer.h +++ b/be/src/pipeline/common/runtime_filter_consumer.h @@ -61,7 +61,6 @@ class RuntimeFilterConsumer { // set to true if this runtime filter is already applied to vconjunct_ctx_ptr bool apply_mark = false; std::shared_ptr runtime_filter; - pipeline::RuntimeFilterDependency* runtime_filter_dependency = nullptr; }; std::vector _runtime_filter_ctxs; @@ -79,7 +78,7 @@ class RuntimeFilterConsumer { const RowDescriptor& _row_descriptor_ref; - VExprContextSPtrs& _conjuncts_ref; + vectorized::VExprContextSPtrs& _conjuncts_ref; // True means all runtime filters are applied to scanners bool _is_all_rf_applied = true; diff --git a/be/src/pipeline/common/set_utils.h b/be/src/pipeline/common/set_utils.h new file mode 100644 index 00000000000000..2caf5b7d0b814c --- /dev/null +++ b/be/src/pipeline/common/set_utils.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "pipeline/exec/join/join_op.h" +#include "vec/common/hash_table/hash_map_util.h" + +namespace doris { + +template +using SetData = PHHashMap>; + +template +using SetFixedKeyHashTableContext = vectorized::MethodKeysFixed>; + +template +using SetPrimaryTypeHashTableContext = vectorized::MethodOneNumber>; + +template +using SetPrimaryTypeHashTableContextNullable = vectorized::MethodSingleNullableColumn< + vectorized::MethodOneNumber>>>; + +using SetSerializedHashTableContext = + vectorized::MethodSerialized>; +using SetMethodOneString = + vectorized::MethodStringNoCache>; + +using SetHashTableVariants = + std::variant, + SetPrimaryTypeHashTableContextNullable, + SetPrimaryTypeHashTableContextNullable, + SetPrimaryTypeHashTableContextNullable, + SetPrimaryTypeHashTableContextNullable, + SetPrimaryTypeHashTableContextNullable, + SetPrimaryTypeHashTableContext, + SetPrimaryTypeHashTableContext, + SetPrimaryTypeHashTableContext, + SetPrimaryTypeHashTableContext, + SetPrimaryTypeHashTableContext, + SetPrimaryTypeHashTableContext, + SetFixedKeyHashTableContext, + SetFixedKeyHashTableContext, + SetFixedKeyHashTableContext, + SetFixedKeyHashTableContext>; + +struct SetDataVariants + : public DataVariants { + void init(const std::vector& data_types, HashKeyType type) { + bool nullable = data_types.size() == 1 && data_types[0]->is_nullable(); + switch (type) { + case HashKeyType::serialized: + method_variant.emplace(); + break; + case HashKeyType::int8_key: + emplace_single>(nullable); + break; + case HashKeyType::int16_key: + emplace_single>(nullable); + break; + case HashKeyType::int32_key: + emplace_single>(nullable); + break; + case HashKeyType::int64_key: + emplace_single>(nullable); + break; + case HashKeyType::int128_key: + emplace_single>(nullable); + break; + case HashKeyType::int256_key: + emplace_single>(nullable); + break; + case HashKeyType::string_key: + method_variant.emplace(); + break; + case HashKeyType::fixed64: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed128: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed136: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + case HashKeyType::fixed256: + method_variant.emplace>( + get_key_sizes(data_types)); + break; + default: + throw Exception(ErrorCode::INTERNAL_ERROR, + "SetDataVariants meet invalid key type, type={}", type); + } + } +}; + +} // namespace doris diff --git a/be/src/pipeline/dependency.cpp b/be/src/pipeline/dependency.cpp index 5f8d5c8c494142..5fef018423df25 100644 --- a/be/src/pipeline/dependency.cpp +++ b/be/src/pipeline/dependency.cpp @@ -32,15 +32,16 @@ #include "vec/spill/spill_stream_manager.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" Dependency* BasicSharedState::create_source_dependency(int operator_id, int node_id, - std::string name) { + const std::string& name) { source_deps.push_back(std::make_shared(operator_id, node_id, name + "_DEPENDENCY")); source_deps.back()->set_shared_state(this); return source_deps.back().get(); } -Dependency* BasicSharedState::create_sink_dependency(int dest_id, int node_id, std::string name) { +Dependency* BasicSharedState::create_sink_dependency(int dest_id, int node_id, + const std::string& name) { sink_deps.push_back(std::make_shared(dest_id, node_id, name + "_DEPENDENCY", true)); sink_deps.back()->set_shared_state(this); return sink_deps.back().get(); @@ -105,16 +106,6 @@ std::string RuntimeFilterDependency::debug_string(int indentation_level) { return fmt::to_string(debug_string_buffer); } -Dependency* RuntimeFilterDependency::is_blocked_by(PipelineTask* task) { - std::unique_lock lc(_task_lock); - auto ready = _ready.load(); - if (!ready && task) { - _add_block_task(task); - task->_blocked_dep = this; - } - return ready ? nullptr : this; -} - void RuntimeFilterTimer::call_timeout() { _parent->set_ready(); } @@ -199,7 +190,7 @@ void LocalExchangeSharedState::sub_running_source_operators( LocalExchangeSharedState::LocalExchangeSharedState(int num_instances) { source_deps.resize(num_instances, nullptr); - mem_trackers.resize(num_instances, nullptr); + mem_counters.resize(num_instances, nullptr); } vectorized::MutableColumns AggSharedState::_get_keys_hash_table() { @@ -267,8 +258,8 @@ bool AggSharedState::do_limit_filter(vectorized::Block* block, size_t num_rows, need_computes.data()); } - auto set_computes_arr = [](auto* __restrict res, auto* __restrict computes, int rows) { - for (int i = 0; i < rows; ++i) { + auto set_computes_arr = [](auto* __restrict res, auto* __restrict computes, size_t rows) { + for (size_t i = 0; i < rows; ++i) { computes[i] = computes[i] == res[i]; } }; @@ -413,4 +404,17 @@ Status SetSharedState::update_build_not_ignore_null(const vectorized::VExprConte return Status::OK(); } +Status SetSharedState::hash_table_init() { + std::vector data_types; + for (size_t i = 0; i != child_exprs_lists[0].size(); ++i) { + auto& ctx = child_exprs_lists[0][i]; + auto data_type = ctx->root()->data_type(); + if (build_not_ignore_null[i]) { + data_type = vectorized::make_nullable(data_type); + } + data_types.emplace_back(std::move(data_type)); + } + return init_hash_method(hash_table_variants.get(), data_types, true); +} + } // namespace doris::pipeline diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index 863458d3bdec3f..4cc3aceaeebdfa 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -27,10 +28,10 @@ #include #include "common/logging.h" -#include "concurrentqueue.h" #include "gutil/integral_types.h" #include "pipeline/common/agg_utils.h" #include "pipeline/common/join_utils.h" +#include "pipeline/common/set_utils.h" #include "pipeline/exec/data_queue.h" #include "pipeline/exec/join/process_hash_table_probe.h" #include "vec/common/sort/partition_sorter.h" @@ -45,7 +46,7 @@ class VSlotRef; } // namespace doris::vectorized namespace doris::pipeline { - +#include "common/compile_check_begin.h" class Dependency; class PipelineTask; struct BasicSharedState; @@ -80,17 +81,15 @@ struct BasicSharedState { virtual ~BasicSharedState() = default; - Dependency* create_source_dependency(int operator_id, int node_id, std::string name); + Dependency* create_source_dependency(int operator_id, int node_id, const std::string& name); - Dependency* create_sink_dependency(int dest_id, int node_id, std::string name); + Dependency* create_sink_dependency(int dest_id, int node_id, const std::string& name); }; class Dependency : public std::enable_shared_from_this { public: ENABLE_FACTORY_CREATOR(Dependency); - Dependency(int id, int node_id, std::string name) - : _id(id), _node_id(node_id), _name(std::move(name)), _ready(false) {} - Dependency(int id, int node_id, std::string name, bool ready) + Dependency(int id, int node_id, std::string name, bool ready = false) : _id(id), _node_id(node_id), _name(std::move(name)), _ready(ready) {} virtual ~Dependency() = default; @@ -110,19 +109,19 @@ class Dependency : public std::enable_shared_from_this { // Notify downstream pipeline tasks this dependency is ready. void set_ready(); void set_ready_to_read() { - DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->source_deps.size(), 1) << debug_string(); _shared_state->source_deps.front()->set_ready(); } void set_block_to_read() { - DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->source_deps.size(), 1) << debug_string(); _shared_state->source_deps.front()->block(); } void set_ready_to_write() { - DCHECK(_shared_state->sink_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->sink_deps.size(), 1) << debug_string(); _shared_state->sink_deps.front()->set_ready(); } void set_block_to_write() { - DCHECK(_shared_state->sink_deps.size() == 1) << debug_string(); + DCHECK_EQ(_shared_state->sink_deps.size(), 1) << debug_string(); _shared_state->sink_deps.front()->block(); } @@ -173,7 +172,7 @@ struct FakeSharedState final : public BasicSharedState { ENABLE_FACTORY_CREATOR(FakeSharedState) }; -struct CountedFinishDependency final : public Dependency { +class CountedFinishDependency final : public Dependency { public: using SharedState = FakeSharedState; CountedFinishDependency(int id, int node_id, std::string name) @@ -277,8 +276,6 @@ class RuntimeFilterDependency final : public Dependency { : Dependency(id, node_id, name), _runtime_filter(runtime_filter) {} std::string debug_string(int indentation_level = 0) override; - Dependency* is_blocked_by(PipelineTask* task) override; - private: const IRuntimeFilter* _runtime_filter = nullptr; }; @@ -324,11 +321,6 @@ struct AggSharedState : public BasicSharedState { vectorized::Sizes offsets_of_aggregate_states; std::vector make_nullable_keys; - struct MemoryRecord { - int64_t used_in_arena {}; - int64_t used_in_state {}; - }; - MemoryRecord mem_usage_record; bool agg_data_created_without_key = false; bool enable_spill = false; bool reach_limit = false; @@ -508,7 +500,7 @@ struct SpillSortSharedState : public BasicSharedState, ~SpillSortSharedState() override = default; // This number specifies the maximum size of sub blocks - static constexpr int SORT_BLOCK_SPILL_BATCH_BYTES = 8 * 1024 * 1024; + static constexpr size_t SORT_BLOCK_SPILL_BATCH_BYTES = 8 * 1024 * 1024; void update_spill_block_batch_row_count(const vectorized::Block* block) { auto rows = block->rows(); if (rows > 0 && 0 == avg_row_bytes) { @@ -529,7 +521,7 @@ struct SpillSortSharedState : public BasicSharedState, std::deque sorted_streams; size_t avg_row_bytes = 0; - int spill_block_batch_row_count; + size_t spill_block_batch_row_count; }; struct UnionSharedState : public BasicSharedState { @@ -610,12 +602,13 @@ struct HashJoinSharedState : public JoinSharedState { ENABLE_FACTORY_CREATOR(HashJoinSharedState) // mark the join column whether support null eq std::vector is_null_safe_eq_join; + // mark the build hash table whether it needs to store null value - std::vector store_null_in_hash_table; + std::vector serialize_null_into_key; std::shared_ptr arena = std::make_shared(); // maybe share hash table with other fragment instances - std::shared_ptr hash_table_variants = std::make_shared(); + std::shared_ptr hash_table_variants = std::make_shared(); const std::vector build_side_child_desc; size_t build_exprs_size = 0; std::shared_ptr build_block; @@ -655,24 +648,6 @@ struct PartitionSortNodeSharedState : public BasicSharedState { std::mutex sink_eos_lock; }; -using SetHashTableVariants = - std::variant, - vectorized::SetPrimaryTypeHashTableContext, - vectorized::SetPrimaryTypeHashTableContext, - vectorized::SetPrimaryTypeHashTableContext, - vectorized::SetPrimaryTypeHashTableContext, - vectorized::SetPrimaryTypeHashTableContext, - vectorized::SetFixedKeyHashTableContext, - vectorized::SetFixedKeyHashTableContext, - vectorized::SetFixedKeyHashTableContext, - vectorized::SetFixedKeyHashTableContext, - vectorized::SetFixedKeyHashTableContext, - vectorized::SetFixedKeyHashTableContext, - vectorized::SetFixedKeyHashTableContext, - vectorized::SetFixedKeyHashTableContext>; - struct SetSharedState : public BasicSharedState { ENABLE_FACTORY_CREATOR(SetSharedState) public: @@ -687,7 +662,7 @@ struct SetSharedState : public BasicSharedState { //// shared static states (shared, decided in prepare/open...) /// init in setup_local_state - std::unique_ptr hash_table_variants = nullptr; // the real data HERE. + std::unique_ptr hash_table_variants = nullptr; // the real data HERE. std::vector build_not_ignore_null; // The SET operator's child might have different nullable attributes. @@ -699,7 +674,7 @@ struct SetSharedState : public BasicSharedState { std::vector child_exprs_lists; /// init in build side - int child_quantity; + size_t child_quantity; vectorized::VExprContextSPtrs build_child_exprs; std::vector probe_finished_children_dependency; @@ -709,64 +684,7 @@ struct SetSharedState : public BasicSharedState { std::atomic ready_for_read = false; /// called in setup_local_state - void hash_table_init() { - using namespace vectorized; - if (child_exprs_lists[0].size() == 1 && (!build_not_ignore_null[0])) { - // Single column optimization - switch (child_exprs_lists[0][0]->root()->result_type()) { - case TYPE_BOOLEAN: - case TYPE_TINYINT: - hash_table_variants->emplace>(); - break; - case TYPE_SMALLINT: - hash_table_variants->emplace>(); - break; - case TYPE_INT: - case TYPE_FLOAT: - case TYPE_DATEV2: - case TYPE_DECIMAL32: - hash_table_variants->emplace>(); - break; - case TYPE_BIGINT: - case TYPE_DOUBLE: - case TYPE_DATETIME: - case TYPE_DATE: - case TYPE_DECIMAL64: - case TYPE_DATETIMEV2: - hash_table_variants->emplace>(); - break; - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: { - hash_table_variants->emplace(); - break; - } - case TYPE_LARGEINT: - case TYPE_DECIMALV2: - case TYPE_DECIMAL128I: - hash_table_variants->emplace>(); - break; - default: - hash_table_variants->emplace(); - } - return; - } - - // here need to change type to nullable, because some case eg: - // (select 0) intersect (select null) the build side hash table should not - // ignore null value. - std::vector data_types; - for (int i = 0; i < child_exprs_lists[0].size(); i++) { - const auto& ctx = child_exprs_lists[0][i]; - data_types.emplace_back(build_not_ignore_null[i] - ? make_nullable(ctx->root()->data_type()) - : ctx->root()->data_type()); - } - if (!try_get_hash_map_context_fixed( - *hash_table_variants, data_types)) { - hash_table_variants->emplace(); - } - } + Status hash_table_init(); }; enum class ExchangeType : uint8_t { @@ -829,7 +747,7 @@ struct LocalExchangeSharedState : public BasicSharedState { LocalExchangeSharedState(int num_instances); ~LocalExchangeSharedState() override; std::unique_ptr exchanger {}; - std::vector mem_trackers; + std::vector mem_counters; std::atomic mem_usage = 0; // We need to make sure to add mem_usage first and then enqueue, otherwise sub mem_usage may cause negative mem_usage during concurrent dequeue. std::mutex le_lock; @@ -865,13 +783,15 @@ struct LocalExchangeSharedState : public BasicSharedState { } void add_mem_usage(int channel_id, size_t delta, bool update_total_mem_usage = true) { - mem_trackers[channel_id]->consume(delta); + mem_counters[channel_id]->update(delta); if (update_total_mem_usage) { add_total_mem_usage(delta, channel_id); } } - void sub_mem_usage(int channel_id, size_t delta) { mem_trackers[channel_id]->release(delta); } + void sub_mem_usage(int channel_id, size_t delta) { + mem_counters[channel_id]->update(-(int64_t)delta); + } virtual void add_total_mem_usage(size_t delta, int channel_id) { if (mem_usage.fetch_add(delta) + delta > config::local_exchange_buffer_mem_limit) { @@ -944,5 +864,5 @@ struct LocalMergeExchangeSharedState : public LocalExchangeSharedState { std::vector _queues_mem_usage; const int64_t _each_queue_limit; }; - +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index 0bf3f8c4e0931d..27400fba474eef 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -20,6 +20,7 @@ #include #include +#include "common/cast_set.h" #include "common/status.h" #include "pipeline/exec/operator.h" #include "runtime/primitive_type.h" @@ -57,23 +58,19 @@ Status AggSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { _agg_data = Base::_shared_state->agg_data.get(); _agg_arena_pool = Base::_shared_state->agg_arena_pool.get(); _hash_table_size_counter = ADD_COUNTER(profile(), "HashTableSize", TUnit::UNIT); - _hash_table_memory_usage = ADD_CHILD_COUNTER_WITH_LEVEL(Base::profile(), "HashTable", - TUnit::BYTES, "MemoryUsage", 1); - _serialize_key_arena_memory_usage = Base::profile()->AddHighWaterMarkCounter( - "SerializeKeyArena", TUnit::BYTES, "MemoryUsage", 1); + _hash_table_memory_usage = + ADD_COUNTER_WITH_LEVEL(Base::profile(), "MemoryUsageHashTable", TUnit::BYTES, 1); + _serialize_key_arena_memory_usage = ADD_COUNTER_WITH_LEVEL( + Base::profile(), "MemoryUsageSerializeKeyArena", TUnit::BYTES, 1); _build_timer = ADD_TIMER(Base::profile(), "BuildTime"); - _serialize_key_timer = ADD_TIMER(Base::profile(), "SerializeKeyTime"); - _exec_timer = ADD_TIMER(Base::profile(), "ExecTime"); _merge_timer = ADD_TIMER(Base::profile(), "MergeTime"); _expr_timer = ADD_TIMER(Base::profile(), "ExprTime"); - _serialize_data_timer = ADD_TIMER(Base::profile(), "SerializeDataTime"); _deserialize_data_timer = ADD_TIMER(Base::profile(), "DeserializeAndMergeTime"); _hash_table_compute_timer = ADD_TIMER(Base::profile(), "HashTableComputeTime"); _hash_table_limit_compute_timer = ADD_TIMER(Base::profile(), "DoLimitComputeTime"); _hash_table_emplace_timer = ADD_TIMER(Base::profile(), "HashTableEmplaceTime"); _hash_table_input_counter = ADD_COUNTER(Base::profile(), "HashTableInputCount", TUnit::UNIT); - _max_row_size_counter = ADD_COUNTER(Base::profile(), "MaxRowSizeInBytes", TUnit::UNIT); return Status::OK(); } @@ -227,24 +224,17 @@ void AggSinkLocalState::_update_memusage_with_serialized_key() { }, [&](auto& agg_method) -> void { auto& data = *agg_method.hash_table; - auto arena_memory_usage = + int64_t arena_memory_usage = _agg_arena_pool->size() + - Base::_shared_state->aggregate_data_container->memory_usage() - - Base::_shared_state->mem_usage_record.used_in_arena; - Base::_mem_tracker->consume(arena_memory_usage); - Base::_mem_tracker->consume( - data.get_buffer_size_in_bytes() - - Base::_shared_state->mem_usage_record.used_in_state); - _serialize_key_arena_memory_usage->add(arena_memory_usage); - COUNTER_UPDATE( - _hash_table_memory_usage, - data.get_buffer_size_in_bytes() - - Base::_shared_state->mem_usage_record.used_in_state); - Base::_shared_state->mem_usage_record.used_in_state = - data.get_buffer_size_in_bytes(); - Base::_shared_state->mem_usage_record.used_in_arena = - _agg_arena_pool->size() + - Base::_shared_state->aggregate_data_container->memory_usage(); + _shared_state->aggregate_data_container->memory_usage(); + int64_t hash_table_memory_usage = data.get_buffer_size_in_bytes(); + + COUNTER_SET(_memory_used_counter, + arena_memory_usage + hash_table_memory_usage); + COUNTER_SET(_peak_memory_usage_counter, _memory_used_counter->value()); + + COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); + COUNTER_SET(_hash_table_memory_usage, hash_table_memory_usage); }}, _agg_data->method_variant); } @@ -423,11 +413,10 @@ Status AggSinkLocalState::_merge_without_key(vectorized::Block* block) { } void AggSinkLocalState::_update_memusage_without_key() { - auto arena_memory_usage = - _agg_arena_pool->size() - Base::_shared_state->mem_usage_record.used_in_arena; - Base::_mem_tracker->consume(arena_memory_usage); - _serialize_key_arena_memory_usage->add(arena_memory_usage); - Base::_shared_state->mem_usage_record.used_in_arena = _agg_arena_pool->size(); + int64_t arena_memory_usage = _agg_arena_pool->size(); + COUNTER_SET(_memory_used_counter, arena_memory_usage); + COUNTER_SET(_peak_memory_usage_counter, arena_memory_usage); + COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); } Status AggSinkLocalState::_execute_with_serialized_key(vectorized::Block* block) { @@ -562,8 +551,8 @@ void AggSinkLocalState::_emplace_into_hash_table(vectorized::AggregateDataPtr* p SCOPED_TIMER(_hash_table_emplace_timer); for (size_t i = 0; i < num_rows; ++i) { - places[i] = agg_method.lazy_emplace(state, i, creator, - creator_for_null_key); + places[i] = *agg_method.lazy_emplace(state, i, creator, + creator_for_null_key); } COUNTER_UPDATE(_hash_table_input_counter, num_rows); @@ -663,8 +652,8 @@ bool AggSinkLocalState::_emplace_into_hash_table_limit(vectorized::AggregateData SCOPED_TIMER(_hash_table_emplace_timer); for (i = 0; i < num_rows; ++i) { - places[i] = agg_method.lazy_emplace(state, i, creator, - creator_for_null_key); + places[i] = *agg_method.lazy_emplace(state, i, creator, + creator_for_null_key); } COUNTER_UPDATE(_hash_table_input_counter, num_rows); return true; @@ -702,9 +691,9 @@ void AggSinkLocalState::_find_in_hash_table(vectorized::AggregateDataPtr* places } Status AggSinkLocalState::_init_hash_method(const vectorized::VExprContextSPtrs& probe_exprs) { - RETURN_IF_ERROR( - init_agg_hash_method(_agg_data, probe_exprs, - Base::_parent->template cast()._is_first_phase)); + RETURN_IF_ERROR(init_hash_method( + _agg_data, get_data_types(probe_exprs), + Base::_parent->template cast()._is_first_phase)); return Status::OK(); } @@ -725,7 +714,10 @@ AggSinkOperatorX::AggSinkOperatorX(ObjectPool* pool, int operator_id, const TPla : tnode.agg_node.grouping_exprs), _is_colocate(tnode.agg_node.__isset.is_colocate && tnode.agg_node.is_colocate), _require_bucket_distribution(require_bucket_distribution), - _agg_fn_output_row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples) {} + _agg_fn_output_row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), + _without_key(tnode.agg_node.grouping_exprs.empty()) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status AggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::init(tnode, state)); @@ -742,7 +734,7 @@ Status AggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( _pool, tnode.agg_node.aggregate_functions[i], tnode.agg_node.__isset.agg_sort_infos ? tnode.agg_node.agg_sort_infos[i] : dummy, - &evaluator)); + tnode.agg_node.grouping_exprs.empty(), &evaluator)); _aggregate_evaluators.push_back(evaluator); } @@ -823,7 +815,8 @@ Status AggSinkOperatorX::open(RuntimeState* state) { // check output type if (_needs_finalize) { RETURN_IF_ERROR(vectorized::AggFnEvaluator::check_agg_fn_output( - _probe_expr_ctxs.size(), _aggregate_evaluators, _agg_fn_output_row_descriptor)); + cast_set(_probe_expr_ctxs.size()), _aggregate_evaluators, + _agg_fn_output_row_descriptor)); } RETURN_IF_ERROR(vectorized::VExpr::open(_probe_expr_ctxs, state)); @@ -876,8 +869,6 @@ Status AggSinkLocalState::close(RuntimeState* state, Status exec_status) { std::vector tmp_deserialize_buffer; _deserialize_buffer.swap(tmp_deserialize_buffer); - Base::_mem_tracker->release(Base::_shared_state->mem_usage_record.used_in_state + - Base::_shared_state->mem_usage_record.used_in_arena); return Base::close(state, exec_status); } diff --git a/be/src/pipeline/exec/aggregation_sink_operator.h b/be/src/pipeline/exec/aggregation_sink_operator.h index 97440de3f09e4c..21ee640613789e 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.h +++ b/be/src/pipeline/exec/aggregation_sink_operator.h @@ -23,7 +23,7 @@ #include "runtime/exec_env.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" class AggSinkOperatorX; class AggSinkLocalState : public PipelineXSinkLocalState { @@ -102,14 +102,11 @@ class AggSinkLocalState : public PipelineXSinkLocalState { RuntimeProfile::Counter* _hash_table_input_counter = nullptr; RuntimeProfile::Counter* _build_timer = nullptr; RuntimeProfile::Counter* _expr_timer = nullptr; - RuntimeProfile::Counter* _serialize_key_timer = nullptr; RuntimeProfile::Counter* _merge_timer = nullptr; - RuntimeProfile::Counter* _serialize_data_timer = nullptr; RuntimeProfile::Counter* _deserialize_data_timer = nullptr; - RuntimeProfile::Counter* _max_row_size_counter = nullptr; RuntimeProfile::Counter* _hash_table_memory_usage = nullptr; RuntimeProfile::Counter* _hash_table_size_counter = nullptr; - RuntimeProfile::HighWaterMarkCounter* _serialize_key_arena_memory_usage = nullptr; + RuntimeProfile::Counter* _serialize_key_arena_memory_usage = nullptr; bool _should_limit_output = false; @@ -143,17 +140,15 @@ class AggSinkOperatorX final : public DataSinkOperatorX { DataDistribution required_data_distribution() const override { if (_probe_expr_ctxs.empty()) { - return _needs_finalize || DataSinkOperatorX::_child - ->ignore_data_distribution() - ? DataDistribution(ExchangeType::PASSTHROUGH) + return _needs_finalize + ? DataDistribution(ExchangeType::NOOP) : DataSinkOperatorX::required_data_distribution(); } - return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_join + return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_operator ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); } bool require_data_distribution() const override { return _is_colocate; } - bool require_shuffled_data_distribution() const override { return !_probe_expr_ctxs.empty(); } size_t get_revocable_mem_size(RuntimeState* state) const; AggregatedDataVariants* get_agg_data(RuntimeState* state) { @@ -204,8 +199,9 @@ class AggSinkOperatorX final : public DataSinkOperatorX { const std::vector _partition_exprs; const bool _is_colocate; const bool _require_bucket_distribution; - RowDescriptor _agg_fn_output_row_descriptor; + const bool _without_key; }; } // namespace doris::pipeline +#include "common/compile_check_end.h" diff --git a/be/src/pipeline/exec/aggregation_source_operator.cpp b/be/src/pipeline/exec/aggregation_source_operator.cpp index 3bdda31308ff86..9feb3493068f97 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/aggregation_source_operator.cpp @@ -30,20 +30,18 @@ namespace doris::pipeline { AggLocalState::AggLocalState(RuntimeState* state, OperatorXBase* parent) : Base(state, parent), _get_results_timer(nullptr), - _serialize_result_timer(nullptr), _hash_table_iterate_timer(nullptr), _insert_keys_to_column_timer(nullptr), - _serialize_data_timer(nullptr) {} + _insert_values_to_column_timer(nullptr) {} Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _get_results_timer = ADD_TIMER(profile(), "GetResultsTime"); - _serialize_result_timer = ADD_TIMER(profile(), "SerializeResultTime"); _hash_table_iterate_timer = ADD_TIMER(profile(), "HashTableIterateTime"); _insert_keys_to_column_timer = ADD_TIMER(profile(), "InsertKeysToColumnTime"); - _serialize_data_timer = ADD_TIMER(profile(), "SerializeDataTime"); + _insert_values_to_column_timer = ADD_TIMER(profile(), "InsertValuesToColumnTime"); _merge_timer = ADD_TIMER(Base::profile(), "MergeTime"); _deserialize_data_timer = ADD_TIMER(Base::profile(), "DeserializeAndMergeTime"); @@ -58,7 +56,7 @@ Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } else { - _executor.get_result = std::bind(&AggLocalState::_serialize_without_key, this, + _executor.get_result = std::bind(&AggLocalState::_get_results_without_key, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } @@ -69,8 +67,8 @@ Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { std::placeholders::_2, std::placeholders::_3); } else { _executor.get_result = std::bind( - &AggLocalState::_serialize_with_serialized_key_result, this, - std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); + &AggLocalState::_get_results_with_serialized_key, this, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); } } @@ -94,18 +92,9 @@ Status AggLocalState::_create_agg_status(vectorized::AggregateDataPtr data) { return Status::OK(); } -Status AggLocalState::_destroy_agg_status(vectorized::AggregateDataPtr data) { - auto& shared_state = *Base::_shared_state; - for (int i = 0; i < shared_state.aggregate_evaluators.size(); ++i) { - shared_state.aggregate_evaluators[i]->function()->destroy( - data + shared_state.offsets_of_aggregate_states[i]); - } - return Status::OK(); -} - -Status AggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, - vectorized::Block* block, bool* eos) { - SCOPED_TIMER(_serialize_result_timer); +Status AggLocalState::_get_results_with_serialized_key(RuntimeState* state, + vectorized::Block* block, bool* eos) { + SCOPED_TIMER(_get_results_timer); auto& shared_state = *_shared_state; size_t key_size = _shared_state->probe_expr_ctxs.size(); size_t agg_size = _shared_state->aggregate_evaluators.size(); @@ -125,7 +114,6 @@ Status AggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, } } - SCOPED_TIMER(_get_results_timer); std::visit( vectorized::Overload { [&](std::monostate& arg) -> void { @@ -181,7 +169,7 @@ Status AggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, } { - SCOPED_TIMER(_serialize_data_timer); + SCOPED_TIMER(_insert_values_to_column_timer); for (size_t i = 0; i < shared_state.aggregate_evaluators.size(); ++i) { value_data_types[i] = shared_state.aggregate_evaluators[i] ->function() @@ -333,13 +321,13 @@ Status AggLocalState::_get_with_serialized_key_result(RuntimeState* state, vecto return Status::OK(); } -Status AggLocalState::_serialize_without_key(RuntimeState* state, vectorized::Block* block, - bool* eos) { +Status AggLocalState::_get_results_without_key(RuntimeState* state, vectorized::Block* block, + bool* eos) { + SCOPED_TIMER(_get_results_timer); auto& shared_state = *_shared_state; // 1. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return NULL result // level one aggregation node set `eos = true` return directly - SCOPED_TIMER(_serialize_result_timer); if (UNLIKELY(_shared_state->input_num_rows == 0)) { *eos = true; return Status::OK(); @@ -441,7 +429,9 @@ AggSourceOperatorX::AggSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : Base(pool, tnode, operator_id, descs), _needs_finalize(tnode.agg_node.need_finalize), - _without_key(tnode.agg_node.grouping_exprs.empty()) {} + _without_key(tnode.agg_node.grouping_exprs.empty()) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status AggSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { auto& local_state = get_local_state(state); @@ -461,8 +451,6 @@ void AggLocalState::do_agg_limit(vectorized::Block* block, bool* eos) { vectorized::Block::filter_block_internal(block, _shared_state->need_computes); if (auto rows = block->rows()) { _num_rows_returned += rows; - COUNTER_UPDATE(_blocks_returned_counter, 1); - COUNTER_SET(_rows_returned_counter, _num_rows_returned); } } else { reached_limit(block, eos); @@ -470,8 +458,6 @@ void AggLocalState::do_agg_limit(vectorized::Block* block, bool* eos) { } else { if (auto rows = block->rows()) { _num_rows_returned += rows; - COUNTER_UPDATE(_blocks_returned_counter, 1); - COUNTER_SET(_rows_returned_counter, _num_rows_returned); } } } @@ -575,17 +561,6 @@ template Status AggSourceOperatorX::merge_with_serialized_key_helper( template Status AggSourceOperatorX::merge_with_serialized_key_helper( RuntimeState* state, vectorized::Block* block); -size_t AggLocalState::_get_hash_table_size() { - return std::visit( - vectorized::Overload {[&](std::monostate& arg) -> size_t { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, - "uninited hash table"); - return 0; - }, - [&](auto& agg_method) { return agg_method.hash_table->size(); }}, - _shared_state->agg_data->method_variant); -} - void AggLocalState::_emplace_into_hash_table(vectorized::AggregateDataPtr* places, vectorized::ColumnRawPtrs& key_columns, size_t num_rows) { @@ -625,8 +600,8 @@ void AggLocalState::_emplace_into_hash_table(vectorized::AggregateDataPtr* place SCOPED_TIMER(_hash_table_emplace_timer); for (size_t i = 0; i < num_rows; ++i) { - places[i] = agg_method.lazy_emplace(state, i, creator, - creator_for_null_key); + places[i] = *agg_method.lazy_emplace(state, i, creator, + creator_for_null_key); } COUNTER_UPDATE(_hash_table_input_counter, num_rows); diff --git a/be/src/pipeline/exec/aggregation_source_operator.h b/be/src/pipeline/exec/aggregation_source_operator.h index a3824a381eb49c..6de2bf93dbc758 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.h +++ b/be/src/pipeline/exec/aggregation_source_operator.h @@ -25,7 +25,7 @@ namespace doris { class RuntimeState; namespace pipeline { - +#include "common/compile_check_begin.h" class AggSourceOperatorX; class AggLocalState final : public PipelineXLocalState { @@ -47,13 +47,12 @@ class AggLocalState final : public PipelineXLocalState { friend class AggSourceOperatorX; Status _get_without_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); + Status _get_results_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); Status _get_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, - bool* eos); + Status _get_results_with_serialized_key(RuntimeState* state, vectorized::Block* block, + bool* eos); Status _create_agg_status(vectorized::AggregateDataPtr data); - Status _destroy_agg_status(vectorized::AggregateDataPtr data); void _make_nullable_output_key(vectorized::Block* block) { if (block->rows() != 0) { auto& shared_state = *Base ::_shared_state; @@ -68,16 +67,14 @@ class AggLocalState final : public PipelineXLocalState { vectorized::ColumnRawPtrs& key_columns, size_t num_rows); void _emplace_into_hash_table(vectorized::AggregateDataPtr* places, vectorized::ColumnRawPtrs& key_columns, size_t num_rows); - size_t _get_hash_table_size(); vectorized::PODArray _places; std::vector _deserialize_buffer; RuntimeProfile::Counter* _get_results_timer = nullptr; - RuntimeProfile::Counter* _serialize_result_timer = nullptr; RuntimeProfile::Counter* _hash_table_iterate_timer = nullptr; RuntimeProfile::Counter* _insert_keys_to_column_timer = nullptr; - RuntimeProfile::Counter* _serialize_data_timer = nullptr; + RuntimeProfile::Counter* _insert_values_to_column_timer = nullptr; RuntimeProfile::Counter* _hash_table_compute_timer = nullptr; RuntimeProfile::Counter* _hash_table_emplace_timer = nullptr; @@ -122,3 +119,4 @@ class AggSourceOperatorX : public OperatorX { } // namespace pipeline } // namespace doris +#include "common/compile_check_end.h" diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index e9276e4fa048c8..abde34a1d0255b 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -30,9 +30,10 @@ Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); - _blocks_memory_usage = - _profile->AddHighWaterMarkCounter("Blocks", TUnit::BYTES, "MemoryUsage", 1); - _evaluation_timer = ADD_TIMER(profile(), "EvaluationTime"); + _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); + _compute_agg_data_timer = ADD_TIMER(profile(), "ComputeAggDataTime"); + _compute_partition_by_timer = ADD_TIMER(profile(), "ComputePartitionByTime"); + _compute_order_by_timer = ADD_TIMER(profile(), "ComputeOrderByTime"); return Status::OK(); } @@ -202,7 +203,9 @@ AnalyticSinkOperatorX::AnalyticSinkOperatorX(ObjectPool* pool, int operator_id, _require_bucket_distribution(require_bucket_distribution), _partition_exprs(tnode.__isset.distribute_expr_lists && require_bucket_distribution ? tnode.distribute_expr_lists[0] - : tnode.analytic_node.partition_exprs) {} + : tnode.analytic_node.partition_exprs) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status AnalyticSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::init(tnode, state)); @@ -287,33 +290,41 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block } } - for (size_t i = 0; i < _agg_functions_size; - ++i) { //insert _agg_input_columns, execute calculate for its - for (size_t j = 0; j < local_state._agg_expr_ctxs[i].size(); ++j) { - RETURN_IF_ERROR(_insert_range_column( - input_block, local_state._agg_expr_ctxs[i][j], - local_state._shared_state->agg_input_columns[i][j].get(), block_rows)); + { + SCOPED_TIMER(local_state._compute_agg_data_timer); + for (size_t i = 0; i < _agg_functions_size; + ++i) { //insert _agg_input_columns, execute calculate for its + for (size_t j = 0; j < local_state._agg_expr_ctxs[i].size(); ++j) { + RETURN_IF_ERROR(_insert_range_column( + input_block, local_state._agg_expr_ctxs[i][j], + local_state._shared_state->agg_input_columns[i][j].get(), block_rows)); + } } } - //record column idx in block - for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._shared_state->partition_by_column_idxs[i] = result_col_id; + { + SCOPED_TIMER(local_state._compute_partition_by_timer); + for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { + int result_col_id = -1; + RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( + input_block, &result_col_id)); + DCHECK_GE(result_col_id, 0); + local_state._shared_state->partition_by_column_idxs[i] = result_col_id; + } } - for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { - int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); - DCHECK_GE(result_col_id, 0); - local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; + { + SCOPED_TIMER(local_state._compute_order_by_timer); + for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { + int result_col_id = -1; + RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( + input_block, &result_col_id)); + DCHECK_GE(result_col_id, 0); + local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; + } } - local_state.mem_tracker()->consume(input_block->allocated_bytes()); - local_state._blocks_memory_usage->add(input_block->allocated_bytes()); + COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); + COUNTER_SET(local_state._peak_memory_usage_counter, local_state._memory_used_counter->value()); //TODO: if need improvement, the is a tips to maintain a free queue, //so the memory could reuse, no need to new/delete again; diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index cf2892eb7e6ceb..e04b220ee351e7 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -24,7 +24,7 @@ #include "pipeline/dependency.h" namespace doris { - +#include "common/compile_check_begin.h" namespace pipeline { class AnalyticSinkOperatorX; @@ -58,7 +58,9 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; }; @@ -80,18 +82,14 @@ class AnalyticSinkOperatorX final : public DataSinkOperatorX::required_data_distribution(); } bool require_data_distribution() const override { return true; } - bool require_shuffled_data_distribution() const override { - return !_partition_by_eq_expr_ctxs.empty() && _order_by_eq_expr_ctxs.empty(); - } private: Status _insert_range_column(vectorized::Block* block, const vectorized::VExprContextSPtr& expr, @@ -115,3 +113,4 @@ class AnalyticSinkOperatorX final : public DataSinkOperatorXAddHighWaterMarkCounter("Blocks", TUnit::BYTES, "MemoryUsage", 1); - _evaluation_timer = ADD_TIMER(profile(), "EvaluationTime"); + profile()->AddHighWaterMarkCounter("MemoryUsageBlocks", TUnit::BYTES, "", 1); + _evaluation_timer = ADD_TIMER(profile(), "GetPartitionBoundTime"); + _execute_timer = ADD_TIMER(profile(), "ExecuteTime"); + _get_next_timer = ADD_TIMER(profile(), "GetNextTime"); + _get_result_timer = ADD_TIMER(profile(), "GetResultsTime"); return Status::OK(); } @@ -233,12 +236,6 @@ Status AnalyticLocalState::open(RuntimeState* state) { std::placeholders::_1); } } - _executor.insert_result = - std::bind(&AnalyticLocalState::_insert_result_info, this, std::placeholders::_1); - _executor.execute = - std::bind(&AnalyticLocalState::_execute_for_win_func, this, std::placeholders::_1, - std::placeholders::_2, std::placeholders::_3, std::placeholders::_4); - _create_agg_status(); return Status::OK(); } @@ -282,6 +279,7 @@ void AnalyticLocalState::_destroy_agg_status() { void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end) { + SCOPED_TIMER(_execute_timer); for (size_t i = 0; i < _agg_functions_size; ++i) { std::vector agg_columns; for (int j = 0; j < _shared_state->agg_input_columns[i].size(); ++j) { @@ -300,6 +298,7 @@ void AnalyticLocalState::_execute_for_win_func(int64_t partition_start, int64_t } void AnalyticLocalState::_insert_result_info(int64_t current_block_rows) { + SCOPED_TIMER(_get_result_timer); int64_t current_block_row_pos = _shared_state->input_block_first_row_positions[_output_block_index]; int64_t get_result_start = _shared_state->current_row_position - current_block_row_pos; @@ -344,6 +343,7 @@ void AnalyticLocalState::_insert_result_info(int64_t current_block_rows) { } Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && _window_end_position < current_block_rows) { int64_t range_start, range_end; @@ -367,31 +367,33 @@ Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { // Make sure range_start <= range_end range_start = std::min(range_start, range_end); } - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, range_start, - range_end); - _executor.insert_result(current_block_rows); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + range_start, range_end); + _insert_result_info(current_block_rows); } return Status::OK(); } Status AnalyticLocalState::_get_next_for_partition(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); if (_next_partition) { - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _partition_by_start.pos, _shared_state->partition_by_end.pos); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + _partition_by_start.pos, _shared_state->partition_by_end.pos); } - _executor.insert_result(current_block_rows); + _insert_result_info(current_block_rows); return Status::OK(); } Status AnalyticLocalState::_get_next_for_range(size_t current_block_rows) { + SCOPED_TIMER(_get_next_timer); while (_shared_state->current_row_position < _shared_state->partition_by_end.pos && _window_end_position < current_block_rows) { if (_shared_state->current_row_position >= _order_by_end.pos) { _update_order_by_range(); - _executor.execute(_partition_by_start.pos, _shared_state->partition_by_end.pos, - _order_by_start.pos, _order_by_end.pos); + _execute_for_win_func(_partition_by_start.pos, _shared_state->partition_by_end.pos, + _order_by_start.pos, _order_by_end.pos); } - _executor.insert_result(current_block_rows); + _insert_result_info(current_block_rows); } return Status::OK(); } @@ -443,7 +445,6 @@ bool AnalyticLocalState::init_next_partition(BlockRowPos found_partition_end) { Status AnalyticLocalState::output_current_block(vectorized::Block* block) { block->swap(std::move(_shared_state->input_blocks[_output_block_index])); _blocks_memory_usage->add(-block->allocated_bytes()); - mem_tracker()->consume(-block->allocated_bytes()); if (_shared_state->origin_cols.size() < block->columns()) { block->erase_not_in(_shared_state->origin_cols); } @@ -476,6 +477,7 @@ AnalyticSourceOperatorX::AnalyticSourceOperatorX(ObjectPool* pool, const TPlanNo _has_range_window(tnode.analytic_node.window.type == TAnalyticWindowType::RANGE), _has_window_start(tnode.analytic_node.window.__isset.window_start), _has_window_end(tnode.analytic_node.window.__isset.window_end) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; _fn_scope = AnalyticFnScope::PARTITION; if (tnode.analytic_node.__isset.window && tnode.analytic_node.window.type == TAnalyticWindowType::RANGE) { @@ -500,11 +502,13 @@ Status AnalyticSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state RETURN_IF_ERROR(OperatorX::init(tnode, state)); const TAnalyticNode& analytic_node = tnode.analytic_node; size_t agg_size = analytic_node.analytic_functions.size(); - for (int i = 0; i < agg_size; ++i) { vectorized::AggFnEvaluator* evaluator = nullptr; + // Window function treats all NullableAggregateFunction as AlwaysNullable. + // Its behavior is same with executed without group by key. + // https://github.com/apache/doris/pull/40693 RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( - _pool, analytic_node.analytic_functions[i], {}, &evaluator)); + _pool, analytic_node.analytic_functions[i], {}, /*wihout_key*/ true, &evaluator)); _agg_functions.emplace_back(evaluator); } @@ -536,7 +540,7 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block local_state.init_result_columns(); size_t current_block_rows = local_state._shared_state->input_blocks[local_state._output_block_index].rows(); - static_cast(local_state._executor.get_next(current_block_rows)); + RETURN_IF_ERROR(local_state._executor.get_next(current_block_rows)); if (local_state._window_end_position == current_block_rows) { break; } diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index d8dafa875dee6b..8f44b77f567e55 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -26,7 +26,7 @@ namespace doris { class RuntimeState; namespace pipeline { - +#include "common/compile_check_begin.h" enum AnalyticFnScope { PARTITION, RANGE, ROWS }; class AnalyticSourceOperatorX; @@ -96,17 +96,15 @@ class AnalyticLocalState final : public PipelineXLocalState std::vector _agg_functions; RuntimeProfile::Counter* _evaluation_timer = nullptr; + RuntimeProfile::Counter* _execute_timer = nullptr; + RuntimeProfile::Counter* _get_next_timer = nullptr; + RuntimeProfile::Counter* _get_result_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _blocks_memory_usage = nullptr; - using vectorized_execute = std::function; using vectorized_get_next = std::function; - using vectorized_get_result = std::function; struct executor { - vectorized_execute execute; vectorized_get_next get_next; - vectorized_get_result insert_result; }; executor _executor; @@ -156,3 +154,4 @@ class AnalyticSourceOperatorX final : public OperatorX { } // namespace pipeline } // namespace doris +#include "common/compile_check_end.h" \ No newline at end of file diff --git a/be/src/pipeline/exec/assert_num_rows_operator.cpp b/be/src/pipeline/exec/assert_num_rows_operator.cpp index 5aa27b51c45095..345e42b7d96837 100644 --- a/be/src/pipeline/exec/assert_num_rows_operator.cpp +++ b/be/src/pipeline/exec/assert_num_rows_operator.cpp @@ -21,12 +21,13 @@ #include "vec/utils/util.hpp" namespace doris::pipeline { - +#include "common/compile_check_begin.h" AssertNumRowsOperatorX::AssertNumRowsOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs) : StreamingOperatorX(pool, tnode, operator_id, descs), _desired_num_rows(tnode.assert_num_rows_node.desired_num_rows), _subquery_string(tnode.assert_num_rows_node.subquery_string) { + _is_serial_operator = true; if (tnode.assert_num_rows_node.__isset.assertion) { _assertion = tnode.assert_num_rows_node.assertion; } else { @@ -114,8 +115,6 @@ Status AssertNumRowsOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc return Status::Cancelled("Expected {} {} to be returned by expression {}", to_string_lambda(_assertion), _desired_num_rows, _subquery_string); } - COUNTER_SET(local_state.rows_returned_counter(), local_state.num_rows_returned()); - COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, block->columns())); return Status::OK(); diff --git a/be/src/pipeline/exec/cache_sink_operator.cpp b/be/src/pipeline/exec/cache_sink_operator.cpp index b8b5b5346591c8..b09921245bbfe0 100644 --- a/be/src/pipeline/exec/cache_sink_operator.cpp +++ b/be/src/pipeline/exec/cache_sink_operator.cpp @@ -27,7 +27,7 @@ #include "util/runtime_profile.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" Status CacheSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); diff --git a/be/src/pipeline/exec/cache_source_operator.cpp b/be/src/pipeline/exec/cache_source_operator.cpp index 5f8c5befc6a2b9..cace8465fc2d46 100644 --- a/be/src/pipeline/exec/cache_source_operator.cpp +++ b/be/src/pipeline/exec/cache_source_operator.cpp @@ -29,7 +29,7 @@ namespace doris { class RuntimeState; namespace pipeline { - +#include "common/compile_check_begin.h" Status CacheSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); @@ -65,7 +65,7 @@ Status CacheSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { // 3. lookup the cache and find proper slot order hit_cache = QueryCache::instance()->lookup(_cache_key, _version, &_query_cache_handle); - _runtime_profile->add_info_string("HitCache", hit_cache ? "1" : "0"); + _runtime_profile->add_info_string("HitCache", std::to_string(hit_cache)); if (hit_cache && !cache_param.force_refresh_query_cache) { _hit_cache_results = _query_cache_handle.get_cache_result(); auto hit_cache_slot_orders = _query_cache_handle.get_cache_slot_orders(); @@ -125,13 +125,16 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* b if (local_state._hit_cache_results == nullptr) { Defer insert_cache([&] { - if (*eos && local_state._need_insert_cache) { - local_state._runtime_profile->add_info_string("InsertCache", "1"); - local_state._global_cache->insert(local_state._cache_key, local_state._version, - local_state._local_cache_blocks, - local_state._slot_orders, - local_state._current_query_cache_bytes); - local_state._local_cache_blocks.clear(); + if (*eos) { + local_state._runtime_profile->add_info_string( + "InsertCache", std::to_string(local_state._need_insert_cache)); + if (local_state._need_insert_cache) { + local_state._global_cache->insert(local_state._cache_key, local_state._version, + local_state._local_cache_blocks, + local_state._slot_orders, + local_state._current_query_cache_bytes); + local_state._local_cache_blocks.clear(); + } } }); @@ -156,14 +159,12 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* b local_state._current_query_cache_rows += output_block->rows(); auto mem_consume = output_block->allocated_bytes(); local_state._current_query_cache_bytes += mem_consume; - local_state._mem_tracker->consume(mem_consume); if (_cache_param.entry_max_bytes < local_state._current_query_cache_bytes || _cache_param.entry_max_rows < local_state._current_query_cache_rows) { // over the max bytes, pass through the data, no need to do cache local_state._local_cache_blocks.clear(); local_state._need_insert_cache = false; - local_state._runtime_profile->add_info_string("InsertCache", "0"); } else { local_state._local_cache_blocks.emplace_back(std::move(output_block)); } diff --git a/be/src/pipeline/exec/data_queue.cpp b/be/src/pipeline/exec/data_queue.cpp index 20fae125e3bd98..436a98e6b0369e 100644 --- a/be/src/pipeline/exec/data_queue.cpp +++ b/be/src/pipeline/exec/data_queue.cpp @@ -29,7 +29,7 @@ namespace doris { namespace pipeline { - +#include "common/compile_check_begin.h" DataQueue::DataQueue(int child_count) : _queue_blocks_lock(child_count), _queue_blocks(child_count), diff --git a/be/src/pipeline/exec/datagen_operator.cpp b/be/src/pipeline/exec/datagen_operator.cpp index 93b3d058154e62..d400953799e5bb 100644 --- a/be/src/pipeline/exec/datagen_operator.cpp +++ b/be/src/pipeline/exec/datagen_operator.cpp @@ -30,13 +30,15 @@ class RuntimeState; } // namespace doris namespace doris::pipeline { - +#include "common/compile_check_begin.h" DataGenSourceOperatorX::DataGenSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs) : OperatorX(pool, tnode, operator_id, descs), _tuple_id(tnode.data_gen_scan_node.tuple_id), _tuple_desc(nullptr), - _runtime_filter_descs(tnode.runtime_filters) {} + _runtime_filter_descs(tnode.runtime_filters) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status DataGenSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(OperatorX::init(tnode, state)); @@ -68,17 +70,25 @@ Status DataGenSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - Status res = local_state._table_func->get_next(state, block, eos); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, - block->columns())); + { + SCOPED_TIMER(local_state._table_function_execution_timer); + RETURN_IF_ERROR(local_state._table_func->get_next(state, block, eos)); + } + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, + block->columns())); + } local_state.reached_limit(block, eos); - return res; + return Status::OK(); } Status DataGenLocalState::init(RuntimeState* state, LocalStateInfo& info) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); + _table_function_execution_timer = ADD_TIMER(profile(), "TableFunctionExecutionTime"); + _filter_timer = ADD_TIMER(profile(), "FilterTime"); auto& p = _parent->cast(); _table_func = std::make_shared(p._tuple_id, p._tuple_desc); _table_func->set_tuple_desc(p._tuple_desc); @@ -87,8 +97,8 @@ Status DataGenLocalState::init(RuntimeState* state, LocalStateInfo& info) { // TODO: use runtime filter to filte result block, maybe this node need derive from vscan_node. for (const auto& filter_desc : p._runtime_filter_descs) { std::shared_ptr runtime_filter; - RETURN_IF_ERROR(state->register_consumer_runtime_filter( - filter_desc, p.ignore_data_distribution(), p.node_id(), &runtime_filter)); + RETURN_IF_ERROR(state->register_consumer_runtime_filter(filter_desc, p.is_serial_operator(), + p.node_id(), &runtime_filter)); runtime_filter->init_profile(_runtime_profile.get()); } return Status::OK(); diff --git a/be/src/pipeline/exec/datagen_operator.h b/be/src/pipeline/exec/datagen_operator.h index c63ef97bb7a40f..bada5ec4080d08 100644 --- a/be/src/pipeline/exec/datagen_operator.h +++ b/be/src/pipeline/exec/datagen_operator.h @@ -44,6 +44,8 @@ class DataGenLocalState final : public PipelineXLocalState<> { private: friend class DataGenSourceOperatorX; std::shared_ptr _table_func; + RuntimeProfile::Counter* _table_function_execution_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; class DataGenSourceOperatorX final : public OperatorX { diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp index 70b73225f060e8..bb282fd118e5c0 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.cpp @@ -31,7 +31,7 @@ class RuntimeState; } // namespace doris namespace doris::pipeline { - +#include "common/compile_check_begin.h" struct StreamingHtMinReductionEntry { // Use 'streaming_ht_min_reduction' if the total size of hash table bucket directories in // bytes is greater than this threshold. @@ -59,10 +59,9 @@ static constexpr int STREAMING_HT_MIN_REDUCTION_SIZE = DistinctStreamingAggLocalState::DistinctStreamingAggLocalState(RuntimeState* state, OperatorXBase* parent) : PipelineXLocalState(state, parent), - dummy_mapped_data(std::make_shared('A')), batch_size(state->batch_size()), _agg_arena_pool(std::make_unique()), - _agg_data(std::make_unique()), + _agg_data(std::make_unique()), _agg_profile_arena(std::make_unique()), _child_block(vectorized::Block::create_unique()), _aggregated_block(vectorized::Block::create_unique()) {} @@ -72,7 +71,6 @@ Status DistinctStreamingAggLocalState::init(RuntimeState* state, LocalStateInfo& SCOPED_TIMER(Base::exec_time_counter()); SCOPED_TIMER(Base::_init_timer); _build_timer = ADD_TIMER(Base::profile(), "BuildTime"); - _exec_timer = ADD_TIMER(Base::profile(), "ExecTime"); _hash_table_compute_timer = ADD_TIMER(Base::profile(), "HashTableComputeTime"); _hash_table_emplace_timer = ADD_TIMER(Base::profile(), "HashTableEmplaceTime"); _hash_table_input_counter = ADD_COUNTER(Base::profile(), "HashTableInputCount", TUnit::UNIT); @@ -95,12 +93,8 @@ Status DistinctStreamingAggLocalState::open(RuntimeState* state) { RETURN_IF_ERROR(p._probe_expr_ctxs[i]->clone(state, _probe_expr_ctxs[i])); } - if (_probe_expr_ctxs.empty()) { - _agg_data->without_key = reinterpret_cast( - _agg_profile_arena->alloc(p._total_size_of_aggregate_states)); - } else { - RETURN_IF_ERROR(_init_hash_method(_probe_expr_ctxs)); - } + DCHECK_EQ(p._total_size_of_aggregate_states, 0); + RETURN_IF_ERROR(_init_hash_method(_probe_expr_ctxs)); return Status::OK(); } @@ -139,8 +133,8 @@ bool DistinctStreamingAggLocalState::_should_expand_preagg_hash_tables() { const int64_t aggregated_input_rows = input_rows - _num_rows_returned; // TODO chenhao // const int64_t expected_input_rows = estimated_input_cardinality_ - num_rows_returned_; - double current_reduction = - static_cast(aggregated_input_rows) / ht_rows; + double current_reduction = static_cast(aggregated_input_rows) / + static_cast(ht_rows); // TODO: workaround for IMPALA-2490: subplan node rows_returned counter may be // inaccurate, which could lead to a divide by zero below. @@ -171,8 +165,8 @@ bool DistinctStreamingAggLocalState::_should_expand_preagg_hash_tables() { Status DistinctStreamingAggLocalState::_init_hash_method( const vectorized::VExprContextSPtrs& probe_exprs) { - RETURN_IF_ERROR(init_agg_hash_method( - _agg_data.get(), probe_exprs, + RETURN_IF_ERROR(init_hash_method( + _agg_data.get(), get_data_types(probe_exprs), Base::_parent->template cast()._is_first_phase)); return Status::OK(); } @@ -198,7 +192,7 @@ Status DistinctStreamingAggLocalState::_distinct_pre_agg_with_serialized_key( } } - int rows = in_block->rows(); + size_t rows = in_block->rows(); _distinct_row.clear(); _distinct_row.reserve(rows); @@ -303,13 +297,10 @@ void DistinctStreamingAggLocalState::_emplace_into_hash_table_to_distinct( size_t row = 0; auto creator = [&](const auto& ctor, auto& key, auto& origin) { HashMethodType::try_presis_key(key, origin, _arena); - ctor(key, dummy_mapped_data.get()); - distinct_row.push_back(row); - }; - auto creator_for_null_key = [&](auto& mapped) { - mapped = dummy_mapped_data.get(); + ctor(key); distinct_row.push_back(row); }; + auto creator_for_null_key = [&]() { distinct_row.push_back(row); }; SCOPED_TIMER(_hash_table_emplace_timer); for (; row < num_rows; ++row) { @@ -334,7 +325,9 @@ DistinctStreamingAggOperatorX::DistinctStreamingAggOperatorX(ObjectPool* pool, i ? tnode.distribute_expr_lists[0] : tnode.agg_node.grouping_exprs), _is_colocate(tnode.agg_node.__isset.is_colocate && tnode.agg_node.is_colocate), - _require_bucket_distribution(require_bucket_distribution) { + _require_bucket_distribution(require_bucket_distribution), + _without_key(tnode.agg_node.grouping_exprs.empty()) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; if (tnode.agg_node.__isset.use_streaming_preaggregation) { _is_streaming_preagg = tnode.agg_node.use_streaming_preaggregation; if (_is_streaming_preagg) { @@ -361,7 +354,7 @@ Status DistinctStreamingAggOperatorX::init(const TPlanNode& tnode, RuntimeState* RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( _pool, tnode.agg_node.aggregate_functions[i], tnode.agg_node.__isset.agg_sort_infos ? tnode.agg_node.agg_sort_infos[i] : dummy, - &evaluator)); + tnode.agg_node.grouping_exprs.empty(), &evaluator)); _aggregate_evaluators.push_back(evaluator); } @@ -376,8 +369,8 @@ Status DistinctStreamingAggOperatorX::open(RuntimeState* state) { DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size()); RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child->row_desc())); - int j = _probe_expr_ctxs.size(); - for (int i = 0; i < j; ++i) { + size_t j = _probe_expr_ctxs.size(); + for (size_t i = 0; i < j; ++i) { auto nullable_output = _output_tuple_desc->slots()[i]->is_nullable(); auto nullable_input = _probe_expr_ctxs[i]->root()->is_nullable(); if (nullable_output != nullable_input) { @@ -460,7 +453,6 @@ Status DistinctStreamingAggOperatorX::pull(RuntimeState* state, vectorized::Bloc block->columns())); } local_state.add_num_rows_returned(block->rows()); - COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); // If the limit is not reached, it is important to ensure that _aggregated_block is empty // because it may still contain data. // However, if the limit is reached, there is no need to output data even if some exists. diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h index e5476e89421a0f..4c5fcd5efa74b9 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_operator.h @@ -23,6 +23,7 @@ #include #include "common/status.h" +#include "pipeline/common/distinct_agg_utils.h" #include "pipeline/exec/operator.h" #include "util/runtime_profile.h" #include "vec/core/block.h" @@ -32,7 +33,7 @@ class ExecNode; class RuntimeState; namespace pipeline { - +#include "common/compile_check_begin.h" class DistinctStreamingAggOperatorX; class DistinctStreamingAggLocalState final : public PipelineXLocalState { @@ -65,7 +66,6 @@ class DistinctStreamingAggLocalState final : public PipelineXLocalStateclone_empty(); } - std::shared_ptr dummy_mapped_data; vectorized::IColumn::Selector _distinct_row; vectorized::Arena _arena; size_t _input_num_rows = 0; @@ -73,7 +73,7 @@ class DistinctStreamingAggLocalState final : public PipelineXLocalState _agg_arena_pool = nullptr; - AggregatedDataVariantsUPtr _agg_data = nullptr; + std::unique_ptr _agg_data = nullptr; std::vector _aggregate_evaluators; // group by k1,k2 vectorized::VExprContextSPtrs _probe_expr_ctxs; @@ -104,8 +104,11 @@ class DistinctStreamingAggOperatorX final bool need_more_input_data(RuntimeState* state) const override; DataDistribution required_data_distribution() const override { + if (_needs_finalize && _probe_expr_ctxs.empty()) { + return {ExchangeType::NOOP}; + } if (_needs_finalize || (!_probe_expr_ctxs.empty() && !_is_streaming_preagg)) { - return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_join + return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_operator ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); } @@ -113,9 +116,6 @@ class DistinctStreamingAggOperatorX final } bool require_data_distribution() const override { return _is_colocate; } - bool require_shuffled_data_distribution() const override { - return _needs_finalize || (!_probe_expr_ctxs.empty() && !_is_streaming_preagg); - } private: friend class DistinctStreamingAggLocalState; @@ -136,7 +136,9 @@ class DistinctStreamingAggOperatorX final /// The total size of the row from the aggregate functions. size_t _total_size_of_aggregate_states = 0; bool _is_streaming_preagg = false; + const bool _without_key; }; } // namespace pipeline } // namespace doris +#include "common/compile_check_end.h" \ No newline at end of file diff --git a/be/src/pipeline/exec/es_scan_operator.cpp b/be/src/pipeline/exec/es_scan_operator.cpp index c7e953a7fa3201..2cb3cd5e0b29ce 100644 --- a/be/src/pipeline/exec/es_scan_operator.cpp +++ b/be/src/pipeline/exec/es_scan_operator.cpp @@ -22,7 +22,7 @@ #include "vec/exec/scan/new_es_scanner.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" // Prefer to the local host static std::string get_host_and_port(const std::vector& es_hosts) { std::string host_port; @@ -44,12 +44,10 @@ static std::string get_host_and_port(const std::vector& Status EsScanLocalState::_init_profile() { RETURN_IF_ERROR(Base::_init_profile()); - _es_profile.reset(new RuntimeProfile("EsIterator")); - Base::_scanner_profile->add_child(_es_profile.get(), true, nullptr); - _rows_read_counter = ADD_COUNTER(_es_profile, "RowsRead", TUnit::UNIT); - _read_timer = ADD_TIMER(_es_profile, "TotalRawReadTime(*)"); - _materialize_timer = ADD_TIMER(_es_profile, "MaterializeTupleTime(*)"); + _blocks_read_counter = ADD_COUNTER(_runtime_profile, "BlocksRead", TUnit::UNIT); + _read_timer = ADD_TIMER(_runtime_profile, "TotalRawReadTime(*)"); + _materialize_timer = ADD_TIMER(_runtime_profile, "MaterializeTupleTime(*)"); return Status::OK(); } diff --git a/be/src/pipeline/exec/es_scan_operator.h b/be/src/pipeline/exec/es_scan_operator.h index 4e80150d0ba8c6..2ae562e4fc7f32 100644 --- a/be/src/pipeline/exec/es_scan_operator.h +++ b/be/src/pipeline/exec/es_scan_operator.h @@ -52,13 +52,12 @@ class EsScanLocalState final : public ScanLocalState { Status _init_scanners(std::list* scanners) override; std::vector> _scan_ranges; - std::unique_ptr _es_profile; // FIXME: non-static data member '_rows_read_counter' of 'EsScanLocalState' shadows member inherited from type 'ScanLocalStateBase' #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wshadow-field" #endif - RuntimeProfile::Counter* _rows_read_counter = nullptr; + RuntimeProfile::Counter* _blocks_read_counter = nullptr; #ifdef __clang__ #pragma clang diagnostic pop #endif diff --git a/be/src/pipeline/exec/exchange_sink_buffer.cpp b/be/src/pipeline/exec/exchange_sink_buffer.cpp index e0a3725ad65e6d..7163299d766f4e 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.cpp +++ b/be/src/pipeline/exec/exchange_sink_buffer.cpp @@ -86,14 +86,13 @@ void BroadcastPBlockHolderMemLimiter::release(const BroadcastPBlockHolder& holde } // namespace vectorized namespace pipeline { - ExchangeSinkBuffer::ExchangeSinkBuffer(PUniqueId query_id, PlanNodeId dest_node_id, int send_id, int be_number, RuntimeState* state, ExchangeSinkLocalState* parent) : HasTaskExecutionCtx(state), _queue_capacity(0), _is_finishing(false), - _query_id(query_id), + _query_id(std::move(query_id)), _dest_node_id(dest_node_id), _sender_id(send_id), _be_number(be_number), @@ -110,12 +109,6 @@ void ExchangeSinkBuffer::close() { //_instance_to_request.clear(); } -void ExchangeSinkBuffer::_set_ready_to_finish(bool all_done) { - if (_finish_dependency && _should_stop && all_done) { - _finish_dependency->set_ready(); - } -} - void ExchangeSinkBuffer::register_sink(TUniqueId fragment_instance_id) { if (_is_finishing) { return; @@ -135,7 +128,6 @@ void ExchangeSinkBuffer::register_sink(TUniqueId fragment_instance_id) { finst_id.set_hi(fragment_instance_id.hi); finst_id.set_lo(fragment_instance_id.lo); _rpc_channel_is_idle[low_id] = true; - _instance_to_rpc_ctx[low_id] = {}; _instance_to_receiver_eof[low_id] = false; _instance_to_rpc_time[low_id] = 0; _construct_request(low_id, finst_id); @@ -160,11 +152,13 @@ Status ExchangeSinkBuffer::add_block(TransmitInfo&& request) { if (_rpc_channel_is_idle[ins_id]) { send_now = true; _rpc_channel_is_idle[ins_id] = false; - _busy_channels++; } if (request.block) { RETURN_IF_ERROR( BeExecVersionManager::check_be_exec_version(request.block->be_exec_version())); + COUNTER_UPDATE(_parent->memory_used_counter(), request.block->ByteSizeLong()); + COUNTER_SET(_parent->peak_memory_usage_counter(), + _parent->memory_used_counter()->value()); } _instance_to_package_queue[ins_id].emplace(std::move(request)); _total_queue_size++; @@ -198,7 +192,6 @@ Status ExchangeSinkBuffer::add_block(BroadcastTransmitInfo&& request) { if (_rpc_channel_is_idle[ins_id]) { send_now = true; _rpc_channel_is_idle[ins_id] = false; - _busy_channels++; } if (request.block_holder->get_block()) { RETURN_IF_ERROR(BeExecVersionManager::check_be_exec_version( @@ -223,7 +216,7 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { _instance_to_broadcast_package_queue[id]; if (_is_finishing) { - _turn_off_channel(id); + _turn_off_channel(id, lock); return Status::OK(); } @@ -241,11 +234,8 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { } auto send_callback = request.channel->get_send_callback(id, request.eos); - _instance_to_rpc_ctx[id]._send_callback = send_callback; - _instance_to_rpc_ctx[id].is_cancelled = false; - send_callback->cntl_->set_timeout_ms(request.channel->_brpc_timeout_ms); - if (config::exchange_sink_ignore_eovercrowded) { + if (config::execution_ignore_eovercrowded) { send_callback->cntl_->ignore_eovercrowded(); } send_callback->addFailedHandler([&, weak_task_ctx = weak_task_exec_ctx()]( @@ -303,6 +293,7 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { } } if (request.block) { + COUNTER_UPDATE(_parent->memory_used_counter(), -request.block->ByteSizeLong()); static_cast(brpc_request->release_block()); } q.pop(); @@ -321,14 +312,8 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { brpc_request->set_allocated_block(request.block_holder->get_block()); } auto send_callback = request.channel->get_send_callback(id, request.eos); - - ExchangeRpcContext rpc_ctx; - rpc_ctx._send_callback = send_callback; - rpc_ctx.is_cancelled = false; - _instance_to_rpc_ctx[id] = rpc_ctx; - send_callback->cntl_->set_timeout_ms(request.channel->_brpc_timeout_ms); - if (config::exchange_sink_ignore_eovercrowded) { + if (config::execution_ignore_eovercrowded) { send_callback->cntl_->ignore_eovercrowded(); } send_callback->addFailedHandler([&, weak_task_ctx = weak_task_exec_ctx()]( @@ -390,7 +375,7 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { } broadcast_q.pop(); } else { - _turn_off_channel(id); + _rpc_channel_is_idle[id] = true; } return Status::OK(); @@ -420,23 +405,43 @@ void ExchangeSinkBuffer::_ended(InstanceLoId id) { __builtin_unreachable(); } else { std::unique_lock lock(*_instance_to_package_queue_mutex[id]); - _turn_off_channel(id); + _turn_off_channel(id, lock); } } void ExchangeSinkBuffer::_failed(InstanceLoId id, const std::string& err) { _is_finishing = true; _context->cancel(Status::Cancelled(err)); - std::unique_lock lock(*_instance_to_package_queue_mutex[id]); - _turn_off_channel(id, true); } void ExchangeSinkBuffer::_set_receiver_eof(InstanceLoId id) { std::unique_lock lock(*_instance_to_package_queue_mutex[id]); _instance_to_receiver_eof[id] = true; - _turn_off_channel(id, true); - std::queue> empty; - swap(empty, _instance_to_broadcast_package_queue[id]); + _turn_off_channel(id, lock); + std::queue>& broadcast_q = + _instance_to_broadcast_package_queue[id]; + for (; !broadcast_q.empty(); broadcast_q.pop()) { + if (broadcast_q.front().block_holder->get_block()) { + COUNTER_UPDATE(_parent->memory_used_counter(), + -broadcast_q.front().block_holder->get_block()->ByteSizeLong()); + } + } + { + std::queue> empty; + swap(empty, broadcast_q); + } + + std::queue>& q = _instance_to_package_queue[id]; + for (; !q.empty(); q.pop()) { + if (q.front().block) { + COUNTER_UPDATE(_parent->memory_used_counter(), -q.front().block->ByteSizeLong()); + } + } + + { + std::queue> empty; + swap(empty, q); + } } bool ExchangeSinkBuffer::_is_receiver_eof(InstanceLoId id) { @@ -444,17 +449,17 @@ bool ExchangeSinkBuffer::_is_receiver_eof(InstanceLoId id) { return _instance_to_receiver_eof[id]; } -void ExchangeSinkBuffer::_turn_off_channel(InstanceLoId id, bool cleanup) { +// The unused parameter `with_lock` is to ensure that the function is called when the lock is held. +void ExchangeSinkBuffer::_turn_off_channel(InstanceLoId id, + std::unique_lock& /*with_lock*/) { if (!_rpc_channel_is_idle[id]) { _rpc_channel_is_idle[id] = true; - auto all_done = _busy_channels.fetch_sub(1) == 1; - _set_ready_to_finish(all_done); - if (cleanup && all_done) { - auto weak_task_ctx = weak_task_exec_ctx(); - if (auto pip_ctx = weak_task_ctx.lock()) { - _parent->set_reach_limit(); - } - } + } + _instance_to_receiver_eof[id] = true; + + auto weak_task_ctx = weak_task_exec_ctx(); + if (auto pip_ctx = weak_task_ctx.lock()) { + _parent->on_channel_finished(id); } } diff --git a/be/src/pipeline/exec/exchange_sink_buffer.h b/be/src/pipeline/exec/exchange_sink_buffer.h index 2d30a492a0d8f9..13692532a335a4 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.h +++ b/be/src/pipeline/exec/exchange_sink_buffer.h @@ -22,9 +22,9 @@ #include #include #include -#include #include +#include #include #include #include @@ -51,7 +51,7 @@ class ExchangeSinkLocalState; } // namespace pipeline namespace vectorized { -class PipChannel; +class Channel; // We use BroadcastPBlockHolder to hold a broadcasted PBlock. For broadcast shuffle, one PBlock // will be shared between different channel, so we have to use a ref count to mark if this @@ -102,14 +102,14 @@ class BroadcastPBlockHolderMemLimiter namespace pipeline { struct TransmitInfo { - vectorized::PipChannel* channel = nullptr; + vectorized::Channel* channel = nullptr; std::unique_ptr block; bool eos; Status exec_status; }; struct BroadcastTransmitInfo { - vectorized::PipChannel* channel = nullptr; + vectorized::Channel* channel = nullptr; std::shared_ptr block_holder = nullptr; bool eos; }; @@ -169,11 +169,6 @@ class ExchangeSendCallback : public ::doris::DummyBrpcCallback { bool _eos; }; -struct ExchangeRpcContext { - std::shared_ptr> _send_callback; - bool is_cancelled = false; -}; - // Each ExchangeSinkOperator have one ExchangeSinkBuffer class ExchangeSinkBuffer final : public HasTaskExecutionCtx { public: @@ -198,14 +193,8 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { _broadcast_dependency = broadcast_dependency; } - void set_should_stop() { - _should_stop = true; - _set_ready_to_finish(_busy_channels == 0); - } - private: friend class ExchangeSinkLocalState; - void _set_ready_to_finish(bool all_done); phmap::flat_hash_map> _instance_to_package_queue_mutex; @@ -224,11 +213,9 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { phmap::flat_hash_map> _instance_to_request; // One channel is corresponding to a downstream instance. phmap::flat_hash_map _rpc_channel_is_idle; - // Number of busy channels; - std::atomic _busy_channels = 0; + phmap::flat_hash_map _instance_to_receiver_eof; phmap::flat_hash_map _instance_to_rpc_time; - phmap::flat_hash_map _instance_to_rpc_ctx; std::atomic _is_finishing; PUniqueId _query_id; @@ -247,7 +234,7 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { inline void _failed(InstanceLoId id, const std::string& err); inline void _set_receiver_eof(InstanceLoId id); inline bool _is_receiver_eof(InstanceLoId id); - inline void _turn_off_channel(InstanceLoId id, bool cleanup = false); + inline void _turn_off_channel(InstanceLoId id, std::unique_lock& with_lock); void get_max_min_rpc_time(int64_t* max_time, int64_t* min_time); int64_t get_sum_rpc_time(); @@ -255,7 +242,6 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { std::shared_ptr _queue_dependency = nullptr; std::shared_ptr _finish_dependency = nullptr; std::shared_ptr _broadcast_dependency = nullptr; - std::atomic _should_stop = false; ExchangeSinkLocalState* _parent = nullptr; }; diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index 518620ba6b4d3e..1f91af01aa1f6b 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include "common/status.h" @@ -31,16 +32,13 @@ #include "pipeline/exec/operator.h" #include "pipeline/exec/sort_source_operator.h" #include "pipeline/local_exchange/local_exchange_sink_operator.h" +#include "util/runtime_profile.h" +#include "util/uid_util.h" #include "vec/columns/column_const.h" #include "vec/exprs/vexpr.h" namespace doris::pipeline { - -Status ExchangeSinkLocalState::serialize_block(vectorized::Block* src, PBlock* dest, - int num_receivers) { - return _parent->cast().serialize_block(*this, src, dest, num_receivers); -} - +#include "common/compile_check_begin.h" bool ExchangeSinkLocalState::transfer_large_data_by_brpc() const { return _parent->cast()._transfer_large_data_by_brpc; } @@ -58,18 +56,16 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _local_sent_rows = ADD_COUNTER(_profile, "LocalSentRows", TUnit::UNIT); _serialize_batch_timer = ADD_TIMER(_profile, "SerializeBatchTime"); _compress_timer = ADD_TIMER(_profile, "CompressTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); - _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); _split_block_hash_compute_timer = ADD_TIMER(_profile, "SplitBlockHashComputeTime"); - _split_block_distribute_by_channel_timer = - ADD_TIMER(_profile, "SplitBlockDistributeByChannelTime"); + _distribute_rows_into_channels_timer = ADD_TIMER(_profile, "DistributeRowsIntoChannelsTime"); _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); - _rows_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "RowsProduced", TUnit::UNIT, 1); _overall_throughput = _profile->add_derived_counter( "OverallThroughput", TUnit::BYTES_PER_SECOND, - std::bind(&RuntimeProfile::units_per_second, _bytes_sent_counter, - _profile->total_time_counter()), + [this]() { + return RuntimeProfile::units_per_second(_bytes_sent_counter, + _profile->total_time_counter()); + }, ""); _merge_block_timer = ADD_TIMER(profile(), "MergeBlockTime"); _local_bytes_send_counter = ADD_COUNTER(_profile, "LocalBytesSent", TUnit::BYTES); @@ -84,18 +80,17 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf const auto& fragment_instance_id = p._dests[i].fragment_instance_id; if (fragment_id_to_channel_index.find(fragment_instance_id.lo) == fragment_id_to_channel_index.end()) { - channel_shared_ptrs.emplace_back( - new vectorized::PipChannel(this, p._row_desc, p._dests[i].brpc_server, - fragment_instance_id, p._dest_node_id)); - fragment_id_to_channel_index.emplace(fragment_instance_id.lo, - channel_shared_ptrs.size() - 1); - channels.push_back(channel_shared_ptrs.back().get()); + channels.push_back(std::make_shared( + this, p._dests[i].brpc_server, fragment_instance_id, p._dest_node_id)); + fragment_id_to_channel_index.emplace(fragment_instance_id.lo, channels.size() - 1); + + if (fragment_instance_id.hi != -1 && fragment_instance_id.lo != -1) { + _working_channels_count++; + } } else { - channel_shared_ptrs.emplace_back( - channel_shared_ptrs[fragment_id_to_channel_index[fragment_instance_id.lo]]); + channels.emplace_back(channels[fragment_id_to_channel_index[fragment_instance_id.lo]]); } } - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); // Make sure brpc stub is ready before execution. for (int i = 0; i < channels.size(); ++i) { @@ -107,28 +102,48 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf return Status::OK(); } +void ExchangeSinkLocalState::on_channel_finished(InstanceLoId channel_id) { + std::lock_guard lock(_finished_channels_mutex); + + if (_finished_channels.contains(channel_id)) { + LOG(WARNING) << "query: " << print_id(_state->query_id()) + << ", on_channel_finished on already finished channel: " << channel_id; + return; + } else { + _finished_channels.emplace(channel_id); + if (_working_channels_count.fetch_sub(1) == 1) { + set_reach_limit(); + if (_finish_dependency) { + _finish_dependency->set_ready(); + } + } + } +} + Status ExchangeSinkLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(Base::open(state)); auto& p = _parent->cast(); - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); - int local_size = 0; - for (int i = 0; i < channels.size(); ++i) { - RETURN_IF_ERROR(channels[i]->open(state)); - if (channels[i]->is_local()) { - local_size++; - } - } if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM || _part_type == TPartitionType::TABLE_SINK_RANDOM_PARTITIONED) { std::random_device rd; std::mt19937 g(rd()); shuffle(channels.begin(), channels.end(), g); } + size_t local_size = 0; + for (int i = 0; i < channels.size(); ++i) { + RETURN_IF_ERROR(channels[i]->open(state)); + if (channels[i]->is_local()) { + local_size++; + _last_local_channel_idx = i; + } + } only_local_exchange = local_size == channels.size(); + _rpc_channels_num = channels.size() - local_size; + PUniqueId id; id.set_hi(_state->query_id().hi); id.set_lo(_state->query_id().lo); @@ -140,7 +155,6 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { _queue_dependency = Dependency::create_shared(_parent->operator_id(), _parent->node_id(), "ExchangeSinkQueueDependency", true); _sink_buffer->set_dependency(_queue_dependency, _finish_dependency); - _finish_dependency->block(); } if ((_part_type == TPartitionType::UNPARTITIONED || channels.size() == 1) && @@ -152,7 +166,7 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { vectorized::BroadcastPBlockHolderMemLimiter::create_shared(_broadcast_dependency); } else if (local_size > 0) { size_t dep_id = 0; - for (auto* channel : channels) { + for (auto& channel : channels) { if (channel->is_local()) { if (auto dep = channel->get_local_channel_dependency()) { _local_channels_dependency.push_back(dep); @@ -167,16 +181,18 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { } if (_part_type == TPartitionType::HASH_PARTITIONED) { _partition_count = channels.size(); - _partitioner.reset(new vectorized::Crc32HashPartitioner( - channels.size())); + _partitioner = + std::make_unique>( + channels.size()); RETURN_IF_ERROR(_partitioner->init(p._texprs)); RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); _profile->add_info_string("Partitioner", fmt::format("Crc32HashPartitioner({})", _partition_count)); } else if (_part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) { - _partition_count = channel_shared_ptrs.size(); - _partitioner.reset(new vectorized::Crc32HashPartitioner( - channel_shared_ptrs.size())); + _partition_count = channels.size(); + _partitioner = + std::make_unique>( + channels.size()); RETURN_IF_ERROR(_partitioner->init(p._texprs)); RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); _profile->add_info_string("Partitioner", @@ -222,12 +238,13 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { } else if (_part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { _partition_count = channels.size() * config::table_sink_partition_write_max_partition_nums_per_writer; - _partitioner.reset(new vectorized::Crc32HashPartitioner( - _partition_count)); - _partition_function.reset(new HashPartitionFunction(_partitioner.get())); + _partitioner = + std::make_unique>( + _partition_count); + _partition_function = std::make_unique(_partitioner.get()); - scale_writer_partitioning_exchanger.reset(new vectorized::ScaleWriterPartitioningExchanger< - HashPartitionFunction>( + scale_writer_partitioning_exchanger = std::make_unique< + vectorized::ScaleWriterPartitioningExchanger>( channels.size(), *_partition_function, _partition_count, channels.size(), 1, config::table_sink_partition_write_min_partition_data_processed_rebalance_threshold / state->task_num() == @@ -240,7 +257,7 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { 0 ? config::table_sink_partition_write_min_data_processed_rebalance_threshold : config::table_sink_partition_write_min_data_processed_rebalance_threshold / - state->task_num())); + state->task_num()); RETURN_IF_ERROR(_partitioner->init(p._texprs)); RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); @@ -264,7 +281,7 @@ Status ExchangeSinkLocalState::_send_new_partition_batch() { vectorized::Block tmp_block = _row_distribution._batching_block->to_block(); // Borrow out, for lval ref auto& p = _parent->cast(); - // these order is only. + // these order is unique. // 1. clear batching stats(and flag goes true) so that we won't make a new batching process in dealing batched block. // 2. deal batched block // 3. now reuse the column of lval block. cuz write doesn't real adjust it. it generate a new block from that. @@ -338,7 +355,6 @@ Status ExchangeSinkOperatorX::init(const TDataSink& tsink) { Status ExchangeSinkOperatorX::open(RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::open(state)); _state = state; - _mem_tracker = std::make_unique("ExchangeSinkOperatorX:"); _compression_type = state->fragement_transmission_compression_type(); if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { if (_output_tuple_id == -1) { @@ -360,17 +376,15 @@ void ExchangeSinkOperatorX::_handle_eof_channel(RuntimeState* state, ChannelPtrT Status st) { channel->set_receiver_eof(st); // Chanel will not send RPC to the downstream when eof, so close chanel by OK status. - static_cast(channel->close(state, Status::OK())); + static_cast(channel->close(state)); } Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); - COUNTER_UPDATE(local_state.rows_sent_counter(), (int64_t)block->rows()); SCOPED_TIMER(local_state.exec_time_counter()); - local_state._peak_memory_usage_counter->set(local_state._mem_tracker->peak_consumption()); bool all_receiver_eof = true; - for (auto* channel : local_state.channels) { + for (auto& channel : local_state.channels) { if (!channel->is_receiver_eof()) { all_receiver_eof = false; break; @@ -379,6 +393,10 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block if (all_receiver_eof) { return Status::EndOfFile("all data stream channels EOF"); } + Defer defer([&]() { + COUNTER_SET(local_state._peak_memory_usage_counter, + local_state._memory_used_counter->value()); + }); if (_part_type == TPartitionType::UNPARTITIONED || local_state.channels.size() == 1) { // 1. serialize depends on it is not local exchange @@ -387,68 +405,82 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block if (local_state.only_local_exchange) { if (!block->empty()) { Status status; - for (auto* channel : local_state.channels) { + size_t idx = 0; + for (auto& channel : local_state.channels) { if (!channel->is_receiver_eof()) { - status = channel->send_local_block(block); + // If this channel is the last, we can move this block to downstream pipeline. + // Otherwise, this block also need to be broadcasted to other channels so should be copied. + DCHECK_GE(local_state._last_local_channel_idx, 0); + status = channel->send_local_block( + block, eos, idx == local_state._last_local_channel_idx); HANDLE_CHANNEL_STATUS(state, channel, status); } + idx++; } } } else { auto block_holder = vectorized::BroadcastPBlockHolder::create_shared(); { - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); bool serialized = false; RETURN_IF_ERROR(local_state._serializer.next_serialized_block( - block, block_holder->get_block(), local_state.channels.size(), &serialized, - eos)); + block, block_holder->get_block(), local_state._rpc_channels_num, + &serialized, eos)); if (serialized) { auto cur_block = local_state._serializer.get_block()->to_block(); if (!cur_block.empty()) { + DCHECK(eos || local_state._serializer.is_local()) << debug_string(state, 0); RETURN_IF_ERROR(local_state._serializer.serialize_block( &cur_block, block_holder->get_block(), - local_state.channels.size())); + local_state._rpc_channels_num)); } else { block_holder->reset_block(); } local_state._broadcast_pb_mem_limiter->acquire(*block_holder); - for (auto* channel : local_state.channels) { + size_t idx = 0; + bool moved = false; + for (auto& channel : local_state.channels) { if (!channel->is_receiver_eof()) { Status status; if (channel->is_local()) { - status = channel->send_local_block(&cur_block); + // If this channel is the last, we can move this block to downstream pipeline. + // Otherwise, this block also need to be broadcasted to other channels so should be copied. + DCHECK_GE(local_state._last_local_channel_idx, 0); + status = channel->send_local_block( + &cur_block, eos, + idx == local_state._last_local_channel_idx); + moved = idx == local_state._last_local_channel_idx; } else { - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); status = channel->send_broadcast_block(block_holder, eos); } HANDLE_CHANNEL_STATUS(state, channel, status); } + idx++; + } + if (moved) { + local_state._serializer.reset_block(); + } else { + cur_block.clear_column_data(); + local_state._serializer.get_block()->set_mutable_columns( + cur_block.mutate_columns()); } - cur_block.clear_column_data(); - local_state._serializer.get_block()->set_mutable_columns( - cur_block.mutate_columns()); } } } } else if (_part_type == TPartitionType::RANDOM) { // 1. select channel - vectorized::PipChannel* current_channel = - local_state.channels[local_state.current_channel_idx]; + auto& current_channel = local_state.channels[local_state.current_channel_idx]; if (!current_channel->is_receiver_eof()) { // 2. serialize, send and rollover block if (current_channel->is_local()) { - auto status = current_channel->send_local_block(block); + auto status = current_channel->send_local_block(block, eos, true); HANDLE_CHANNEL_STATUS(state, current_channel, status); } else { - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); - RETURN_IF_ERROR(local_state._serializer.serialize_block( - block, current_channel->ch_cur_pb_block())); - auto status = - current_channel->send_remote_block(current_channel->ch_cur_pb_block(), eos); + auto pblock = std::make_unique(); + RETURN_IF_ERROR(local_state._serializer.serialize_block(block, pblock.get())); + auto status = current_channel->send_remote_block(std::move(pblock), eos); HANDLE_CHANNEL_STATUS(state, current_channel, status); - current_channel->ch_roll_pb_block(); } } local_state.current_channel_idx = @@ -458,19 +490,36 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block auto rows = block->rows(); { SCOPED_TIMER(local_state._split_block_hash_compute_timer); - RETURN_IF_ERROR( - local_state._partitioner->do_partitioning(state, block, _mem_tracker.get())); + RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, block)); + } + int64_t old_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + old_channel_mem_usage += channel->mem_usage(); } if (_part_type == TPartitionType::HASH_PARTITIONED) { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); RETURN_IF_ERROR(channel_add_rows( state, local_state.channels, local_state._partition_count, local_state._partitioner->get_channel_ids().get(), rows, block, eos)); } else { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); RETURN_IF_ERROR(channel_add_rows( - state, local_state.channel_shared_ptrs, local_state._partition_count, + state, local_state.channels, local_state._partition_count, local_state._partitioner->get_channel_ids().get(), rows, block, eos)); } + int64_t new_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + new_channel_mem_usage += channel->mem_usage(); + } + COUNTER_UPDATE(local_state.memory_used_counter(), + new_channel_mem_usage - old_channel_mem_usage); + COUNTER_SET(local_state.peak_memory_usage_counter(), + local_state.memory_used_counter()->value()); } else if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { + int64_t old_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + old_channel_mem_usage += channel->mem_usage(); + } // check out of limit RETURN_IF_ERROR(local_state._send_new_partition_batch()); std::shared_ptr convert_block = std::make_shared(); @@ -502,39 +551,61 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block local_state._row_distribution._deal_batched = true; RETURN_IF_ERROR(local_state._send_new_partition_batch()); } - // the convert_block maybe different with block after execute exprs - // when send data we still use block - RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, num_channels, - channel2rows, block, eos)); + { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); + // the convert_block maybe different with block after execute exprs + // when send data we still use block + RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, num_channels, + channel2rows, block, eos)); + } + int64_t new_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + new_channel_mem_usage += channel->mem_usage(); + } + COUNTER_UPDATE(local_state.memory_used_counter(), + new_channel_mem_usage - old_channel_mem_usage); + COUNTER_SET(local_state.peak_memory_usage_counter(), + local_state.memory_used_counter()->value()); } else if (_part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { + int64_t old_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + old_channel_mem_usage += channel->mem_usage(); + } { SCOPED_TIMER(local_state._split_block_hash_compute_timer); - RETURN_IF_ERROR( - local_state._partitioner->do_partitioning(state, block, _mem_tracker.get())); + RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, block)); } std::vector> assignments = local_state.scale_writer_partitioning_exchanger->accept(block); - RETURN_IF_ERROR(channel_add_rows_with_idx( - state, local_state.channels, local_state.channels.size(), assignments, block, eos)); + { + SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); + RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, + local_state.channels.size(), assignments, + block, eos)); + } + int64_t new_channel_mem_usage = 0; + for (const auto& channel : local_state.channels) { + new_channel_mem_usage += channel->mem_usage(); + } + COUNTER_UPDATE(local_state.memory_used_counter(), + new_channel_mem_usage - old_channel_mem_usage); + COUNTER_SET(local_state.peak_memory_usage_counter(), + local_state.memory_used_counter()->value()); } else if (_part_type == TPartitionType::TABLE_SINK_RANDOM_PARTITIONED) { // Control the number of channels according to the flow, thereby controlling the number of table sink writers. // 1. select channel - vectorized::PipChannel* current_channel = - local_state.channels[local_state.current_channel_idx]; + auto& current_channel = local_state.channels[local_state.current_channel_idx]; if (!current_channel->is_receiver_eof()) { // 2. serialize, send and rollover block if (current_channel->is_local()) { - auto status = current_channel->send_local_block(block); + auto status = current_channel->send_local_block(block, eos, true); HANDLE_CHANNEL_STATUS(state, current_channel, status); } else { - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); - RETURN_IF_ERROR(local_state._serializer.serialize_block( - block, current_channel->ch_cur_pb_block())); - auto status = - current_channel->send_remote_block(current_channel->ch_cur_pb_block(), eos); + auto pblock = std::make_unique(); + RETURN_IF_ERROR(local_state._serializer.serialize_block(block, pblock.get())); + auto status = current_channel->send_remote_block(std::move(pblock), eos); HANDLE_CHANNEL_STATUS(state, current_channel, status); - current_channel->ch_roll_pb_block(); } _data_processed += block->bytes(); } @@ -556,48 +627,26 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block Status final_st = Status::OK(); if (eos) { local_state._serializer.reset_block(); - for (int i = 0; i < local_state.channels.size(); ++i) { - Status st = local_state.channels[i]->close(state, Status::OK()); + for (auto& channel : local_state.channels) { + Status st = channel->close(state); if (!st.ok() && final_st.ok()) { final_st = st; } } - if (local_state._sink_buffer) { - local_state._sink_buffer->set_should_stop(); - } } return final_st; } -Status ExchangeSinkOperatorX::serialize_block(ExchangeSinkLocalState& state, vectorized::Block* src, - PBlock* dest, int num_receivers) { - { - SCOPED_TIMER(state.serialize_batch_timer()); - dest->Clear(); - size_t uncompressed_bytes = 0; - size_t compressed_bytes = 0; - RETURN_IF_ERROR(src->serialize(_state->be_exec_version(), dest, &uncompressed_bytes, - &compressed_bytes, _compression_type, - _transfer_large_data_by_brpc)); - COUNTER_UPDATE(state.bytes_sent_counter(), compressed_bytes * num_receivers); - COUNTER_UPDATE(state.uncompressed_bytes_counter(), uncompressed_bytes * num_receivers); - COUNTER_UPDATE(state.compress_timer(), src->get_compress_time()); - } - - return Status::OK(); -} - void ExchangeSinkLocalState::register_channels(pipeline::ExchangeSinkBuffer* buffer) { - for (auto channel : channels) { - ((vectorized::PipChannel*)channel)->register_exchange_buffer(buffer); + for (auto& channel : channels) { + channel->register_exchange_buffer(buffer); } } -template -Status ExchangeSinkOperatorX::channel_add_rows(RuntimeState* state, Channels& channels, - int num_channels, - const HashValueType* __restrict channel_ids, - int rows, vectorized::Block* block, bool eos) { +Status ExchangeSinkOperatorX::channel_add_rows( + RuntimeState* state, std::vector>& channels, + size_t num_channels, const uint32_t* __restrict channel_ids, size_t rows, + vectorized::Block* block, bool eos) { std::vector> channel2rows; channel2rows.resize(num_channels); for (uint32_t i = 0; i < rows; i++) { @@ -609,10 +658,10 @@ Status ExchangeSinkOperatorX::channel_add_rows(RuntimeState* state, Channels& ch return Status::OK(); } -template Status ExchangeSinkOperatorX::channel_add_rows_with_idx( - RuntimeState* state, Channels& channels, int num_channels, - std::vector>& channel2rows, vectorized::Block* block, bool eos) { + RuntimeState* state, std::vector>& channels, + size_t num_channels, std::vector>& channel2rows, + vectorized::Block* block, bool eos) { Status status = Status::OK(); for (int i = 0; i < num_channels; ++i) { if (!channels[i]->is_receiver_eof() && !channel2rows[i].empty()) { @@ -636,12 +685,12 @@ std::string ExchangeSinkLocalState::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, "{}", Base::debug_string(indentation_level)); if (_sink_buffer) { - fmt::format_to( - debug_string_buffer, - ", Sink Buffer: (_should_stop = {}, _busy_channels = {}, _is_finishing = {}), " - "_reach_limit: {}", - _sink_buffer->_should_stop.load(), _sink_buffer->_busy_channels.load(), - _sink_buffer->_is_finishing.load(), _reach_limit.load()); + fmt::format_to(debug_string_buffer, + ", Sink Buffer: (_is_finishing = {}, blocks in queue: {}, queue capacity: " + "{}, queue dep: {}), _reach_limit: {}, working channels: {}", + _sink_buffer->_is_finishing.load(), _sink_buffer->_total_queue_size, + _sink_buffer->_queue_capacity, (void*)_sink_buffer->_queue_dependency.get(), + _reach_limit.load(), _working_channels_count.load()); } return fmt::to_string(debug_string_buffer); } diff --git a/be/src/pipeline/exec/exchange_sink_operator.h b/be/src/pipeline/exec/exchange_sink_operator.h index adf8a3424706d2..63d50290005470 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.h +++ b/be/src/pipeline/exec/exchange_sink_operator.h @@ -19,7 +19,9 @@ #include +#include #include +#include #include "common/status.h" #include "exchange_sink_buffer.h" @@ -53,13 +55,10 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { public: ExchangeSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) - : Base(parent, state), - current_channel_idx(0), - only_local_exchange(false), - _serializer(this) { + : Base(parent, state), _serializer(this) { _finish_dependency = std::make_shared(parent->operator_id(), parent->node_id(), - parent->get_name() + "_FINISH_DEPENDENCY", true); + parent->get_name() + "_FINISH_DEPENDENCY", false); } std::vector dependencies() const override { @@ -78,27 +77,13 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { Status open(RuntimeState* state) override; Status close(RuntimeState* state, Status exec_status) override; Dependency* finishdependency() override { return _finish_dependency.get(); } - Status serialize_block(vectorized::Block* src, PBlock* dest, int num_receivers = 1); void register_channels(pipeline::ExchangeSinkBuffer* buffer); - RuntimeProfile::Counter* brpc_wait_timer() { return _brpc_wait_timer; } RuntimeProfile::Counter* blocks_sent_counter() { return _blocks_sent_counter; } - RuntimeProfile::Counter* rows_sent_counter() { return _rows_sent_counter; } RuntimeProfile::Counter* local_send_timer() { return _local_send_timer; } RuntimeProfile::Counter* local_bytes_send_counter() { return _local_bytes_send_counter; } RuntimeProfile::Counter* local_sent_rows() { return _local_sent_rows; } - RuntimeProfile::Counter* brpc_send_timer() { return _brpc_send_timer; } - RuntimeProfile::Counter* serialize_batch_timer() { return _serialize_batch_timer; } - RuntimeProfile::Counter* split_block_distribute_by_channel_timer() { - return _split_block_distribute_by_channel_timer; - } - RuntimeProfile::Counter* bytes_sent_counter() { return _bytes_sent_counter; } - RuntimeProfile::Counter* split_block_hash_compute_timer() { - return _split_block_hash_compute_timer; - } RuntimeProfile::Counter* merge_block_timer() { return _merge_block_timer; } - RuntimeProfile::Counter* compress_timer() { return _compress_timer; } - RuntimeProfile::Counter* uncompressed_bytes_counter() { return _uncompressed_bytes_counter; } [[nodiscard]] bool transfer_large_data_by_brpc() const; bool is_finished() const override { return _reach_limit.load(); } void set_reach_limit() { _reach_limit = true; }; @@ -112,10 +97,11 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { return Status::OK(); } Status _send_new_partition_batch(); - std::vector channels; - std::vector> channel_shared_ptrs; - int current_channel_idx; // index of current channel to send to if _random == true - bool only_local_exchange; + std::vector> channels; + int current_channel_idx {0}; // index of current channel to send to if _random == true + bool only_local_exchange {false}; + + void on_channel_finished(InstanceLoId channel_id); // for external table sink hash partition std::unique_ptr> @@ -123,23 +109,19 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { private: friend class ExchangeSinkOperatorX; - friend class vectorized::Channel; - friend class vectorized::PipChannel; - friend class vectorized::BlockSerializer; + friend class vectorized::Channel; + friend class vectorized::BlockSerializer; std::unique_ptr _sink_buffer = nullptr; RuntimeProfile::Counter* _serialize_batch_timer = nullptr; RuntimeProfile::Counter* _compress_timer = nullptr; - RuntimeProfile::Counter* _brpc_send_timer = nullptr; - RuntimeProfile::Counter* _brpc_wait_timer = nullptr; RuntimeProfile::Counter* _bytes_sent_counter = nullptr; RuntimeProfile::Counter* _uncompressed_bytes_counter = nullptr; RuntimeProfile::Counter* _local_sent_rows = nullptr; RuntimeProfile::Counter* _local_send_timer = nullptr; RuntimeProfile::Counter* _split_block_hash_compute_timer = nullptr; - RuntimeProfile::Counter* _split_block_distribute_by_channel_timer = nullptr; + RuntimeProfile::Counter* _distribute_rows_into_channels_timer = nullptr; RuntimeProfile::Counter* _blocks_sent_counter = nullptr; - RuntimeProfile::Counter* _rows_sent_counter = nullptr; // Throughput per total time spent in sender RuntimeProfile::Counter* _overall_throughput = nullptr; // Used to counter send bytes under local data exchange @@ -154,7 +136,8 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { int _sender_id; std::shared_ptr _broadcast_pb_mem_limiter; - vectorized::BlockSerializer _serializer; + size_t _rpc_channels_num = 0; + vectorized::BlockSerializer _serializer; std::shared_ptr _queue_dependency = nullptr; std::shared_ptr _broadcast_dependency = nullptr; @@ -179,7 +162,7 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { */ std::vector> _local_channels_dependency; std::unique_ptr _partitioner; - int _partition_count; + size_t _partition_count; std::shared_ptr _finish_dependency; @@ -202,6 +185,11 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { // for external table sink hash partition std::unique_ptr _partition_function = nullptr; std::atomic _reach_limit = false; + int _last_local_channel_idx = -1; + + std::atomic_int _working_channels_count = 0; + std::set _finished_channels; + std::mutex _finished_channels_mutex; }; class ExchangeSinkOperatorX final : public DataSinkOperatorX { @@ -217,9 +205,8 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX void _handle_eof_channel(RuntimeState* state, ChannelPtrType channel, Status st); - template - Status channel_add_rows(RuntimeState* state, Channels& channels, int num_channels, - const HashValueType* channel_ids, int rows, vectorized::Block* block, - bool eos); + Status channel_add_rows(RuntimeState* state, + std::vector>& channels, + size_t num_channels, const uint32_t* __restrict channel_ids, + size_t rows, vectorized::Block* block, bool eos); - template - Status channel_add_rows_with_idx(RuntimeState* state, Channels& channels, int num_channels, + Status channel_add_rows_with_idx(RuntimeState* state, + std::vector>& channels, + size_t num_channels, std::vector>& channel2rows, vectorized::Block* block, bool eos); RuntimeState* _state = nullptr; @@ -252,7 +240,6 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX _dests; - std::unique_ptr _mem_tracker; // Identifier of the destination plan node. const PlanNodeId _dest_node_id; diff --git a/be/src/pipeline/exec/exchange_source_operator.cpp b/be/src/pipeline/exec/exchange_source_operator.cpp index cf2055ec47b071..eafefa2e4c06bb 100644 --- a/be/src/pipeline/exec/exchange_source_operator.cpp +++ b/be/src/pipeline/exec/exchange_source_operator.cpp @@ -17,6 +17,7 @@ #include "exchange_source_operator.h" +#include #include #include "pipeline/exec/operator.h" @@ -29,7 +30,7 @@ #include "vec/runtime/vdata_stream_recvr.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" ExchangeLocalState::ExchangeLocalState(RuntimeState* state, OperatorXBase* parent) : Base(state, parent), num_rows_skipped(0), is_ready(false) {} @@ -62,8 +63,8 @@ Status ExchangeLocalState::init(RuntimeState* state, LocalStateInfo& info) { SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); stream_recvr = state->exec_env()->vstream_mgr()->create_recvr( - state, p.input_row_desc(), state->fragment_instance_id(), p.node_id(), p.num_senders(), - profile(), p.is_merging()); + state, this, p.input_row_desc(), state->fragment_instance_id(), p.node_id(), + p.num_senders(), profile(), p.is_merging()); const auto& queues = stream_recvr->sender_queues(); deps.resize(queues.size()); metrics.resize(queues.size()); @@ -77,6 +78,10 @@ Status ExchangeLocalState::init(RuntimeState* state, LocalStateInfo& info) { TUnit ::TIME_NS, timer_name, 1); } + get_data_from_recvr_timer = ADD_TIMER(_runtime_profile, "GetDataFromRecvrTime"); + filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); + create_merger_timer = ADD_TIMER(_runtime_profile, "CreateMergerTime"); + return Status::OK(); } @@ -104,7 +109,9 @@ ExchangeSourceOperatorX::ExchangeSourceOperatorX(ObjectPool* pool, const TPlanNo std::vector(tnode.nullable_tuples.begin(), tnode.nullable_tuples.begin() + tnode.exchange_node.input_row_tuples.size())), - _offset(tnode.exchange_node.__isset.offset ? tnode.exchange_node.offset : 0) {} + _offset(tnode.exchange_node.__isset.offset ? tnode.exchange_node.offset : 0) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status ExchangeSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(OperatorX::init(tnode, state)); @@ -141,15 +148,22 @@ Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block }); SCOPED_TIMER(local_state.exec_time_counter()); if (_is_merging && !local_state.is_ready) { + SCOPED_TIMER(local_state.create_merger_timer); RETURN_IF_ERROR(local_state.stream_recvr->create_merger( local_state.vsort_exec_exprs.lhs_ordering_expr_ctxs(), _is_asc_order, _nulls_first, state->batch_size(), _limit, _offset)); local_state.is_ready = true; return Status::OK(); } - auto status = local_state.stream_recvr->get_next(block, eos); - RETURN_IF_ERROR(doris::vectorized::VExprContext::filter_block(local_state.conjuncts(), block, - block->columns())); + { + SCOPED_TIMER(local_state.get_data_from_recvr_timer); + RETURN_IF_ERROR(local_state.stream_recvr->get_next(block, eos)); + } + { + SCOPED_TIMER(local_state.filter_timer); + RETURN_IF_ERROR(doris::vectorized::VExprContext::filter_block(local_state.conjuncts(), + block, block->columns())); + } // In vsortrunmerger, it will set eos=true, and block not empty // so that eos==true, could not make sure that block not have valid data if (!*eos || block->rows() > 0) { @@ -158,9 +172,10 @@ Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block local_state.num_rows_skipped += block->rows(); block->set_num_rows(0); } else if (local_state.num_rows_skipped < _offset) { - auto offset = _offset - local_state.num_rows_skipped; + int64_t offset = _offset - local_state.num_rows_skipped; local_state.num_rows_skipped = _offset; - block->set_num_rows(block->rows() - offset); + // should skip some rows + block->skip_num_rows(offset); } } if (local_state.num_rows_returned() + block->rows() < _limit) { @@ -171,10 +186,8 @@ Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block block->set_num_rows(limit); local_state.set_num_rows_returned(_limit); } - COUNTER_SET(local_state.rows_returned_counter(), local_state.num_rows_returned()); - COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); } - return status; + return Status::OK(); } Status ExchangeLocalState::close(RuntimeState* state) { diff --git a/be/src/pipeline/exec/exchange_source_operator.h b/be/src/pipeline/exec/exchange_source_operator.h index 0fe3dcbb590b7d..f938f5007d1643 100644 --- a/be/src/pipeline/exec/exchange_source_operator.h +++ b/be/src/pipeline/exec/exchange_source_operator.h @@ -59,6 +59,9 @@ class ExchangeLocalState final : public PipelineXLocalState<> { std::vector> deps; std::vector metrics; + RuntimeProfile::Counter* get_data_from_recvr_timer = nullptr; + RuntimeProfile::Counter* filter_timer = nullptr; + RuntimeProfile::Counter* create_merger_timer = nullptr; }; class ExchangeSourceOperatorX final : public OperatorX { @@ -81,7 +84,7 @@ class ExchangeSourceOperatorX final : public OperatorX { [[nodiscard]] bool is_merging() const { return _is_merging; } DataDistribution required_data_distribution() const override { - if (OperatorX::ignore_data_distribution()) { + if (OperatorX::is_serial_operator()) { return {ExchangeType::NOOP}; } return _partition_type == TPartitionType::HASH_PARTITIONED diff --git a/be/src/pipeline/exec/file_scan_operator.cpp b/be/src/pipeline/exec/file_scan_operator.cpp index 6fa7401e278451..7afbb29134c079 100644 --- a/be/src/pipeline/exec/file_scan_operator.cpp +++ b/be/src/pipeline/exec/file_scan_operator.cpp @@ -29,7 +29,7 @@ #include "vec/exec/scan/vfile_scanner.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" Status FileScanLocalState::_init_scanners(std::list* scanners) { if (_split_source->num_scan_ranges() == 0) { _eos = true; @@ -37,10 +37,10 @@ Status FileScanLocalState::_init_scanners(std::list* s } auto& p = _parent->cast(); - size_t shard_num = std::min( + uint32_t shard_num = std::min( config::doris_scanner_thread_pool_thread_num / state()->query_parallel_instance_num(), _max_scanners); - shard_num = std::max(shard_num, (size_t)1); + shard_num = std::max(shard_num, 1U); _kv_cache.reset(new vectorized::ShardedKVCache(shard_num)); for (int i = 0; i < _max_scanners; ++i) { std::unique_ptr scanner = vectorized::VFileScanner::create_unique( diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp index 6db49bb7ab1089..9f99d55d3ea989 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.cpp +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.cpp @@ -23,7 +23,7 @@ #include "vec/sink/vtablet_block_convertor.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" GroupCommitBlockSinkLocalState::~GroupCommitBlockSinkLocalState() { if (_load_block_queue) { _remove_estimated_wal_bytes(); @@ -46,8 +46,6 @@ Status GroupCommitBlockSinkLocalState::open(RuntimeState* state) { _vpartition = std::make_unique(p._schema, p._partition); RETURN_IF_ERROR(_vpartition->init()); _state = state; - // profile must add to state's object pool - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); _block_convertor = std::make_unique(p._output_tuple_desc); _block_convertor->init_autoinc_info(p._schema->db_id(), p._schema->table_id(), @@ -66,6 +64,7 @@ Status GroupCommitBlockSinkLocalState::open(RuntimeState* state) { } Status GroupCommitBlockSinkLocalState::_initialize_load_queue() { + SCOPED_TIMER(_init_load_queue_timer); auto& p = _parent->cast(); if (_state->exec_env()->wal_mgr()->is_running()) { RETURN_IF_ERROR(_state->exec_env()->group_commit_mgr()->get_first_block_load_queue( @@ -240,6 +239,17 @@ Status GroupCommitBlockSinkLocalState::_add_blocks(RuntimeState* state, return Status::OK(); } +Status GroupCommitBlockSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _init_load_queue_timer = ADD_TIMER(_profile, "InitLoadQueueTime"); + _valid_and_convert_block_timer = ADD_TIMER(_profile, "ValidAndConvertBlockTime"); + _find_partition_timer = ADD_TIMER(_profile, "FindPartitionTime"); + _append_blocks_timer = ADD_TIMER(_profile, "AppendBlocksTime"); + return Status::OK(); +} + Status GroupCommitBlockSinkOperatorX::init(const TDataSink& t_sink) { RETURN_IF_ERROR(Base::init(t_sink)); DCHECK(t_sink.__isset.olap_table_sink); @@ -276,7 +286,6 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); - SCOPED_CONSUME_MEM_TRACKER(local_state._mem_tracker.get()); if (!local_state._load_block_queue) { RETURN_IF_ERROR(local_state._initialize_load_queue()); } @@ -296,7 +305,7 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc int64_t num_selected_rows = state->num_rows_load_total() - state->num_rows_load_unselected(); if (num_selected_rows > 0 && - (double)state->num_rows_load_filtered() / num_selected_rows > + (double)state->num_rows_load_filtered() / (double)num_selected_rows > _max_filter_ratio) { return Status::DataQualityError("too many filtered rows"); } @@ -321,10 +330,15 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc std::shared_ptr block; bool has_filtered_rows = false; - RETURN_IF_ERROR(local_state._block_convertor->validate_and_convert_block( - state, input_block, block, local_state._output_vexpr_ctxs, rows, has_filtered_rows)); + { + SCOPED_TIMER(local_state._valid_and_convert_block_timer); + RETURN_IF_ERROR(local_state._block_convertor->validate_and_convert_block( + state, input_block, block, local_state._output_vexpr_ctxs, rows, + has_filtered_rows)); + } local_state._has_filtered_rows = false; if (!local_state._vpartition->is_auto_partition()) { + SCOPED_TIMER(local_state._find_partition_timer); //reuse vars for find_partition local_state._partitions.assign(rows, nullptr); local_state._filter_bitmap.Reset(rows); @@ -354,23 +368,26 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, vectorized::Bloc } } } - - if (local_state._block_convertor->num_filtered_rows() > 0 || local_state._has_filtered_rows) { - auto cloneBlock = block->clone_without_columns(); - auto res_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); - for (int i = 0; i < rows; ++i) { - if (local_state._block_convertor->filter_map()[i]) { - continue; - } - if (local_state._filter_bitmap.Get(i)) { - continue; + { + SCOPED_TIMER(local_state._append_blocks_timer); + if (local_state._block_convertor->num_filtered_rows() > 0 || + local_state._has_filtered_rows) { + auto cloneBlock = block->clone_without_columns(); + auto res_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + for (int i = 0; i < rows; ++i) { + if (local_state._block_convertor->filter_map()[i]) { + continue; + } + if (local_state._filter_bitmap.Get(i)) { + continue; + } + res_block.add_row(block.get(), i); } - res_block.add_row(block.get(), i); + block->swap(res_block.to_block()); } - block->swap(res_block.to_block()); + // add block into block queue + RETURN_IF_ERROR(local_state._add_block(state, block)); } - // add block into block queue - RETURN_IF_ERROR(local_state._add_block(state, block)); return wind_up(); } diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.h b/be/src/pipeline/exec/group_commit_block_sink_operator.h index 32ca0613652ae4..e469aee8df595c 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.h +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.h @@ -42,8 +42,8 @@ class GroupCommitBlockSinkLocalState final : public PipelineXSinkLocalState dependencies() const override { @@ -79,6 +79,11 @@ class GroupCommitBlockSinkLocalState final : public PipelineXSinkLocalState _finish_dependency; std::shared_ptr _create_plan_dependency = nullptr; std::shared_ptr _put_block_dependency = nullptr; + + RuntimeProfile::Counter* _init_load_queue_timer = nullptr; + RuntimeProfile::Counter* _valid_and_convert_block_timer = nullptr; + RuntimeProfile::Counter* _find_partition_timer = nullptr; + RuntimeProfile::Counter* _append_blocks_timer = nullptr; }; class GroupCommitBlockSinkOperatorX final diff --git a/be/src/pipeline/exec/group_commit_scan_operator.cpp b/be/src/pipeline/exec/group_commit_scan_operator.cpp index 3e6ad62c5dcb7c..141a5e7bf770c5 100644 --- a/be/src/pipeline/exec/group_commit_scan_operator.cpp +++ b/be/src/pipeline/exec/group_commit_scan_operator.cpp @@ -20,7 +20,7 @@ #include namespace doris::pipeline { - +#include "common/compile_check_begin.h" GroupCommitOperatorX::GroupCommitOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs, int parallel_tasks) @@ -31,6 +31,7 @@ GroupCommitOperatorX::GroupCommitOperatorX(ObjectPool* pool, const TPlanNode& tn Status GroupCommitOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state.exec_time_counter()); bool find_node = false; while (!find_node && !*eos) { RETURN_IF_ERROR(local_state.load_block_queue->get_block(state, block, &find_node, eos, diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 8f7b176a979a4d..37de9ac93d839f 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -22,11 +22,12 @@ #include "exprs/bloom_filter_func.h" #include "pipeline/exec/hashjoin_probe_operator.h" #include "pipeline/exec/operator.h" +#include "pipeline/pipeline_task.h" #include "vec/data_types/data_type_nullable.h" #include "vec/utils/template_helpers.hpp" namespace doris::pipeline { - +#include "common/compile_check_begin.h" HashJoinBuildSinkLocalState::HashJoinBuildSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) : JoinBuildSinkLocalState(parent, state) { @@ -42,7 +43,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_state->join_op_variants = p._join_op_variants; _shared_state->is_null_safe_eq_join = p._is_null_safe_eq_join; - _shared_state->store_null_in_hash_table = p._store_null_in_hash_table; + _shared_state->serialize_null_into_key = p._serialize_null_into_key; _build_expr_ctxs.resize(p._build_expr_ctxs.size()); for (size_t i = 0; i < _build_expr_ctxs.size(); i++) { RETURN_IF_ERROR(p._build_expr_ctxs[i]->clone(state, _build_expr_ctxs[i])); @@ -50,19 +51,19 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_state->build_exprs_size = _build_expr_ctxs.size(); _should_build_hash_table = true; + profile()->add_info_string("BroadcastJoin", std::to_string(p._is_broadcast_join)); if (p._is_broadcast_join) { - profile()->add_info_string("BroadcastJoin", "true"); if (state->enable_share_hash_table_for_broadcast_join()) { _should_build_hash_table = info.task_idx == 0; if (_should_build_hash_table) { - profile()->add_info_string("ShareHashTableEnabled", "true"); p._shared_hashtable_controller->set_builder_and_consumers( state->fragment_instance_id(), p.node_id()); } - } else { - profile()->add_info_string("ShareHashTableEnabled", "false"); } } + profile()->add_info_string("BuildShareHashTable", std::to_string(_should_build_hash_table)); + profile()->add_info_string("ShareHashTableEnabled", + std::to_string(state->enable_share_hash_table_for_broadcast_join())); if (!_should_build_hash_table) { _dependency->block(); _finish_dependency->block(); @@ -71,25 +72,23 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _finish_dependency->shared_from_this()); } + _runtime_filter_init_timer = ADD_TIMER(profile(), "RuntimeFilterInitTime"); _build_blocks_memory_usage = - ADD_CHILD_COUNTER_WITH_LEVEL(profile(), "BuildBlocks", TUnit::BYTES, "MemoryUsage", 1); + ADD_COUNTER_WITH_LEVEL(profile(), "MemoryUsageBuildBlocks", TUnit::BYTES, 1); _hash_table_memory_usage = - ADD_CHILD_COUNTER_WITH_LEVEL(profile(), "HashTable", TUnit::BYTES, "MemoryUsage", 1); + ADD_COUNTER_WITH_LEVEL(profile(), "MemoryUsageHashTable", TUnit::BYTES, 1); _build_arena_memory_usage = - profile()->AddHighWaterMarkCounter("BuildKeyArena", TUnit::BYTES, "MemoryUsage", 1); + ADD_COUNTER_WITH_LEVEL(profile(), "MemoryUsageBuildKeyArena", TUnit::BYTES, 1); // Build phase auto* record_profile = _should_build_hash_table ? profile() : faker_runtime_profile(); - _build_table_timer = ADD_TIMER(profile(), "BuildTableTime"); - _build_side_merge_block_timer = ADD_TIMER(profile(), "BuildSideMergeBlockTime"); + _build_table_timer = ADD_TIMER(profile(), "BuildHashTableTime"); + _build_side_merge_block_timer = ADD_TIMER(profile(), "MergeBuildBlockTime"); _build_table_insert_timer = ADD_TIMER(record_profile, "BuildTableInsertTime"); _build_expr_call_timer = ADD_TIMER(record_profile, "BuildExprCallTime"); - _build_side_compute_hash_timer = ADD_TIMER(record_profile, "BuildSideHashComputingTime"); - - _allocate_resource_timer = ADD_TIMER(profile(), "AllocateResourceTime"); // Hash Table Init - _hash_table_init(state); + RETURN_IF_ERROR(_hash_table_init(state)); _runtime_filters.resize(p._runtime_filter_descs.size()); for (size_t i = 0; i < p._runtime_filter_descs.size(); i++) { RETURN_IF_ERROR(state->register_producer_runtime_filter( @@ -111,33 +110,55 @@ Status HashJoinBuildSinkLocalState::open(RuntimeState* state) { } Status HashJoinBuildSinkLocalState::close(RuntimeState* state, Status exec_status) { + if (_closed) { + return Status::OK(); + } auto p = _parent->cast(); Defer defer {[&]() { + if (_should_build_hash_table) { + // The build side hash key column maybe no need output, but we need to keep the column in block + // because it is used to compare with probe side hash key column + if (p._should_keep_hash_key_column && _build_col_ids.size() == 1) { + p._should_keep_column_flags[_build_col_ids[0]] = true; + } + + if (_shared_state->build_block) { + // release the memory of unused column in probe stage + _shared_state->build_block->clear_column_mem_not_keep( + p._should_keep_column_flags, bool(p._shared_hashtable_controller)); + } + } + if (_should_build_hash_table && p._shared_hashtable_controller) { p._shared_hashtable_controller->signal_finish(p.node_id()); } }}; if (!_runtime_filter_slots || _runtime_filters.empty() || state->is_cancelled()) { - return Status::OK(); + return Base::close(state, exec_status); } - auto* block = _shared_state->build_block.get(); - uint64_t hash_table_size = block ? block->rows() : 0; - { - SCOPED_TIMER(_runtime_filter_init_timer); - if (_should_build_hash_table) { - RETURN_IF_ERROR(_runtime_filter_slots->init_filters(state, hash_table_size)); + + if (state->get_task()->wake_up_by_downstream()) { + RETURN_IF_ERROR(_runtime_filter_slots->send_filter_size(state, 0, _finish_dependency)); + RETURN_IF_ERROR(_runtime_filter_slots->ignore_all_filters()); + } else { + auto* block = _shared_state->build_block.get(); + uint64_t hash_table_size = block ? block->rows() : 0; + { + SCOPED_TIMER(_runtime_filter_init_timer); + if (_should_build_hash_table) { + RETURN_IF_ERROR(_runtime_filter_slots->init_filters(state, hash_table_size)); + } + RETURN_IF_ERROR(_runtime_filter_slots->ignore_filters(state)); + } + if (_should_build_hash_table && hash_table_size > 1) { + SCOPED_TIMER(_runtime_filter_compute_timer); + _runtime_filter_slots->insert(block); } - RETURN_IF_ERROR(_runtime_filter_slots->ignore_filters(state)); - } - if (_should_build_hash_table && hash_table_size > 1) { - SCOPED_TIMER(_runtime_filter_compute_timer); - _runtime_filter_slots->insert(block); } - SCOPED_TIMER(_publish_runtime_filter_timer); RETURN_IF_ERROR(_runtime_filter_slots->publish(!_should_build_hash_table)); - return Status::OK(); + return Base::close(state, exec_status); } bool HashJoinBuildSinkLocalState::build_unique() const { @@ -204,33 +225,22 @@ Status HashJoinBuildSinkLocalState::_extract_join_column( vectorized::Block& block, vectorized::ColumnUInt8::MutablePtr& null_map, vectorized::ColumnRawPtrs& raw_ptrs, const std::vector& res_col_ids) { auto& shared_state = *_shared_state; - auto& p = _parent->cast(); for (size_t i = 0; i < shared_state.build_exprs_size; ++i) { - if (p._should_convert_to_nullable[i]) { + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (!column->is_nullable() && shared_state.serialize_null_into_key[i]) { _key_columns_holder.emplace_back( vectorized::make_nullable(block.get_by_position(res_col_ids[i]).column)); raw_ptrs[i] = _key_columns_holder.back().get(); - continue; - } - - if (shared_state.is_null_safe_eq_join[i]) { - raw_ptrs[i] = block.get_by_position(res_col_ids[i]).column.get(); + } else if (const auto* nullable = check_and_get_column(*column); + !shared_state.serialize_null_into_key[i] && nullable) { + // update nulllmap and split nested out of ColumnNullable when serialize_null_into_key is false and column is nullable + const auto& col_nested = nullable->get_nested_column(); + const auto& col_nullmap = nullable->get_null_map_data(); + DCHECK(null_map != nullptr); + vectorized::VectorizedUtils::update_null_map(null_map->get_data(), col_nullmap); + raw_ptrs[i] = &col_nested; } else { - const auto* column = block.get_by_position(res_col_ids[i]).column.get(); - if (const auto* nullable = check_and_get_column(*column)) { - const auto& col_nested = nullable->get_nested_column(); - const auto& col_nullmap = nullable->get_null_map_data(); - - if (shared_state.store_null_in_hash_table[i]) { - raw_ptrs[i] = nullable; - } else { - DCHECK(null_map != nullptr); - vectorized::VectorizedUtils::update_null_map(null_map->get_data(), col_nullmap); - raw_ptrs[i] = &col_nested; - } - } else { - raw_ptrs[i] = column; - } + raw_ptrs[i] = column; } } return Status::OK(); @@ -244,7 +254,6 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, if (UNLIKELY(rows == 0)) { return Status::OK(); } - COUNTER_UPDATE(_build_rows_counter, rows); block.replace_if_overflow(); vectorized::ColumnRawPtrs raw_ptrs(_build_expr_ctxs.size()); @@ -261,13 +270,9 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, .data()[0] = 1; } } - // TODO: Now we are not sure whether a column is nullable only by ExecNode's `row_desc` - // so we have to initialize this flag by the first build block. - if (!_has_set_need_null_map_for_build) { - _has_set_need_null_map_for_build = true; - _set_build_ignore_flag(block, _build_col_ids); - } - if (p._short_circuit_for_null_in_build_side || _build_side_ignore_null) { + + _set_build_side_has_external_nullmap(block, _build_col_ids); + if (_build_side_has_external_nullmap) { null_map_val = vectorized::ColumnUInt8::create(); null_map_val->get_data().assign(rows, (uint8_t)0); } @@ -277,142 +282,73 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, st = std::visit( vectorized::Overload { - [&](std::monostate& arg, auto join_op, auto has_null_value, + [&](std::monostate& arg, auto join_op, auto short_circuit_for_null_in_build_side, auto with_other_conjuncts) -> Status { LOG(FATAL) << "FATAL: uninited hash table"; __builtin_unreachable(); return Status::OK(); }, - [&](auto&& arg, auto&& join_op, auto has_null_value, - auto short_circuit_for_null_in_build_side, + [&](auto&& arg, auto&& join_op, auto short_circuit_for_null_in_build_side, auto with_other_conjuncts) -> Status { using HashTableCtxType = std::decay_t; using JoinOpType = std::decay_t; ProcessHashTableBuild hash_table_build_process( rows, raw_ptrs, this, state->batch_size(), state); - auto old_hash_table_size = arg.hash_table->get_byte_size(); - auto old_key_size = arg.serialized_keys_size(true); auto st = hash_table_build_process.template run< - JoinOpType::value, has_null_value, - short_circuit_for_null_in_build_side, with_other_conjuncts>( - arg, - has_null_value || short_circuit_for_null_in_build_side - ? &null_map_val->get_data() - : nullptr, + JoinOpType::value, short_circuit_for_null_in_build_side, + with_other_conjuncts>( + arg, null_map_val ? &null_map_val->get_data() : nullptr, &_shared_state->_has_null_in_build_side); - _mem_tracker->consume(arg.hash_table->get_byte_size() - - old_hash_table_size); - _mem_tracker->consume(arg.serialized_keys_size(true) - old_key_size); + COUNTER_SET(_memory_used_counter, + _build_blocks_memory_usage->value() + + (int64_t)(arg.hash_table->get_byte_size() + + arg.serialized_keys_size(true))); + COUNTER_SET(_peak_memory_usage_counter, _memory_used_counter->value()); return st; }}, - *_shared_state->hash_table_variants, _shared_state->join_op_variants, - vectorized::make_bool_variant(_build_side_ignore_null), + _shared_state->hash_table_variants->method_variant, _shared_state->join_op_variants, vectorized::make_bool_variant(p._short_circuit_for_null_in_build_side), vectorized::make_bool_variant((p._have_other_join_conjunct))); return st; } -void HashJoinBuildSinkLocalState::_set_build_ignore_flag(vectorized::Block& block, - const std::vector& res_col_ids) { +void HashJoinBuildSinkLocalState::_set_build_side_has_external_nullmap( + vectorized::Block& block, const std::vector& res_col_ids) { auto& p = _parent->cast(); + if (p._short_circuit_for_null_in_build_side) { + _build_side_has_external_nullmap = true; + return; + } for (size_t i = 0; i < _build_expr_ctxs.size(); ++i) { - if (!_shared_state->is_null_safe_eq_join[i] && !p._short_circuit_for_null_in_build_side) { - const auto* column = block.get_by_position(res_col_ids[i]).column.get(); - if (check_and_get_column(*column)) { - _build_side_ignore_null |= !_shared_state->store_null_in_hash_table[i]; - } + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (column->is_nullable() && !_shared_state->serialize_null_into_key[i]) { + _build_side_has_external_nullmap = true; + return; } } } -void HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { +Status HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { auto& p = _parent->cast(); - std::visit( - [&](auto&& join_op_variants, auto have_other_join_conjunct) { - if (_build_expr_ctxs.size() == 1 && !p._store_null_in_hash_table[0]) { - // Single column optimization - switch (_build_expr_ctxs[0]->root()->result_type()) { - case TYPE_BOOLEAN: - case TYPE_TINYINT: - _shared_state->hash_table_variants->emplace(); - break; - case TYPE_SMALLINT: - _shared_state->hash_table_variants->emplace(); - break; - case TYPE_INT: - case TYPE_FLOAT: - case TYPE_DATEV2: - _shared_state->hash_table_variants->emplace(); - break; - case TYPE_BIGINT: - case TYPE_DOUBLE: - case TYPE_DATETIME: - case TYPE_DATE: - case TYPE_DATETIMEV2: - _shared_state->hash_table_variants->emplace(); - break; - case TYPE_LARGEINT: - case TYPE_DECIMALV2: - case TYPE_DECIMAL32: - case TYPE_DECIMAL64: - case TYPE_DECIMAL128I: { - vectorized::DataTypePtr& type_ptr = - _build_expr_ctxs[0]->root()->data_type(); - vectorized::TypeIndex idx = - _build_expr_ctxs[0]->root()->is_nullable() - ? assert_cast( - *type_ptr) - .get_nested_type() - ->get_type_id() - : type_ptr->get_type_id(); - vectorized::WhichDataType which(idx); - if (which.is_decimal32()) { - _shared_state->hash_table_variants->emplace(); - } else if (which.is_decimal64()) { - _shared_state->hash_table_variants->emplace(); - } else { - _shared_state->hash_table_variants->emplace(); - } - break; - } - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: { - _shared_state->hash_table_variants->emplace(); - break; - } - default: - _shared_state->hash_table_variants - ->emplace(); - } - return; - } - - std::vector data_types; - for (size_t i = 0; i != _build_expr_ctxs.size(); ++i) { - auto& ctx = _build_expr_ctxs[i]; - auto data_type = ctx->root()->data_type(); - - /// For 'null safe equal' join, - /// the build key column maybe be converted to nullable from non-nullable. - if (p._should_convert_to_nullable[i]) { - data_type = vectorized::make_nullable(data_type); - } - data_types.emplace_back(std::move(data_type)); - } - - if (!try_get_hash_map_context_fixed( - *_shared_state->hash_table_variants, data_types)) { - _shared_state->hash_table_variants - ->emplace(); - } - }, - _shared_state->join_op_variants, - vectorized::make_bool_variant(p._have_other_join_conjunct)); - - DCHECK(!std::holds_alternative(*_shared_state->hash_table_variants)); + std::vector data_types; + for (size_t i = 0; i < _build_expr_ctxs.size(); ++i) { + auto& ctx = _build_expr_ctxs[i]; + auto data_type = ctx->root()->data_type(); + + /// For 'null safe equal' join, + /// the build key column maybe be converted to nullable from non-nullable. + if (p._serialize_null_into_key[i]) { + data_type = vectorized::make_nullable(data_type); + } + data_types.emplace_back(std::move(data_type)); + } + if (_build_expr_ctxs.size() == 1) { + p._should_keep_hash_key_column = true; + } + return init_hash_method(_shared_state->hash_table_variants.get(), data_types, + true); } HashJoinBuildSinkOperatorX::HashJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, @@ -433,9 +369,9 @@ Status HashJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* st RETURN_IF_ERROR(JoinBuildSinkOperatorX::init(tnode, state)); DCHECK(tnode.__isset.hash_join_node); - const bool build_stores_null = _join_op == TJoinOp::RIGHT_OUTER_JOIN || - _join_op == TJoinOp::FULL_OUTER_JOIN || - _join_op == TJoinOp::RIGHT_ANTI_JOIN; + if (tnode.hash_join_node.__isset.hash_output_slot_ids) { + _hash_output_slot_ids = tnode.hash_join_node.hash_output_slot_ids; + } const std::vector& eq_join_conjuncts = tnode.hash_join_node.eq_join_conjuncts; for (const auto& eq_join_conjunct : eq_join_conjuncts) { @@ -470,16 +406,18 @@ Status HashJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* st (eq_join_conjunct.right.nodes[0].is_nullable || eq_join_conjunct.left.nodes[0].is_nullable); - const bool should_convert_to_nullable = is_null_safe_equal && - !eq_join_conjunct.right.nodes[0].is_nullable && - eq_join_conjunct.left.nodes[0].is_nullable; _is_null_safe_eq_join.push_back(is_null_safe_equal); - _should_convert_to_nullable.emplace_back(should_convert_to_nullable); - // if is null aware, build join column and probe join column both need dispose null value - _store_null_in_hash_table.emplace_back( - is_null_safe_equal || - (_build_expr_ctxs.back()->root()->is_nullable() && build_stores_null)); + if (eq_join_conjuncts.size() == 1) { + // single column key serialize method must use nullmap for represent null to instead serialize null into key + _serialize_null_into_key.emplace_back(false); + } else if (is_null_safe_equal) { + // use serialize null into key to represent multi column null value + _serialize_null_into_key.emplace_back(true); + } else { + // on normal conditions, because null!=null, it can be expressed directly with nullmap. + _serialize_null_into_key.emplace_back(false); + } } return Status::OK(); @@ -494,6 +432,17 @@ Status HashJoinBuildSinkOperatorX::open(RuntimeState* state) { _shared_hash_table_context = _shared_hashtable_controller->get_context(node_id()); } } + auto init_keep_column_flags = [&](auto& tuple_descs, auto& output_slot_flags) { + for (const auto& tuple_desc : tuple_descs) { + for (const auto& slot_desc : tuple_desc->slots()) { + output_slot_flags.emplace_back( + _hash_output_slot_ids.empty() || + std::find(_hash_output_slot_ids.begin(), _hash_output_slot_ids.end(), + slot_desc->id()) != _hash_output_slot_ids.end()); + } + } + }; + init_keep_column_flags(row_desc().tuple_descriptors(), _should_keep_column_flags); RETURN_IF_ERROR(vectorized::VExpr::prepare(_build_expr_ctxs, state, _child->row_desc())); return vectorized::VExpr::open(_build_expr_ctxs, state); } @@ -504,10 +453,10 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); + local_state._eos = eos; if (local_state._should_build_hash_table) { // If eos or have already met a null value using short-circuit strategy, we do not need to pull // data from probe side. - local_state._build_side_mem_used += in_block->allocated_bytes(); if (local_state._build_side_mutable_block.empty()) { auto tmp_build_block = vectorized::VectorizedUtils::create_empty_columnswithtypename( @@ -534,12 +483,13 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* std::to_string(std::numeric_limits::max())); } - local_state._mem_tracker->consume(in_block->bytes()); - COUNTER_UPDATE(local_state._build_blocks_memory_usage, in_block->bytes()); - SCOPED_TIMER(local_state._build_side_merge_block_timer); RETURN_IF_ERROR(local_state._build_side_mutable_block.merge_ignore_overflow( std::move(*in_block))); + int64_t blocks_mem_usage = local_state._build_side_mutable_block.allocated_bytes(); + COUNTER_SET(local_state._memory_used_counter, blocks_mem_usage); + COUNTER_SET(local_state._peak_memory_usage_counter, blocks_mem_usage); + COUNTER_SET(local_state._build_blocks_memory_usage, blocks_mem_usage); } } @@ -555,6 +505,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* local_state.process_build_block(state, (*local_state._shared_state->build_block))); if (_shared_hashtable_controller) { _shared_hash_table_context->status = Status::OK(); + _shared_hash_table_context->complete_build_stage = true; // arena will be shared with other instances. _shared_hash_table_context->arena = local_state._shared_state->arena; _shared_hash_table_context->hash_table_variants = @@ -565,9 +516,9 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* _shared_hash_table_context->build_indexes_null = local_state._shared_state->build_indexes_null; local_state._runtime_filter_slots->copy_to_shared_context(_shared_hash_table_context); - _shared_hashtable_controller->signal(node_id()); } - } else if (!local_state._should_build_hash_table) { + } else if (!local_state._should_build_hash_table && + _shared_hash_table_context->complete_build_stage) { DCHECK(_shared_hashtable_controller != nullptr); DCHECK(_shared_hash_table_context != nullptr); // the instance which is not build hash table, it's should wait the signal of hash table build finished. @@ -597,9 +548,10 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* dst.hash_table = src.hash_table; } }, - *local_state._shared_state->hash_table_variants, - *std::static_pointer_cast( - _shared_hash_table_context->hash_table_variants)); + local_state._shared_state->hash_table_variants->method_variant, + std::static_pointer_cast( + _shared_hash_table_context->hash_table_variants) + ->method_variant); local_state._shared_state->build_block = _shared_hash_table_context->block; local_state._shared_state->build_indexes_null = diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index cf677833fb5b64..45aa1e8c8a262d 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -22,7 +22,7 @@ #include "operator.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" class HashJoinBuildSinkOperatorX; class HashJoinBuildSinkLocalState final @@ -55,8 +55,9 @@ class HashJoinBuildSinkLocalState final Status close(RuntimeState* state, Status exec_status) override; protected: - void _hash_table_init(RuntimeState* state); - void _set_build_ignore_flag(vectorized::Block& block, const std::vector& res_col_ids); + Status _hash_table_init(RuntimeState* state); + void _set_build_side_has_external_nullmap(vectorized::Block& block, + const std::vector& res_col_ids); Status _do_evaluate(vectorized::Block& block, vectorized::VExprContextSPtrs& exprs, RuntimeProfile::Counter& expr_call_timer, std::vector& res_col_ids); std::vector _convert_block_to_null(vectorized::Block& block); @@ -74,14 +75,11 @@ class HashJoinBuildSinkLocalState final std::vector _key_columns_holder; bool _should_build_hash_table = true; - int64_t _build_side_mem_used = 0; - int64_t _build_side_last_mem_used = 0; size_t _build_side_rows = 0; vectorized::MutableBlock _build_side_mutable_block; std::shared_ptr _runtime_filter_slots; - bool _has_set_need_null_map_for_build = false; /* * The comparison result of a null value with any other value is null, @@ -89,21 +87,19 @@ class HashJoinBuildSinkLocalState final * the result of an equality condition involving null should be false, * so null does not need to be added to the hash table. */ - bool _build_side_ignore_null = false; + bool _build_side_has_external_nullmap = false; std::vector _build_col_ids; - std::shared_ptr _finish_dependency; + std::shared_ptr _finish_dependency; RuntimeProfile::Counter* _build_table_timer = nullptr; RuntimeProfile::Counter* _build_expr_call_timer = nullptr; RuntimeProfile::Counter* _build_table_insert_timer = nullptr; - RuntimeProfile::Counter* _build_side_compute_hash_timer = nullptr; RuntimeProfile::Counter* _build_side_merge_block_timer = nullptr; - RuntimeProfile::Counter* _allocate_resource_timer = nullptr; - RuntimeProfile::Counter* _build_blocks_memory_usage = nullptr; RuntimeProfile::Counter* _hash_table_memory_usage = nullptr; - RuntimeProfile::HighWaterMarkCounter* _build_arena_memory_usage = nullptr; + RuntimeProfile::Counter* _build_arena_memory_usage = nullptr; + RuntimeProfile::Counter* _runtime_filter_init_timer = nullptr; }; class HashJoinBuildSinkOperatorX final @@ -132,8 +128,8 @@ class HashJoinBuildSinkOperatorX final if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { return {ExchangeType::NOOP}; } else if (_is_broadcast_join) { - return _child->ignore_data_distribution() ? DataDistribution(ExchangeType::PASS_TO_ONE) - : DataDistribution(ExchangeType::NOOP); + return _child->is_serial_operator() ? DataDistribution(ExchangeType::PASS_TO_ONE) + : DataDistribution(ExchangeType::NOOP); } return _join_distribution == TJoinDistributionType::BUCKET_SHUFFLE || _join_distribution == TJoinDistributionType::COLOCATE @@ -141,10 +137,7 @@ class HashJoinBuildSinkOperatorX final : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); } - bool require_shuffled_data_distribution() const override { - return _join_op != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && !_is_broadcast_join; - } - bool is_shuffled_hash_join() const override { + bool is_shuffled_operator() const override { return _join_distribution == TJoinDistributionType::PARTITIONED; } bool require_data_distribution() const override { @@ -159,13 +152,11 @@ class HashJoinBuildSinkOperatorX final // build expr vectorized::VExprContextSPtrs _build_expr_ctxs; // mark the build hash table whether it needs to store null value - std::vector _store_null_in_hash_table; + std::vector _serialize_null_into_key; // mark the join column whether support null eq std::vector _is_null_safe_eq_join; - std::vector _should_convert_to_nullable; - bool _is_broadcast_join = false; std::shared_ptr _shared_hashtable_controller; @@ -173,11 +164,15 @@ class HashJoinBuildSinkOperatorX final const std::vector _partition_exprs; const bool _need_local_merge; + + std::vector _hash_output_slot_ids; + std::vector _should_keep_column_flags; + bool _should_keep_hash_key_column = false; }; template struct ProcessHashTableBuild { - ProcessHashTableBuild(int rows, vectorized::ColumnRawPtrs& build_raw_ptrs, + ProcessHashTableBuild(size_t rows, vectorized::ColumnRawPtrs& build_raw_ptrs, HashJoinBuildSinkLocalState* parent, int batch_size, RuntimeState* state) : _rows(rows), _build_raw_ptrs(build_raw_ptrs), @@ -185,12 +180,12 @@ struct ProcessHashTableBuild { _batch_size(batch_size), _state(state) {} - template + template Status run(HashTableContext& hash_table_ctx, vectorized::ConstNullMapPtr null_map, bool* has_null_key) { - if (short_circuit_for_null || ignore_null) { + if (null_map) { // first row is mocked and is null + // TODO: Need to test the for loop. break may better for (uint32_t i = 1; i < _rows; i++) { if ((*null_map)[i]) { *has_null_key = true; @@ -208,8 +203,21 @@ struct ProcessHashTableBuild { hash_table_ctx.init_serialized_keys(_build_raw_ptrs, _rows, null_map ? null_map->data() : nullptr, true, true, hash_table_ctx.hash_table->get_bucket_size()); - hash_table_ctx.hash_table->template build( - hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), _rows); + // only 2 cases need to access the null value in hash table + bool keep_null_key = false; + if ((JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) && + with_other_conjuncts) { + //null aware join with other conjuncts + keep_null_key = true; + } else if (_parent->_shared_state->is_null_safe_eq_join.size() == 1 && + _parent->_shared_state->is_null_safe_eq_join[0]) { + // single null safe eq + keep_null_key = true; + } + + hash_table_ctx.hash_table->build(hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), + _rows, keep_null_key); hash_table_ctx.bucket_nums.resize(_batch_size); hash_table_ctx.bucket_nums.shrink_to_fit(); @@ -221,7 +229,7 @@ struct ProcessHashTableBuild { } private: - const uint32_t _rows; + const size_t _rows; vectorized::ColumnRawPtrs& _build_raw_ptrs; HashJoinBuildSinkLocalState* _parent = nullptr; int _batch_size; @@ -229,3 +237,4 @@ struct ProcessHashTableBuild { }; } // namespace doris::pipeline +#include "common/compile_check_end.h" diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index f91e1eaa2a1b17..426bfcb219dc04 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -19,6 +19,7 @@ #include +#include "common/cast_set.h" #include "common/logging.h" #include "pipeline/exec/operator.h" #include "runtime/descriptors.h" @@ -26,7 +27,7 @@ #include "vec/data_types/data_type_nullable.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" HashJoinProbeLocalState::HashJoinProbeLocalState(RuntimeState* state, OperatorXBase* parent) : JoinProbeLocalState(state, parent), _process_hashtable_ctx_variants(std::make_unique()) {} @@ -54,15 +55,13 @@ Status HashJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) _construct_mutable_join_block(); _probe_column_disguise_null.reserve(_probe_expr_ctxs.size()); _probe_arena_memory_usage = - profile()->AddHighWaterMarkCounter("ProbeKeyArena", TUnit::BYTES, "MemoryUsage", 1); + profile()->AddHighWaterMarkCounter("MemoryUsageProbeKeyArena", TUnit::BYTES, "", 1); // Probe phase - _probe_next_timer = ADD_TIMER(profile(), "ProbeFindNextTime"); _probe_expr_call_timer = ADD_TIMER(profile(), "ProbeExprCallTime"); _search_hashtable_timer = ADD_TIMER(profile(), "ProbeWhenSearchHashTableTime"); _build_side_output_timer = ADD_TIMER(profile(), "ProbeWhenBuildSideOutputTime"); _probe_side_output_timer = ADD_TIMER(profile(), "ProbeWhenProbeSideOutputTime"); - _probe_process_hashtable_timer = ADD_TIMER(profile(), "ProbeWhenProcessHashTableTime"); - _process_other_join_conjunct_timer = ADD_TIMER(profile(), "OtherJoinConjunctTime"); + _non_equal_join_conjuncts_timer = ADD_TIMER(profile(), "NonEqualJoinConjunctEvaluationTime"); _init_probe_side_timer = ADD_TIMER(profile(), "InitProbeSideTime"); return Status::OK(); } @@ -153,11 +152,9 @@ Status HashJoinProbeLocalState::close(RuntimeState* state) { bool HashJoinProbeLocalState::_need_probe_null_map(vectorized::Block& block, const std::vector& res_col_ids) { for (size_t i = 0; i < _probe_expr_ctxs.size(); ++i) { - if (!_shared_state->is_null_safe_eq_join[i]) { - auto column = block.get_by_position(res_col_ids[i]).column.get(); - if (check_and_get_column(*column)) { - return true; - } + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (column->is_nullable() && !_shared_state->serialize_null_into_key[i]) { + return true; } } return false; @@ -230,7 +227,6 @@ HashJoinProbeOperatorX::HashJoinProbeOperatorX(ObjectPool* pool, const TPlanNode Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Block* output_block, bool* eos) const { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state._probe_timer); if (local_state._shared_state->short_circuit_for_probe) { // If we use a short-circuit strategy, should return empty block directly. *eos = true; @@ -289,23 +285,19 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc if (local_state._probe_index < local_state._probe_block.rows()) { DCHECK(local_state._has_set_need_null_map_for_probe); std::visit( - [&](auto&& arg, auto&& process_hashtable_ctx, auto need_null_map_for_probe, - auto ignore_null) { + [&](auto&& arg, auto&& process_hashtable_ctx, auto need_judge_null) { using HashTableProbeType = std::decay_t; if constexpr (!std::is_same_v) { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { - st = process_hashtable_ctx - .template process( - arg, - need_null_map_for_probe - ? &local_state._null_map_column->get_data() - : nullptr, - mutable_join_block, &temp_block, - local_state._probe_block.rows(), _is_mark_join, - _have_other_join_conjunct); - local_state._mem_tracker->set_consumption( - arg.serialized_keys_size(false)); + st = process_hashtable_ctx.template process( + arg, + local_state._null_map_column + ? &local_state._null_map_column->get_data() + : nullptr, + mutable_join_block, &temp_block, + cast_set(local_state._probe_block.rows()), + _is_mark_join, _have_other_join_conjunct); } else { st = Status::InternalError("uninited hash table"); } @@ -313,10 +305,10 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc st = Status::InternalError("uninited hash table probe"); } }, - *local_state._shared_state->hash_table_variants, + local_state._shared_state->hash_table_variants->method_variant, *local_state._process_hashtable_ctx_variants, - vectorized::make_bool_variant(local_state._need_null_map_for_probe), - vectorized::make_bool_variant(local_state._shared_state->probe_ignore_null)); + vectorized::make_bool_variant(local_state._need_null_map_for_probe && + local_state._shared_state->probe_ignore_null)); } else if (local_state._probe_eos) { if (_is_right_semi_anti || (_is_outer_join && _join_op != TJoinOp::LEFT_OUTER_JOIN)) { std::visit( @@ -325,7 +317,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc if constexpr (!std::is_same_v) { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { - st = process_hashtable_ctx.process_data_in_hashtable( + st = process_hashtable_ctx.finish_probing( arg, mutable_join_block, &temp_block, eos, _is_mark_join); } else { st = Status::InternalError("uninited hash table"); @@ -334,7 +326,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc st = Status::InternalError("uninited hash table probe"); } }, - *local_state._shared_state->hash_table_variants, + local_state._shared_state->hash_table_variants->method_variant, *local_state._process_hashtable_ctx_variants); } else { *eos = true; @@ -384,34 +376,22 @@ Status HashJoinProbeLocalState::_extract_join_column(vectorized::Block& block, } auto& shared_state = *_shared_state; - auto& p = _parent->cast(); for (size_t i = 0; i < shared_state.build_exprs_size; ++i) { - if (p._should_convert_to_nullable[i]) { + const auto* column = block.get_by_position(res_col_ids[i]).column.get(); + if (!column->is_nullable() && shared_state.serialize_null_into_key[i]) { _key_columns_holder.emplace_back( vectorized::make_nullable(block.get_by_position(res_col_ids[i]).column)); _probe_columns[i] = _key_columns_holder.back().get(); - continue; - } - - if (shared_state.is_null_safe_eq_join[i]) { - _probe_columns[i] = block.get_by_position(res_col_ids[i]).column.get(); + } else if (const auto* nullable = check_and_get_column(*column); + nullable && !shared_state.serialize_null_into_key[i]) { + // update nulllmap and split nested out of ColumnNullable when serialize_null_into_key is false and column is nullable + const auto& col_nested = nullable->get_nested_column(); + const auto& col_nullmap = nullable->get_null_map_data(); + DCHECK(_null_map_column != nullptr); + vectorized::VectorizedUtils::update_null_map(_null_map_column->get_data(), col_nullmap); + _probe_columns[i] = &col_nested; } else { - const auto* column = block.get_by_position(res_col_ids[i]).column.get(); - if (const auto* nullable = check_and_get_column(*column)) { - const auto& col_nested = nullable->get_nested_column(); - const auto& col_nullmap = nullable->get_null_map_data(); - - DCHECK(_null_map_column != nullptr); - vectorized::VectorizedUtils::update_null_map(_null_map_column->get_data(), - col_nullmap); - if (shared_state.store_null_in_hash_table[i]) { - _probe_columns[i] = nullable; - } else { - _probe_columns[i] = &col_nested; - } - } else { - _probe_columns[i] = column; - } + _probe_columns[i] = column; } } return Status::OK(); @@ -501,6 +481,10 @@ Status HashJoinProbeOperatorX::push(RuntimeState* state, vectorized::Block* inpu if (&local_state._probe_block != input_block) { input_block->swap(local_state._probe_block); + COUNTER_SET(local_state._memory_used_counter, + (int64_t)local_state._probe_block.allocated_bytes()); + COUNTER_SET(local_state._peak_memory_usage_counter, + local_state._memory_used_counter->value()); } } return Status::OK(); @@ -528,20 +512,6 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeState* state) null_aware || (_probe_expr_ctxs.back()->root()->is_nullable() && probe_dispose_null); conjuncts_index++; - const bool is_null_safe_equal = eq_join_conjunct.__isset.opcode && - (eq_join_conjunct.opcode == TExprOpcode::EQ_FOR_NULL) && - (eq_join_conjunct.right.nodes[0].is_nullable || - eq_join_conjunct.left.nodes[0].is_nullable); - - /// If it's right anti join, - /// we should convert the probe to nullable if the build side is nullable. - /// And if it is 'null safe equal', - /// we must make sure the build side and the probe side are both nullable or non-nullable. - const bool should_convert_to_nullable = - (is_null_safe_equal || _join_op == TJoinOp::RIGHT_ANTI_JOIN) && - !eq_join_conjunct.left.nodes[0].is_nullable && - eq_join_conjunct.right.nodes[0].is_nullable; - _should_convert_to_nullable.emplace_back(should_convert_to_nullable); } for (size_t i = 0; i < _probe_expr_ctxs.size(); ++i) { _probe_ignore_null |= !probe_not_ignore_null[i]; @@ -643,7 +613,7 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) { } } - const int right_col_idx = + const size_t right_col_idx = (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _left_table_data_types.size(); size_t idx = 0; for (const auto* slot : slots_to_check) { diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index d3bca8fa7cd712..1bdb9d13347d09 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -26,7 +26,7 @@ namespace doris { class RuntimeState; namespace pipeline { - +#include "common/compile_check_begin.h" class HashJoinProbeLocalState; using HashTableCtxVariants = @@ -117,14 +117,12 @@ class HashJoinProbeLocalState final std::make_unique(); RuntimeProfile::Counter* _probe_expr_call_timer = nullptr; - RuntimeProfile::Counter* _probe_next_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; - RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; RuntimeProfile::HighWaterMarkCounter* _probe_arena_memory_usage = nullptr; RuntimeProfile::Counter* _search_hashtable_timer = nullptr; RuntimeProfile::Counter* _init_probe_side_timer = nullptr; RuntimeProfile::Counter* _build_side_output_timer = nullptr; - RuntimeProfile::Counter* _process_other_join_conjunct_timer = nullptr; + RuntimeProfile::Counter* _non_equal_join_conjuncts_timer = nullptr; }; class HashJoinProbeOperatorX final : public JoinProbeOperatorX { @@ -152,10 +150,7 @@ class HashJoinProbeOperatorX final : public JoinProbeOperatorX _should_convert_to_nullable; - vectorized::DataTypes _right_table_data_types; vectorized::DataTypes _left_table_data_types; std::vector _hash_output_slot_ids; @@ -194,3 +187,4 @@ class HashJoinProbeOperatorX final : public JoinProbeOperatorXnode_id()), std::to_string(_parent->nereids_id()), diff --git a/be/src/pipeline/exec/jdbc_table_sink_operator.cpp b/be/src/pipeline/exec/jdbc_table_sink_operator.cpp index dba9f6259ff38d..29c881d1c28100 100644 --- a/be/src/pipeline/exec/jdbc_table_sink_operator.cpp +++ b/be/src/pipeline/exec/jdbc_table_sink_operator.cpp @@ -25,7 +25,7 @@ #include "vec/exprs/vexpr_context.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" JdbcTableSinkOperatorX::JdbcTableSinkOperatorX(const RowDescriptor& row_desc, int operator_id, const std::vector& t_output_expr) : DataSinkOperatorX(operator_id, 0), _row_desc(row_desc), _t_output_expr(t_output_expr) {} @@ -47,6 +47,7 @@ Status JdbcTableSinkOperatorX::open(RuntimeState* state) { Status JdbcTableSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); + COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); RETURN_IF_ERROR(local_state.sink(state, block, eos)); return Status::OK(); } diff --git a/be/src/pipeline/exec/join/join_op.h b/be/src/pipeline/exec/join/join_op.h index 616753b72de39b..f3bd47a911eb32 100644 --- a/be/src/pipeline/exec/join/join_op.h +++ b/be/src/pipeline/exec/join/join_op.h @@ -20,7 +20,7 @@ #include "vec/common/columns_hashing.h" #include "vec/core/block.h" -namespace doris::pipeline { +namespace doris { /** * Now we have different kinds of RowRef for join operation. Overall, RowRef is the base class and * the class inheritance is below: @@ -129,12 +129,10 @@ struct RowRefList : RowRef { RowRefList() = default; RowRefList(size_t row_num_) : RowRef(row_num_) {} - ForwardIterator begin() { return ForwardIterator(this); } + ForwardIterator begin() { return {this}; } /// insert element after current one - void insert(RowRefType&& row_ref, vectorized::Arena& pool) { - next.emplace_back(std::move(row_ref)); - } + void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } void clear() { next.clear(); } @@ -149,9 +147,7 @@ struct RowRefListWithFlag : RowRef { RowRefListWithFlag() = default; RowRefListWithFlag(size_t row_num_) : RowRef(row_num_) {} - ForwardIterator const begin() { - return ForwardIterator(this); - } + ForwardIterator begin() { return {this}; } /// insert element after current one void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } @@ -171,9 +167,7 @@ struct RowRefListWithFlags : RowRefWithFlag { RowRefListWithFlags() = default; RowRefListWithFlags(size_t row_num_) : RowRefWithFlag(row_num_) {} - ForwardIterator const begin() { - return ForwardIterator(this); - } + ForwardIterator begin() { return {this}; } /// insert element after current one void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } @@ -185,4 +179,4 @@ struct RowRefListWithFlags : RowRefWithFlag { std::vector next; }; -} // namespace doris::pipeline +} // namespace doris diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h b/be/src/pipeline/exec/join/process_hash_table_probe.h index 965d62192b2fed..14e0edd977f57b 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe.h @@ -55,20 +55,20 @@ struct ProcessHashTableProbe { int last_probe_index, bool all_match_one, bool have_other_join_conjunct); - template + template Status process(HashTableType& hash_table_ctx, ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, vectorized::Block* output_block, - size_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); + uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); // Only process the join with no other join conjunct, because of no other join conjunt // the output block struct is same with mutable block. we can do more opt on it and simplify // the logic of probe // TODO: opt the visited here to reduce the size of hash table - template + template Status do_process(HashTableType& hash_table_ctx, ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, vectorized::Block* output_block, - size_t probe_rows); + uint32_t probe_rows); // In the presence of other join conjunct, the process of join become more complicated. // each matching join column need to be processed by other join conjunct. so the struct of mutable block // and output block may be different @@ -87,13 +87,12 @@ struct ProcessHashTableProbe { // Process full outer join/ right join / right semi/anti join to output the join result // in hash table template - Status process_data_in_hashtable(HashTableType& hash_table_ctx, - vectorized::MutableBlock& mutable_block, - vectorized::Block* output_block, bool* eos, bool is_mark_join); + Status finish_probing(HashTableType& hash_table_ctx, vectorized::MutableBlock& mutable_block, + vectorized::Block* output_block, bool* eos, bool is_mark_join); /// For null aware join with other conjuncts, if the probe key of one row on left side is null, /// we should make this row match with all rows in build side. - size_t _process_probe_null_key(uint32_t probe_idx); + uint32_t _process_probe_null_key(uint32_t probe_idx); pipeline::HashJoinProbeLocalState* _parent = nullptr; const int _batch_size; @@ -132,15 +131,14 @@ struct ProcessHashTableProbe { bool _need_calculate_build_index_has_zero = true; bool* _has_null_in_build_side; - RuntimeProfile::Counter* _rows_returned_counter = nullptr; RuntimeProfile::Counter* _search_hashtable_timer = nullptr; RuntimeProfile::Counter* _init_probe_side_timer = nullptr; RuntimeProfile::Counter* _build_side_output_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; - RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; + RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; - int _right_col_idx; - int _right_col_len; + size_t _right_col_idx; + size_t _right_col_len; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 653cc8ab4473dd..05cd3d7d9e0590 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -19,6 +19,7 @@ #include +#include "common/cast_set.h" #include "common/status.h" #include "pipeline/exec/hashjoin_probe_operator.h" #include "process_hash_table_probe.h" @@ -29,7 +30,7 @@ #include "vec/exprs/vexpr_context.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" template ProcessHashTableProbe::ProcessHashTableProbe(HashJoinProbeLocalState* parent, int batch_size) @@ -51,12 +52,11 @@ ProcessHashTableProbe::ProcessHashTableProbe(HashJoinProbeLocalState _left_output_slot_flags(parent->left_output_slot_flags()), _right_output_slot_flags(parent->right_output_slot_flags()), _has_null_in_build_side(parent->has_null_in_build_side()), - _rows_returned_counter(parent->_rows_returned_counter), _search_hashtable_timer(parent->_search_hashtable_timer), _init_probe_side_timer(parent->_init_probe_side_timer), _build_side_output_timer(parent->_build_side_output_timer), _probe_side_output_timer(parent->_probe_side_output_timer), - _probe_process_hashtable_timer(parent->_probe_process_hashtable_timer), + _finish_probe_phase_timer(parent->_finish_probe_phase_timer), _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _parent->left_table_data_types().size()), @@ -177,21 +177,23 @@ typename HashTableType::State ProcessHashTableProbe::_init_probe_sid false, hash_table_ctx.hash_table->get_bucket_size()); hash_table_ctx.hash_table->pre_build_idxs(hash_table_ctx.bucket_nums, need_judge_null ? null_map : nullptr); - COUNTER_SET(_parent->_probe_arena_memory_usage, - (int64_t)hash_table_ctx.serialized_keys_size(false)); + int64_t arena_memory_usage = hash_table_ctx.serialized_keys_size(false); + COUNTER_SET(_parent->_probe_arena_memory_usage, arena_memory_usage); + COUNTER_UPDATE(_parent->_memory_used_counter, arena_memory_usage); + COUNTER_SET(_parent->_peak_memory_usage_counter, _parent->_memory_used_counter->value()); } return typename HashTableType::State(_parent->_probe_columns); } template -template +template Status ProcessHashTableProbe::do_process(HashTableType& hash_table_ctx, vectorized::ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, vectorized::Block* output_block, - size_t probe_rows) { + uint32_t probe_rows) { if (_right_col_len && !_build_block) { return Status::InternalError("build block is nullptr"); } @@ -204,8 +206,8 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c SCOPED_TIMER(_init_probe_side_timer); _init_probe_side( hash_table_ctx, probe_rows, with_other_conjuncts, - need_null_map_for_probe ? null_map->data() : nullptr, - need_null_map_for_probe && ignore_null && + null_map ? null_map->data() : nullptr, + need_judge_null && (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN || JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || @@ -215,7 +217,7 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c auto& mcol = mutable_block.mutable_columns(); const bool has_mark_join_conjunct = !_parent->_mark_join_conjuncts.empty(); - int current_offset = 0; + uint32_t current_offset = 0; if constexpr ((JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) && with_other_conjuncts) { @@ -253,13 +255,12 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c } } else { SCOPED_TIMER(_search_hashtable_timer); - auto [new_probe_idx, new_build_idx, - new_current_offset] = hash_table_ctx.hash_table->template find_batch < JoinOpType, - with_other_conjuncts, is_mark_join, - need_null_map_for_probe && - ignore_null > (hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), - probe_index, build_index, probe_rows, _probe_indexs.data(), - _probe_visited, _build_indexs.data(), has_mark_join_conjunct); + auto [new_probe_idx, new_build_idx, new_current_offset] = + hash_table_ctx.hash_table->template find_batch( + hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), probe_index, + build_index, cast_set(probe_rows), _probe_indexs.data(), + _probe_visited, _build_indexs.data(), has_mark_join_conjunct); probe_index = new_probe_idx; build_index = new_build_idx; current_offset = new_current_offset; @@ -303,12 +304,12 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash_table_c } template -size_t ProcessHashTableProbe::_process_probe_null_key(uint32_t probe_index) { +uint32_t ProcessHashTableProbe::_process_probe_null_key(uint32_t probe_index) { const auto rows = _build_block->rows(); DCHECK_LT(_build_index_for_null_probe_key, rows); DCHECK_LT(0, _build_index_for_null_probe_key); - size_t matched_cnt = 0; + uint32_t matched_cnt = 0; for (; _build_index_for_null_probe_key < rows && matched_cnt < _batch_size; ++matched_cnt) { _probe_indexs[matched_cnt] = probe_index; _build_indexs[matched_cnt] = _build_index_for_null_probe_key++; @@ -501,8 +502,8 @@ Status ProcessHashTableProbe::do_other_join_conjuncts(vectorized::Bl return Status::OK(); } - SCOPED_TIMER(_parent->_process_other_join_conjunct_timer); - int orig_columns = output_block->columns(); + SCOPED_TIMER(_parent->_non_equal_join_conjuncts_timer); + size_t orig_columns = output_block->columns(); vectorized::IColumn::Filter other_conjunct_filter(row_count, 1); { bool can_be_filter_all = false; @@ -616,10 +617,11 @@ Status ProcessHashTableProbe::do_other_join_conjuncts(vectorized::Bl template template -Status ProcessHashTableProbe::process_data_in_hashtable( - HashTableType& hash_table_ctx, vectorized::MutableBlock& mutable_block, - vectorized::Block* output_block, bool* eos, bool is_mark_join) { - SCOPED_TIMER(_probe_process_hashtable_timer); +Status ProcessHashTableProbe::finish_probing(HashTableType& hash_table_ctx, + vectorized::MutableBlock& mutable_block, + vectorized::Block* output_block, bool* eos, + bool is_mark_join) { + SCOPED_TIMER(_finish_probe_phase_timer); auto& mcol = mutable_block.mutable_columns(); if (is_mark_join) { std::unique_ptr mark_column = @@ -640,9 +642,13 @@ Status ProcessHashTableProbe::process_data_in_hashtable( mcol.size(), _right_col_len, _right_col_idx); } for (size_t j = 0; j < _right_col_len; ++j) { - const auto& column = *_build_block->safe_get_by_position(j).column; - mcol[j + _right_col_idx]->insert_indices_from(column, _build_indexs.data(), - _build_indexs.data() + block_size); + if (_right_output_slot_flags->at(j)) { + const auto& column = *_build_block->safe_get_by_position(j).column; + mcol[j + _right_col_idx]->insert_indices_from(column, _build_indexs.data(), + _build_indexs.data() + block_size); + } else { + mcol[j + _right_col_idx]->resize(block_size); + } } // just resize the left table column in case with other conjunct to make block size is not zero @@ -668,19 +674,19 @@ Status ProcessHashTableProbe::process_data_in_hashtable( } template -template +template Status ProcessHashTableProbe::process(HashTableType& hash_table_ctx, vectorized::ConstNullMapPtr null_map, vectorized::MutableBlock& mutable_block, vectorized::Block* output_block, - size_t probe_rows, bool is_mark_join, + uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct) { Status res; std::visit( [&](auto is_mark_join, auto have_other_join_conjunct) { - res = do_process( - hash_table_ctx, null_map, mutable_block, output_block, probe_rows); + res = do_process(hash_table_ctx, null_map, mutable_block, + output_block, probe_rows); }, vectorized::make_bool_variant(is_mark_join), vectorized::make_bool_variant(have_other_join_conjunct)); @@ -696,50 +702,32 @@ struct ExtractType { }; #define INSTANTIATION(JoinOpType, T) \ - template Status \ - ProcessHashTableProbe::process::Type>( \ - ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ - vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ - size_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - template Status \ - ProcessHashTableProbe::process::Type>( \ - ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ - vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ - size_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - template Status \ - ProcessHashTableProbe::process::Type>( \ + template Status ProcessHashTableProbe::process::Type>( \ ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ - size_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - template Status \ - ProcessHashTableProbe::process::Type>( \ + uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ + template Status ProcessHashTableProbe::process::Type>( \ ExtractType::Type & hash_table_ctx, vectorized::ConstNullMapPtr null_map, \ vectorized::MutableBlock & mutable_block, vectorized::Block * output_block, \ - size_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ - \ - template Status \ - ProcessHashTableProbe::process_data_in_hashtable::Type>( \ + uint32_t probe_rows, bool is_mark_join, bool have_other_join_conjunct); \ + template Status ProcessHashTableProbe::finish_probing::Type>( \ ExtractType::Type & hash_table_ctx, vectorized::MutableBlock & mutable_block, \ vectorized::Block * output_block, bool* eos, bool is_mark_join); -#define INSTANTIATION_FOR(JoinOpType) \ - template struct ProcessHashTableProbe; \ - \ - INSTANTIATION(JoinOpType, (vectorized::SerializedHashTableContext)); \ - INSTANTIATION(JoinOpType, (I8HashTableContext)); \ - INSTANTIATION(JoinOpType, (I16HashTableContext)); \ - INSTANTIATION(JoinOpType, (I32HashTableContext)); \ - INSTANTIATION(JoinOpType, (I64HashTableContext)); \ - INSTANTIATION(JoinOpType, (I128HashTableContext)); \ - INSTANTIATION(JoinOpType, (I256HashTableContext)); \ - INSTANTIATION(JoinOpType, (I64FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I64FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I128FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I128FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext)); \ - INSTANTIATION(JoinOpType, (MethodOneString)); \ - INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext)); - +#define INSTANTIATION_FOR(JoinOpType) \ + template struct ProcessHashTableProbe; \ + \ + INSTANTIATION(JoinOpType, (SerializedHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (PrimaryTypeHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (FixedKeyHashTableContext)); \ + INSTANTIATION(JoinOpType, (MethodOneString)); +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/join_build_sink_operator.cpp b/be/src/pipeline/exec/join_build_sink_operator.cpp index 2439dbc8fe1c95..8b3f5cd98ff7c0 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.cpp +++ b/be/src/pipeline/exec/join_build_sink_operator.cpp @@ -23,7 +23,7 @@ #include "pipeline/exec/partitioned_hash_join_sink_operator.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" template Status JoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { @@ -33,15 +33,11 @@ Status JoinBuildSinkLocalState::init(RuntimeState* stat PipelineXSinkLocalState::profile()->add_info_string("JoinType", to_string(p._join_op)); - _build_rows_counter = ADD_COUNTER(PipelineXSinkLocalState::profile(), - "BuildRows", TUnit::UNIT); _publish_runtime_filter_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), "PublishRuntimeFilterTime"); - _runtime_filter_compute_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), - "RuntimeFilterComputeTime"); - _runtime_filter_init_timer = - ADD_TIMER(PipelineXSinkLocalState::profile(), "RuntimeFilterInitTime"); + _runtime_filter_compute_timer = + ADD_TIMER(PipelineXSinkLocalState::profile(), "BuildRuntimeFilterTime"); return Status::OK(); } @@ -82,6 +78,8 @@ JoinBuildSinkOperatorX::JoinBuildSinkOperatorX(ObjectPool* pool, _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && !_is_mark_join), _runtime_filter_descs(tnode.runtime_filters) { + DataSinkOperatorX::_is_serial_operator = + tnode.__isset.is_serial_operator && tnode.is_serial_operator; _init_join_op(); if (_is_mark_join) { DCHECK(_join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN || diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h index 714e0c34190678..9d79a97397ff77 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.h +++ b/be/src/pipeline/exec/join_build_sink_operator.h @@ -39,10 +39,8 @@ class JoinBuildSinkLocalState : public PipelineXSinkLocalState template friend class JoinBuildSinkOperatorX; - RuntimeProfile::Counter* _build_rows_counter = nullptr; RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr; RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr; - RuntimeProfile::Counter* _runtime_filter_init_timer = nullptr; std::vector> _runtime_filters; }; diff --git a/be/src/pipeline/exec/join_probe_operator.cpp b/be/src/pipeline/exec/join_probe_operator.cpp index 05c62544d2b7ce..11b5b29c8b556b 100644 --- a/be/src/pipeline/exec/join_probe_operator.cpp +++ b/be/src/pipeline/exec/join_probe_operator.cpp @@ -23,17 +23,16 @@ #include "pipeline/exec/partitioned_hash_join_probe_operator.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" template Status JoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - _probe_timer = ADD_TIMER(Base::profile(), "ProbeTime"); _join_filter_timer = ADD_TIMER(Base::profile(), "JoinFilterTimer"); _build_output_block_timer = ADD_TIMER(Base::profile(), "BuildOutputBlock"); _probe_rows_counter = ADD_COUNTER_WITH_LEVEL(Base::profile(), "ProbeRows", TUnit::UNIT, 1); - + _finish_probe_phase_timer = ADD_TIMER(Base::profile(), "FinishProbePhaseTime"); return Status::OK(); } @@ -220,6 +219,7 @@ JoinProbeOperatorX::JoinProbeOperatorX(ObjectPool* pool, const T : true) ) { + Base::_is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; if (tnode.__isset.hash_join_node) { _intermediate_row_desc.reset(new RowDescriptor( descs, tnode.hash_join_node.vintermediate_tuple_id_list, diff --git a/be/src/pipeline/exec/join_probe_operator.h b/be/src/pipeline/exec/join_probe_operator.h index 3f68c73d04b161..078806cea4fc5a 100644 --- a/be/src/pipeline/exec/join_probe_operator.h +++ b/be/src/pipeline/exec/join_probe_operator.h @@ -49,10 +49,10 @@ class JoinProbeLocalState : public PipelineXLocalState { size_t _mark_column_id = -1; - RuntimeProfile::Counter* _probe_timer = nullptr; RuntimeProfile::Counter* _probe_rows_counter = nullptr; RuntimeProfile::Counter* _join_filter_timer = nullptr; RuntimeProfile::Counter* _build_output_block_timer = nullptr; + RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr; std::unique_ptr _child_block = nullptr; bool _child_eos = false; diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.cpp b/be/src/pipeline/exec/memory_scratch_sink_operator.cpp index 69e30791c139af..2c69c0e2b2ba9f 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.cpp +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.cpp @@ -28,11 +28,14 @@ #include "vec/exprs/vexpr_context.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" Status MemoryScratchSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _get_arrow_schema_timer = ADD_TIMER(_profile, "GetArrowSchemaTime"); + _convert_block_to_arrow_batch_timer = ADD_TIMER(_profile, "ConvertBlockToArrowBatchTime"); + _evaluation_timer = ADD_TIMER(_profile, "EvaluationTime"); // create queue state->exec_env()->result_queue_mgr()->create_queue(state->fragment_instance_id(), &_queue); @@ -92,15 +95,24 @@ Status MemoryScratchSinkOperatorX::sink(RuntimeState* state, vectorized::Block* // Exec vectorized expr here to speed up, block.rows() == 0 means expr exec // failed, just return the error status vectorized::Block block; - RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( - local_state._output_vexpr_ctxs, *input_block, &block)); + { + SCOPED_TIMER(local_state._evaluation_timer); + RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( + local_state._output_vexpr_ctxs, *input_block, &block)); + } std::shared_ptr block_arrow_schema; - // After expr executed, use recaculated schema as final schema - RETURN_IF_ERROR(convert_block_arrow_schema(block, &block_arrow_schema)); - RETURN_IF_ERROR(convert_to_arrow_batch(block, block_arrow_schema, arrow::default_memory_pool(), - &result, _timezone_obj)); + { + SCOPED_TIMER(local_state._get_arrow_schema_timer); + // After expr executed, use recaculated schema as final schema + RETURN_IF_ERROR(get_arrow_schema(block, &block_arrow_schema, state->timezone())); + } + { + SCOPED_TIMER(local_state._convert_block_to_arrow_batch_timer); + RETURN_IF_ERROR(convert_to_arrow_batch( + block, block_arrow_schema, arrow::default_memory_pool(), &result, _timezone_obj)); + } local_state._queue->blocking_put(result); - if (local_state._queue->size() < 10) { + if (local_state._queue->size() > config::max_memory_sink_batch_count) { local_state._queue_dependency->block(); } return Status::OK(); diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.h b/be/src/pipeline/exec/memory_scratch_sink_operator.h index c2cd78c7cd5aee..c74659d15b96f2 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.h +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.h @@ -42,9 +42,12 @@ class MemoryScratchSinkLocalState final : public PipelineXSinkLocalState _queue_dependency = nullptr; + RuntimeProfile::Counter* _get_arrow_schema_timer = nullptr; + RuntimeProfile::Counter* _convert_block_to_arrow_batch_timer = nullptr; + RuntimeProfile::Counter* _evaluation_timer = nullptr; }; class MemoryScratchSinkOperatorX final : public DataSinkOperatorX { @@ -61,7 +64,7 @@ class MemoryScratchSinkOperatorX final : public DataSinkOperatorX& _t_output_expr; - VExprContextSPtrs _output_vexpr_ctxs; + vectorized::VExprContextSPtrs _output_vexpr_ctxs; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/meta_scan_operator.cpp b/be/src/pipeline/exec/meta_scan_operator.cpp index 0ff8a7337bb2aa..1d7e40afedc155 100644 --- a/be/src/pipeline/exec/meta_scan_operator.cpp +++ b/be/src/pipeline/exec/meta_scan_operator.cpp @@ -20,7 +20,7 @@ #include "vec/exec/scan/vmeta_scanner.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" Status MetaScanLocalState::_init_scanners(std::list* scanners) { if (Base::_eos) { return Status::OK(); diff --git a/be/src/pipeline/exec/multi_cast_data_stream_sink.cpp b/be/src/pipeline/exec/multi_cast_data_stream_sink.cpp index d0827c2ca184ce..eb72e9601e1acf 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_sink.cpp +++ b/be/src/pipeline/exec/multi_cast_data_stream_sink.cpp @@ -21,7 +21,7 @@ #include "pipeline/exec/multi_cast_data_streamer.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" std::string MultiCastDataStreamSinkLocalState::name_suffix() { auto& sinks = static_cast(_parent)->sink_node().sinks; std::string id_name = " (dst id : "; diff --git a/be/src/pipeline/exec/multi_cast_data_stream_sink.h b/be/src/pipeline/exec/multi_cast_data_stream_sink.h index 1a9787789dde02..57b5974064b6a2 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_sink.h +++ b/be/src/pipeline/exec/multi_cast_data_stream_sink.h @@ -42,15 +42,15 @@ class MultiCastDataStreamSinkOperatorX final using Base = DataSinkOperatorX; public: - MultiCastDataStreamSinkOperatorX(int sink_id, std::vector& sources, - const int cast_sender_count, ObjectPool* pool, + MultiCastDataStreamSinkOperatorX(int sink_id, std::vector& sources, ObjectPool* pool, const TMultiCastDataStreamSink& sink, const RowDescriptor& row_desc) : Base(sink_id, -1, sources), _pool(pool), _row_desc(row_desc), - _cast_sender_count(cast_sender_count), - _sink(sink) {} + _cast_sender_count(sources.size()), + _sink(sink), + _num_dests(sources.size()) {} ~MultiCastDataStreamSinkOperatorX() override = default; Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos) override; @@ -60,14 +60,19 @@ class MultiCastDataStreamSinkOperatorX final std::shared_ptr create_shared_state() const override; const TMultiCastDataStreamSink& sink_node() { return _sink; } + bool count_down_destination() override { + DCHECK_GT(_num_dests, 0); + return _num_dests.fetch_sub(1) == 1; + } private: friend class MultiCastDataStreamSinkLocalState; ObjectPool* _pool; RowDescriptor _row_desc; - const int _cast_sender_count; + const size_t _cast_sender_count; const TMultiCastDataStreamSink& _sink; friend class MultiCastDataStreamSinkLocalState; + std::atomic _num_dests; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp index 1028bca7ce2ca4..e45e59d17e27b3 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp @@ -24,7 +24,7 @@ #include "vec/core/materialize_block.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" MultiCastDataStreamSourceLocalState::MultiCastDataStreamSourceLocalState(RuntimeState* state, OperatorXBase* parent) : Base(state, parent), @@ -40,6 +40,9 @@ Status MultiCastDataStreamSourceLocalState::init(RuntimeState* state, LocalState auto& p = _parent->cast(); _shared_state->multi_cast_data_streamer->set_dep_by_sender_idx(p._consumer_id, _dependency); _wait_for_rf_timer = ADD_TIMER(_runtime_profile, "WaitForRuntimeFilter"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); + _get_data_timer = ADD_TIMER(_runtime_profile, "GetDataTime"); + _materialize_data_timer = ADD_TIMER(_runtime_profile, "MaterializeDataTime"); // init profile for runtime filter RuntimeFilterConsumer::_init_profile(profile()); init_runtime_filter_dependency(_filter_dependencies, p.operator_id(), p.node_id(), @@ -86,20 +89,23 @@ Status MultiCastDataStreamerSourceOperatorX::get_block(RuntimeState* state, if (!local_state._output_expr_contexts.empty()) { output_block = &tmp_block; } - RETURN_IF_ERROR(local_state._shared_state->multi_cast_data_streamer->pull(_consumer_id, - output_block, eos)); - + { + SCOPED_TIMER(local_state._get_data_timer); + RETURN_IF_ERROR(local_state._shared_state->multi_cast_data_streamer->pull( + _consumer_id, output_block, eos)); + } if (!local_state._conjuncts.empty()) { + SCOPED_TIMER(local_state._filter_timer); RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, output_block->columns())); } if (!local_state._output_expr_contexts.empty() && output_block->rows() > 0) { + SCOPED_TIMER(local_state._materialize_data_timer); RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( local_state._output_expr_contexts, *output_block, block, true)); vectorized::materialize_block_inplace(*block); } - COUNTER_UPDATE(local_state._rows_returned_counter, block->rows()); return Status::OK(); } diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.h b/be/src/pipeline/exec/multi_cast_data_stream_source.h index 76472f3ce85e83..57410bf8d9568a 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.h +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.h @@ -33,6 +33,7 @@ class Block; } // namespace vectorized namespace pipeline { +#include "common/compile_check_begin.h" class MultiCastDataStreamer; class MultiCastDataStreamerSourceOperatorX; @@ -67,6 +68,9 @@ class MultiCastDataStreamSourceLocalState final : public PipelineXLocalState> _filter_dependencies; RuntimeProfile::Counter* _wait_for_rf_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; + RuntimeProfile::Counter* _get_data_timer = nullptr; + RuntimeProfile::Counter* _materialize_data_timer = nullptr; }; class MultiCastDataStreamerSourceOperatorX final @@ -137,3 +141,4 @@ class MultiCastDataStreamerSourceOperatorX final } // namespace pipeline } // namespace doris +#include "common/compile_check_end.h" \ No newline at end of file diff --git a/be/src/pipeline/exec/multi_cast_data_streamer.cpp b/be/src/pipeline/exec/multi_cast_data_streamer.cpp index d44cf3974a6275..25c939695f90ef 100644 --- a/be/src/pipeline/exec/multi_cast_data_streamer.cpp +++ b/be/src/pipeline/exec/multi_cast_data_streamer.cpp @@ -22,17 +22,15 @@ #include "runtime/runtime_state.h" namespace doris::pipeline { - -MultiCastBlock::MultiCastBlock(vectorized::Block* block, int used_count, int un_finish_copy, - size_t mem_size) - : _used_count(used_count), _un_finish_copy(un_finish_copy), _mem_size(mem_size) { +#include "common/compile_check_begin.h" +MultiCastBlock::MultiCastBlock(vectorized::Block* block, int un_finish_copy, size_t mem_size) + : _un_finish_copy(un_finish_copy), _mem_size(mem_size) { _block = vectorized::Block::create_unique(block->get_columns_with_type_and_name()); block->clear(); } Status MultiCastDataStreamer::pull(int sender_idx, doris::vectorized::Block* block, bool* eos) { int* un_finish_copy = nullptr; - int use_count = 0; { std::lock_guard l(_mutex); auto& pos_to_pull = _sender_pos_to_read[sender_idx]; @@ -43,8 +41,6 @@ Status MultiCastDataStreamer::pull(int sender_idx, doris::vectorized::Block* blo _cumulative_mem_size -= pos_to_pull->_mem_size; - pos_to_pull->_used_count--; - use_count = pos_to_pull->_used_count; un_finish_copy = &pos_to_pull->_un_finish_copy; pos_to_pull++; @@ -56,12 +52,7 @@ Status MultiCastDataStreamer::pull(int sender_idx, doris::vectorized::Block* blo *eos = _eos and pos_to_pull == end; } - if (use_count == 0) { - // will clear _multi_cast_blocks - _wait_copy_block(block, *un_finish_copy); - } else { - _copy_block(block, *un_finish_copy); - } + _copy_block(block, *un_finish_copy); return Status::OK(); } @@ -71,21 +62,13 @@ void MultiCastDataStreamer::_copy_block(vectorized::Block* block, int& un_finish for (int i = 0; i < block->columns(); ++i) { block->get_by_position(i).column = block->get_by_position(i).column->clone_resized(rows); } - std::unique_lock l(_mutex); un_finish_copy--; if (un_finish_copy == 0) { - l.unlock(); - _cv.notify_one(); + _multi_cast_blocks.pop_front(); } } -void MultiCastDataStreamer::_wait_copy_block(vectorized::Block* block, int& un_finish_copy) { - std::unique_lock l(_mutex); - _cv.wait(l, [&]() { return un_finish_copy == 0; }); - _multi_cast_blocks.pop_front(); -} - Status MultiCastDataStreamer::push(RuntimeState* state, doris::vectorized::Block* block, bool eos) { auto rows = block->rows(); COUNTER_UPDATE(_process_rows, rows); @@ -96,8 +79,7 @@ Status MultiCastDataStreamer::push(RuntimeState* state, doris::vectorized::Block { std::lock_guard l(_mutex); - _multi_cast_blocks.emplace_back(block, _cast_sender_count, _cast_sender_count - 1, - block_mem_size); + _multi_cast_blocks.emplace_back(block, _cast_sender_count, block_mem_size); // last elem auto end = std::prev(_multi_cast_blocks.end()); for (int i = 0; i < _sender_pos_to_read.size(); ++i) { diff --git a/be/src/pipeline/exec/multi_cast_data_streamer.h b/be/src/pipeline/exec/multi_cast_data_streamer.h index 07e64016363f65..51a73cf0c2b053 100644 --- a/be/src/pipeline/exec/multi_cast_data_streamer.h +++ b/be/src/pipeline/exec/multi_cast_data_streamer.h @@ -23,10 +23,11 @@ namespace doris::pipeline { class Dependency; struct MultiCastBlock { - MultiCastBlock(vectorized::Block* block, int used_count, int need_copy, size_t mem_size); + MultiCastBlock(vectorized::Block* block, int need_copy, size_t mem_size); std::unique_ptr _block; - int _used_count; + // Each block is copied during pull. If _un_finish_copy == 0, + // it indicates that this block has been fully used and can be released. int _un_finish_copy; size_t _mem_size; }; @@ -69,14 +70,10 @@ class MultiCastDataStreamer { void _block_reading(int sender_idx); void _copy_block(vectorized::Block* block, int& un_finish_copy); - - void _wait_copy_block(vectorized::Block* block, int& un_finish_copy); - const RowDescriptor& _row_desc; RuntimeProfile* _profile = nullptr; std::list _multi_cast_blocks; std::vector::iterator> _sender_pos_to_read; - std::condition_variable _cv; std::mutex _mutex; bool _eos = false; int _cast_sender_count = 0; diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index 793a37c7396a61..83b378e792c3fa 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -23,7 +23,7 @@ #include "pipeline/exec/operator.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" struct RuntimeFilterBuild { RuntimeFilterBuild(NestedLoopJoinBuildSinkLocalState* parent) : _parent(parent) {} Status operator()(RuntimeState* state) { @@ -109,9 +109,9 @@ Status NestedLoopJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeSta Status NestedLoopJoinBuildSinkOperatorX::open(RuntimeState* state) { RETURN_IF_ERROR(JoinBuildSinkOperatorX::open(state)); - int num_build_tuples = _child->row_desc().tuple_descriptors().size(); + size_t num_build_tuples = _child->row_desc().tuple_descriptors().size(); - for (int i = 0; i < num_build_tuples; ++i) { + for (size_t i = 0; i < num_build_tuples; ++i) { TupleDescriptor* build_tuple_desc = _child->row_desc().tuple_descriptors()[i]; auto tuple_idx = _row_descriptor.get_tuple_idx(build_tuple_desc->id()); RETURN_IF_INVALID_TUPLE_IDX(build_tuple_desc->id(), tuple_idx); @@ -139,7 +139,6 @@ Status NestedLoopJoinBuildSinkOperatorX::sink(doris::RuntimeState* state, vector } if (eos) { - COUNTER_UPDATE(local_state._build_rows_counter, local_state._build_rows); RuntimeFilterBuild rf_ctx(&local_state); RETURN_IF_ERROR(rf_ctx(state)); diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h index f2ca259754b661..d6e72799f97d92 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h @@ -76,8 +76,8 @@ class NestedLoopJoinBuildSinkOperatorX final if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { return {ExchangeType::NOOP}; } - return _child->ignore_data_distribution() ? DataDistribution(ExchangeType::BROADCAST) - : DataDistribution(ExchangeType::NOOP); + return _child->is_serial_operator() ? DataDistribution(ExchangeType::BROADCAST) + : DataDistribution(ExchangeType::NOOP); } private: diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp index 9546ed8df56671..afa1a2e59b798c 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp @@ -19,6 +19,7 @@ #include +#include "common/cast_set.h" #include "common/exception.h" #include "pipeline/exec/operator.h" #include "vec/columns/column_filter_helper.h" @@ -29,7 +30,7 @@ class RuntimeState; } // namespace doris namespace doris::pipeline { - +#include "common/compile_check_begin.h" NestedLoopJoinProbeLocalState::NestedLoopJoinProbeLocalState(RuntimeState* state, OperatorXBase* parent) : JoinProbeLocalState(state, @@ -42,6 +43,10 @@ Status NestedLoopJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _loop_join_timer = ADD_TIMER(profile(), "LoopGenerateJoin"); + _output_temp_blocks_timer = ADD_TIMER(profile(), "OutputTempBlocksTime"); + _update_visited_flags_timer = ADD_TIMER(profile(), "UpdateVisitedFlagsTime"); + _join_conjuncts_evaluation_timer = ADD_TIMER(profile(), "JoinConjunctsEvaluationTime"); + _filtered_by_join_conjuncts_timer = ADD_TIMER(profile(), "FilteredByJoinConjunctsTime"); return Status::OK(); } @@ -129,6 +134,8 @@ Status NestedLoopJoinProbeLocalState::generate_join_block_data(RuntimeState* sta if (!_matched_rows_done && !_need_more_input_data) { // We should try to join rows if there still are some rows from probe side. + // _probe_offset_stack and _build_offset_stack use u16 for storage + // because on the FE side, it is guaranteed that the batch size will not exceed 65535 (the maximum value for u16).s while (_join_block.rows() < state->batch_size()) { while (_current_build_pos == _shared_state->build_blocks.size() || _left_block_pos == _child_block->rows()) { @@ -140,7 +147,8 @@ Status NestedLoopJoinProbeLocalState::generate_join_block_data(RuntimeState* sta _reset_with_next_probe_row(); if (_left_block_pos < _child_block->rows()) { if constexpr (set_probe_side_flag) { - _probe_offset_stack.push(_join_block.rows()); + _probe_offset_stack.push( + cast_set(_join_block.rows())); } } else { if (_shared_state->left_side_eos) { @@ -159,28 +167,31 @@ Status NestedLoopJoinProbeLocalState::generate_join_block_data(RuntimeState* sta const auto& now_process_build_block = _shared_state->build_blocks[_current_build_pos++]; if constexpr (set_build_side_flag) { - _build_offset_stack.push(_join_block.rows()); + _build_offset_stack.push(cast_set(_join_block.rows())); } _process_left_child_block(_join_block, now_process_build_block); } - if constexpr (set_probe_side_flag) { - RETURN_IF_ERROR( - (_do_filtering_and_update_visited_flags( - &_join_block, !p._is_left_semi_anti))); - _update_additional_flags(&_join_block); - // If this join operation is left outer join or full outer join, when - // `_left_side_process_count`, means all rows from build - // side have been joined with _left_side_process_count, we should output current - // probe row with null from build side. - if (_left_side_process_count) { - _finalize_current_phase( - _join_block, state->batch_size()); + { + SCOPED_TIMER(_finish_probe_phase_timer); + if constexpr (set_probe_side_flag) { + RETURN_IF_ERROR( + (_do_filtering_and_update_visited_flags( + &_join_block, !p._is_left_semi_anti))); + _update_additional_flags(&_join_block); + // If this join operation is left outer join or full outer join, when + // `_left_side_process_count`, means all rows from build + // side have been joined with _left_side_process_count, we should output current + // probe row with null from build side. + if (_left_side_process_count) { + _finalize_current_phase( + _join_block, state->batch_size()); + } + } else if (_left_side_process_count && p._is_mark_join && + _shared_state->build_blocks.empty()) { + _append_left_data_with_null(_join_block); } - } else if (_left_side_process_count && p._is_mark_join && - _shared_state->build_blocks.empty()) { - _append_left_data_with_null(_join_block); } } @@ -202,8 +213,8 @@ Status NestedLoopJoinProbeLocalState::generate_join_block_data(RuntimeState* sta } void NestedLoopJoinProbeLocalState::_resize_fill_tuple_is_null_column(size_t new_size, - int left_flag, - int right_flag) { + uint8_t left_flag, + uint8_t right_flag) { auto& p = _parent->cast(); if (p._is_outer_join) { reinterpret_cast(_tuple_is_null_left_flag_column.get()) @@ -237,7 +248,7 @@ void NestedLoopJoinProbeLocalState::_finalize_current_phase(vectorized::Block& b std::vector selector(num_rows); size_t selector_idx = 0; - for (size_t j = 0; j < num_rows; j++) { + for (uint32_t j = 0; j < num_rows; j++) { if constexpr (IsSemi) { if (cur_visited_flags[j]) { selector[selector_idx++] = j; @@ -373,9 +384,10 @@ void NestedLoopJoinProbeLocalState::_append_left_data_with_null(vectorized::Bloc void NestedLoopJoinProbeLocalState::_process_left_child_block( vectorized::Block& block, const vectorized::Block& now_process_build_block) const { + SCOPED_TIMER(_output_temp_blocks_timer); auto& p = _parent->cast(); auto dst_columns = block.mutate_columns(); - const int max_added_rows = now_process_build_block.rows(); + const size_t max_added_rows = now_process_build_block.rows(); for (size_t i = 0; i < p._num_probe_side_columns; ++i) { const vectorized::ColumnWithTypeAndName& src_column = _child_block->get_by_position(i); if (!src_column.column->is_nullable() && dst_columns[i]->is_nullable()) { @@ -390,6 +402,7 @@ void NestedLoopJoinProbeLocalState::_process_left_child_block( .get_data() .resize_fill(origin_sz + max_added_rows, 0); } else { + // TODO: for cross join, maybe could insert one row, and wrap for a const column dst_columns[i]->insert_many_from(*src_column.column, _left_block_pos, max_added_rows); } } @@ -480,6 +493,7 @@ Status NestedLoopJoinProbeOperatorX::push(doris::RuntimeState* state, vectorized set_build_side_flag, set_probe_side_flag>( state, join_op_variants); }; + SCOPED_TIMER(local_state._loop_join_timer); RETURN_IF_ERROR( std::visit(func, local_state._shared_state->join_op_variants, vectorized::make_bool_variant(_match_all_build || _is_right_semi_anti), diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.h b/be/src/pipeline/exec/nested_loop_join_probe_operator.h index f46a99306a5713..c744e6acdc507e 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.h @@ -19,6 +19,9 @@ #include +#include + +#include "common/cast_set.h" #include "common/status.h" #include "operator.h" #include "pipeline/exec/join_probe_operator.h" @@ -28,7 +31,7 @@ namespace doris { class RuntimeState; namespace pipeline { - +#include "common/compile_check_begin.h" class NestedLoopJoinProbeOperatorX; class NestedLoopJoinProbeLocalState final : public JoinProbeLocalState { @@ -54,51 +57,59 @@ class NestedLoopJoinProbeLocalState final void _update_additional_flags(vectorized::Block* block); template void _finalize_current_phase(vectorized::Block& block, size_t batch_size); - void _resize_fill_tuple_is_null_column(size_t new_size, int left_flag, int right_flag); + void _resize_fill_tuple_is_null_column(size_t new_size, uint8_t left_flag, uint8_t right_flag); void _reset_with_next_probe_row(); void _append_left_data_with_null(vectorized::Block& block) const; void _process_left_child_block(vectorized::Block& block, const vectorized::Block& now_process_build_block) const; template - void _do_filtering_and_update_visited_flags_impl(vectorized::Block* block, int column_to_keep, - int build_block_idx, int processed_blocks_num, - bool materialize, Filter& filter) { - if constexpr (SetBuildSideFlag) { - for (size_t i = 0; i < processed_blocks_num; i++) { - auto& build_side_flag = - assert_cast( - _shared_state->build_side_visited_flags[build_block_idx].get()) - ->get_data(); - auto* __restrict build_side_flag_data = build_side_flag.data(); - auto cur_sz = build_side_flag.size(); - const size_t offset = _build_offset_stack.top(); - _build_offset_stack.pop(); - for (size_t j = 0; j < cur_sz; j++) { - build_side_flag_data[j] |= filter[offset + j]; + void _do_filtering_and_update_visited_flags_impl(vectorized::Block* block, + uint32_t column_to_keep, + size_t build_block_idx, + size_t processed_blocks_num, bool materialize, + Filter& filter) { + { + SCOPED_TIMER(_update_visited_flags_timer); + if constexpr (SetBuildSideFlag) { + for (size_t i = 0; i < processed_blocks_num; i++) { + auto& build_side_flag = + assert_cast( + _shared_state->build_side_visited_flags[build_block_idx].get()) + ->get_data(); + auto* __restrict build_side_flag_data = build_side_flag.data(); + auto cur_sz = build_side_flag.size(); + const size_t offset = _build_offset_stack.top(); + _build_offset_stack.pop(); + for (size_t j = 0; j < cur_sz; j++) { + build_side_flag_data[j] |= filter[offset + j]; + } + build_block_idx = build_block_idx == 0 ? _shared_state->build_blocks.size() - 1 + : build_block_idx - 1; } - build_block_idx = build_block_idx == 0 ? _shared_state->build_blocks.size() - 1 - : build_block_idx - 1; } - } - if constexpr (SetProbeSideFlag) { - int end = filter.size(); - for (int i = _left_block_pos == _child_block->rows() ? _left_block_pos - 1 - : _left_block_pos; - i >= _left_block_start_pos; i--) { - int offset = 0; - if (!_probe_offset_stack.empty()) { - offset = _probe_offset_stack.top(); - _probe_offset_stack.pop(); - } - if (!_cur_probe_row_visited_flags[i]) { - _cur_probe_row_visited_flags[i] = - simd::contain_byte(filter.data() + offset, end - offset, 1) ? 1 - : 0; + if constexpr (SetProbeSideFlag) { + int64_t end = filter.size(); + for (int i = _left_block_pos == _child_block->rows() ? _left_block_pos - 1 + : _left_block_pos; + i >= _left_block_start_pos; i--) { + int64_t offset = 0; + if (!_probe_offset_stack.empty()) { + offset = _probe_offset_stack.top(); + _probe_offset_stack.pop(); + } + if (!_cur_probe_row_visited_flags[i]) { + _cur_probe_row_visited_flags[i] = + simd::contain_byte(filter.data() + offset, end - offset, 1) + ? 1 + : 0; + } + end = offset; } - end = offset; } } + if (materialize) { + SCOPED_TIMER(_filtered_by_join_conjuncts_timer); vectorized::Block::filter_block_internal(block, filter, column_to_keep); } else { CLEAR_BLOCK @@ -108,7 +119,8 @@ class NestedLoopJoinProbeLocalState final // need exception safety template Status _do_filtering_and_update_visited_flags(vectorized::Block* block, bool materialize) { - auto column_to_keep = block->columns(); + // The number of columns will not exceed the range of u32. + uint32_t column_to_keep = cast_set(block->columns()); // If we need to set visited flags for build side, // 1. Execute conjuncts and get a column with bool type to do filtering. // 2. Use bool column to update build-side visited flags. @@ -119,8 +131,11 @@ class NestedLoopJoinProbeLocalState final if (LIKELY(!_join_conjuncts.empty() && block->rows() > 0)) { vectorized::IColumn::Filter filter(block->rows(), 1); bool can_filter_all = false; - RETURN_IF_ERROR(vectorized::VExprContext::execute_conjuncts( - _join_conjuncts, nullptr, IgnoreNull, block, &filter, &can_filter_all)); + { + SCOPED_TIMER(_join_conjuncts_evaluation_timer); + RETURN_IF_ERROR(vectorized::VExprContext::execute_conjuncts( + _join_conjuncts, nullptr, IgnoreNull, block, &filter, &can_filter_all)); + } if (can_filter_all) { CLEAR_BLOCK @@ -179,6 +194,10 @@ class NestedLoopJoinProbeLocalState final vectorized::VExprContextSPtrs _join_conjuncts; RuntimeProfile::Counter* _loop_join_timer = nullptr; + RuntimeProfile::Counter* _output_temp_blocks_timer = nullptr; + RuntimeProfile::Counter* _update_visited_flags_timer = nullptr; + RuntimeProfile::Counter* _join_conjuncts_evaluation_timer = nullptr; + RuntimeProfile::Counter* _filtered_by_join_conjuncts_timer = nullptr; }; class NestedLoopJoinProbeOperatorX final @@ -197,7 +216,9 @@ class NestedLoopJoinProbeOperatorX final } DataDistribution required_data_distribution() const override { - if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::RIGHT_OUTER_JOIN || _join_op == TJoinOp::RIGHT_ANTI_JOIN || + _join_op == TJoinOp::RIGHT_SEMI_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN) { return {ExchangeType::NOOP}; } return {ExchangeType::ADAPTIVE_PASSTHROUGH}; @@ -222,3 +243,4 @@ class NestedLoopJoinProbeOperatorX final } // namespace pipeline } // namespace doris +#include "common/compile_check_end.h" \ No newline at end of file diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 09e999d4737e12..124f2d1c70ec93 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -43,6 +43,9 @@ namespace doris::pipeline { Status OlapScanLocalState::_init_profile() { RETURN_IF_ERROR(ScanLocalState::_init_profile()); + // Rows read from storage. + // Include the rows read from doris page cache. + _scan_rows = ADD_COUNTER(_runtime_profile, "ScanRows", TUnit::UNIT); // 1. init segment profile _segment_profile.reset(new RuntimeProfile("SegmentIterator")); _scanner_profile->add_child(_segment_profile.get(), true, nullptr); @@ -58,23 +61,20 @@ Status OlapScanLocalState::_init_profile() { _block_load_counter = ADD_COUNTER(_segment_profile, "BlocksLoad", TUnit::UNIT); _block_fetch_timer = ADD_TIMER(_scanner_profile, "BlockFetchTime"); _delete_bitmap_get_agg_timer = ADD_TIMER(_scanner_profile, "DeleteBitmapGetAggTime"); - _sync_rowset_timer = ADD_TIMER(_scanner_profile, "SyncRowsetTime"); - _raw_rows_counter = ADD_COUNTER(_segment_profile, "RawRowsRead", TUnit::UNIT); - _block_convert_timer = ADD_TIMER(_scanner_profile, "BlockConvertTime"); + if (config::is_cloud_mode()) { + _sync_rowset_timer = ADD_TIMER(_scanner_profile, "SyncRowsetTime"); + } _block_init_timer = ADD_TIMER(_segment_profile, "BlockInitTime"); _block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime"); _block_init_seek_counter = ADD_COUNTER(_segment_profile, "BlockInitSeekCount", TUnit::UNIT); - _block_conditions_filtered_timer = ADD_TIMER(_segment_profile, "BlockConditionsFilteredTime"); - _block_conditions_filtered_bf_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredBloomFilterTime"); + _segment_generate_row_range_timer = ADD_TIMER(_segment_profile, "GenerateRowRangeTime"); + _segment_generate_row_range_by_bf_timer = + ADD_TIMER(_segment_profile, "GenerateRowRangeByBloomFilterIndexTime"); _collect_iterator_merge_next_timer = ADD_TIMER(_segment_profile, "CollectIteratorMergeTime"); - _collect_iterator_normal_next_timer = ADD_TIMER(_segment_profile, "CollectIteratorNormalTime"); - _block_conditions_filtered_zonemap_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredZonemapTime"); - _block_conditions_filtered_zonemap_rp_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredZonemapRuntimePredicateTime"); - _block_conditions_filtered_dict_timer = - ADD_TIMER(_segment_profile, "BlockConditionsFilteredDictTime"); + _segment_generate_row_range_by_zonemap_timer = + ADD_TIMER(_segment_profile, "GenerateRowRangeByZoneMapIndexTime"); + _segment_generate_row_range_by_dict_timer = + ADD_TIMER(_segment_profile, "GenerateRowRangeByDictTime"); _rows_vec_cond_filtered_counter = ADD_COUNTER(_segment_profile, "RowsVectorPredFiltered", TUnit::UNIT); @@ -87,10 +87,11 @@ Status OlapScanLocalState::_init_profile() { _vec_cond_timer = ADD_TIMER(_segment_profile, "VectorPredEvalTime"); _short_cond_timer = ADD_TIMER(_segment_profile, "ShortPredEvalTime"); _expr_filter_timer = ADD_TIMER(_segment_profile, "ExprFilterEvalTime"); - _first_read_timer = ADD_TIMER(_segment_profile, "FirstReadTime"); - _second_read_timer = ADD_TIMER(_segment_profile, "SecondReadTime"); - _first_read_seek_timer = ADD_TIMER(_segment_profile, "FirstReadSeekTime"); - _first_read_seek_counter = ADD_COUNTER(_segment_profile, "FirstReadSeekCount", TUnit::UNIT); + _predicate_column_read_timer = ADD_TIMER(_segment_profile, "PredicateColumnReadTime"); + _non_predicate_column_read_timer = ADD_TIMER(_segment_profile, "NonPredicateColumnReadTime"); + _predicate_column_read_seek_timer = ADD_TIMER(_segment_profile, "PredicateColumnReadSeekTime"); + _predicate_column_read_seek_counter = + ADD_COUNTER(_segment_profile, "PredicateColumnReadSeekCount", TUnit::UNIT); _lazy_read_timer = ADD_TIMER(_segment_profile, "LazyReadTime"); _lazy_read_seek_timer = ADD_TIMER(_segment_profile, "LazyReadSeekTime"); @@ -100,7 +101,7 @@ Status OlapScanLocalState::_init_profile() { _stats_filtered_counter = ADD_COUNTER(_segment_profile, "RowsStatsFiltered", TUnit::UNIT); _stats_rp_filtered_counter = - ADD_COUNTER(_segment_profile, "RowsZonemapRuntimePredicateFiltered", TUnit::UNIT); + ADD_COUNTER(_segment_profile, "RowsZoneMapRuntimePredicateFiltered", TUnit::UNIT); _bf_filtered_counter = ADD_COUNTER(_segment_profile, "RowsBloomFilterFiltered", TUnit::UNIT); _dict_filtered_counter = ADD_COUNTER(_segment_profile, "RowsDictFiltered", TUnit::UNIT); _del_filtered_counter = ADD_COUNTER(_scanner_profile, "RowsDelFiltered", TUnit::UNIT); @@ -131,8 +132,6 @@ Status OlapScanLocalState::_init_profile() { ADD_TIMER(_segment_profile, "InvertedIndexQueryNullBitmapTime"); _inverted_index_query_bitmap_copy_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapCopyTime"); - _inverted_index_query_bitmap_op_timer = - ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapOpTime"); _inverted_index_searcher_open_timer = ADD_TIMER(_segment_profile, "InvertedIndexSearcherOpenTime"); _inverted_index_searcher_search_timer = @@ -144,7 +143,7 @@ Status OlapScanLocalState::_init_profile() { _inverted_index_downgrade_count_counter = ADD_COUNTER(_segment_profile, "InvertedIndexDowngradeCount", TUnit::UNIT); - _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTimer"); + _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTime"); _filtered_segment_counter = ADD_COUNTER(_segment_profile, "NumSegmentFiltered", TUnit::UNIT); _total_segment_counter = ADD_COUNTER(_segment_profile, "NumSegmentTotal", TUnit::UNIT); _tablet_counter = ADD_COUNTER(_runtime_profile, "TabletNum", TUnit::UNIT); @@ -164,16 +163,13 @@ Status OlapScanLocalState::_process_conjuncts(RuntimeState* state) { } bool OlapScanLocalState::_is_key_column(const std::string& key_name) { - auto& p = _parent->cast(); // all column in dup_keys table or unique_keys with merge on write table olap scan node threat // as key column - if (p._olap_scan_node.keyType == TKeysType::DUP_KEYS || - (p._olap_scan_node.keyType == TKeysType::UNIQUE_KEYS && - p._olap_scan_node.__isset.enable_unique_key_merge_on_write && - p._olap_scan_node.enable_unique_key_merge_on_write)) { + if (_storage_no_merge()) { return true; } + auto& p = _parent->cast(); auto res = std::find(p._olap_scan_node.key_column_name.begin(), p._olap_scan_node.key_column_name.end(), key_name); return res != p._olap_scan_node.key_column_name.end(); @@ -282,8 +278,9 @@ Status OlapScanLocalState::_init_scanners(std::list* s scan_range->version.data() + scan_range->version.size(), version); tablets.emplace_back(std::move(tablet), version); } - int64_t duration_ns = 0; + if (config::is_cloud_mode()) { + int64_t duration_ns = 0; SCOPED_RAW_TIMER(&duration_ns); std::vector> tasks; tasks.reserve(_scan_ranges.size()); @@ -293,8 +290,8 @@ Status OlapScanLocalState::_init_scanners(std::list* s }); } RETURN_IF_ERROR(cloud::bthread_fork_join(tasks, 10)); + _sync_rowset_timer->update(duration_ns); } - _sync_rowset_timer->update(duration_ns); if (enable_parallel_scan && !p._should_run_serial && !has_cpu_limit && p._push_down_agg_type == TPushAggOp::NONE && @@ -335,25 +332,6 @@ Status OlapScanLocalState::_init_scanners(std::list* s int scanners_per_tablet = std::max(1, 64 / (int)_scan_ranges.size()); - auto build_new_scanner = [&](BaseTabletSPtr tablet, int64_t version, - const std::vector& key_ranges) { - COUNTER_UPDATE(_key_range_counter, key_ranges.size()); - auto scanner = vectorized::NewOlapScanner::create_shared( - this, vectorized::NewOlapScanner::Params { - state(), - _scanner_profile.get(), - key_ranges, - std::move(tablet), - version, - {}, - p._limit, - p._olap_scan_node.is_preaggregation, - }); - RETURN_IF_ERROR(scanner->prepare(state(), _conjuncts)); - scanners->push_back(std::move(scanner)); - return Status::OK(); - }; - for (auto& scan_range : _scan_ranges) { auto tablet = DORIS_TRY(ExecEnv::get_tablet(scan_range->tablet_id)); int64_t version = 0; @@ -379,7 +357,21 @@ Status OlapScanLocalState::_init_scanners(std::list* s ++j, ++i) { scanner_ranges.push_back((*ranges)[i].get()); } - RETURN_IF_ERROR(build_new_scanner(tablet, version, scanner_ranges)); + + COUNTER_UPDATE(_key_range_counter, scanner_ranges.size()); + auto scanner = vectorized::NewOlapScanner::create_shared( + this, vectorized::NewOlapScanner::Params { + state(), + _scanner_profile.get(), + scanner_ranges, + std::move(tablet), + version, + {}, + p._limit, + p._olap_scan_node.is_preaggregation, + }); + RETURN_IF_ERROR(scanner->prepare(state(), _conjuncts)); + scanners->push_back(std::move(scanner)); } } diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index 4465ce5690e703..9e8624b3a0b255 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -79,7 +79,7 @@ class OlapScanLocalState final : public ScanLocalState { if (!predicate.target_is_slot(_parent->node_id())) { return false; } - return _is_key_column(predicate.get_col_name(_parent->node_id())) || _storage_no_merge(); + return _is_key_column(predicate.get_col_name(_parent->node_id())); } Status _init_scanners(std::list* scanners) override; @@ -97,11 +97,8 @@ class OlapScanLocalState final : public ScanLocalState { std::unique_ptr _segment_profile; - RuntimeProfile::Counter* _num_disks_accessed_counter = nullptr; - RuntimeProfile::Counter* _tablet_counter = nullptr; RuntimeProfile::Counter* _key_range_counter = nullptr; - RuntimeProfile::Counter* _rows_pushed_cond_filtered_counter = nullptr; RuntimeProfile::Counter* _reader_init_timer = nullptr; RuntimeProfile::Counter* _scanner_init_timer = nullptr; RuntimeProfile::Counter* _process_conjunct_timer = nullptr; @@ -110,7 +107,6 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _read_compressed_counter = nullptr; RuntimeProfile::Counter* _decompressor_timer = nullptr; RuntimeProfile::Counter* _read_uncompressed_counter = nullptr; - RuntimeProfile::Counter* _raw_rows_counter = nullptr; RuntimeProfile::Counter* _rows_vec_cond_filtered_counter = nullptr; RuntimeProfile::Counter* _rows_short_circuit_cond_filtered_counter = nullptr; @@ -140,23 +136,19 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _block_init_timer = nullptr; RuntimeProfile::Counter* _block_init_seek_timer = nullptr; RuntimeProfile::Counter* _block_init_seek_counter = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_bf_timer = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_timer = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_by_bf_timer = nullptr; RuntimeProfile::Counter* _collect_iterator_merge_next_timer = nullptr; - RuntimeProfile::Counter* _collect_iterator_normal_next_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_zonemap_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_zonemap_rp_timer = nullptr; - RuntimeProfile::Counter* _block_conditions_filtered_dict_timer = nullptr; - RuntimeProfile::Counter* _first_read_timer = nullptr; - RuntimeProfile::Counter* _second_read_timer = nullptr; - RuntimeProfile::Counter* _first_read_seek_timer = nullptr; - RuntimeProfile::Counter* _first_read_seek_counter = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_by_zonemap_timer = nullptr; + RuntimeProfile::Counter* _segment_generate_row_range_by_dict_timer = nullptr; + RuntimeProfile::Counter* _predicate_column_read_timer = nullptr; + RuntimeProfile::Counter* _non_predicate_column_read_timer = nullptr; + RuntimeProfile::Counter* _predicate_column_read_seek_timer = nullptr; + RuntimeProfile::Counter* _predicate_column_read_seek_counter = nullptr; RuntimeProfile::Counter* _lazy_read_timer = nullptr; RuntimeProfile::Counter* _lazy_read_seek_timer = nullptr; RuntimeProfile::Counter* _lazy_read_seek_counter = nullptr; - RuntimeProfile::Counter* _block_convert_timer = nullptr; - // total pages read // used by segment v2 RuntimeProfile::Counter* _total_pages_num_counter = nullptr; @@ -176,7 +168,6 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_query_cache_miss_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_timer = nullptr; RuntimeProfile::Counter* _inverted_index_query_bitmap_copy_timer = nullptr; - RuntimeProfile::Counter* _inverted_index_query_bitmap_op_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_open_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_search_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_cache_hit_counter = nullptr; diff --git a/be/src/pipeline/exec/operator.cpp b/be/src/pipeline/exec/operator.cpp index 4a93bac67fe477..3b5174d87c0f7f 100644 --- a/be/src/pipeline/exec/operator.cpp +++ b/be/src/pipeline/exec/operator.cpp @@ -74,6 +74,7 @@ #include "pipeline/exec/union_source_operator.h" #include "pipeline/local_exchange/local_exchange_sink_operator.h" #include "pipeline/local_exchange/local_exchange_source_operator.h" +#include "pipeline/pipeline.h" #include "util/debug_util.h" #include "util/runtime_profile.h" #include "util/string_util.h" @@ -116,11 +117,16 @@ std::string PipelineXSinkLocalState::name_suffix() { }() + ")"; } -DataDistribution DataSinkOperatorXBase::required_data_distribution() const { - return _child && _child->ignore_data_distribution() +DataDistribution OperatorBase::required_data_distribution() const { + return _child && _child->is_serial_operator() && !is_source() ? DataDistribution(ExchangeType::PASSTHROUGH) : DataDistribution(ExchangeType::NOOP); } + +bool OperatorBase::require_shuffled_data_distribution() const { + return Pipeline::is_hash_exchange(required_data_distribution().distribution_type); +} + const RowDescriptor& OperatorBase::row_desc() const { return _child->row_desc(); } @@ -141,8 +147,9 @@ std::string PipelineXSinkLocalState::debug_string(int indentatio std::string OperatorXBase::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; - fmt::format_to(debug_string_buffer, "{}{}: id={}, parallel_tasks={}", - std::string(indentation_level * 2, ' '), _op_name, node_id(), _parallel_tasks); + fmt::format_to(debug_string_buffer, "{}{}: id={}, parallel_tasks={}, _is_serial_operator={}", + std::string(indentation_level * 2, ' '), _op_name, node_id(), _parallel_tasks, + _is_serial_operator); return fmt::to_string(debug_string_buffer); } @@ -315,17 +322,28 @@ Status OperatorXBase::get_block_after_projects(RuntimeState* state, vectorized:: } }); + Status status; auto* local_state = state->get_local_state(operator_id()); + Defer defer([&]() { + if (status.ok()) { + if (auto rows = block->rows()) { + COUNTER_UPDATE(local_state->_rows_returned_counter, rows); + COUNTER_UPDATE(local_state->_blocks_returned_counter, 1); + } + } + }); if (_output_row_descriptor) { local_state->clear_origin_block(); - auto status = get_block(state, &local_state->_origin_block, eos); + status = get_block(state, &local_state->_origin_block, eos); if (UNLIKELY(!status.ok())) { return status; } - return do_projections(state, &local_state->_origin_block, block); + status = do_projections(state, &local_state->_origin_block, block); + return status; } - local_state->_peak_memory_usage_counter->set(local_state->_mem_tracker->peak_consumption()); - return get_block(state, block, eos); + status = get_block(state, block, eos); + local_state->_peak_memory_usage_counter->set(local_state->_memory_used_counter->value()); + return status; } void PipelineXLocalStateBase::reached_limit(vectorized::Block* block, bool* eos) { @@ -346,16 +364,14 @@ void PipelineXLocalStateBase::reached_limit(vectorized::Block* block, bool* eos) if (auto rows = block->rows()) { _num_rows_returned += rows; - COUNTER_UPDATE(_blocks_returned_counter, 1); - COUNTER_SET(_rows_returned_counter, _num_rows_returned); } } std::string DataSinkOperatorXBase::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; - fmt::format_to(debug_string_buffer, "{}{}: id={}", std::string(indentation_level * 2, ' '), - _name, node_id()); + fmt::format_to(debug_string_buffer, "{}{}: id={}, _is_serial_operator={}", + std::string(indentation_level * 2, ' '), _name, node_id(), _is_serial_operator); return fmt::to_string(debug_string_buffer); } @@ -468,10 +484,9 @@ Status PipelineXLocalState::init(RuntimeState* state, LocalState _open_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "OpenTime", 1); _close_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "CloseTime", 1); _exec_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "ExecTime", 1); - _mem_tracker = std::make_unique("PipelineXLocalState:" + _runtime_profile->name()); - _memory_used_counter = ADD_LABEL_COUNTER_WITH_LEVEL(_runtime_profile, "MemoryUsage", 1); - _peak_memory_usage_counter = _runtime_profile->AddHighWaterMarkCounter( - "PeakMemoryUsage", TUnit::BYTES, "MemoryUsage", 1); + _memory_used_counter = ADD_COUNTER_WITH_LEVEL(_runtime_profile, "MemoryUsage", TUnit::BYTES, 1); + _peak_memory_usage_counter = + _runtime_profile->AddHighWaterMarkCounter("MemoryUsagePeak", TUnit::BYTES, "", 1); return Status::OK(); } @@ -504,11 +519,8 @@ Status PipelineXLocalState::close(RuntimeState* state) { if constexpr (!std::is_same_v) { COUNTER_SET(_wait_for_dependency_timer, _dependency->watcher_elapse_time()); } - if (_rows_returned_counter != nullptr) { - COUNTER_SET(_rows_returned_counter, _num_rows_returned); - } if (_peak_memory_usage_counter) { - _peak_memory_usage_counter->set(_mem_tracker->peak_consumption()); + _peak_memory_usage_counter->set(_memory_used_counter->value()); } _closed = true; // Some kinds of source operators has a 1-1 relationship with a sink operator (such as AnalyticOperator). @@ -548,10 +560,9 @@ Status PipelineXSinkLocalState::init(RuntimeState* state, LocalSink _close_timer = ADD_TIMER_WITH_LEVEL(_profile, "CloseTime", 1); _exec_timer = ADD_TIMER_WITH_LEVEL(_profile, "ExecTime", 1); info.parent_profile->add_child(_profile, true, nullptr); - _mem_tracker = std::make_unique(_parent->get_name()); - _memory_used_counter = ADD_LABEL_COUNTER_WITH_LEVEL(_profile, "MemoryUsage", 1); + _memory_used_counter = ADD_COUNTER_WITH_LEVEL(_profile, "MemoryUsage", TUnit::BYTES, 1); _peak_memory_usage_counter = - _profile->AddHighWaterMarkCounter("PeakMemoryUsage", TUnit::BYTES, "MemoryUsage", 1); + _profile->AddHighWaterMarkCounter("MemoryUsagePeak", TUnit::BYTES, "", 1); return Status::OK(); } @@ -564,7 +575,7 @@ Status PipelineXSinkLocalState::close(RuntimeState* state, Status e COUNTER_SET(_wait_for_dependency_timer, _dependency->watcher_elapse_time()); } if (_peak_memory_usage_counter) { - _peak_memory_usage_counter->set(_mem_tracker->peak_consumption()); + _peak_memory_usage_counter->set(_memory_used_counter->value()); } _closed = true; return Status::OK(); @@ -655,7 +666,7 @@ Status AsyncWriterSink::close(RuntimeState* state, Status exec_s if (_writer) { Status st = _writer->get_writer_status(); if (exec_status.ok()) { - _writer->force_close(state->is_cancelled() ? Status::Cancelled("Cancelled") + _writer->force_close(state->is_cancelled() ? state->cancel_reason() : Status::Cancelled("force close")); } else { _writer->force_close(exec_status); diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h index 04d8f0dc736f27..6053b1a2f48e87 100644 --- a/be/src/pipeline/exec/operator.h +++ b/be/src/pipeline/exec/operator.h @@ -101,6 +101,9 @@ class OperatorBase { return Status::OK(); } + // Operators need to be executed serially. (e.g. finalized agg without key) + [[nodiscard]] virtual bool is_serial_operator() const { return _is_serial_operator; } + [[nodiscard]] bool is_closed() const { return _is_closed; } virtual size_t revocable_mem_size(RuntimeState* state) const { return 0; } @@ -108,17 +111,22 @@ class OperatorBase { virtual Status revoke_memory(RuntimeState* state) { return Status::OK(); } [[nodiscard]] virtual bool require_data_distribution() const { return false; } OperatorPtr child() { return _child; } - [[nodiscard]] bool followed_by_shuffled_join() const { return _followed_by_shuffled_join; } - void set_followed_by_shuffled_join(bool followed_by_shuffled_join) { - _followed_by_shuffled_join = followed_by_shuffled_join; + [[nodiscard]] bool followed_by_shuffled_operator() const { + return _followed_by_shuffled_operator; + } + void set_followed_by_shuffled_operator(bool followed_by_shuffled_operator) { + _followed_by_shuffled_operator = followed_by_shuffled_operator; } - [[nodiscard]] virtual bool require_shuffled_data_distribution() const { return false; } + [[nodiscard]] virtual bool is_shuffled_operator() const { return false; } + [[nodiscard]] virtual DataDistribution required_data_distribution() const; + [[nodiscard]] virtual bool require_shuffled_data_distribution() const; protected: OperatorPtr _child = nullptr; bool _is_closed; - bool _followed_by_shuffled_join = false; + bool _followed_by_shuffled_operator = false; + bool _is_serial_operator = false; }; class PipelineXLocalStateBase { @@ -155,10 +163,11 @@ class PipelineXLocalStateBase { void reached_limit(vectorized::Block* block, bool* eos); RuntimeProfile* profile() { return _runtime_profile.get(); } - MemTracker* mem_tracker() { return _mem_tracker.get(); } - RuntimeProfile::Counter* rows_returned_counter() { return _rows_returned_counter; } - RuntimeProfile::Counter* blocks_returned_counter() { return _blocks_returned_counter; } RuntimeProfile::Counter* exec_time_counter() { return _exec_timer; } + RuntimeProfile::Counter* memory_used_counter() { return _memory_used_counter; } + RuntimeProfile::HighWaterMarkCounter* peak_memory_usage_counter() { + return _peak_memory_usage_counter; + } OperatorXBase* parent() { return _parent; } RuntimeState* state() { return _state; } vectorized::VExprContextSPtrs& conjuncts() { return _conjuncts; } @@ -180,16 +189,14 @@ class PipelineXLocalStateBase { protected: friend class OperatorXBase; + template + friend class ScanOperatorX; ObjectPool* _pool = nullptr; int64_t _num_rows_returned {0}; std::unique_ptr _runtime_profile; - // Record this node memory size. it is expected that artificial guarantees are accurate, - // which will providea reference for operator memory. - std::unique_ptr _mem_tracker; - std::shared_ptr _query_statistics = nullptr; RuntimeProfile::Counter* _rows_returned_counter = nullptr; @@ -334,13 +341,16 @@ class PipelineXSinkLocalStateBase { DataSinkOperatorXBase* parent() { return _parent; } RuntimeState* state() { return _state; } RuntimeProfile* profile() { return _profile; } - MemTracker* mem_tracker() { return _mem_tracker.get(); } [[nodiscard]] RuntimeProfile* faker_runtime_profile() const { return _faker_runtime_profile.get(); } RuntimeProfile::Counter* rows_input_counter() { return _rows_input_counter; } RuntimeProfile::Counter* exec_time_counter() { return _exec_timer; } + RuntimeProfile::Counter* memory_used_counter() { return _memory_used_counter; } + RuntimeProfile::HighWaterMarkCounter* peak_memory_usage_counter() { + return _peak_memory_usage_counter; + } virtual std::vector dependencies() const { return {nullptr}; } // override in exchange sink , AsyncWriterSink @@ -352,10 +362,10 @@ class PipelineXSinkLocalStateBase { DataSinkOperatorXBase* _parent = nullptr; RuntimeState* _state = nullptr; RuntimeProfile* _profile = nullptr; - std::unique_ptr _mem_tracker; // Set to true after close() has been called. subclasses should check and set this in // close(). bool _closed = false; + std::atomic _eos = false; //NOTICE: now add a faker profile, because sometimes the profile record is useless //so we want remove some counters and timers, eg: in join node, if it's broadcast_join //and shared hash table, some counter/timer about build hash table is useless, @@ -439,7 +449,7 @@ class DataSinkOperatorXBase : public OperatorBase { Status init(const TDataSink& tsink) override; [[nodiscard]] virtual Status init(ExchangeType type, const int num_buckets, - const bool is_shuffled_hash_join, + const bool use_global_hash_shuffle, const std::map& shuffle_idx_to_instance_idx) { return Status::InternalError("init() is only implemented in local exchange!"); } @@ -474,9 +484,6 @@ class DataSinkOperatorXBase : public OperatorBase { } [[nodiscard]] virtual std::shared_ptr create_shared_state() const = 0; - [[nodiscard]] virtual DataDistribution required_data_distribution() const; - - [[nodiscard]] virtual bool is_shuffled_hash_join() const { return false; } Status close(RuntimeState* state) override { return Status::InternalError("Should not reach here!"); @@ -489,8 +496,6 @@ class DataSinkOperatorXBase : public OperatorBase { [[nodiscard]] bool is_sink() const override { return true; } - [[nodiscard]] bool is_source() const override { return false; } - static Status close(RuntimeState* state, Status exec_status) { auto result = state->get_sink_local_state_result(); if (!result) { @@ -513,6 +518,8 @@ class DataSinkOperatorXBase : public OperatorBase { virtual bool should_dry_run(RuntimeState* state) { return false; } + [[nodiscard]] virtual bool count_down_destination() { return true; } + protected: template requires(std::is_base_of_v) @@ -643,27 +650,13 @@ class OperatorXBase : public OperatorBase { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, _op_name); } [[nodiscard]] std::string get_name() const override { return _op_name; } - [[nodiscard]] virtual DataDistribution required_data_distribution() const { - return _child && _child->ignore_data_distribution() && !is_source() - ? DataDistribution(ExchangeType::PASSTHROUGH) - : DataDistribution(ExchangeType::NOOP); - } - [[nodiscard]] virtual bool ignore_data_distribution() const { - return _child ? _child->ignore_data_distribution() : _ignore_data_distribution; - } - [[nodiscard]] bool ignore_data_hash_distribution() const { - return _child ? _child->ignore_data_hash_distribution() : _ignore_data_distribution; - } [[nodiscard]] virtual bool need_more_input_data(RuntimeState* state) const { return true; } - void set_ignore_data_distribution() { _ignore_data_distribution = true; } Status open(RuntimeState* state) override; [[nodiscard]] virtual Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos) = 0; - [[nodiscard]] virtual bool is_shuffled_hash_join() const { return false; } - Status close(RuntimeState* state) override; [[nodiscard]] virtual const RowDescriptor& intermediate_row_desc() const { @@ -728,8 +721,6 @@ class OperatorXBase : public OperatorBase { bool has_output_row_desc() const { return _output_row_descriptor != nullptr; } - [[nodiscard]] bool is_source() const override { return false; } - [[nodiscard]] virtual Status get_block_after_projects(RuntimeState* state, vectorized::Block* block, bool* eos); @@ -739,6 +730,9 @@ class OperatorXBase : public OperatorBase { void set_parallel_tasks(int parallel_tasks) { _parallel_tasks = parallel_tasks; } int parallel_tasks() const { return _parallel_tasks; } + // To keep compatibility with older FE + void set_serial_operator() { _is_serial_operator = true; } + protected: template friend class PipelineXLocalState; @@ -772,7 +766,6 @@ class OperatorXBase : public OperatorBase { uint32_t _debug_point_count = 0; std::string _op_name; - bool _ignore_data_distribution = false; int _parallel_tasks = 0; //_keep_origin is used to avoid copying during projection, @@ -843,9 +836,9 @@ class StatefulOperatorX : public OperatorX { template requires(std::is_base_of_v) -class AsyncWriterSink : public PipelineXSinkLocalState { +class AsyncWriterSink : public PipelineXSinkLocalState { public: - using Base = PipelineXSinkLocalState; + using Base = PipelineXSinkLocalState; AsyncWriterSink(DataSinkOperatorXBase* parent, RuntimeState* state) : Base(parent, state), _async_writer_dependency(nullptr) { _finish_dependency = diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.cpp b/be/src/pipeline/exec/partition_sort_sink_operator.cpp index fbabdbdc8f85fe..48b8fe9cb765a1 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_sink_operator.cpp @@ -25,79 +25,6 @@ namespace doris::pipeline { -Status PartitionBlocks::append_block_by_selector(const vectorized::Block* input_block, bool eos) { - if (_blocks.empty() || reach_limit()) { - _init_rows = _partition_sort_info->_runtime_state->batch_size(); - _blocks.push_back(vectorized::Block::create_unique( - vectorized::VectorizedUtils::create_empty_block(_partition_sort_info->_row_desc))); - } - auto columns = input_block->get_columns(); - auto mutable_columns = _blocks.back()->mutate_columns(); - DCHECK(columns.size() == mutable_columns.size()); - for (int i = 0; i < mutable_columns.size(); ++i) { - columns[i]->append_data_by_selector(mutable_columns[i], _selector); - } - _blocks.back()->set_columns(std::move(mutable_columns)); - auto selector_rows = _selector.size(); - _init_rows = _init_rows - selector_rows; - _total_rows = _total_rows + selector_rows; - _current_input_rows = _current_input_rows + selector_rows; - _selector.clear(); - // maybe better could change by user PARTITION_SORT_ROWS_THRESHOLD - if (!eos && _partition_sort_info->_partition_inner_limit != -1 && - _current_input_rows >= PARTITION_SORT_ROWS_THRESHOLD && - _partition_sort_info->_topn_phase != TPartTopNPhase::TWO_PHASE_GLOBAL) { - create_or_reset_sorter_state(); - RETURN_IF_ERROR(do_partition_topn_sort()); - _current_input_rows = 0; // reset record - _do_partition_topn_count++; - } - return Status::OK(); -} - -void PartitionBlocks::create_or_reset_sorter_state() { - if (_partition_topn_sorter == nullptr) { - _previous_row = std::make_unique(); - _partition_topn_sorter = vectorized::PartitionSorter::create_unique( - *_partition_sort_info->_vsort_exec_exprs, _partition_sort_info->_limit, - _partition_sort_info->_offset, _partition_sort_info->_pool, - _partition_sort_info->_is_asc_order, _partition_sort_info->_nulls_first, - _partition_sort_info->_row_desc, _partition_sort_info->_runtime_state, - _is_first_sorter ? _partition_sort_info->_runtime_profile : nullptr, - _partition_sort_info->_has_global_limit, - _partition_sort_info->_partition_inner_limit, - _partition_sort_info->_top_n_algorithm, _previous_row.get()); - _partition_topn_sorter->init_profile(_partition_sort_info->_runtime_profile); - } else { - _partition_topn_sorter->reset_sorter_state(_partition_sort_info->_runtime_state); - } -} - -Status PartitionBlocks::do_partition_topn_sort() { - for (const auto& block : _blocks) { - RETURN_IF_ERROR(_partition_topn_sorter->append_block(block.get())); - } - _blocks.clear(); - RETURN_IF_ERROR(_partition_topn_sorter->prepare_for_read()); - bool current_eos = false; - size_t current_output_rows = 0; - while (!current_eos) { - // output_block maybe need better way - auto output_block = vectorized::Block::create_unique( - vectorized::VectorizedUtils::create_empty_block(_partition_sort_info->_row_desc)); - RETURN_IF_ERROR(_partition_topn_sorter->get_next(_partition_sort_info->_runtime_state, - output_block.get(), ¤t_eos)); - auto rows = output_block->rows(); - if (rows > 0) { - current_output_rows += rows; - _blocks.emplace_back(std::move(output_block)); - } - } - - _topn_filter_rows += (_current_input_rows - current_output_rows); - return Status::OK(); -} - Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); @@ -113,9 +40,9 @@ Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _partition_exprs_num = p._partition_exprs_num; _hash_table_size_counter = ADD_COUNTER(_profile, "HashTableSize", TUnit::UNIT); _serialize_key_arena_memory_usage = - _profile->AddHighWaterMarkCounter("SerializeKeyArena", TUnit::BYTES, "MemoryUsage", 1); + _profile->AddHighWaterMarkCounter("MemoryUsageSerializeKeyArena", TUnit::BYTES, "", 1); _hash_table_memory_usage = - ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "HashTable", TUnit::BYTES, "MemoryUsage", 1); + ADD_COUNTER_WITH_LEVEL(_profile, "MemoryUsageHashTable", TUnit::BYTES, 1); _build_timer = ADD_TIMER(_profile, "HashTableBuildTime"); _selector_block_timer = ADD_TIMER(_profile, "SelectorBlockTime"); _emplace_key_timer = ADD_TIMER(_profile, "EmplaceKeyTime"); @@ -287,8 +214,8 @@ Status PartitionSortSinkOperatorX::_emplace_into_hash_table( SCOPED_TIMER(local_state._emplace_key_timer); int row = num_rows; for (row = row - 1; row >= 0 && !local_state._is_need_passthrough; --row) { - auto& mapped = agg_method.lazy_emplace(state, row, creator, - creator_for_null_key); + auto& mapped = *agg_method.lazy_emplace(state, row, creator, + creator_for_null_key); mapped->add_row_idx(row); local_state._sorted_partition_input_rows++; local_state._is_need_passthrough = @@ -323,12 +250,11 @@ Status PartitionSortSinkOperatorX::_emplace_into_hash_table( local_state._partitioned_data->method_variant); } -constexpr auto init_partition_hash_method = - init_hash_method; +constexpr auto init_partition_hash_method = init_hash_method; Status PartitionSortSinkLocalState::_init_hash_method() { - RETURN_IF_ERROR( - init_partition_hash_method(_partitioned_data.get(), _partition_expr_ctxs, true)); + RETURN_IF_ERROR(init_partition_hash_method(_partitioned_data.get(), + get_data_types(_partition_expr_ctxs), true)); return Status::OK(); } diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.h b/be/src/pipeline/exec/partition_sort_sink_operator.h index f16df509dca4a0..6926445f18f2f4 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.h +++ b/be/src/pipeline/exec/partition_sort_sink_operator.h @@ -17,202 +17,14 @@ #pragma once -#include - #include #include "operator.h" +#include "pipeline/common/partition_sort_utils.h" #include "vec/common/sort/partition_sorter.h" namespace doris::pipeline { -struct PartitionSortInfo { - ~PartitionSortInfo() = default; - - PartitionSortInfo(vectorized::VSortExecExprs* vsort_exec_exprs, int64_t limit, int64_t offset, - ObjectPool* pool, const std::vector& is_asc_order, - const std::vector& nulls_first, const RowDescriptor& row_desc, - RuntimeState* runtime_state, RuntimeProfile* runtime_profile, - bool has_global_limit, int64_t partition_inner_limit, - TopNAlgorithm::type top_n_algorithm, TPartTopNPhase::type topn_phase) - : _vsort_exec_exprs(vsort_exec_exprs), - _limit(limit), - _offset(offset), - _pool(pool), - _is_asc_order(is_asc_order), - _nulls_first(nulls_first), - _row_desc(row_desc), - _runtime_state(runtime_state), - _runtime_profile(runtime_profile), - _has_global_limit(has_global_limit), - _partition_inner_limit(partition_inner_limit), - _top_n_algorithm(top_n_algorithm), - _topn_phase(topn_phase) {} - -public: - vectorized::VSortExecExprs* _vsort_exec_exprs = nullptr; - int64_t _limit = -1; - int64_t _offset = 0; - ObjectPool* _pool = nullptr; - std::vector _is_asc_order; - std::vector _nulls_first; - const RowDescriptor& _row_desc; - RuntimeState* _runtime_state = nullptr; - RuntimeProfile* _runtime_profile = nullptr; - bool _has_global_limit = false; - int64_t _partition_inner_limit = 0; - TopNAlgorithm::type _top_n_algorithm = TopNAlgorithm::ROW_NUMBER; - TPartTopNPhase::type _topn_phase = TPartTopNPhase::TWO_PHASE_GLOBAL; -}; - -static constexpr size_t INITIAL_BUFFERED_BLOCK_BYTES = 64 << 20; - -#ifndef NDEBUG -static constexpr size_t PARTITION_SORT_ROWS_THRESHOLD = 10; -#else -static constexpr size_t PARTITION_SORT_ROWS_THRESHOLD = 20000; -#endif - -struct PartitionBlocks { -public: - PartitionBlocks(std::shared_ptr partition_sort_info, bool is_first_sorter) - : _is_first_sorter(is_first_sorter), _partition_sort_info(partition_sort_info) {} - ~PartitionBlocks() = default; - - void add_row_idx(size_t row) { _selector.push_back(row); } - - Status append_block_by_selector(const vectorized::Block* input_block, bool eos); - - Status do_partition_topn_sort(); - - void create_or_reset_sorter_state(); - - void append_whole_block(vectorized::Block* input_block, const RowDescriptor& row_desc) { - auto empty_block = vectorized::Block::create_unique( - vectorized::VectorizedUtils::create_empty_block(row_desc)); - empty_block->swap(*input_block); - _blocks.emplace_back(std::move(empty_block)); - } - - bool reach_limit() { - return _init_rows <= 0 || _blocks.back()->bytes() > INITIAL_BUFFERED_BLOCK_BYTES; - } - - size_t get_total_rows() const { return _total_rows; } - size_t get_topn_filter_rows() const { return _topn_filter_rows; } - size_t get_do_topn_count() const { return _do_partition_topn_count; } - - vectorized::IColumn::Selector _selector; - std::vector> _blocks; - size_t _total_rows = 0; - size_t _current_input_rows = 0; - size_t _topn_filter_rows = 0; - size_t _do_partition_topn_count = 0; - int _init_rows = 4096; - bool _is_first_sorter = false; - - std::unique_ptr _previous_row; - std::unique_ptr _partition_topn_sorter = nullptr; - std::shared_ptr _partition_sort_info = nullptr; -}; - -using PartitionDataPtr = PartitionBlocks*; -using PartitionDataWithStringKey = PHHashMap; -using PartitionDataWithShortStringKey = StringHashMap; -using PartitionDataWithUInt8Key = PHHashMap; -using PartitionDataWithUInt16Key = PHHashMap; -using PartitionDataWithUInt32Key = - PHHashMap>; -using PartitionDataWithUInt64Key = PHHashMap>; -using PartitionDataWithUInt128Key = PHHashMap>; -using PartitionDataWithUInt256Key = PHHashMap>; -using PartitionDataWithUInt136Key = PHHashMap>; - -using PartitionedMethodVariants = std::variant< - std::monostate, vectorized::MethodSerialized, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodOneNumber, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodSingleNullableColumn>>, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodKeysFixed, - vectorized::MethodStringNoCache, - vectorized::MethodSingleNullableColumn>>>; - -struct PartitionedHashMapVariants - : public vectorized::DataVariants< - PartitionedMethodVariants, vectorized::MethodSingleNullableColumn, - vectorized::MethodOneNumber, MethodKeysFixed, vectorized::DataWithNullKey> { - template - void init(vectorized::HashKeyType type) { - _type = type; - switch (_type) { - case vectorized::HashKeyType::serialized: { - method_variant.emplace>(); - break; - } - case vectorized::HashKeyType::int8_key: { - emplace_single(); - break; - } - case vectorized::HashKeyType::int16_key: { - emplace_single(); - break; - } - case vectorized::HashKeyType::int32_key: { - emplace_single(); - break; - } - case vectorized::HashKeyType::int64_key: { - emplace_single(); - break; - } - case vectorized::HashKeyType::int128_key: { - emplace_single(); - break; - } - case vectorized::HashKeyType::string_key: { - if (nullable) { - method_variant.emplace< - vectorized::MethodSingleNullableColumn>>>(); - } else { - method_variant.emplace< - vectorized::MethodStringNoCache>(); - } - break; - } - default: - throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid key type, type={}", type); - } - } - void init(vectorized::HashKeyType type, bool is_nullable = false) { - if (is_nullable) { - init(type); - } else { - init(type); - } - } -}; - class PartitionSortSinkOperatorX; class PartitionSortSinkLocalState : public PipelineXSinkLocalState { ENABLE_FACTORY_CREATOR(PartitionSortSinkLocalState); diff --git a/be/src/pipeline/exec/partition_sort_source_operator.cpp b/be/src/pipeline/exec/partition_sort_source_operator.cpp index f2cd8dea0b943c..6d355477ab871c 100644 --- a/be/src/pipeline/exec/partition_sort_source_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_source_operator.cpp @@ -58,7 +58,6 @@ Status PartitionSortSourceOperatorX::get_block(RuntimeState* state, vectorized:: } } if (!output_block->empty()) { - COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); local_state._num_rows_returned += output_block->rows(); } return Status::OK(); @@ -80,7 +79,6 @@ Status PartitionSortSourceOperatorX::get_block(RuntimeState* state, vectorized:: local_state._sort_idx >= local_state._shared_state->partition_sorts.size(); } if (!output_block->empty()) { - COUNTER_UPDATE(local_state.blocks_returned_counter(), 1); local_state._num_rows_returned += output_block->rows(); } return Status::OK(); diff --git a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp index 469716b7a22182..ab0a43f4a635cf 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.cpp @@ -81,10 +81,10 @@ Status PartitionedAggSinkLocalState::close(RuntimeState* state, Status exec_stat void PartitionedAggSinkLocalState::_init_counters() { _internal_runtime_profile = std::make_unique("internal_profile"); - _hash_table_memory_usage = ADD_CHILD_COUNTER_WITH_LEVEL(Base::profile(), "HashTable", - TUnit::BYTES, "MemoryUsage", 1); + _hash_table_memory_usage = + ADD_COUNTER_WITH_LEVEL(Base::profile(), "MemoryUsageHashTable", TUnit::BYTES, 1); _serialize_key_arena_memory_usage = Base::profile()->AddHighWaterMarkCounter( - "SerializeKeyArena", TUnit::BYTES, "MemoryUsage", 1); + "MemoryUsageSerializeKeyArena", TUnit::BYTES, "", 1); _build_timer = ADD_TIMER(Base::profile(), "BuildTime"); _serialize_key_timer = ADD_TIMER(Base::profile(), "SerializeKeyTime"); @@ -110,8 +110,8 @@ void PartitionedAggSinkLocalState::_init_counters() { } while (false) void PartitionedAggSinkLocalState::update_profile(RuntimeProfile* child_profile) { - UPDATE_PROFILE(_hash_table_memory_usage, "HashTable"); - UPDATE_PROFILE(_serialize_key_arena_memory_usage, "SerializeKeyArena"); + UPDATE_PROFILE(_hash_table_memory_usage, "MemoryUsageHashTable"); + UPDATE_PROFILE(_serialize_key_arena_memory_usage, "MemoryUsageSerializeKeyArena"); UPDATE_PROFILE(_build_timer, "BuildTime"); UPDATE_PROFILE(_serialize_key_timer, "SerializeKeyTime"); UPDATE_PROFILE(_merge_timer, "MergeTime"); diff --git a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h index 259d7580877493..15f6b22387a8e2 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h @@ -259,7 +259,6 @@ class PartitionedAggSinkLocalState std::unique_ptr _runtime_state; - bool _eos = false; std::shared_ptr _finish_dependency; // temp structures during spilling @@ -310,9 +309,6 @@ class PartitionedAggSinkOperatorX : public DataSinkOperatorXrequire_data_distribution(); } - bool require_shuffled_data_distribution() const override { - return _agg_sink_operator->require_shuffled_data_distribution(); - } Status set_child(OperatorPtr child) override { RETURN_IF_ERROR(DataSinkOperatorX::set_child(child)); diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp index 48df5587198b08..655a6e19725a9b 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.cpp @@ -118,6 +118,10 @@ Status PartitionedAggSourceOperatorX::close(RuntimeState* state) { return _agg_source_operator->close(state); } +bool PartitionedAggSourceOperatorX::is_serial_operator() const { + return _agg_source_operator->is_serial_operator(); +} + Status PartitionedAggSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { auto& local_state = get_local_state(state); diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h index edae99c716a925..7e73241745e029 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h @@ -91,6 +91,8 @@ class PartitionedAggSourceOperatorX : public OperatorX bool is_source() const override { return true; } + bool is_serial_operator() const override; + private: friend class PartitionedAggLocalState; diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp index 018d63a6deebb1..0e56acc1c574b2 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp @@ -557,8 +557,7 @@ Status PartitionedHashJoinProbeOperatorX::push(RuntimeState* state, vectorized:: } { SCOPED_TIMER(local_state._partition_timer); - RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, input_block, - local_state._mem_tracker.get())); + RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, input_block)); } std::vector> partition_indexes(_partition_count); diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h index 8cccc9f8faeba6..f8fc0780b6fc3f 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h @@ -165,10 +165,7 @@ class PartitionedHashJoinProbeOperatorX final _distribution_partition_exprs)); } - bool require_shuffled_data_distribution() const override { - return _join_op != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; - } - bool is_shuffled_hash_join() const override { + bool is_shuffled_operator() const override { return _join_distribution == TJoinDistributionType::PARTITIONED; } diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp index a7297be493f804..83a205e59c78fb 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp @@ -80,7 +80,7 @@ size_t PartitionedHashJoinSinkLocalState::revocable_mem_size(RuntimeState* state if (inner_sink_state_) { auto inner_sink_state = assert_cast(inner_sink_state_); - return inner_sink_state->_build_side_mem_used; + return inner_sink_state->_build_blocks_memory_usage->value(); } } return 0; @@ -161,7 +161,7 @@ Status PartitionedHashJoinSinkLocalState::_revoke_unpartitioned_block(RuntimeSta { SCOPED_TIMER(_partition_timer); - (void)_partitioner->do_partitioning(state, &sub_block, _mem_tracker.get()); + (void)_partitioner->do_partitioning(state, &sub_block); } const auto* channel_ids = _partitioner->get_channel_ids().get(); @@ -294,7 +294,7 @@ Status PartitionedHashJoinSinkLocalState::revoke_memory(RuntimeState* state) { return Status::OK(); }(); - if (!status.OK()) { + if (!status.ok()) { std::unique_lock lock(_spill_lock); _dependency->set_ready(); _spill_status_ok = false; @@ -334,7 +334,7 @@ Status PartitionedHashJoinSinkLocalState::_partition_block(RuntimeState* state, { /// TODO: DO NOT execute build exprs twice(when partition and building hash table) SCOPED_TIMER(_partition_timer); - RETURN_IF_ERROR(_partitioner->do_partitioning(state, in_block, _mem_tracker.get())); + RETURN_IF_ERROR(_partitioner->do_partitioning(state, in_block)); } auto& p = _parent->cast(); diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h index 1376964663f7f3..8e89763b50a9d5 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h @@ -115,10 +115,7 @@ class PartitionedHashJoinSinkOperatorX _distribution_partition_exprs); } - bool require_shuffled_data_distribution() const override { - return _join_op != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; - } - bool is_shuffled_hash_join() const override { + bool is_shuffled_operator() const override { return _join_distribution == TJoinDistributionType::PARTITIONED; } diff --git a/be/src/pipeline/exec/repeat_operator.cpp b/be/src/pipeline/exec/repeat_operator.cpp index d355d99c2e352f..5c94d43f0d1e05 100644 --- a/be/src/pipeline/exec/repeat_operator.cpp +++ b/be/src/pipeline/exec/repeat_operator.cpp @@ -46,6 +46,16 @@ Status RepeatLocalState::open(RuntimeState* state) { return Status::OK(); } +Status RepeatLocalState::init(RuntimeState* state, LocalStateInfo& info) { + RETURN_IF_ERROR(Base::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _evaluate_input_timer = ADD_TIMER(profile(), "EvaluateInputDataTime"); + _get_repeat_data_timer = ADD_TIMER(profile(), "GetRepeatDataTime"); + _filter_timer = ADD_TIMER(profile(), "FilterTime"); + return Status::OK(); +} + Status RepeatOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(OperatorXBase::init(tnode, state)); RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees(tnode.repeat_node.exprs, _expr_ctxs)); @@ -166,23 +176,24 @@ Status RepeatLocalState::add_grouping_id_column(std::size_t rows, std::size_t& c Status RepeatOperatorX::push(RuntimeState* state, vectorized::Block* input_block, bool eos) const { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state._evaluate_input_timer); local_state._child_eos = eos; - auto& _intermediate_block = local_state._intermediate_block; - auto& _expr_ctxs = local_state._expr_ctxs; - DCHECK(!_intermediate_block || _intermediate_block->rows() == 0); + auto& intermediate_block = local_state._intermediate_block; + auto& expr_ctxs = local_state._expr_ctxs; + DCHECK(!intermediate_block || intermediate_block->rows() == 0); if (input_block->rows() > 0) { - _intermediate_block = vectorized::Block::create_unique(); + intermediate_block = vectorized::Block::create_unique(); - for (auto& expr : _expr_ctxs) { + for (auto& expr : expr_ctxs) { int result_column_id = -1; RETURN_IF_ERROR(expr->execute(input_block, &result_column_id)); DCHECK(result_column_id != -1); input_block->get_by_position(result_column_id).column = input_block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); - _intermediate_block->insert(input_block->get_by_position(result_column_id)); + intermediate_block->insert(input_block->get_by_position(result_column_id)); } - DCHECK_EQ(_expr_ctxs.size(), _intermediate_block->columns()); + DCHECK_EQ(expr_ctxs.size(), intermediate_block->columns()); } return Status::OK(); @@ -202,36 +213,41 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp } DCHECK(output_block->rows() == 0); - if (_intermediate_block && _intermediate_block->rows() > 0) { - RETURN_IF_ERROR(local_state.get_repeated_block(_intermediate_block.get(), _repeat_id_idx, - output_block)); + { + SCOPED_TIMER(local_state._get_repeat_data_timer); + if (_intermediate_block && _intermediate_block->rows() > 0) { + RETURN_IF_ERROR(local_state.get_repeated_block(_intermediate_block.get(), + _repeat_id_idx, output_block)); - _repeat_id_idx++; + _repeat_id_idx++; - int size = _repeat_id_list.size(); - if (_repeat_id_idx >= size) { - _intermediate_block->clear(); + int size = _repeat_id_list.size(); + if (_repeat_id_idx >= size) { + _intermediate_block->clear(); + _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); + _repeat_id_idx = 0; + } + } else if (local_state._expr_ctxs.empty()) { + auto m_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( + output_block, _output_slots); + auto rows = _child_block.rows(); + auto& columns = m_block.mutable_columns(); + + for (int repeat_id_idx = 0; repeat_id_idx < _repeat_id_list.size(); repeat_id_idx++) { + std::size_t cur_col = 0; + RETURN_IF_ERROR( + local_state.add_grouping_id_column(rows, cur_col, columns, repeat_id_idx)); + } _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); - _repeat_id_idx = 0; } - } else if (local_state._expr_ctxs.empty()) { - auto m_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block(output_block, - _output_slots); - auto rows = _child_block.rows(); - auto& columns = m_block.mutable_columns(); - - for (int repeat_id_idx = 0; repeat_id_idx < _repeat_id_list.size(); repeat_id_idx++) { - std::size_t cur_col = 0; - RETURN_IF_ERROR( - local_state.add_grouping_id_column(rows, cur_col, columns, repeat_id_idx)); - } - _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); } - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, - output_block->columns())); + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, output_block, + output_block->columns())); + } *eos = _child_eos && _child_block.rows() == 0; local_state.reached_limit(output_block, eos); - COUNTER_SET(local_state._rows_returned_counter, local_state._num_rows_returned); return Status::OK(); } diff --git a/be/src/pipeline/exec/repeat_operator.h b/be/src/pipeline/exec/repeat_operator.h index 22398df372ae65..31f88f37231aaa 100644 --- a/be/src/pipeline/exec/repeat_operator.h +++ b/be/src/pipeline/exec/repeat_operator.h @@ -36,6 +36,7 @@ class RepeatLocalState final : public PipelineXLocalState { using Base = PipelineXLocalState; RepeatLocalState(RuntimeState* state, OperatorXBase* parent); + Status init(RuntimeState* state, LocalStateInfo& info) override; Status open(RuntimeState* state) override; Status get_repeated_block(vectorized::Block* child_block, int repeat_id_idx, @@ -53,6 +54,10 @@ class RepeatLocalState final : public PipelineXLocalState { int _repeat_id_idx; std::unique_ptr _intermediate_block; vectorized::VExprContextSPtrs _expr_ctxs; + + RuntimeProfile::Counter* _evaluate_input_timer = nullptr; + RuntimeProfile::Counter* _get_repeat_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; class RepeatOperatorX final : public StatefulOperatorX { diff --git a/be/src/pipeline/exec/result_file_sink_operator.cpp b/be/src/pipeline/exec/result_file_sink_operator.cpp index a11c4df6625aa2..bc4e4c88d14ca7 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.cpp +++ b/be/src/pipeline/exec/result_file_sink_operator.cpp @@ -31,9 +31,7 @@ namespace doris::pipeline { ResultFileSinkLocalState::ResultFileSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) - : AsyncWriterSink(parent, state), - _serializer( - std::make_unique>(this)) {} + : AsyncWriterSink(parent, state) {} ResultFileSinkOperatorX::ResultFileSinkOperatorX(int operator_id, const RowDescriptor& row_desc, const std::vector& t_output_expr) @@ -87,12 +85,6 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i SCOPED_TIMER(_init_timer); _sender_id = info.sender_id; - _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); - _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); - _split_block_distribute_by_channel_timer = - ADD_TIMER(_profile, "SplitBlockDistributeByChannelTime"); - _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); auto& p = _parent->cast(); CHECK(p._file_opts.get() != nullptr); // create sender @@ -103,6 +95,7 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i state->fragment_instance_id(), p._buf_size, &_sender, state->execution_timeout(), state->batch_size())); } + _sender->set_dependency(state->fragment_instance_id(), _dependency->shared_from_this()); // create writer _writer.reset(new (std::nothrow) vectorized::VFileResultWriter( @@ -145,14 +138,6 @@ Status ResultFileSinkLocalState::close(RuntimeState* state, Status exec_status) return Base::close(state, exec_status); } -template -void ResultFileSinkLocalState::_handle_eof_channel(RuntimeState* state, ChannelPtrType channel, - Status st) { - channel->set_receiver_eof(st); - // Chanel will not send RPC to the downstream when eof, so close chanel by OK status. - static_cast(channel->close(state, Status::OK())); -} - Status ResultFileSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, bool eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); diff --git a/be/src/pipeline/exec/result_file_sink_operator.h b/be/src/pipeline/exec/result_file_sink_operator.h index e99eb709a9f4e7..e9f2b8eeb9c670 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.h +++ b/be/src/pipeline/exec/result_file_sink_operator.h @@ -21,10 +21,6 @@ #include "vec/sink/writer/vfile_result_writer.h" namespace doris::vectorized { -template -class BlockSerializer; -template -class Channel; class BroadcastPBlockHolder; } // namespace doris::vectorized @@ -44,31 +40,12 @@ class ResultFileSinkLocalState final [[nodiscard]] int sender_id() const { return _sender_id; } - RuntimeProfile::Counter* brpc_wait_timer() { return _brpc_wait_timer; } - RuntimeProfile::Counter* local_send_timer() { return _local_send_timer; } - RuntimeProfile::Counter* brpc_send_timer() { return _brpc_send_timer; } - RuntimeProfile::Counter* merge_block_timer() { return _merge_block_timer; } - RuntimeProfile::Counter* split_block_distribute_by_channel_timer() { - return _split_block_distribute_by_channel_timer; - } - private: friend class ResultFileSinkOperatorX; - template - void _handle_eof_channel(RuntimeState* state, ChannelPtrType channel, Status st); - std::shared_ptr _sender; - std::vector*> _channels; - std::unique_ptr> _serializer; std::shared_ptr _block_holder; - RuntimeProfile::Counter* _brpc_wait_timer = nullptr; - RuntimeProfile::Counter* _local_send_timer = nullptr; - RuntimeProfile::Counter* _brpc_send_timer = nullptr; - RuntimeProfile::Counter* _merge_block_timer = nullptr; - RuntimeProfile::Counter* _split_block_distribute_by_channel_timer = nullptr; - int _sender_id; }; diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index 0608beaf522290..15612168affd89 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -17,11 +17,12 @@ #include "result_sink_operator.h" +#include +#include + #include -#include #include "common/config.h" -#include "common/object_pool.h" #include "exec/rowid_fetcher.h" #include "pipeline/exec/operator.h" #include "runtime/buffer_control_block.h" @@ -39,13 +40,12 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _fetch_row_id_timer = ADD_TIMER(profile(), "FetchRowIdTime"); + _write_data_timer = ADD_TIMER(profile(), "WriteDataTime"); static const std::string timer_name = "WaitForDependencyTime"; _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL(_profile, timer_name, 1); auto fragment_instance_id = state->fragment_instance_id(); - _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); - _rows_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "RowsProduced", TUnit::UNIT, 1); - if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { @@ -81,7 +81,8 @@ Status ResultSinkLocalState::open(RuntimeState* state) { } case TResultSinkType::ARROW_FLIGHT_PROTOCAL: { std::shared_ptr arrow_schema; - RETURN_IF_ERROR(convert_expr_ctxs_arrow_schema(_output_vexpr_ctxs, &arrow_schema)); + RETURN_IF_ERROR(convert_expr_ctxs_arrow_schema(_output_vexpr_ctxs, &arrow_schema, + state->timezone())); if (state->query_options().enable_parallel_result_sink) { state->exec_env()->result_mgr()->register_arrow_schema(state->query_id(), arrow_schema); } else { @@ -143,12 +144,15 @@ Status ResultSinkOperatorX::open(RuntimeState* state) { Status ResultSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, bool eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - COUNTER_UPDATE(local_state.rows_sent_counter(), (int64_t)block->rows()); - COUNTER_UPDATE(local_state.blocks_sent_counter(), 1); + COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); if (_fetch_option.use_two_phase_fetch && block->rows() > 0) { + SCOPED_TIMER(local_state._fetch_row_id_timer); RETURN_IF_ERROR(_second_phase_fetch_data(state, block)); } - RETURN_IF_ERROR(local_state._writer->write(state, *block)); + { + SCOPED_TIMER(local_state._write_data_timer); + RETURN_IF_ERROR(local_state._writer->write(state, *block)); + } if (_fetch_option.use_two_phase_fetch) { // Block structure may be changed by calling _second_phase_fetch_data(). // So we should clear block in case of unmatched columns @@ -188,9 +192,10 @@ Status ResultSinkLocalState::close(RuntimeState* state, Status exec_status) { final_status = st; } - LOG_INFO("Query {} result sink closed with status {} and has written {} rows", - print_id(state->query_id()), final_status.to_string_no_stack(), - _writer->get_written_rows()); + VLOG_NOTICE << fmt::format( + "Query {} result sink closed with status {} and has written {} rows", + print_id(state->query_id()), final_status.to_string_no_stack(), + _writer->get_written_rows()); } // close sender, this is normal path end diff --git a/be/src/pipeline/exec/result_sink_operator.h b/be/src/pipeline/exec/result_sink_operator.h index 3c503096ecb51e..339c167825643b 100644 --- a/be/src/pipeline/exec/result_sink_operator.h +++ b/be/src/pipeline/exec/result_sink_operator.h @@ -128,8 +128,6 @@ class ResultSinkLocalState final : public PipelineXSinkLocalState _sender = nullptr; std::shared_ptr _writer = nullptr; - RuntimeProfile::Counter* _blocks_sent_counter = nullptr; - RuntimeProfile::Counter* _rows_sent_counter = nullptr; + + RuntimeProfile::Counter* _fetch_row_id_timer = nullptr; + RuntimeProfile::Counter* _write_data_timer = nullptr; }; class ResultSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index b81a64c7dfe5fb..21c3103fe5a708 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -73,7 +73,7 @@ Status ScanLocalState::init(RuntimeState* state, LocalStateInfo& info) SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); auto& p = _parent->cast(); - RETURN_IF_ERROR(RuntimeFilterConsumer::init(state, p.ignore_data_distribution())); + RETURN_IF_ERROR(RuntimeFilterConsumer::init(state, p.is_serial_operator())); // init profile for runtime filter RuntimeFilterConsumer::_init_profile(profile()); init_runtime_filter_dependency(_filter_dependencies, p.operator_id(), p.node_id(), @@ -305,18 +305,15 @@ Status ScanLocalState::_normalize_predicate( RETURN_IF_PUSH_DOWN(_normalize_noneq_binary_predicate( cur_expr, context, slot, value_range, &pdt), status); - if (_is_key_column(slot->col_name())) { + RETURN_IF_PUSH_DOWN( + _normalize_bitmap_filter(cur_expr, context, slot, &pdt), + status); + RETURN_IF_PUSH_DOWN( + _normalize_bloom_filter(cur_expr, context, slot, &pdt), status); + if (state()->enable_function_pushdown()) { RETURN_IF_PUSH_DOWN( - _normalize_bitmap_filter(cur_expr, context, slot, &pdt), + _normalize_function_filters(cur_expr, context, slot, &pdt), status); - RETURN_IF_PUSH_DOWN( - _normalize_bloom_filter(cur_expr, context, slot, &pdt), - status); - if (state()->enable_function_pushdown()) { - RETURN_IF_PUSH_DOWN(_normalize_function_filters( - cur_expr, context, slot, &pdt), - status); - } } }, *range); @@ -330,8 +327,7 @@ Status ScanLocalState::_normalize_predicate( return Status::OK(); } - if (pdt == PushDownType::ACCEPTABLE && - (_is_key_column(slot->col_name()) || _storage_no_merge())) { + if (pdt == PushDownType::ACCEPTABLE && (_is_key_column(slot->col_name()))) { output_expr = nullptr; return Status::OK(); } else { @@ -379,7 +375,7 @@ Status ScanLocalState::_normalize_bloom_filter(vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, PushDownType* pdt) { if (TExprNodeType::BLOOM_PRED == expr->node_type()) { - DCHECK(expr->children().size() == 1); + DCHECK(expr->get_num_children() == 1); PushDownType temp_pdt = _should_push_down_bloom_filter(); if (temp_pdt != PushDownType::UNACCEPTABLE) { _filter_predicates.bloom_filters.emplace_back(slot->col_name(), @@ -395,7 +391,7 @@ Status ScanLocalState::_normalize_bitmap_filter(vectorized::VExpr* expr vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, PushDownType* pdt) { if (TExprNodeType::BITMAP_PRED == expr->node_type()) { - DCHECK(expr->children().size() == 1); + DCHECK(expr->get_num_children() == 1); PushDownType temp_pdt = _should_push_down_bitmap_filter(); if (temp_pdt != PushDownType::UNACCEPTABLE) { _filter_predicates.bitmap_filters.emplace_back(slot->col_name(), @@ -524,7 +520,7 @@ Status ScanLocalState::_eval_const_conjuncts(vectorized::VExpr* vexpr, if (vexpr->is_constant()) { std::shared_ptr const_col_wrapper; RETURN_IF_ERROR(vexpr->get_const_col(expr_ctx, &const_col_wrapper)); - if (const vectorized::ColumnConst* const_column = + if (const auto* const_column = check_and_get_column(const_col_wrapper->column_ptr)) { constant_val = const_cast(const_column->get_data_at(0).data); if (constant_val == nullptr || !*reinterpret_cast(constant_val)) { @@ -532,7 +528,7 @@ Status ScanLocalState::_eval_const_conjuncts(vectorized::VExpr* vexpr, _eos = true; _scan_dependency->set_ready(); } - } else if (const vectorized::ColumnVector* bool_column = + } else if (const auto* bool_column = check_and_get_column>( const_col_wrapper->column_ptr)) { // TODO: If `vexpr->is_constant()` is true, a const column is expected here. @@ -583,24 +579,21 @@ Status ScanLocalState::_normalize_in_and_eq_predicate(vectorized::VExpr if (hybrid_set->size() <= _parent->cast()._max_pushdown_conditions_per_column) { iter = hybrid_set->begin(); - } else if (_is_key_column(slot->col_name()) || _storage_no_merge()) { + } else { _filter_predicates.in_filters.emplace_back(slot->col_name(), expr->get_set_func()); *pdt = PushDownType::ACCEPTABLE; return Status::OK(); - } else { - *pdt = PushDownType::UNACCEPTABLE; - return Status::OK(); } } else { // normal in predicate - vectorized::VInPredicate* pred = static_cast(expr); + auto* pred = static_cast(expr); PushDownType temp_pdt = _should_push_down_in_predicate(pred, expr_ctx, false); if (temp_pdt == PushDownType::UNACCEPTABLE) { return Status::OK(); } // begin to push InPredicate value into ColumnValueRange - vectorized::InState* state = reinterpret_cast( + auto* state = reinterpret_cast( expr_ctx->fn_context(pred->fn_context_index()) ->get_function_state(FunctionContext::FRAGMENT_LOCAL)); @@ -619,7 +612,7 @@ Status ScanLocalState::_normalize_in_and_eq_predicate(vectorized::VExpr iter->next(); continue; } - auto value = const_cast(iter->get_value()); + auto* value = const_cast(iter->get_value()); RETURN_IF_ERROR(_change_value_range( temp_range, value, ColumnValueRange::add_fixed_value_range, "")); iter->next(); @@ -627,7 +620,7 @@ Status ScanLocalState::_normalize_in_and_eq_predicate(vectorized::VExpr range.intersection(temp_range); *pdt = PushDownType::ACCEPTABLE; } else if (TExprNodeType::BINARY_PRED == expr->node_type()) { - DCHECK(expr->children().size() == 2); + DCHECK(expr->get_num_children() == 2); auto eq_checker = [](const std::string& fn_name) { return fn_name == "eq"; }; StringRef value; @@ -776,7 +769,7 @@ Status ScanLocalState::_normalize_not_in_and_not_eq_predicate( iter->next(); } } else if (TExprNodeType::BINARY_PRED == expr->node_type()) { - DCHECK(expr->children().size() == 2); + DCHECK(expr->get_num_children() == 2); auto ne_checker = [](const std::string& fn_name) { return fn_name == "ne"; }; StringRef value; @@ -931,7 +924,7 @@ Status ScanLocalState::_normalize_noneq_binary_predicate( vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, ColumnValueRange& range, PushDownType* pdt) { if (TExprNodeType::BINARY_PRED == expr->node_type()) { - DCHECK(expr->children().size() == 2); + DCHECK(expr->get_num_children() == 2); auto noneq_checker = [](const std::string& fn_name) { return fn_name != "ne" && fn_name != "eq" && fn_name != "eq_for_null"; @@ -997,16 +990,7 @@ Status ScanLocalState::_start_scanners( auto& p = _parent->cast(); _scanner_ctx = vectorized::ScannerContext::create_shared( state(), this, p._output_tuple_desc, p.output_row_descriptor(), scanners, p.limit(), - state()->scan_queue_mem_limit(), _scan_dependency, - // NOTE: This will logic makes _max_thread_num of ScannerContext to be C(num of cores) * 2 - // For a query with C/2 instance and M scan node, scan task of this query will be C/2 * M * C*2 - // and will be C*C*N at most. - // 1. If data distribution is ignored , we use 1 instance to scan. - // 2. Else if this operator is not file scan operator, we use config::doris_scanner_thread_pool_thread_num scanners to scan. - // 3. Else, file scanner will consume much memory so we use config::doris_scanner_thread_pool_thread_num / query_parallel_instance_num scanners to scan. - p.ignore_data_distribution() || !p.is_file_scan_operator() - ? 1 - : state()->query_parallel_instance_num()); + _scan_dependency, p.is_serial_operator(), p.is_file_scan_operator()); return Status::OK(); } @@ -1064,13 +1048,10 @@ Status ScanLocalState::_init_profile() { ADD_COUNTER(_scanner_profile, "NewlyCreateFreeBlocksNum", TUnit::UNIT); _scale_up_scanners_counter = ADD_COUNTER(_scanner_profile, "NumScaleUpScanners", TUnit::UNIT); // time of transfer thread to wait for block from scan thread - _scanner_wait_batch_timer = ADD_TIMER(_scanner_profile, "ScannerBatchWaitTime"); _scanner_sched_counter = ADD_COUNTER(_scanner_profile, "ScannerSchedCount", TUnit::UNIT); - _scanner_ctx_sched_time = ADD_TIMER(_scanner_profile, "ScannerCtxSchedTime"); _scan_timer = ADD_TIMER(_scanner_profile, "ScannerGetBlockTime"); _scan_cpu_timer = ADD_TIMER(_scanner_profile, "ScannerCpuTime"); - _convert_block_timer = ADD_TIMER(_scanner_profile, "ScannerConvertBlockTime"); _filter_timer = ADD_TIMER(_scanner_profile, "ScannerFilterTime"); // time of scan thread to wait for worker thread of the thread pool @@ -1080,6 +1061,13 @@ Status ScanLocalState::_init_profile() { _peak_running_scanner = _scanner_profile->AddHighWaterMarkCounter("PeakRunningScanner", TUnit::UNIT); + + // Rows read from storage. + // Include the rows read from doris page cache. + _scan_rows = ADD_COUNTER(_runtime_profile, "ScanRows", TUnit::UNIT); + // Size of data that read from storage. + // Does not include rows that are cached by doris page cache. + _scan_bytes = ADD_COUNTER(_runtime_profile, "ScanBytes", TUnit::BYTES); return Status::OK(); } @@ -1154,6 +1142,8 @@ ScanOperatorX::ScanOperatorX(ObjectPool* pool, const TPlanNode& : OperatorX(pool, tnode, operator_id, descs), _runtime_filter_descs(tnode.runtime_filters), _parallel_tasks(parallel_tasks) { + OperatorX::_is_serial_operator = + tnode.__isset.is_serial_operator && tnode.is_serial_operator; if (tnode.__isset.push_down_count) { _push_down_count = tnode.push_down_count; } @@ -1291,6 +1281,7 @@ Status ScanOperatorX::get_block(RuntimeState* state, vectorized: if (*eos) { // reach limit, stop the scanners. local_state._scanner_ctx->stop_scanners(state); + local_state._scanner_profile->add_info_string("EOS", "True"); } return Status::OK(); diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index 28dbd01280f3c8..5d41c800383bd0 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -102,8 +102,6 @@ class ScanLocalStateBase : public PipelineXLocalState<>, public RuntimeFilterCon std::shared_ptr _scanner_profile; RuntimeProfile::Counter* _scanner_sched_counter = nullptr; - RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr; - RuntimeProfile::Counter* _scanner_wait_batch_timer = nullptr; RuntimeProfile::Counter* _scanner_wait_worker_timer = nullptr; // Num of newly created free blocks when running query RuntimeProfile::Counter* _newly_create_free_blocks_num = nullptr; @@ -114,8 +112,6 @@ class ScanLocalStateBase : public PipelineXLocalState<>, public RuntimeFilterCon // time of get block from scanner RuntimeProfile::Counter* _scan_timer = nullptr; RuntimeProfile::Counter* _scan_cpu_timer = nullptr; - // time of convert input block to output block from scanner - RuntimeProfile::Counter* _convert_block_timer = nullptr; // time of filter output block from scanner RuntimeProfile::Counter* _filter_timer = nullptr; RuntimeProfile::Counter* _memory_usage_counter = nullptr; @@ -128,6 +124,9 @@ class ScanLocalStateBase : public PipelineXLocalState<>, public RuntimeFilterCon RuntimeProfile::Counter* _num_scanners = nullptr; RuntimeProfile::Counter* _wait_for_rf_timer = nullptr; + + RuntimeProfile::Counter* _scan_rows = nullptr; + RuntimeProfile::Counter* _scan_bytes = nullptr; }; template @@ -359,7 +358,15 @@ class ScanOperatorX : public OperatorX { Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos) override; Status get_block_after_projects(RuntimeState* state, vectorized::Block* block, bool* eos) override { - return get_block(state, block, eos); + Status status = get_block(state, block, eos); + if (status.ok()) { + if (auto rows = block->rows()) { + auto* local_state = state->get_local_state(operator_id()); + COUNTER_UPDATE(local_state->_rows_returned_counter, rows); + COUNTER_UPDATE(local_state->_blocks_returned_counter, 1); + } + } + return status; } [[nodiscard]] bool is_source() const override { return true; } @@ -372,8 +379,8 @@ class ScanOperatorX : public OperatorX { TPushAggOp::type get_push_down_agg_type() { return _push_down_agg_type; } DataDistribution required_data_distribution() const override { - if (OperatorX::ignore_data_distribution()) { - // `ignore_data_distribution()` returns true means we ignore the distribution. + if (OperatorX::is_serial_operator()) { + // `is_serial_operator()` returns true means we ignore the distribution. return {ExchangeType::NOOP}; } return {ExchangeType::BUCKET_HASH_SHUFFLE}; diff --git a/be/src/pipeline/exec/schema_scan_operator.cpp b/be/src/pipeline/exec/schema_scan_operator.cpp index fcc1ed2bbb184a..006ecf8ad82e84 100644 --- a/be/src/pipeline/exec/schema_scan_operator.cpp +++ b/be/src/pipeline/exec/schema_scan_operator.cpp @@ -266,6 +266,9 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* bl } while (block->rows() == 0 && !*eos); local_state.reached_limit(block, eos); + if (*eos) { + local_state._finish_dependency->set_always_ready(); + } return Status::OK(); } diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp index 955f956f60d6fe..4c250d5603b499 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.cpp +++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp @@ -71,12 +71,16 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized auto probe_rows = in_block->rows(); if (probe_rows > 0) { - RETURN_IF_ERROR(_extract_probe_column(local_state, *in_block, local_state._probe_columns, - _cur_child_id)); + { + SCOPED_TIMER(local_state._extract_probe_data_timer); + RETURN_IF_ERROR(_extract_probe_column(local_state, *in_block, + local_state._probe_columns, _cur_child_id)); + } RETURN_IF_ERROR(std::visit( [&](auto&& arg) -> Status { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { + SCOPED_TIMER(local_state._probe_timer); vectorized::HashTableProbe process_hashtable_ctx(&local_state, probe_rows); return process_hashtable_ctx.mark_data_in_hashtable(arg); @@ -85,7 +89,7 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized __builtin_unreachable(); } }, - *local_state._shared_state->hash_table_variants)); + local_state._shared_state->hash_table_variants->method_variant)); } if (eos) { @@ -99,6 +103,9 @@ Status SetProbeSinkLocalState::init(RuntimeState* state, LocalSink RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + + _probe_timer = ADD_TIMER(Base::profile(), "ProbeTime"); + _extract_probe_data_timer = ADD_TIMER(Base::profile(), "ExtractProbeDataTime"); Parent& parent = _parent->cast(); _shared_state->probe_finished_children_dependency[parent._cur_child_id] = _dependency; _dependency->block(); @@ -183,7 +190,7 @@ void SetProbeSinkOperatorX::_finalize_probe( valid_element_in_hash_tbl = arg.hash_table->size(); } }, - *hash_table_variants); + hash_table_variants->method_variant); } local_state._probe_columns.resize( local_state._shared_state->child_exprs_lists[_cur_child_id + 1].size()); @@ -203,52 +210,58 @@ void SetProbeSinkOperatorX::_refresh_hash_table( [&](auto&& arg) { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { - auto tmp_hash_table = - std::make_shared(); - bool is_need_shrink = - arg.hash_table->should_be_shrink(valid_element_in_hash_tbl); - if (is_intersect || is_need_shrink) { - tmp_hash_table->init_buf_size(size_t( - valid_element_in_hash_tbl / arg.hash_table->get_factor() + 1)); - } - arg.init_iterator(); auto& iter = arg.iterator; auto iter_end = arg.hash_table->end(); - std::visit( - [&](auto is_need_shrink_const) { - while (iter != iter_end) { - auto& mapped = iter->get_second(); - auto it = mapped.begin(); - - if constexpr (is_intersect) { //intersected - if (it->visited) { - it->visited = false; - tmp_hash_table->insert(iter->get_value()); - } - ++iter; - } else { //except - if constexpr (is_need_shrink_const) { - if (!it->visited) { - tmp_hash_table->insert(iter->get_value()); - } - } - ++iter; - } - } - }, - vectorized::make_bool_variant(is_need_shrink)); - arg.reset(); - if (is_intersect || is_need_shrink) { + constexpr double need_shrink_ratio = 0.25; + bool is_need_shrink = + is_intersect + ? (valid_element_in_hash_tbl < + arg.hash_table + ->size()) // When intersect, shrink as long as the element decreases + : (valid_element_in_hash_tbl < + arg.hash_table->size() * + need_shrink_ratio); // When except, element decreases need to within the 'need_shrink_ratio' before shrinking + + if (is_need_shrink) { + auto tmp_hash_table = + std::make_shared(); + tmp_hash_table->reserve( + local_state._shared_state->valid_element_in_hash_tbl); + while (iter != iter_end) { + auto& mapped = iter->get_second(); + auto it = mapped.begin(); + + if constexpr (is_intersect) { + if (it->visited) { + it->visited = false; + tmp_hash_table->insert(iter->get_first(), iter->get_second()); + } + } else { + if (!it->visited) { + tmp_hash_table->insert(iter->get_first(), iter->get_second()); + } + } + ++iter; + } arg.hash_table = std::move(tmp_hash_table); + } else if (is_intersect) { + while (iter != iter_end) { + auto& mapped = iter->get_second(); + auto it = mapped.begin(); + it->visited = false; + ++iter; + } } + + arg.reset(); } else { LOG(FATAL) << "FATAL: uninited hash table"; __builtin_unreachable(); } }, - *hash_table_variants); + hash_table_variants->method_variant); } template class SetProbeSinkLocalState; diff --git a/be/src/pipeline/exec/set_probe_sink_operator.h b/be/src/pipeline/exec/set_probe_sink_operator.h index ab53f5358c2a91..368ea812cdfe01 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.h +++ b/be/src/pipeline/exec/set_probe_sink_operator.h @@ -60,6 +60,9 @@ class SetProbeSinkLocalState final : public PipelineXSinkLocalState @@ -96,8 +99,6 @@ class SetProbeSinkOperatorX final : public DataSinkOperatorX create_shared_state() const override { return nullptr; } private: diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index 38667293d4854b..539134e53e7fe2 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -24,6 +24,7 @@ #include "vec/core/materialize_block.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" template Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, @@ -39,8 +40,10 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo auto& valid_element_in_hash_tbl = local_state._shared_state->valid_element_in_hash_tbl; if (in_block->rows() != 0) { - RETURN_IF_ERROR(local_state._mutable_block.merge(*in_block)); - + { + SCOPED_TIMER(local_state._merge_block_timer); + RETURN_IF_ERROR(local_state._mutable_block.merge(*in_block)); + } if (local_state._mutable_block.rows() > std::numeric_limits::max()) { return Status::NotSupported("set operator do not support build table rows over:" + std::to_string(std::numeric_limits::max())); @@ -48,6 +51,7 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo } if (eos || local_state._mutable_block.allocated_bytes() >= BUILD_BLOCK_MAX_SIZE) { + SCOPED_TIMER(local_state._build_timer); build_block = local_state._mutable_block.to_block(); RETURN_IF_ERROR(_process_build_block(local_state, build_block, state)); local_state._mutable_block.clear(); @@ -63,7 +67,7 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo valid_element_in_hash_tbl = arg.hash_table->size(); } }, - *local_state._shared_state->hash_table_variants); + local_state._shared_state->hash_table_variants->method_variant); } local_state._shared_state->probe_finished_children_dependency[_cur_child_id + 1] ->set_ready(); @@ -87,22 +91,22 @@ Status SetSinkOperatorX::_process_build_block( vectorized::materialize_block_inplace(block); vectorized::ColumnRawPtrs raw_ptrs(_child_exprs.size()); RETURN_IF_ERROR(_extract_build_column(local_state, block, raw_ptrs, rows)); - + auto st = Status::OK(); std::visit( [&](auto&& arg) { using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { vectorized::HashTableBuild hash_table_build_process(&local_state, rows, raw_ptrs, state); - static_cast(hash_table_build_process(arg, local_state._arena)); + st = hash_table_build_process(arg, local_state._arena); } else { LOG(FATAL) << "FATAL: uninited hash table"; __builtin_unreachable(); } }, - *local_state._shared_state->hash_table_variants); + local_state._shared_state->hash_table_variants->method_variant); - return Status::OK(); + return st; } template @@ -119,7 +123,7 @@ Status SetSinkOperatorX::_extract_build_column( rows = is_all_const ? 1 : rows; for (size_t i = 0; i < _child_exprs.size(); ++i) { - int result_col_id = result_locs[i]; + size_t result_col_id = result_locs[i]; if (is_all_const) { block.get_by_position(result_col_id).column = @@ -151,6 +155,7 @@ Status SetSinkLocalState::init(RuntimeState* state, LocalSinkState RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _merge_block_timer = ADD_TIMER(_profile, "MergeBlocksTime"); _build_timer = ADD_TIMER(_profile, "BuildTime"); auto& parent = _parent->cast(); _shared_state->probe_finished_children_dependency[parent._cur_child_id] = _dependency; @@ -181,8 +186,8 @@ Status SetSinkLocalState::open(RuntimeState* state) { auto& parent = _parent->cast(); DCHECK(parent._cur_child_id == 0); - _shared_state->hash_table_variants = std::make_unique(); - _shared_state->hash_table_init(); + _shared_state->hash_table_variants = std::make_unique(); + RETURN_IF_ERROR(_shared_state->hash_table_init()); return Status::OK(); } diff --git a/be/src/pipeline/exec/set_sink_operator.h b/be/src/pipeline/exec/set_sink_operator.h index 1c08eddc141f2e..ba387d97b41360 100644 --- a/be/src/pipeline/exec/set_sink_operator.h +++ b/be/src/pipeline/exec/set_sink_operator.h @@ -23,6 +23,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { template @@ -48,14 +49,14 @@ class SetSinkLocalState final : public PipelineXSinkLocalState { private: friend class SetSinkOperatorX; - template - friend struct vectorized::HashTableBuild; - RuntimeProfile::Counter* _build_timer; // time to build hash table vectorized::MutableBlock _mutable_block; // every child has its result expr list vectorized::VExprContextSPtrs _child_exprs; vectorized::Arena _arena; + + RuntimeProfile::Counter* _merge_block_timer = nullptr; + RuntimeProfile::Counter* _build_timer = nullptr; }; template @@ -93,7 +94,6 @@ class SetSinkOperatorX final : public DataSinkOperatorX @@ -106,13 +106,14 @@ class SetSinkOperatorX final : public DataSinkOperatorX _partition_exprs; using OperatorBase::_child; }; +#include "common/compile_check_end.h" } // namespace pipeline } // namespace doris diff --git a/be/src/pipeline/exec/set_source_operator.cpp b/be/src/pipeline/exec/set_source_operator.cpp index 554a58caf142bc..91c98288d8b8e1 100644 --- a/be/src/pipeline/exec/set_source_operator.cpp +++ b/be/src/pipeline/exec/set_source_operator.cpp @@ -18,17 +18,20 @@ #include "set_source_operator.h" #include +#include #include "common/status.h" #include "pipeline/exec/operator.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" template Status SetSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _get_data_timer = ADD_TIMER(_runtime_profile, "GetDataTime"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); _shared_state->probe_finished_children_dependency.resize( _parent->cast>()._child_quantity, nullptr); return Status::OK(); @@ -75,21 +78,26 @@ Status SetSourceOperatorX::get_block(RuntimeState* state, vectoriz auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); _create_mutable_cols(local_state, block); - auto st = std::visit( - [&](auto&& arg) -> Status { - using HashTableCtxType = std::decay_t; - if constexpr (!std::is_same_v) { - return _get_data_in_hashtable(local_state, arg, block, - state->batch_size(), eos); - } else { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - } - }, - *local_state._shared_state->hash_table_variants); - RETURN_IF_ERROR(st); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, - block->columns())); + { + SCOPED_TIMER(local_state._get_data_timer); + RETURN_IF_ERROR(std::visit( + [&](auto&& arg) -> Status { + using HashTableCtxType = std::decay_t; + if constexpr (!std::is_same_v) { + return _get_data_in_hashtable(local_state, arg, block, + state->batch_size(), eos); + } else { + LOG(FATAL) << "FATAL: uninited hash table"; + __builtin_unreachable(); + } + }, + local_state._shared_state->hash_table_variants->method_variant)); + } + { + SCOPED_TIMER(local_state._filter_timer); + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, + block->columns())); + } local_state.reached_limit(block, eos); return Status::OK(); } @@ -115,13 +123,11 @@ template Status SetSourceOperatorX::_get_data_in_hashtable( SetSourceLocalState& local_state, HashTableContext& hash_table_ctx, vectorized::Block* output_block, const int batch_size, bool* eos) { - int left_col_len = local_state._left_table_data_types.size(); + size_t left_col_len = local_state._left_table_data_types.size(); hash_table_ctx.init_iterator(); - auto& iter = hash_table_ctx.iterator; auto block_size = 0; - for (; iter != hash_table_ctx.hash_table->end() && block_size < batch_size; ++iter) { - auto& value = iter->get_second(); + auto add_result = [&local_state, &block_size, this](auto value) { auto it = value.begin(); if constexpr (is_intersect) { if (it->visited) { //intersected: have done probe, so visited values it's the result @@ -132,9 +138,21 @@ Status SetSourceOperatorX::_get_data_in_hashtable( _add_result_columns(local_state, value, block_size); } } + }; + + auto& iter = hash_table_ctx.iterator; + for (; iter != hash_table_ctx.hash_table->end() && block_size < batch_size; ++iter) { + add_result(iter->get_second()); } *eos = iter == hash_table_ctx.hash_table->end(); + if (*eos && hash_table_ctx.hash_table->has_null_key_data()) { + auto value = hash_table_ctx.hash_table->template get_null_key_data(); + if constexpr (std::is_same_v>) { + add_result(value); + } + } + if (!output_block->mem_reuse()) { for (int i = 0; i < left_col_len; ++i) { output_block->insert( diff --git a/be/src/pipeline/exec/set_source_operator.h b/be/src/pipeline/exec/set_source_operator.h index 5157a2f9c979fe..976ffde3bf23ea 100644 --- a/be/src/pipeline/exec/set_source_operator.h +++ b/be/src/pipeline/exec/set_source_operator.h @@ -26,7 +26,7 @@ namespace doris { class RuntimeState; namespace pipeline { - +#include "common/compile_check_begin.h" template class SetSourceOperatorX; @@ -46,6 +46,9 @@ class SetSourceLocalState final : public PipelineXLocalState { std::vector _mutable_cols; //record build column type vectorized::DataTypes _left_table_data_types; + + RuntimeProfile::Counter* _get_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; }; template @@ -82,8 +85,8 @@ class SetSourceOperatorX final : public OperatorX& local_state, RowRefListWithFlags& value, int& block_size); - const int _child_quantity; + const size_t _child_quantity; }; - +#include "common/compile_check_end.h" } // namespace pipeline } // namespace doris diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp index b07942b9ab1c05..faec4961af93b7 100644 --- a/be/src/pipeline/exec/sort_sink_operator.cpp +++ b/be/src/pipeline/exec/sort_sink_operator.cpp @@ -31,7 +31,9 @@ Status SortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _sort_blocks_memory_usage = - ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "SortBlocks", TUnit::BYTES, "MemoryUsage", 1); + ADD_COUNTER_WITH_LEVEL(_profile, "MemoryUsageSortBlocks", TUnit::BYTES, 1); + _append_blocks_timer = ADD_TIMER(profile(), "AppendBlockTime"); + _update_runtime_predicate_timer = ADD_TIMER(profile(), "UpdateRuntimePredicateTime"); return Status::OK(); } @@ -90,7 +92,9 @@ SortSinkOperatorX::SortSinkOperatorX(ObjectPool* pool, int operator_id, const TP : std::vector {}), _algorithm(tnode.sort_node.__isset.algorithm ? tnode.sort_node.algorithm : TSortAlgorithm::FULL_SORT), - _reuse_mem(_algorithm != TSortAlgorithm::HEAP_SORT) {} + _reuse_mem(_algorithm != TSortAlgorithm::HEAP_SORT) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status SortSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::init(tnode, state)); @@ -117,12 +121,19 @@ Status SortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* in SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); if (in_block->rows() > 0) { - COUNTER_UPDATE(local_state._sort_blocks_memory_usage, (int64_t)in_block->bytes()); - RETURN_IF_ERROR(local_state._shared_state->sorter->append_block(in_block)); - local_state._mem_tracker->set_consumption(local_state._shared_state->sorter->data_size()); + { + SCOPED_TIMER(local_state._append_blocks_timer); + RETURN_IF_ERROR(local_state._shared_state->sorter->append_block(in_block)); + } + int64_t data_size = local_state._shared_state->sorter->data_size(); + COUNTER_SET(local_state._sort_blocks_memory_usage, data_size); + COUNTER_SET(local_state._memory_used_counter, data_size); + COUNTER_SET(local_state._peak_memory_usage_counter, data_size); + RETURN_IF_CANCELLED(state); if (state->get_query_ctx()->has_runtime_predicate(_node_id)) { + SCOPED_TIMER(local_state._update_runtime_predicate_timer); auto& predicate = state->get_query_ctx()->get_runtime_predicate(_node_id); if (predicate.enable()) { vectorized::Field new_top = local_state._shared_state->sorter->get_top_value(); diff --git a/be/src/pipeline/exec/sort_sink_operator.h b/be/src/pipeline/exec/sort_sink_operator.h index 0bd6dd9096482c..6bf87164e71026 100644 --- a/be/src/pipeline/exec/sort_sink_operator.h +++ b/be/src/pipeline/exec/sort_sink_operator.h @@ -46,6 +46,8 @@ class SortSinkLocalState : public PipelineXSinkLocalState { // topn top value vectorized::Field old_top {vectorized::Field::Types::Null}; + RuntimeProfile::Counter* _append_blocks_timer = nullptr; + RuntimeProfile::Counter* _update_runtime_predicate_timer = nullptr; }; class SortSinkOperatorX final : public DataSinkOperatorX { @@ -63,16 +65,16 @@ class SortSinkOperatorX final : public DataSinkOperatorX { Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos) override; DataDistribution required_data_distribution() const override { if (_is_analytic_sort) { - return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_join + return _is_colocate && _require_bucket_distribution && !_followed_by_shuffled_operator ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); } else if (_merge_by_exchange) { // The current sort node is used for the ORDER BY return {ExchangeType::PASSTHROUGH}; + } else { + return {ExchangeType::NOOP}; } - return DataSinkOperatorX::required_data_distribution(); } - bool require_shuffled_data_distribution() const override { return _is_analytic_sort; } bool require_data_distribution() const override { return _is_colocate; } size_t get_revocable_mem_size(RuntimeState* state) const; diff --git a/be/src/pipeline/exec/sort_source_operator.cpp b/be/src/pipeline/exec/sort_source_operator.cpp index 02a99e183c852e..7f801b79c0b12b 100644 --- a/be/src/pipeline/exec/sort_source_operator.cpp +++ b/be/src/pipeline/exec/sort_source_operator.cpp @@ -30,7 +30,9 @@ SortSourceOperatorX::SortSourceOperatorX(ObjectPool* pool, const TPlanNode& tnod const DescriptorTbl& descs) : OperatorX(pool, tnode, operator_id, descs), _merge_by_exchange(tnode.sort_node.merge_by_exchange), - _offset(tnode.sort_node.__isset.offset ? tnode.sort_node.offset : 0) {} + _offset(tnode.sort_node.__isset.offset ? tnode.sort_node.offset : 0) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; +} Status SortSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(Base::init(tnode, state)); diff --git a/be/src/pipeline/exec/spill_sort_sink_operator.cpp b/be/src/pipeline/exec/spill_sort_sink_operator.cpp index 4bf1ab04efb628..267bcc83aad92c 100644 --- a/be/src/pipeline/exec/spill_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/spill_sort_sink_operator.cpp @@ -51,7 +51,7 @@ void SpillSortSinkLocalState::_init_counters() { _partial_sort_timer = ADD_TIMER(_profile, "PartialSortTime"); _merge_block_timer = ADD_TIMER(_profile, "MergeBlockTime"); _sort_blocks_memory_usage = - ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "SortBlocks", TUnit::BYTES, "MemoryUsage", 1); + ADD_COUNTER_WITH_LEVEL(_profile, "MemoryUsageSortBlocks", TUnit::BYTES, 1); _spill_merge_sort_timer = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "SpillMergeSortTime", "Spill", 1); @@ -70,7 +70,7 @@ void SpillSortSinkLocalState::_init_counters() { void SpillSortSinkLocalState::update_profile(RuntimeProfile* child_profile) { UPDATE_PROFILE(_partial_sort_timer, "PartialSortTime"); UPDATE_PROFILE(_merge_block_timer, "MergeBlockTime"); - UPDATE_PROFILE(_sort_blocks_memory_usage, "SortBlocks"); + UPDATE_PROFILE(_sort_blocks_memory_usage, "MemoryUsageSortBlocks"); } Status SpillSortSinkLocalState::close(RuntimeState* state, Status execsink_status) { @@ -156,8 +156,12 @@ Status SpillSortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Bloc DBUG_EXECUTE_IF("fault_inject::spill_sort_sink::sink", { return Status::InternalError("fault_inject spill_sort_sink sink failed"); }); RETURN_IF_ERROR(_sort_sink_operator->sink(local_state._runtime_state.get(), in_block, false)); - local_state._mem_tracker->set_consumption( - local_state._shared_state->in_mem_shared_state->sorter->data_size()); + + int64_t data_size = local_state._shared_state->in_mem_shared_state->sorter->data_size(); + COUNTER_SET(local_state._sort_blocks_memory_usage, data_size); + COUNTER_SET(local_state._memory_used_counter, data_size); + COUNTER_SET(local_state._peak_memory_usage_counter, data_size); + if (eos) { if (local_state._shared_state->is_spilled) { if (revocable_mem_size(state) > 0) { diff --git a/be/src/pipeline/exec/spill_sort_sink_operator.h b/be/src/pipeline/exec/spill_sort_sink_operator.h index e74b5d2a41401a..2c820d9fa09daf 100644 --- a/be/src/pipeline/exec/spill_sort_sink_operator.h +++ b/be/src/pipeline/exec/spill_sort_sink_operator.h @@ -54,7 +54,6 @@ class SpillSortSinkLocalState : public PipelineXSpillSinkLocalState _finish_dependency; }; diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp index dfbe42c637ea56..cf5071d62e4737 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp @@ -22,6 +22,7 @@ #include #include +#include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "pipeline/exec/operator.h" #include "vec/exprs/vectorized_agg_fn.h" @@ -87,31 +88,24 @@ Status StreamingAggLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(Base::exec_time_counter()); SCOPED_TIMER(Base::_init_timer); - _hash_table_memory_usage = ADD_CHILD_COUNTER_WITH_LEVEL(Base::profile(), "HashTable", - TUnit::BYTES, "MemoryUsage", 1); + _hash_table_memory_usage = + ADD_COUNTER_WITH_LEVEL(Base::profile(), "MemoryUsageHashTable", TUnit::BYTES, 1); _serialize_key_arena_memory_usage = Base::profile()->AddHighWaterMarkCounter( - "SerializeKeyArena", TUnit::BYTES, "MemoryUsage", 1); + "MemoryUsageSerializeKeyArena", TUnit::BYTES, "", 1); _build_timer = ADD_TIMER(Base::profile(), "BuildTime"); - _build_table_convert_timer = ADD_TIMER(Base::profile(), "BuildConvertToPartitionedTime"); - _serialize_key_timer = ADD_TIMER(Base::profile(), "SerializeKeyTime"); - _exec_timer = ADD_TIMER(Base::profile(), "ExecTime"); _merge_timer = ADD_TIMER(Base::profile(), "MergeTime"); _expr_timer = ADD_TIMER(Base::profile(), "ExprTime"); - _serialize_data_timer = ADD_TIMER(Base::profile(), "SerializeDataTime"); + _insert_values_to_column_timer = ADD_TIMER(Base::profile(), "InsertValuesToColumnTime"); _deserialize_data_timer = ADD_TIMER(Base::profile(), "DeserializeAndMergeTime"); _hash_table_compute_timer = ADD_TIMER(Base::profile(), "HashTableComputeTime"); _hash_table_emplace_timer = ADD_TIMER(Base::profile(), "HashTableEmplaceTime"); _hash_table_input_counter = ADD_COUNTER(Base::profile(), "HashTableInputCount", TUnit::UNIT); - _max_row_size_counter = ADD_COUNTER(Base::profile(), "MaxRowSizeInBytes", TUnit::UNIT); _hash_table_size_counter = ADD_COUNTER(profile(), "HashTableSize", TUnit::UNIT); - _queue_byte_size_counter = ADD_COUNTER(profile(), "MaxSizeInBlockQueue", TUnit::BYTES); - _queue_size_counter = ADD_COUNTER(profile(), "MaxSizeOfBlockQueue", TUnit::UNIT); _streaming_agg_timer = ADD_TIMER(profile(), "StreamingAggTime"); _build_timer = ADD_TIMER(profile(), "BuildTime"); _expr_timer = ADD_TIMER(Base::profile(), "ExprTime"); _get_results_timer = ADD_TIMER(profile(), "GetResultsTime"); - _serialize_result_timer = ADD_TIMER(profile(), "SerializeResultTime"); _hash_table_iterate_timer = ADD_TIMER(profile(), "HashTableIterateTime"); _insert_keys_to_column_timer = ADD_TIMER(profile(), "InsertKeysToColumnTime"); @@ -357,10 +351,10 @@ Status StreamingAggLocalState::_merge_without_key(vectorized::Block* block) { } void StreamingAggLocalState::_update_memusage_without_key() { - auto arena_memory_usage = _agg_arena_pool->size() - _mem_usage_record.used_in_arena; - Base::_mem_tracker->consume(arena_memory_usage); - _serialize_key_arena_memory_usage->add(arena_memory_usage); - _mem_usage_record.used_in_arena = _agg_arena_pool->size(); + int64_t arena_memory_usage = _agg_arena_pool->size(); + COUNTER_SET(_memory_used_counter, arena_memory_usage); + COUNTER_SET(_peak_memory_usage_counter, arena_memory_usage); + COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); } Status StreamingAggLocalState::_execute_with_serialized_key(vectorized::Block* block) { @@ -372,28 +366,25 @@ Status StreamingAggLocalState::_execute_with_serialized_key(vectorized::Block* b } void StreamingAggLocalState::_update_memusage_with_serialized_key() { - std::visit( - vectorized::Overload { - [&](std::monostate& arg) -> void { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); - }, - [&](auto& agg_method) -> void { - auto& data = *agg_method.hash_table; - auto arena_memory_usage = _agg_arena_pool->size() + - _aggregate_data_container->memory_usage() - - _mem_usage_record.used_in_arena; - Base::_mem_tracker->consume(arena_memory_usage); - Base::_mem_tracker->consume(data.get_buffer_size_in_bytes() - - _mem_usage_record.used_in_state); - _serialize_key_arena_memory_usage->add(arena_memory_usage); - COUNTER_UPDATE( - _hash_table_memory_usage, - data.get_buffer_size_in_bytes() - _mem_usage_record.used_in_state); - _mem_usage_record.used_in_state = data.get_buffer_size_in_bytes(); - _mem_usage_record.used_in_arena = - _agg_arena_pool->size() + _aggregate_data_container->memory_usage(); - }}, - _agg_data->method_variant); + std::visit(vectorized::Overload { + [&](std::monostate& arg) -> void { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); + }, + [&](auto& agg_method) -> void { + auto& data = *agg_method.hash_table; + int64_t arena_memory_usage = _agg_arena_pool->size() + + _aggregate_data_container->memory_usage(); + int64_t hash_table_memory_usage = data.get_buffer_size_in_bytes(); + + COUNTER_SET(_memory_used_counter, + arena_memory_usage + hash_table_memory_usage); + COUNTER_SET(_peak_memory_usage_counter, + arena_memory_usage + hash_table_memory_usage); + + COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); + COUNTER_SET(_hash_table_memory_usage, hash_table_memory_usage); + }}, + _agg_data->method_variant); } template @@ -502,8 +493,8 @@ Status StreamingAggLocalState::_merge_with_serialized_key(vectorized::Block* blo } Status StreamingAggLocalState::_init_hash_method(const vectorized::VExprContextSPtrs& probe_exprs) { - RETURN_IF_ERROR(init_agg_hash_method( - _agg_data.get(), probe_exprs, + RETURN_IF_ERROR(init_hash_method( + _agg_data.get(), get_data_types(probe_exprs), Base::_parent->template cast()._is_first_phase)); return Status::OK(); } @@ -515,7 +506,6 @@ Status StreamingAggLocalState::do_pre_agg(vectorized::Block* input_block, // pre stream agg need use _num_row_return to decide whether to do pre stream agg _cur_num_rows_returned += output_block->rows(); _make_nullable_output_key(output_block); - // COUNTER_SET(_rows_returned_counter, _num_rows_returned); _executor->update_memusage(this); return Status::OK(); } @@ -683,7 +673,7 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B } for (int i = 0; i != _aggregate_evaluators.size(); ++i) { - SCOPED_TIMER(_serialize_data_timer); + SCOPED_TIMER(_insert_values_to_column_timer); RETURN_IF_ERROR( _aggregate_evaluators[i]->streaming_agg_serialize_to_column( in_block, value_columns[i], rows, @@ -852,12 +842,12 @@ Status StreamingAggLocalState::_get_with_serialized_key_result(RuntimeState* sta return Status::OK(); } -Status StreamingAggLocalState::_serialize_without_key(RuntimeState* state, vectorized::Block* block, - bool* eos) { +Status StreamingAggLocalState::_get_results_without_key(RuntimeState* state, + vectorized::Block* block, bool* eos) { // 1. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return NULL result // level one aggregation node set `eos = true` return directly - SCOPED_TIMER(_serialize_result_timer); + SCOPED_TIMER(_get_results_timer); if (UNLIKELY(_input_num_rows == 0)) { *eos = true; return Status::OK(); @@ -896,10 +886,10 @@ Status StreamingAggLocalState::_serialize_without_key(RuntimeState* state, vecto return Status::OK(); } -Status StreamingAggLocalState::_serialize_with_serialized_key_result(RuntimeState* state, - vectorized::Block* block, - bool* eos) { - SCOPED_TIMER(_serialize_result_timer); +Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* state, + vectorized::Block* block, + bool* eos) { + SCOPED_TIMER(_get_results_timer); auto& p = _parent->cast(); int key_size = _probe_expr_ctxs.size(); int agg_size = _aggregate_evaluators.size(); @@ -918,7 +908,6 @@ Status StreamingAggLocalState::_serialize_with_serialized_key_result(RuntimeStat } } - SCOPED_TIMER(_get_results_timer); std::visit( vectorized::Overload { [&](std::monostate& arg) -> void { @@ -974,7 +963,7 @@ Status StreamingAggLocalState::_serialize_with_serialized_key_result(RuntimeStat } { - SCOPED_TIMER(_serialize_data_timer); + SCOPED_TIMER(_insert_values_to_column_timer); for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { value_data_types[i] = _aggregate_evaluators[i]->function()->get_serialized_type(); @@ -1118,8 +1107,8 @@ void StreamingAggLocalState::_emplace_into_hash_table(vectorized::AggregateDataP SCOPED_TIMER(_hash_table_emplace_timer); for (size_t i = 0; i < num_rows; ++i) { - places[i] = agg_method.lazy_emplace(state, i, creator, - creator_for_null_key); + places[i] = *agg_method.lazy_emplace(state, i, creator, + creator_for_null_key); } COUNTER_UPDATE(_hash_table_input_counter, num_rows); @@ -1156,7 +1145,7 @@ Status StreamingAggOperatorX::init(const TPlanNode& tnode, RuntimeState* state) RETURN_IF_ERROR(vectorized::AggFnEvaluator::create( _pool, tnode.agg_node.aggregate_functions[i], tnode.agg_node.__isset.agg_sort_infos ? tnode.agg_node.agg_sort_infos[i] : dummy, - &evaluator)); + tnode.agg_node.grouping_exprs.empty(), &evaluator)); _aggregate_evaluators.push_back(evaluator); } @@ -1229,7 +1218,8 @@ Status StreamingAggOperatorX::open(RuntimeState* state) { // check output type if (_needs_finalize) { RETURN_IF_ERROR(vectorized::AggFnEvaluator::check_agg_fn_output( - _probe_expr_ctxs.size(), _aggregate_evaluators, _agg_fn_output_row_descriptor)); + cast_set(_probe_expr_ctxs.size()), _aggregate_evaluators, + _agg_fn_output_row_descriptor)); } RETURN_IF_ERROR(vectorized::VExpr::open(_probe_expr_ctxs, state)); @@ -1255,7 +1245,6 @@ Status StreamingAggLocalState::close(RuntimeState* state) { std::vector tmp_deserialize_buffer; _deserialize_buffer.swap(tmp_deserialize_buffer); - Base::_mem_tracker->release(_mem_usage_record.used_in_state + _mem_usage_record.used_in_arena); /// _hash_table_size_counter may be null if prepare failed. if (_hash_table_size_counter) { diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.h b/be/src/pipeline/exec/streaming_aggregation_operator.h index c37fa5cbd881ca..b695880ac2857b 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.h +++ b/be/src/pipeline/exec/streaming_aggregation_operator.h @@ -65,11 +65,11 @@ class StreamingAggLocalState final : public PipelineXLocalState void _update_memusage_with_serialized_key(); Status _init_hash_method(const vectorized::VExprContextSPtrs& probe_exprs); Status _get_without_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); + Status _get_results_without_key(RuntimeState* state, vectorized::Block* block, bool* eos); Status _get_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, bool* eos); - Status _serialize_with_serialized_key_result(RuntimeState* state, vectorized::Block* block, - bool* eos); + Status _get_results_with_serialized_key(RuntimeState* state, vectorized::Block* block, + bool* eos); template Status _merge_with_serialized_key_helper(vectorized::Block* block); @@ -83,25 +83,19 @@ class StreamingAggLocalState final : public PipelineXLocalState Status _create_agg_status(vectorized::AggregateDataPtr data); size_t _get_hash_table_size(); - RuntimeProfile::Counter* _queue_byte_size_counter = nullptr; - RuntimeProfile::Counter* _queue_size_counter = nullptr; RuntimeProfile::Counter* _streaming_agg_timer = nullptr; RuntimeProfile::Counter* _hash_table_compute_timer = nullptr; RuntimeProfile::Counter* _hash_table_emplace_timer = nullptr; RuntimeProfile::Counter* _hash_table_input_counter = nullptr; RuntimeProfile::Counter* _build_timer = nullptr; RuntimeProfile::Counter* _expr_timer = nullptr; - RuntimeProfile::Counter* _build_table_convert_timer = nullptr; - RuntimeProfile::Counter* _serialize_key_timer = nullptr; RuntimeProfile::Counter* _merge_timer = nullptr; - RuntimeProfile::Counter* _serialize_data_timer = nullptr; + RuntimeProfile::Counter* _insert_values_to_column_timer = nullptr; RuntimeProfile::Counter* _deserialize_data_timer = nullptr; - RuntimeProfile::Counter* _max_row_size_counter = nullptr; RuntimeProfile::Counter* _hash_table_memory_usage = nullptr; RuntimeProfile::HighWaterMarkCounter* _serialize_key_arena_memory_usage = nullptr; RuntimeProfile::Counter* _hash_table_size_counter = nullptr; RuntimeProfile::Counter* _get_results_timer = nullptr; - RuntimeProfile::Counter* _serialize_result_timer = nullptr; RuntimeProfile::Counter* _hash_table_iterate_timer = nullptr; RuntimeProfile::Counter* _insert_keys_to_column_timer = nullptr; @@ -136,13 +130,13 @@ class StreamingAggLocalState final : public PipelineXLocalState if constexpr (NeedFinalize) { return local_state->_get_without_key_result(state, block, eos); } else { - return local_state->_serialize_without_key(state, block, eos); + return local_state->_get_results_without_key(state, block, eos); } } else { if constexpr (NeedFinalize) { return local_state->_get_with_serialized_key_result(state, block, eos); } else { - return local_state->_serialize_with_serialized_key_result(state, block, eos); + return local_state->_get_results_with_serialized_key(state, block, eos); } } } @@ -173,12 +167,6 @@ class StreamingAggLocalState final : public PipelineXLocalState }; std::unique_ptr _executor = nullptr; - struct MemoryRecord { - MemoryRecord() : used_in_arena(0), used_in_state(0) {} - int64_t used_in_arena; - int64_t used_in_state; - }; - MemoryRecord _mem_usage_record; std::unique_ptr _child_block = nullptr; bool _child_eos = false; std::unique_ptr _pre_aggregated_block = nullptr; diff --git a/be/src/pipeline/exec/table_function_operator.cpp b/be/src/pipeline/exec/table_function_operator.cpp index ff9dfe632faec6..c1621470f435b4 100644 --- a/be/src/pipeline/exec/table_function_operator.cpp +++ b/be/src/pipeline/exec/table_function_operator.cpp @@ -32,6 +32,18 @@ namespace doris::pipeline { TableFunctionLocalState::TableFunctionLocalState(RuntimeState* state, OperatorXBase* parent) : PipelineXLocalState<>(state, parent), _child_block(vectorized::Block::create_unique()) {} +Status TableFunctionLocalState::init(RuntimeState* state, LocalStateInfo& info) { + RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); + SCOPED_TIMER(exec_time_counter()); + SCOPED_TIMER(_init_timer); + _init_function_timer = ADD_TIMER(_runtime_profile, "InitTableFunctionTime"); + _process_rows_timer = ADD_TIMER(_runtime_profile, "ProcessRowsTime"); + _copy_data_timer = ADD_TIMER(_runtime_profile, "CopyDataTime"); + _filter_timer = ADD_TIMER(_runtime_profile, "FilterTime"); + _repeat_data_timer = ADD_TIMER(_runtime_profile, "RepeatDataTime"); + return Status::OK(); +} + Status TableFunctionLocalState::open(RuntimeState* state) { SCOPED_TIMER(PipelineXLocalState<>::exec_time_counter()); SCOPED_TIMER(PipelineXLocalState<>::_open_timer); @@ -59,6 +71,7 @@ void TableFunctionLocalState::_copy_output_slots( if (!_current_row_insert_times) { return; } + SCOPED_TIMER(_copy_data_timer); auto& p = _parent->cast(); for (auto index : p._output_slot_indexs) { auto src_column = _child_block->get_by_position(index).column; @@ -197,15 +210,18 @@ Status TableFunctionLocalState::get_expanded_block(RuntimeState* state, columns[index]->insert_many_defaults(row_size - columns[index]->size()); } - // 3. eval conjuncts - RETURN_IF_ERROR(vectorized::VExprContext::filter_block(_conjuncts, output_block, - output_block->columns())); + { + SCOPED_TIMER(_filter_timer); // 3. eval conjuncts + RETURN_IF_ERROR(vectorized::VExprContext::filter_block(_conjuncts, output_block, + output_block->columns())); + } *eos = _child_eos && _cur_child_offset == -1; return Status::OK(); } void TableFunctionLocalState::process_next_child_row() { + SCOPED_TIMER(_process_rows_timer); _cur_child_offset++; if (_cur_child_offset >= _child_block->rows()) { @@ -232,9 +248,6 @@ TableFunctionOperatorX::TableFunctionOperatorX(ObjectPool* pool, const TPlanNode Status TableFunctionOperatorX::_prepare_output_slot_ids(const TPlanNode& tnode) { // Prepare output slot ids - if (tnode.table_function_node.outputSlotIds.empty()) { - return Status::InternalError("Output slots of table function node is empty"); - } SlotId max_id = -1; for (auto slot_id : tnode.table_function_node.outputSlotIds) { if (slot_id > max_id) { diff --git a/be/src/pipeline/exec/table_function_operator.h b/be/src/pipeline/exec/table_function_operator.h index 75b1608fad7112..81160acb7f7611 100644 --- a/be/src/pipeline/exec/table_function_operator.h +++ b/be/src/pipeline/exec/table_function_operator.h @@ -37,6 +37,7 @@ class TableFunctionLocalState final : public PipelineXLocalState<> { TableFunctionLocalState(RuntimeState* state, OperatorXBase* parent); ~TableFunctionLocalState() override = default; + Status init(RuntimeState* state, LocalStateInfo& infos) override; Status open(RuntimeState* state) override; Status close(RuntimeState* state) override { for (auto* fn : _fns) { @@ -67,6 +68,12 @@ class TableFunctionLocalState final : public PipelineXLocalState<> { std::unique_ptr _child_block; int _current_row_insert_times = 0; bool _child_eos = false; + + RuntimeProfile::Counter* _init_function_timer = nullptr; + RuntimeProfile::Counter* _process_rows_timer = nullptr; + RuntimeProfile::Counter* _copy_data_timer = nullptr; + RuntimeProfile::Counter* _filter_timer = nullptr; + RuntimeProfile::Counter* _repeat_data_timer = nullptr; }; class TableFunctionOperatorX final : public StatefulOperatorX { @@ -93,6 +100,7 @@ class TableFunctionOperatorX final : public StatefulOperatorXprocess_init(input_block, state)); } local_state.process_next_child_row(); diff --git a/be/src/pipeline/exec/union_sink_operator.cpp b/be/src/pipeline/exec/union_sink_operator.cpp index 288fc131037fab..8467eeb1d5467a 100644 --- a/be/src/pipeline/exec/union_sink_operator.cpp +++ b/be/src/pipeline/exec/union_sink_operator.cpp @@ -32,6 +32,7 @@ Status UnionSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) RETURN_IF_ERROR(Base::init(state, info)); SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); + _expr_timer = ADD_TIMER(_profile, "ExprTime"); auto& p = _parent->cast(); _shared_state->data_queue.set_sink_dependency(_dependency, p._cur_child_id); return Status::OK(); diff --git a/be/src/pipeline/exec/union_sink_operator.h b/be/src/pipeline/exec/union_sink_operator.h index 13dfb0ba6379cb..aa94ed9a73038f 100644 --- a/be/src/pipeline/exec/union_sink_operator.h +++ b/be/src/pipeline/exec/union_sink_operator.h @@ -55,6 +55,7 @@ class UnionSinkLocalState final : public PipelineXSinkLocalState { @@ -89,6 +90,12 @@ class UnionSinkOperatorX final : public DataSinkOperatorX { } } + bool require_shuffled_data_distribution() const override { + return _followed_by_shuffled_operator; + } + + bool is_shuffled_operator() const override { return _followed_by_shuffled_operator; } + private: int _get_first_materialized_child_idx() const { return _first_materialized_child_idx; } @@ -130,6 +137,7 @@ class UnionSinkOperatorX final : public DataSinkOperatorX { Status materialize_block(RuntimeState* state, vectorized::Block* src_block, int child_idx, vectorized::Block* res_block) { auto& local_state = get_local_state(state); + SCOPED_TIMER(local_state._expr_timer); const auto& child_exprs = local_state._child_expr; vectorized::ColumnsWithTypeAndName colunms; for (size_t i = 0; i < child_exprs.size(); ++i) { diff --git a/be/src/pipeline/exec/union_source_operator.h b/be/src/pipeline/exec/union_source_operator.h index bf32e9a25c2454..200e7de8597b91 100644 --- a/be/src/pipeline/exec/union_source_operator.h +++ b/be/src/pipeline/exec/union_source_operator.h @@ -63,7 +63,9 @@ class UnionSourceOperatorX final : public OperatorX { using Base = OperatorX; UnionSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, const DescriptorTbl& descs) - : Base(pool, tnode, operator_id, descs), _child_size(tnode.num_children) {}; + : Base(pool, tnode, operator_id, descs), _child_size(tnode.num_children) { + _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; + } ~UnionSourceOperatorX() override = default; Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos) override; @@ -95,6 +97,11 @@ class UnionSourceOperatorX final : public OperatorX { return Status::OK(); } [[nodiscard]] int get_child_count() const { return _child_size; } + bool require_shuffled_data_distribution() const override { + return _followed_by_shuffled_operator; + } + + bool is_shuffled_operator() const override { return _followed_by_shuffled_operator; } private: bool _has_data(RuntimeState* state) const { diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp index 19c37f3649bcc7..a939d25654b4cc 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp @@ -36,16 +36,17 @@ std::vector LocalExchangeSinkLocalState::dependencies() const { } Status LocalExchangeSinkOperatorX::init(ExchangeType type, const int num_buckets, - const bool should_disable_bucket_shuffle, + const bool use_global_hash_shuffle, const std::map& shuffle_idx_to_instance_idx) { _name = "LOCAL_EXCHANGE_SINK_OPERATOR (" + get_exchange_type_name(type) + ")"; _type = type; if (_type == ExchangeType::HASH_SHUFFLE) { + _use_global_shuffle = use_global_hash_shuffle; // For shuffle join, if data distribution has been broken by previous operator, we // should use a HASH_SHUFFLE local exchanger to shuffle data again. To be mentioned, // we should use map shuffle idx to instance idx because all instances will be // distributed to all BEs. Otherwise, we should use shuffle idx directly. - if (should_disable_bucket_shuffle) { + if (use_global_hash_shuffle) { std::for_each(shuffle_idx_to_instance_idx.begin(), shuffle_idx_to_instance_idx.end(), [&](const auto& item) { DCHECK(item.first != -1); @@ -84,6 +85,11 @@ Status LocalExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo SCOPED_TIMER(_init_timer); _compute_hash_value_timer = ADD_TIMER(profile(), "ComputeHashValueTime"); _distribute_timer = ADD_TIMER(profile(), "DistributeDataTime"); + if (_parent->cast()._type == ExchangeType::HASH_SHUFFLE) { + _profile->add_info_string( + "UseGlobalShuffle", + std::to_string(_parent->cast()._use_global_shuffle)); + } _channel_id = info.task_idx; return Status::OK(); } @@ -105,29 +111,27 @@ Status LocalExchangeSinkLocalState::open(RuntimeState* state) { } Status LocalExchangeSinkLocalState::close(RuntimeState* state, Status exec_status) { - if (_closed) { + SCOPED_TIMER(Base::exec_time_counter()); + SCOPED_TIMER(Base::_close_timer); + if (Base::_closed) { return Status::OK(); } - RETURN_IF_ERROR(Base::close(state, exec_status)); - if (exec_status.ok()) { - DCHECK(_release_count) << "Do not finish correctly! " << debug_string(0) - << " state: { cancel = " << state->is_cancelled() << ", " - << state->cancel_reason().to_string() << "} query ctx: { cancel = " - << state->get_query_ctx()->is_cancelled() << ", " - << state->get_query_ctx()->exec_status().to_string() << "}"; + if (_shared_state) { + _shared_state->sub_running_sink_operators(); } - return Status::OK(); + return Base::close(state, exec_status); } std::string LocalExchangeSinkLocalState::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, - "{}, _channel_id: {}, _num_partitions: {}, _num_senders: {}, _num_sources: {}, " - "_running_sink_operators: {}, _running_source_operators: {}, _release_count: {}", - Base::debug_string(indentation_level), _channel_id, _exchanger->_num_partitions, - _exchanger->_num_senders, _exchanger->_num_sources, - _exchanger->_running_sink_operators, _exchanger->_running_source_operators, - _release_count); + "{}, _use_global_shuffle: {}, _channel_id: {}, _num_partitions: {}, " + "_num_senders: {}, _num_sources: {}, " + "_running_sink_operators: {}, _running_source_operators: {}", + Base::debug_string(indentation_level), + _parent->cast()._use_global_shuffle, _channel_id, + _exchanger->_num_partitions, _exchanger->_num_senders, _exchanger->_num_sources, + _exchanger->_running_sink_operators, _exchanger->_running_source_operators); return fmt::to_string(debug_string_buffer); } @@ -140,14 +144,8 @@ Status LocalExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* // If all exchange sources ended due to limit reached, current task should also finish if (local_state._exchanger->_running_source_operators == 0) { - local_state._release_count = true; - local_state._shared_state->sub_running_sink_operators(); return Status::EndOfFile("receiver eof"); } - if (eos) { - local_state._shared_state->sub_running_sink_operators(); - local_state._release_count = true; - } return Status::OK(); } diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h index 7a98840b4b323e..4c4a400c2bde3b 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h @@ -43,9 +43,9 @@ class LocalExchangeSinkLocalState final : public PipelineXSinkLocalState dependencies() const override; + Status close(RuntimeState* state, Status exec_status) override; private: friend class LocalExchangeSinkOperatorX; @@ -69,7 +69,6 @@ class LocalExchangeSinkLocalState final : public PipelineXSinkLocalState& shuffle_idx_to_instance_idx) override; Status open(RuntimeState* state) override; @@ -118,6 +117,7 @@ class LocalExchangeSinkOperatorX final : public DataSinkOperatorX _partitioner; const std::map _bucket_seq_to_instance_idx; std::vector> _shuffle_idx_to_instance_idx; + bool _use_global_shuffle = false; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp index 2d20b8f365cd7d..c4832b9958c00d 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp @@ -26,7 +26,7 @@ Status LocalExchangeSourceLocalState::init(RuntimeState* state, LocalStateInfo& SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_init_timer); _channel_id = info.task_idx; - _shared_state->mem_trackers[_channel_id] = _mem_tracker.get(); + _shared_state->mem_counters[_channel_id] = _memory_used_counter; _exchanger = _shared_state->exchanger.get(); DCHECK(_exchanger != nullptr); _get_block_failed_counter = @@ -105,8 +105,8 @@ std::string LocalExchangeSourceLocalState::debug_string(int indentation_level) c _exchanger->data_queue_debug_string(_channel_id)); size_t i = 0; fmt::format_to(debug_string_buffer, ", MemTrackers: "); - for (auto* mem_tracker : _shared_state->mem_trackers) { - fmt::format_to(debug_string_buffer, "{}: {}, ", i, mem_tracker->consumption()); + for (auto* mem_counter : _shared_state->mem_counters) { + fmt::format_to(debug_string_buffer, "{}: {}, ", i, mem_counter->value()); i++; } return fmt::to_string(debug_string_buffer); diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.h b/be/src/pipeline/local_exchange/local_exchange_source_operator.h index c0da5c8120c1e9..3c706d50182538 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.h @@ -81,9 +81,6 @@ class LocalExchangeSourceOperatorX final : public OperatorX void Exchanger::_enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState& local_state, @@ -118,8 +119,7 @@ Status ShuffleExchanger::sink(RuntimeState* state, vectorized::Block* in_block, } { SCOPED_TIMER(local_state._compute_hash_value_timer); - RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, in_block, - local_state.mem_tracker())); + RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, in_block)); } { SCOPED_TIMER(local_state._distribute_timer); @@ -171,11 +171,11 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, LocalExchangeSinkLocalState& local_state) { - const auto rows = block->rows(); + const auto rows = cast_set(block->rows()); auto row_idx = std::make_shared>(rows); { local_state._partition_rows_histogram.assign(_num_partitions + 1, 0); - for (size_t i = 0; i < rows; ++i) { + for (int32_t i = 0; i < rows; ++i) { local_state._partition_rows_histogram[channel_ids[i]]++; } for (int32_t i = 1; i <= _num_partitions; ++i) { @@ -213,7 +213,7 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest */ const auto& map = local_state._parent->cast() ._shuffle_idx_to_instance_idx; - new_block_wrapper->ref(map.size()); + new_block_wrapper->ref(cast_set(map.size())); for (const auto& it : map) { DCHECK(it.second >= 0 && it.second < _num_partitions) << it.first << " : " << it.second << " " << _num_partitions; @@ -226,7 +226,7 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); } } - } else if (_num_senders != _num_sources || _ignore_source_data_distribution) { + } else if (_num_senders != _num_sources) { // In this branch, data just should be distributed equally into all instances. new_block_wrapper->ref(_num_partitions); for (size_t i = 0; i < _num_partitions; i++) { @@ -242,7 +242,7 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest } else { DCHECK(!bucket_seq_to_instance_idx.empty()); new_block_wrapper->ref(_num_partitions); - for (size_t i = 0; i < _num_partitions; i++) { + for (int i = 0; i < _num_partitions; i++) { uint32_t start = local_state._partition_rows_histogram[i]; uint32_t size = local_state._partition_rows_histogram[i + 1] - start; if (size > 0) { @@ -427,7 +427,7 @@ Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block local_state._shared_state->add_total_mem_usage(wrapper->data_block.allocated_bytes(), local_state._channel_id); wrapper->ref(_num_partitions); - for (size_t i = 0; i < _num_partitions; i++) { + for (int i = 0; i < _num_partitions; i++) { _enqueue_data_and_set_ready(i, local_state, {wrapper, {0, wrapper->data_block.rows()}}); } @@ -501,11 +501,11 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, LocalExchangeSinkLocalState& local_state) { - const auto rows = block->rows(); + const auto rows = cast_set(block->rows()); auto row_idx = std::make_shared>(rows); { local_state._partition_rows_histogram.assign(_num_partitions + 1, 0); - for (size_t i = 0; i < rows; ++i) { + for (int32_t i = 0; i < rows; ++i) { local_state._partition_rows_histogram[channel_ids[i]]++; } for (int32_t i = 1; i <= _num_partitions; ++i) { @@ -518,7 +518,7 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, local_state._partition_rows_histogram[channel_ids[i]]--; } } - for (size_t i = 0; i < _num_partitions; i++) { + for (int32_t i = 0; i < _num_partitions; i++) { const size_t start = local_state._partition_rows_histogram[i]; const size_t size = local_state._partition_rows_histogram[i + 1] - start; if (size > 0) { diff --git a/be/src/pipeline/local_exchange/local_exchanger.h b/be/src/pipeline/local_exchange/local_exchanger.h index 01b55816ba8aad..bf052ac3b924ca 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.h +++ b/be/src/pipeline/local_exchange/local_exchanger.h @@ -21,6 +21,7 @@ #include "pipeline/exec/operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class LocalExchangeSourceLocalState; class LocalExchangeSinkLocalState; @@ -217,24 +218,21 @@ class ShuffleExchanger : public Exchanger { protected: ShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, - bool ignore_source_data_distribution, int free_block_limit) + int free_block_limit) : Exchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit), - _ignore_source_data_distribution(ignore_source_data_distribution) { + free_block_limit) { _data_queue.resize(num_partitions); } Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, LocalExchangeSinkLocalState& local_state); - - const bool _ignore_source_data_distribution = false; }; class BucketShuffleExchanger final : public ShuffleExchanger { ENABLE_FACTORY_CREATOR(BucketShuffleExchanger); BucketShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, - bool ignore_source_data_distribution, int free_block_limit) + int free_block_limit) : ShuffleExchanger(running_sink_operators, num_sources, num_partitions, - ignore_source_data_distribution, free_block_limit) {} + free_block_limit) {} ~BucketShuffleExchanger() override = default; ExchangeType get_type() const override { return ExchangeType::BUCKET_HASH_SHUFFLE; } }; @@ -351,5 +349,5 @@ class AdaptivePassthroughExchanger : public Exchanger { std::atomic_bool _is_pass_through = false; std::atomic_int32_t _total_block = 0; }; - +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/pipeline.cpp b/be/src/pipeline/pipeline.cpp index 74e15d7cc93ea1..96da754daa5d98 100644 --- a/be/src/pipeline/pipeline.cpp +++ b/be/src/pipeline/pipeline.cpp @@ -22,6 +22,8 @@ #include #include "pipeline/exec/operator.h" +#include "pipeline/pipeline_fragment_context.h" +#include "pipeline/pipeline_task.h" namespace doris::pipeline { @@ -30,7 +32,48 @@ void Pipeline::_init_profile() { _pipeline_profile = std::make_unique(std::move(s)); } -Status Pipeline::add_operator(OperatorPtr& op) { +bool Pipeline::need_to_local_exchange(const DataDistribution target_data_distribution, + const int idx) const { + // If serial operator exists after `idx`-th operator, we should not improve parallelism. + if (std::any_of(_operators.begin() + idx, _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + return false; + } + // If all operators are serial and sink is not serial, we should improve parallelism for sink. + if (std::all_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + if (!_sink->is_serial_operator()) { + return true; + } + } else if (std::any_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + // If non-serial operators exist, we should improve parallelism for those. + return true; + } + + if (target_data_distribution.distribution_type != ExchangeType::BUCKET_HASH_SHUFFLE && + target_data_distribution.distribution_type != ExchangeType::HASH_SHUFFLE) { + // Always do local exchange if non-hash-partition exchanger is required. + // For example, `PASSTHROUGH` exchanger is always required to distribute data evenly. + return true; + } else if (_operators.front()->is_serial_operator()) { + DCHECK(std::all_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); }) && + _sink->is_serial_operator()) + << debug_string(); + // All operators and sink are serial in this path. + return false; + } else { + return _data_distribution.distribution_type != target_data_distribution.distribution_type && + !(is_hash_exchange(_data_distribution.distribution_type) && + is_hash_exchange(target_data_distribution.distribution_type)); + } +} + +Status Pipeline::add_operator(OperatorPtr& op, const int parallelism) { + if (parallelism > 0 && op->is_serial_operator()) { + set_num_tasks(parallelism); + } op->set_parallel_tasks(num_tasks()); _operators.emplace_back(op); if (op->is_source()) { @@ -65,4 +108,14 @@ Status Pipeline::set_sink(DataSinkOperatorPtr& sink) { return Status::OK(); } -} // namespace doris::pipeline \ No newline at end of file +void Pipeline::make_all_runnable() { + if (_sink->count_down_destination()) { + for (auto* task : _tasks) { + if (task) { + task->clear_blocking_state(true); + } + } + } +} + +} // namespace doris::pipeline diff --git a/be/src/pipeline/pipeline.h b/be/src/pipeline/pipeline.h index dfeb53ae006116..b969186b178bf7 100644 --- a/be/src/pipeline/pipeline.h +++ b/be/src/pipeline/pipeline.h @@ -25,12 +25,13 @@ #include #include +#include "common/cast_set.h" #include "common/status.h" #include "pipeline/exec/operator.h" #include "util/runtime_profile.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" class PipelineFragmentContext; class Pipeline; @@ -43,14 +44,16 @@ class Pipeline : public std::enable_shared_from_this { friend class PipelineFragmentContext; public: - explicit Pipeline(PipelineId pipeline_id, int num_tasks, - std::weak_ptr context) - : _pipeline_id(pipeline_id), _num_tasks(num_tasks) { + explicit Pipeline(PipelineId pipeline_id, int num_tasks, int num_tasks_of_parent) + : _pipeline_id(pipeline_id), + _num_tasks(num_tasks), + _num_tasks_of_parent(num_tasks_of_parent) { _init_profile(); + _tasks.resize(_num_tasks, nullptr); } // Add operators for pipelineX - Status add_operator(OperatorPtr& op); + Status add_operator(OperatorPtr& op, const int parallelism); // prepare operators for pipelineX Status prepare(RuntimeState* state); @@ -70,28 +73,8 @@ class Pipeline : public std::enable_shared_from_this { return idx == ExchangeType::HASH_SHUFFLE || idx == ExchangeType::BUCKET_HASH_SHUFFLE; } - bool need_to_local_exchange(const DataDistribution target_data_distribution) const { - if (target_data_distribution.distribution_type != ExchangeType::BUCKET_HASH_SHUFFLE && - target_data_distribution.distribution_type != ExchangeType::HASH_SHUFFLE) { - return true; - } else if (_operators.front()->ignore_data_hash_distribution()) { - if (_data_distribution.distribution_type == - target_data_distribution.distribution_type && - (_data_distribution.partition_exprs.empty() || - target_data_distribution.partition_exprs.empty())) { - return true; - } - return _data_distribution.distribution_type != - target_data_distribution.distribution_type && - !(is_hash_exchange(_data_distribution.distribution_type) && - is_hash_exchange(target_data_distribution.distribution_type)); - } else { - return _data_distribution.distribution_type != - target_data_distribution.distribution_type && - !(is_hash_exchange(_data_distribution.distribution_type) && - is_hash_exchange(target_data_distribution.distribution_type)); - } - } + bool need_to_local_exchange(const DataDistribution target_data_distribution, + const int idx) const; void init_data_distribution() { set_data_distribution(_operators.front()->required_data_distribution()); } @@ -102,29 +85,52 @@ class Pipeline : public std::enable_shared_from_this { std::vector>& children() { return _children; } void set_children(std::shared_ptr child) { _children.push_back(child); } - void set_children(std::vector> children) { _children = children; } + void set_children(std::vector> children) { + _children = std::move(children); + } + + void incr_created_tasks(int i, PipelineTask* task) { + _num_tasks_created++; + _num_tasks_running++; + DCHECK_LT(i, _tasks.size()); + _tasks[i] = task; + } + + void make_all_runnable(); - void incr_created_tasks() { _num_tasks_created++; } void set_num_tasks(int num_tasks) { _num_tasks = num_tasks; + _tasks.resize(_num_tasks, nullptr); for (auto& op : _operators) { op->set_parallel_tasks(_num_tasks); } + +#ifndef NDEBUG + if (num_tasks > 1 && + std::any_of(_operators.begin(), _operators.end(), + [&](OperatorPtr op) -> bool { return op->is_serial_operator(); })) { + DCHECK(false) << debug_string(); + } +#endif } int num_tasks() const { return _num_tasks; } + bool close_task() { return _num_tasks_running.fetch_sub(1) == 1; } - std::string debug_string() { + std::string debug_string() const { fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, "Pipeline [id: {}, _num_tasks: {}, _num_tasks_created: {}]", _pipeline_id, _num_tasks, _num_tasks_created); - for (size_t i = 0; i < _operators.size(); i++) { + for (int i = 0; i < _operators.size(); i++) { fmt::format_to(debug_string_buffer, "\n{}", _operators[i]->debug_string(i)); } - fmt::format_to(debug_string_buffer, "\n{}", _sink->debug_string(_operators.size())); + fmt::format_to(debug_string_buffer, "\n{}", + _sink->debug_string(cast_set(_operators.size()))); return fmt::to_string(debug_string_buffer); } + int num_tasks_of_parent() const { return _num_tasks_of_parent; } + private: void _init_profile(); @@ -158,6 +164,12 @@ class Pipeline : public std::enable_shared_from_this { int _num_tasks = 1; // How many tasks are already created? std::atomic _num_tasks_created = 0; + // How many tasks are already created and not finished? + std::atomic _num_tasks_running = 0; + // Tasks in this pipeline. + std::vector _tasks; + // Parallelism of parent pipeline. + const int _num_tasks_of_parent; }; - +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index e99d8a17262e2e..d14a0d0c3cd4a7 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -114,8 +114,6 @@ namespace doris::pipeline { #include "common/compile_check_begin.h" -bvar::Adder g_pipeline_tasks_count("doris_pipeline_tasks_count"); - PipelineFragmentContext::PipelineFragmentContext( const TUniqueId& query_id, const int fragment_id, std::shared_ptr query_ctx, ExecEnv* exec_env, const std::function& call_back, @@ -146,6 +144,8 @@ PipelineFragmentContext::~PipelineFragmentContext() { runtime_state.reset(); } } + _dag.clear(); + _pip_id_to_pipeline.clear(); _pipelines.clear(); _sink.reset(); _root_op.reset(); @@ -214,8 +214,8 @@ void PipelineFragmentContext::cancel(const Status reason) { PipelinePtr PipelineFragmentContext::add_pipeline(PipelinePtr parent, int idx) { PipelineId id = _next_pipeline_id++; auto pipeline = std::make_shared( - id, _num_instances, - std::dynamic_pointer_cast(shared_from_this())); + id, parent ? std::min(parent->num_tasks(), _num_instances) : _num_instances, + parent ? parent->num_tasks() : _num_instances); if (idx >= 0) { _pipelines.insert(_pipelines.begin() + idx, pipeline); } else { @@ -252,11 +252,6 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re auto* fragment_context = this; - LOG_INFO("PipelineFragmentContext::prepare") - .tag("query_id", print_id(_query_id)) - .tag("fragment_id", _fragment_id) - .tag("pthread_id", (uintptr_t)pthread_self()); - if (request.query_options.__isset.is_report_success) { fragment_context->set_is_report_success(request.query_options.is_report_success); } @@ -365,10 +360,11 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag _task_runtime_states.resize(_pipelines.size()); for (size_t pip_idx = 0; pip_idx < _pipelines.size(); pip_idx++) { _task_runtime_states[pip_idx].resize(_pipelines[pip_idx]->num_tasks()); + _pip_id_to_pipeline[_pipelines[pip_idx]->id()] = _pipelines[pip_idx].get(); } auto pipeline_id_to_profile = _runtime_state->build_pipeline_profile(_pipelines.size()); - auto pre_and_submit = [&](int64_t i, PipelineFragmentContext* ctx) { + auto pre_and_submit = [&](int i, PipelineFragmentContext* ctx) { const auto& local_params = request.local_params[i]; auto fragment_instance_id = local_params.fragment_instance_id; _fragment_instance_ids[i] = fragment_instance_id; @@ -469,6 +465,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag task_runtime_state.get(), this, pipeline_id_to_profile[pip_idx].get(), get_local_exchange_state(pipeline), i); + pipeline->incr_created_tasks(i, task.get()); task_runtime_state->set_task(task.get()); pipeline_id_to_task.insert({pipeline->id(), task.get()}); _tasks[i].emplace_back(std::move(task)); @@ -536,7 +533,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag std::mutex m; std::condition_variable cv; int prepare_done = 0; - for (size_t i = 0; i < target_size; i++) { + for (int i = 0; i < target_size; i++) { RETURN_IF_ERROR(thread_pool->submit_func([&, i]() { SCOPED_ATTACH_TASK(_query_ctx.get()); prepare_status[i] = pre_and_submit(i, this); @@ -550,19 +547,18 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag std::unique_lock lock(m); if (prepare_done != target_size) { cv.wait(lock); - for (size_t i = 0; i < target_size; i++) { + for (int i = 0; i < target_size; i++) { if (!prepare_status[i].ok()) { return prepare_status[i]; } } } } else { - for (size_t i = 0; i < target_size; i++) { + for (int i = 0; i < target_size; i++) { RETURN_IF_ERROR(pre_and_submit(i, this)); } } _pipeline_parent_map.clear(); - _dag.clear(); _op_id_to_le_state.clear(); return Status::OK(); @@ -571,10 +567,7 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag void PipelineFragmentContext::_init_next_report_time() { auto interval_s = config::pipeline_status_report_interval; if (_is_report_success && interval_s > 0 && _timeout > interval_s) { - std::vector ins_ids; - instance_ids(ins_ids); - VLOG_FILE << "enable period report: instance_id=" - << fmt::format("{}", fmt::join(ins_ids, ", ")); + VLOG_FILE << "enable period report: fragment id=" << _fragment_id; uint64_t report_fragment_offset = (uint64_t)(rand() % interval_s) * NANOS_PER_SEC; // We don't want to wait longer than it takes to run the entire fragment. _previous_report_time = @@ -612,11 +605,9 @@ void PipelineFragmentContext::trigger_report_if_necessary() { return; } if (VLOG_FILE_IS_ON) { - std::vector ins_ids; - instance_ids(ins_ids); VLOG_FILE << "Reporting " << "profile for query_id " << print_id(_query_id) - << ", instance ids: " << fmt::format("{}", fmt::join(ins_ids, ", ")); + << ", fragment id: " << _fragment_id; std::stringstream ss; _runtime_state->runtime_profile()->compute_time_in_profile(); @@ -664,7 +655,7 @@ Status PipelineFragmentContext::_create_tree_helper(ObjectPool* pool, const DescriptorTbl& descs, OperatorPtr parent, int* node_idx, OperatorPtr* root, PipelinePtr& cur_pipe, int child_idx, - const bool followed_by_shuffled_join) { + const bool followed_by_shuffled_operator) { // propagate error case if (*node_idx >= tnodes.size()) { return Status::InternalError( @@ -674,11 +665,11 @@ Status PipelineFragmentContext::_create_tree_helper(ObjectPool* pool, const TPlanNode& tnode = tnodes[*node_idx]; int num_children = tnodes[*node_idx].num_children; - bool current_followed_by_shuffled_join = followed_by_shuffled_join; + bool current_followed_by_shuffled_operator = followed_by_shuffled_operator; OperatorPtr op = nullptr; RETURN_IF_ERROR(_create_operator(pool, tnodes[*node_idx], request, descs, op, cur_pipe, parent == nullptr ? -1 : parent->node_id(), child_idx, - followed_by_shuffled_join)); + followed_by_shuffled_operator)); // assert(parent != nullptr || (node_idx == 0 && root_expr != nullptr)); if (parent != nullptr) { @@ -688,7 +679,7 @@ Status PipelineFragmentContext::_create_tree_helper(ObjectPool* pool, *root = op; } /** - * `ExchangeType::HASH_SHUFFLE` should be used if an operator is followed by a shuffled hash join. + * `ExchangeType::HASH_SHUFFLE` should be used if an operator is followed by a shuffled operator (shuffled hash join, union operator followed by co-located operators). * * For plan: * LocalExchange(id=0) -> Aggregation(id=1) -> ShuffledHashJoin(id=2) @@ -701,15 +692,18 @@ Status PipelineFragmentContext::_create_tree_helper(ObjectPool* pool, auto require_shuffled_data_distribution = cur_pipe->operators().empty() ? cur_pipe->sink()->require_shuffled_data_distribution() : op->require_shuffled_data_distribution(); - current_followed_by_shuffled_join = - (followed_by_shuffled_join || op->is_shuffled_hash_join()) && + current_followed_by_shuffled_operator = + (followed_by_shuffled_operator || op->is_shuffled_operator()) && require_shuffled_data_distribution; + if (num_children == 0) { + _use_serial_source = op->is_serial_operator(); + } // rely on that tnodes is preorder of the plan for (int i = 0; i < num_children; i++) { ++*node_idx; RETURN_IF_ERROR(_create_tree_helper(pool, tnodes, request, descs, op, node_idx, nullptr, - cur_pipe, i, current_followed_by_shuffled_join)); + cur_pipe, i, current_followed_by_shuffled_operator)); // we are expecting a child, but have used all nodes // this means we have been given a bad tree and must fail @@ -737,8 +731,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( int idx, ObjectPool* pool, PipelinePtr cur_pipe, PipelinePtr new_pip, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_hash_distribution) { + const std::map& shuffle_idx_to_instance_idx) { auto& operators = cur_pipe->operators(); const auto downstream_pipeline_id = cur_pipe->id(); auto local_exchange_id = next_operator_id(); @@ -750,16 +743,15 @@ Status PipelineFragmentContext::_add_local_exchange_impl( * `bucket_seq_to_instance_idx` is empty if no scan operator is contained in this fragment. * So co-located operators(e.g. Agg, Analytic) should use `HASH_SHUFFLE` instead of `BUCKET_HASH_SHUFFLE`. */ - const bool followed_by_shuffled_join = operators.size() > idx - ? operators[idx]->followed_by_shuffled_join() - : cur_pipe->sink()->followed_by_shuffled_join(); - const bool should_disable_bucket_shuffle = + const bool followed_by_shuffled_operator = + operators.size() > idx ? operators[idx]->followed_by_shuffled_operator() + : cur_pipe->sink()->followed_by_shuffled_operator(); + const bool use_global_hash_shuffle = bucket_seq_to_instance_idx.empty() && shuffle_idx_to_instance_idx.find(-1) == shuffle_idx_to_instance_idx.end() && - followed_by_shuffled_join; + followed_by_shuffled_operator && !_use_serial_source; sink.reset(new LocalExchangeSinkOperatorX( - sink_id, local_exchange_id, - should_disable_bucket_shuffle ? _total_instances : _num_instances, + sink_id, local_exchange_id, use_global_hash_shuffle ? _total_instances : _num_instances, data_distribution.partition_exprs, bucket_seq_to_instance_idx)); if (bucket_seq_to_instance_idx.empty() && data_distribution.distribution_type == ExchangeType::BUCKET_HASH_SHUFFLE) { @@ -767,8 +759,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( } RETURN_IF_ERROR(new_pip->set_sink(sink)); RETURN_IF_ERROR(new_pip->sink()->init(data_distribution.distribution_type, num_buckets, - should_disable_bucket_shuffle, - shuffle_idx_to_instance_idx)); + use_global_hash_shuffle, shuffle_idx_to_instance_idx)); // 2. Create and initialize LocalExchangeSharedState. std::shared_ptr shared_state = @@ -779,31 +770,34 @@ Status PipelineFragmentContext::_add_local_exchange_impl( case ExchangeType::HASH_SHUFFLE: shared_state->exchanger = ShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), - should_disable_bucket_shuffle ? _total_instances : _num_instances, + use_global_hash_shuffle ? _total_instances : _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set( + _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::BUCKET_HASH_SHUFFLE: shared_state->exchanger = BucketShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, num_buckets, - ignore_data_hash_distribution, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set( + _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::PASSTHROUGH: shared_state->exchanger = PassthroughExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set( + _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::BROADCAST: shared_state->exchanger = BroadcastExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set( + _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::PASS_TO_ONE: @@ -812,13 +806,15 @@ Status PipelineFragmentContext::_add_local_exchange_impl( shared_state->exchanger = PassToOneExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set(_runtime_state->query_options() + .local_exchange_free_blocks_limit) : 0); } else { shared_state->exchanger = BroadcastExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set(_runtime_state->query_options() + .local_exchange_free_blocks_limit) : 0); } break; @@ -833,7 +829,8 @@ Status PipelineFragmentContext::_add_local_exchange_impl( shared_state->exchanger = LocalMergeSortExchanger::create_unique( sort_source, cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set( + _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; } @@ -841,7 +838,8 @@ Status PipelineFragmentContext::_add_local_exchange_impl( shared_state->exchanger = AdaptivePassthroughExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit - ? _runtime_state->query_options().local_exchange_free_blocks_limit + ? cast_set( + _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; default: @@ -917,13 +915,12 @@ Status PipelineFragmentContext::_add_local_exchange( int pip_idx, int idx, int node_id, ObjectPool* pool, PipelinePtr cur_pipe, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_distribution) { - if (_num_instances <= 1) { + const std::map& shuffle_idx_to_instance_idx) { + if (_num_instances <= 1 || cur_pipe->num_tasks_of_parent() <= 1) { return Status::OK(); } - if (!cur_pipe->need_to_local_exchange(data_distribution)) { + if (!cur_pipe->need_to_local_exchange(data_distribution, idx)) { return Status::OK(); } *do_local_exchange = true; @@ -933,7 +930,7 @@ Status PipelineFragmentContext::_add_local_exchange( auto new_pip = add_pipeline(cur_pipe, pip_idx + 1); RETURN_IF_ERROR(_add_local_exchange_impl( idx, pool, cur_pipe, new_pip, data_distribution, do_local_exchange, num_buckets, - bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx, ignore_data_distribution)); + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); CHECK(total_op_num + 1 == cur_pipe->operators().size() + new_pip->operators().size()) << "total_op_num: " << total_op_num @@ -947,7 +944,7 @@ Status PipelineFragmentContext::_add_local_exchange( cast_set(new_pip->operators().size()), pool, new_pip, add_pipeline(new_pip, pip_idx + 2), DataDistribution(ExchangeType::PASSTHROUGH), do_local_exchange, num_buckets, bucket_seq_to_instance_idx, - shuffle_idx_to_instance_idx, ignore_data_distribution)); + shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -973,13 +970,8 @@ Status PipelineFragmentContext::_plan_local_exchange( // scan node. so here use `_num_instance` to replace the `num_buckets` to prevent dividing 0 // still keep colocate plan after local shuffle RETURN_IF_ERROR(_plan_local_exchange( - _pipelines[pip_idx]->operators().front()->ignore_data_hash_distribution() || - num_buckets == 0 - ? _num_instances - : num_buckets, - pip_idx, _pipelines[pip_idx], bucket_seq_to_instance_idx, - shuffle_idx_to_instance_idx, - _pipelines[pip_idx]->operators().front()->ignore_data_hash_distribution())); + _use_serial_source || num_buckets == 0 ? _num_instances : num_buckets, pip_idx, + _pipelines[pip_idx], bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -987,8 +979,7 @@ Status PipelineFragmentContext::_plan_local_exchange( Status PipelineFragmentContext::_plan_local_exchange( int num_buckets, int pip_idx, PipelinePtr pip, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_hash_distribution) { + const std::map& shuffle_idx_to_instance_idx) { int idx = 1; bool do_local_exchange = false; do { @@ -1000,8 +991,7 @@ Status PipelineFragmentContext::_plan_local_exchange( RETURN_IF_ERROR(_add_local_exchange( pip_idx, idx, ops[idx]->node_id(), _runtime_state->obj_pool(), pip, ops[idx]->required_data_distribution(), &do_local_exchange, num_buckets, - bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx, - ignore_data_hash_distribution)); + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } if (do_local_exchange) { // If local exchange is needed for current operator, we will split this pipeline to @@ -1018,8 +1008,7 @@ Status PipelineFragmentContext::_plan_local_exchange( RETURN_IF_ERROR(_add_local_exchange( pip_idx, idx, pip->sink()->node_id(), _runtime_state->obj_pool(), pip, pip->sink()->required_data_distribution(), &do_local_exchange, num_buckets, - bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx, - ignore_data_hash_distribution)); + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -1139,8 +1128,7 @@ Status PipelineFragmentContext::_create_data_sink(ObjectPool* pool, const TDataS } _sink.reset(new MultiCastDataStreamSinkOperatorX( - sink_id, sources, cast_set(thrift_sink.multi_cast_stream_sink.sinks.size()), - pool, thrift_sink.multi_cast_stream_sink, row_desc)); + sink_id, sources, pool, thrift_sink.multi_cast_stream_sink, row_desc)); for (int i = 0; i < sender_size; ++i) { auto new_pipeline = add_pipeline(); RowDescriptor* _row_desc = nullptr; @@ -1159,7 +1147,8 @@ Status PipelineFragmentContext::_create_data_sink(ObjectPool* pool, const TDataS // 1. create and set the source operator of multi_cast_data_stream_source for new pipeline source_op.reset(new MultiCastDataStreamerSourceOperatorX( i, pool, thrift_sink.multi_cast_stream_sink.sinks[i], row_desc, source_id)); - RETURN_IF_ERROR(new_pipeline->add_operator(source_op)); + RETURN_IF_ERROR(new_pipeline->add_operator( + source_op, params.__isset.parallel_instances ? params.parallel_instances : 0)); // 2. create and set sink operator of data stream sender for new pipeline DataSinkOperatorPtr sink_op; @@ -1196,23 +1185,22 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo const DescriptorTbl& descs, OperatorPtr& op, PipelinePtr& cur_pipe, int parent_idx, int child_idx, - const bool followed_by_shuffled_join) { + const bool followed_by_shuffled_operator) { // We directly construct the operator from Thrift because the given array is in the order of preorder traversal. // Therefore, here we need to use a stack-like structure. _pipeline_parent_map.pop(cur_pipe, parent_idx, child_idx); std::stringstream error_msg; bool enable_query_cache = request.fragment.__isset.query_cache_param; + bool fe_with_old_version = false; switch (tnode.node_type) { case TPlanNodeType::OLAP_SCAN_NODE: { op.reset(new OlapScanOperatorX( pool, tnode, next_operator_id(), descs, _num_instances, enable_query_cache ? request.fragment.query_cache_param : TQueryCacheParam {})); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::GROUP_COMMIT_SCAN_NODE: { @@ -1221,56 +1209,46 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _query_ctx->query_mem_tracker->is_group_commit_load = true; #endif op.reset(new GroupCommitOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::JDBC_SCAN_NODE: { if (config::enable_java_support) { op.reset(new JDBCScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } else { return Status::InternalError( "Jdbc scan node is disabled, you can change be config enable_java_support " "to true and restart be."); } - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::FILE_SCAN_NODE: { op.reset(new FileScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::ES_SCAN_NODE: case TPlanNodeType::ES_HTTP_SCAN_NODE: { op.reset(new EsScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::EXCHANGE_NODE: { int num_senders = find_with_default(request.per_exch_num_senders, tnode.node_id, 0); DCHECK_GT(num_senders, 0); op.reset(new ExchangeSourceOperatorX(pool, tnode, next_operator_id(), descs, num_senders)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - op->set_ignore_data_distribution(); - cur_pipe->set_num_tasks(request.parallel_instances); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::AGGREGATION_NODE: { @@ -1285,7 +1263,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo auto cache_source_id = next_operator_id(); op.reset(new CacheSourceOperatorX(pool, cache_node_id, cache_source_id, request.fragment.query_cache_param)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1318,18 +1297,20 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, _require_bucket_distribution)); - op->set_followed_by_shuffled_join(false); + op->set_followed_by_shuffled_operator(false); _require_bucket_distribution = true; - RETURN_IF_ERROR(new_pipe->add_operator(op)); + RETURN_IF_ERROR(new_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); cur_pipe = new_pipe; } else { op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, _require_bucket_distribution)); - op->set_followed_by_shuffled_join(followed_by_shuffled_join); + op->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || op->require_data_distribution(); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } } else if (tnode.agg_node.__isset.use_streaming_preaggregation && tnode.agg_node.use_streaming_preaggregation && @@ -1340,11 +1321,13 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); - RETURN_IF_ERROR(new_pipe->add_operator(op)); + RETURN_IF_ERROR(new_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); cur_pipe = new_pipe; } else { op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } } else { // create new pipeline to add query cache operator @@ -1360,10 +1343,12 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } if (enable_query_cache) { RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); - RETURN_IF_ERROR(new_pipe->add_operator(op)); + RETURN_IF_ERROR(new_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); cur_pipe = new_pipe; } else { - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } const auto downstream_pipeline_id = cur_pipe->id(); @@ -1381,7 +1366,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo sink.reset(new AggSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, _require_bucket_distribution)); } - sink->set_followed_by_shuffled_join(followed_by_shuffled_join); + sink->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || sink->require_data_distribution(); sink->set_dests_id({op->operator_id()}); @@ -1411,7 +1396,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo pool, tnode_, next_operator_id(), descs, partition_count); probe_operator->set_inner_operators(inner_sink_operator, inner_probe_operator); op = std::move(probe_operator); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1431,11 +1417,12 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _pipeline_parent_map.push(op->node_id(), cur_pipe); _pipeline_parent_map.push(op->node_id(), build_side_pipe); - sink->set_followed_by_shuffled_join(sink->is_shuffled_hash_join()); - op->set_followed_by_shuffled_join(op->is_shuffled_hash_join()); + sink->set_followed_by_shuffled_operator(sink->is_shuffled_operator()); + op->set_followed_by_shuffled_operator(op->is_shuffled_operator()); } else { op.reset(new HashJoinProbeOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1453,8 +1440,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _pipeline_parent_map.push(op->node_id(), cur_pipe); _pipeline_parent_map.push(op->node_id(), build_side_pipe); - sink->set_followed_by_shuffled_join(sink->is_shuffled_hash_join()); - op->set_followed_by_shuffled_join(op->is_shuffled_hash_join()); + sink->set_followed_by_shuffled_operator(sink->is_shuffled_operator()); + op->set_followed_by_shuffled_operator(op->is_shuffled_operator()); } _require_bucket_distribution = _require_bucket_distribution || op->require_data_distribution(); @@ -1462,7 +1449,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case TPlanNodeType::CROSS_JOIN_NODE: { op.reset(new NestedLoopJoinProbeOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1484,7 +1472,9 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo case TPlanNodeType::UNION_NODE: { int child_count = tnode.num_children; op.reset(new UnionSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + op->set_followed_by_shuffled_operator(_require_bucket_distribution); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1495,6 +1485,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; sink.reset(new UnionSinkOperatorX(i, next_sink_operator_id(), pool, tnode, descs)); + sink->set_followed_by_shuffled_operator(_require_bucket_distribution); sink->set_dests_id({op->operator_id()}); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); @@ -1511,7 +1502,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } else { op.reset(new SortSourceOperatorX(pool, tnode, next_operator_id(), descs)); } - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1528,7 +1520,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo sink.reset(new SortSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, _require_bucket_distribution)); } - sink->set_followed_by_shuffled_join(followed_by_shuffled_join); + sink->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || sink->require_data_distribution(); sink->set_dests_id({op->operator_id()}); @@ -1538,7 +1530,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case doris::TPlanNodeType::PARTITION_SORT_NODE: { op.reset(new PartitionSortSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1556,7 +1549,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case TPlanNodeType::ANALYTIC_EVAL_NODE: { op.reset(new AnalyticSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1568,7 +1562,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo DataSinkOperatorPtr sink; sink.reset(new AnalyticSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, _require_bucket_distribution)); - sink->set_followed_by_shuffled_join(followed_by_shuffled_join); + sink->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || sink->require_data_distribution(); sink->set_dests_id({op->operator_id()}); @@ -1578,62 +1572,73 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo } case TPlanNodeType::INTERSECT_NODE: { RETURN_IF_ERROR(_build_operators_for_set_operation_node( - pool, tnode, descs, op, cur_pipe, parent_idx, child_idx)); + pool, tnode, descs, op, cur_pipe, parent_idx, child_idx, request)); + op->set_followed_by_shuffled_operator(_require_bucket_distribution); break; } case TPlanNodeType::EXCEPT_NODE: { RETURN_IF_ERROR(_build_operators_for_set_operation_node( - pool, tnode, descs, op, cur_pipe, parent_idx, child_idx)); + pool, tnode, descs, op, cur_pipe, parent_idx, child_idx, request)); + op->set_followed_by_shuffled_operator(_require_bucket_distribution); break; } case TPlanNodeType::REPEAT_NODE: { op.reset(new RepeatOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::TABLE_FUNCTION_NODE: { op.reset(new TableFunctionOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::ASSERT_NUM_ROWS_NODE: { op.reset(new AssertNumRowsOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::EMPTY_SET_NODE: { op.reset(new EmptySetSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::DATA_GEN_SCAN_NODE: { op.reset(new DataGenSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); - if (request.__isset.parallel_instances) { - cur_pipe->set_num_tasks(request.parallel_instances); - op->set_ignore_data_distribution(); - } + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); + fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::SCHEMA_SCAN_NODE: { op.reset(new SchemaScanOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::META_SCAN_NODE: { op.reset(new MetaScanOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::SELECT_NODE: { op.reset(new SelectOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } default: return Status::InternalError("Unsupported exec type in pipeline: {}", print_plan_node_type(tnode.node_type)); } + if (request.__isset.parallel_instances && fe_with_old_version) { + cur_pipe->set_num_tasks(request.parallel_instances); + op->set_serial_operator(); + } return Status::OK(); } @@ -1643,9 +1648,11 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo template Status PipelineFragmentContext::_build_operators_for_set_operation_node( ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, OperatorPtr& op, - PipelinePtr& cur_pipe, int parent_idx, int child_idx) { + PipelinePtr& cur_pipe, int parent_idx, int child_idx, + const doris::TPipelineFragmentParams& request) { op.reset(new SetSourceOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + RETURN_IF_ERROR(cur_pipe->add_operator( + op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { @@ -1752,7 +1759,16 @@ void PipelineFragmentContext::_close_fragment_instance() { std::dynamic_pointer_cast(shared_from_this())); } -void PipelineFragmentContext::close_a_pipeline() { +void PipelineFragmentContext::close_a_pipeline(PipelineId pipeline_id) { + // If all tasks of this pipeline has been closed, upstream tasks is never needed, and we just make those runnable here + DCHECK(_pip_id_to_pipeline.contains(pipeline_id)); + if (_pip_id_to_pipeline[pipeline_id]->close_task()) { + if (_dag.contains(pipeline_id)) { + for (auto dep : _dag[pipeline_id]) { + _pip_id_to_pipeline[dep]->make_all_runnable(); + } + } + } std::lock_guard l(_task_mutex); ++_closed_tasks; if (_closed_tasks == _total_tasks) { diff --git a/be/src/pipeline/pipeline_fragment_context.h b/be/src/pipeline/pipeline_fragment_context.h index dcfcc2016199db..289f5c8236522f 100644 --- a/be/src/pipeline/pipeline_fragment_context.h +++ b/be/src/pipeline/pipeline_fragment_context.h @@ -100,7 +100,7 @@ class PipelineFragmentContext : public TaskExecutionContext { [[nodiscard]] int get_fragment_id() const { return _fragment_id; } - void close_a_pipeline(); + void close_a_pipeline(PipelineId pipeline_id); Status send_report(bool); @@ -115,27 +115,13 @@ class PipelineFragmentContext : public TaskExecutionContext { [[nodiscard]] int next_sink_operator_id() { return _sink_operator_id--; } - void instance_ids(std::vector& ins_ids) const { - ins_ids.resize(_fragment_instance_ids.size()); - for (size_t i = 0; i < _fragment_instance_ids.size(); i++) { - ins_ids[i] = _fragment_instance_ids[i]; - } - } - - void instance_ids(std::vector& ins_ids) const { - ins_ids.resize(_fragment_instance_ids.size()); - for (size_t i = 0; i < _fragment_instance_ids.size(); i++) { - ins_ids[i] = print_id(_fragment_instance_ids[i]); - } - } - void clear_finished_tasks() { for (size_t j = 0; j < _tasks.size(); j++) { for (size_t i = 0; i < _tasks[j].size(); i++) { _tasks[j][i]->stop_if_finished(); } } - }; + } private: Status _build_pipelines(ObjectPool* pool, const doris::TPipelineFragmentParams& request, @@ -154,7 +140,8 @@ class PipelineFragmentContext : public TaskExecutionContext { Status _build_operators_for_set_operation_node(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, OperatorPtr& op, PipelinePtr& cur_pipe, int parent_idx, - int child_idx); + int child_idx, + const doris::TPipelineFragmentParams& request); Status _create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink, const std::vector& output_exprs, @@ -166,22 +153,19 @@ class PipelineFragmentContext : public TaskExecutionContext { const std::map& shuffle_idx_to_instance_idx); Status _plan_local_exchange(int num_buckets, int pip_idx, PipelinePtr pip, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_distribution); + const std::map& shuffle_idx_to_instance_idx); void _inherit_pipeline_properties(const DataDistribution& data_distribution, PipelinePtr pipe_with_source, PipelinePtr pipe_with_sink); Status _add_local_exchange(int pip_idx, int idx, int node_id, ObjectPool* pool, PipelinePtr cur_pipe, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_distribution); + const std::map& shuffle_idx_to_instance_idx); Status _add_local_exchange_impl(int idx, ObjectPool* pool, PipelinePtr cur_pipe, PipelinePtr new_pip, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map& bucket_seq_to_instance_idx, - const std::map& shuffle_idx_to_instance_idx, - const bool ignore_data_hash_distribution); + const std::map& shuffle_idx_to_instance_idx); Status _build_pipeline_tasks(const doris::TPipelineFragmentParams& request, ThreadPool* thread_pool); @@ -238,6 +222,7 @@ class PipelineFragmentContext : public TaskExecutionContext { int _num_instances = 1; int _timeout = -1; + bool _use_serial_source = false; OperatorPtr _root_op = nullptr; // this is a [n * m] matrix. n is parallelism of pipeline engine and m is the number of pipelines. @@ -291,6 +276,7 @@ class PipelineFragmentContext : public TaskExecutionContext { std::map, std::shared_ptr>> _op_id_to_le_state; + std::map _pip_id_to_pipeline; // UniqueId -> runtime mgr std::map> _runtime_filter_mgr_map; diff --git a/be/src/pipeline/pipeline_task.cpp b/be/src/pipeline/pipeline_task.cpp index 4f362ac5042e8f..6f9e59c8291966 100644 --- a/be/src/pipeline/pipeline_task.cpp +++ b/be/src/pipeline/pipeline_task.cpp @@ -71,7 +71,6 @@ PipelineTask::PipelineTask( if (shared_state) { _sink_shared_state = shared_state; } - pipeline->incr_created_tasks(); } Status PipelineTask::prepare(const TPipelineInstanceParams& local_params, const TDataSink& tsink, @@ -182,7 +181,7 @@ void PipelineTask::_init_profile() { _sink_timer = ADD_CHILD_TIMER(_task_profile, "SinkTime", exec_time); _close_timer = ADD_CHILD_TIMER(_task_profile, "CloseTime", exec_time); - _wait_worker_timer = ADD_TIMER(_task_profile, "WaitWorkerTime"); + _wait_worker_timer = ADD_TIMER_WITH_LEVEL(_task_profile, "WaitWorkerTime", 1); _schedule_counts = ADD_COUNTER(_task_profile, "NumScheduleTimes", TUnit::UNIT); _yield_counts = ADD_COUNTER(_task_profile, "NumYieldTimes", TUnit::UNIT); @@ -217,10 +216,6 @@ Status PipelineTask::_open() { return Status::OK(); } -void PipelineTask::set_task_queue(TaskQueue* task_queue) { - _task_queue = task_queue; -} - bool PipelineTask::_wait_to_start() { // Before task starting, we should make sure // 1. Execution dependency is ready (which is controlled by FE 2-phase commit) @@ -228,6 +223,9 @@ bool PipelineTask::_wait_to_start() { _blocked_dep = _execution_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { static_cast(_blocked_dep)->start_watcher(); + if (_wake_up_by_downstream) { + _eos = true; + } return true; } @@ -235,6 +233,9 @@ bool PipelineTask::_wait_to_start() { _blocked_dep = op_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); + if (_wake_up_by_downstream) { + _eos = true; + } return true; } } @@ -242,6 +243,12 @@ bool PipelineTask::_wait_to_start() { } bool PipelineTask::_is_blocked() { + Defer defer([this] { + if (_blocked_dep != nullptr) { + _task_profile->add_info_string("TaskState", "Blocked"); + _task_profile->add_info_string("BlockedByDependency", _blocked_dep->name()); + } + }); // `_dry_run = true` means we do not need data from source operator. if (!_dry_run) { for (int i = _read_dependencies.size() - 1; i >= 0; i--) { @@ -250,6 +257,9 @@ bool PipelineTask::_is_blocked() { _blocked_dep = dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); + if (_wake_up_by_downstream) { + _eos = true; + } return true; } } @@ -269,6 +279,9 @@ bool PipelineTask::_is_blocked() { _blocked_dep = op_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); + if (_wake_up_by_downstream) { + _eos = true; + } return true; } } @@ -279,7 +292,7 @@ Status PipelineTask::execute(bool* eos) { SCOPED_TIMER(_task_profile->total_time_counter()); SCOPED_TIMER(_exec_timer); SCOPED_ATTACH_TASK(_state); - _eos = _sink->is_finished(_state) || _eos; + _eos = _sink->is_finished(_state) || _eos || _wake_up_by_downstream; *eos = _eos; if (_eos) { // If task is waken up by finish dependency, `_eos` is set to true by last execution, and we should return here. @@ -307,15 +320,27 @@ Status PipelineTask::execute(bool* eos) { if (_wait_to_start()) { return Status::OK(); } + if (_wake_up_by_downstream) { + _eos = true; + *eos = true; + return Status::OK(); + } // The status must be runnable if (!_opened && !_fragment_context->is_canceled()) { RETURN_IF_ERROR(_open()); } + _task_profile->add_info_string("TaskState", "Runnable"); + _task_profile->add_info_string("BlockedByDependency", ""); while (!_fragment_context->is_canceled()) { if (_is_blocked()) { return Status::OK(); } + if (_wake_up_by_downstream) { + _eos = true; + *eos = true; + return Status::OK(); + } /// When a task is cancelled, /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready). @@ -370,6 +395,7 @@ Status PipelineTask::execute(bool* eos) { *eos = status.is() ? true : *eos; if (*eos) { // just return, the scheduler will do finish work _eos = true; + _task_profile->add_info_string("TaskState", "Finished"); return Status::OK(); } } @@ -482,9 +508,10 @@ std::string PipelineTask::debug_string() { auto elapsed = _fragment_context->elapsed_time() / 1000000000.0; fmt::format_to(debug_string_buffer, "PipelineTask[this = {}, id = {}, open = {}, eos = {}, finish = {}, dry run = " - "{}, elapse time " - "= {}s], block dependency = {}, is running = {}\noperators: ", + "{}, elapse time = {}s, _wake_up_by_downstream = {}], block dependency = {}, is " + "running = {}\noperators: ", (void*)this, _index, _opened, _eos, _finalized, _dry_run, elapsed, + _wake_up_by_downstream.load(), cur_blocked_dep && !_finalized ? cur_blocked_dep->debug_string() : "NULL", is_running()); for (size_t i = 0; i < _operators.size(); i++) { diff --git a/be/src/pipeline/pipeline_task.h b/be/src/pipeline/pipeline_task.h index dd2ead4b5dcc91..3b4627f589dc54 100644 --- a/be/src/pipeline/pipeline_task.h +++ b/be/src/pipeline/pipeline_task.h @@ -41,7 +41,7 @@ class PipelineFragmentContext; namespace doris::pipeline { -class TaskQueue; +class MultiCoreTaskQueue; class PriorityTaskQueue; class Dependency; @@ -135,10 +135,11 @@ class PipelineTask { int task_id() const { return _index; }; bool is_finalized() const { return _finalized; } - void clear_blocking_state() { + void clear_blocking_state(bool wake_up_by_downstream = false) { _state->get_query_ctx()->get_execution_dependency()->set_always_ready(); // We use a lock to assure all dependencies are not deconstructed here. std::unique_lock lc(_dependency_lock); + _wake_up_by_downstream = _wake_up_by_downstream || wake_up_by_downstream; if (!_finalized) { _execution_dep->set_always_ready(); for (auto* dep : _filter_dependencies) { @@ -158,8 +159,8 @@ class PipelineTask { } } - void set_task_queue(TaskQueue* task_queue); - TaskQueue* get_task_queue() { return _task_queue; } + void set_task_queue(MultiCoreTaskQueue* task_queue) { _task_queue = task_queue; } + MultiCoreTaskQueue* get_task_queue() { return _task_queue; } static constexpr auto THREAD_TIME_SLICE = 100'000'000ULL; @@ -223,6 +224,8 @@ class PipelineTask { RuntimeState* runtime_state() const { return _state; } + RuntimeProfile* get_task_profile() const { return _task_profile.get(); } + std::string task_name() const { return fmt::format("task{}({})", _index, _pipeline->_name); } void stop_if_finished() { @@ -231,6 +234,10 @@ class PipelineTask { } } + PipelineId pipeline_id() const { return _pipeline->id(); } + + bool wake_up_by_downstream() const { return _wake_up_by_downstream; } + private: friend class RuntimeFilterDependency; bool _is_blocked(); @@ -250,7 +257,7 @@ class PipelineTask { uint32_t _schedule_time = 0; std::unique_ptr _block; PipelineFragmentContext* _fragment_context = nullptr; - TaskQueue* _task_queue = nullptr; + MultiCoreTaskQueue* _task_queue = nullptr; // used for priority queue // it may be visited by different thread but there is no race condition @@ -306,11 +313,12 @@ class PipelineTask { Dependency* _execution_dep = nullptr; - std::atomic _finalized {false}; + std::atomic _finalized = false; std::mutex _dependency_lock; - std::atomic _running {false}; - std::atomic _eos {false}; + std::atomic _running = false; + std::atomic _eos = false; + std::atomic _wake_up_by_downstream = false; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/query_cache/query_cache.h b/be/src/pipeline/query_cache/query_cache.h index a905831b530578..827c516ad75f07 100644 --- a/be/src/pipeline/query_cache/query_cache.h +++ b/be/src/pipeline/query_cache/query_cache.h @@ -37,6 +37,7 @@ #include "runtime/memory/mem_tracker.h" #include "util/slice.h" #include "util/time.h" +#include "vec/core/block.h" namespace doris { diff --git a/be/src/pipeline/task_queue.cpp b/be/src/pipeline/task_queue.cpp index ea9fb09e260c0b..ea812ca9b12dd6 100644 --- a/be/src/pipeline/task_queue.cpp +++ b/be/src/pipeline/task_queue.cpp @@ -27,8 +27,7 @@ #include "runtime/workload_group/workload_group.h" namespace doris::pipeline { - -TaskQueue::~TaskQueue() = default; +#include "common/compile_check_begin.h" PipelineTask* SubTaskQueue::try_take(bool is_steal) { if (_queue.empty()) { @@ -121,7 +120,7 @@ Status PriorityTaskQueue::push(PipelineTask* task) { // update empty queue's runtime, to avoid too high priority if (_sub_queues[level].empty() && - _queue_level_min_vruntime > _sub_queues[level].get_vruntime()) { + double(_queue_level_min_vruntime) > _sub_queues[level].get_vruntime()) { _sub_queues[level].adjust_runtime(_queue_level_min_vruntime); } @@ -133,44 +132,35 @@ Status PriorityTaskQueue::push(PipelineTask* task) { MultiCoreTaskQueue::~MultiCoreTaskQueue() = default; -MultiCoreTaskQueue::MultiCoreTaskQueue(int core_size) : TaskQueue(core_size), _closed(false) { - _prio_task_queue_list = - std::make_shared>>(core_size); - for (int i = 0; i < core_size; i++) { - (*_prio_task_queue_list)[i] = std::make_unique(); - } -} +MultiCoreTaskQueue::MultiCoreTaskQueue(int core_size) + : _prio_task_queues(core_size), _closed(false), _core_size(core_size) {} void MultiCoreTaskQueue::close() { if (_closed) { return; } _closed = true; - for (int i = 0; i < _core_size; ++i) { - (*_prio_task_queue_list)[i]->close(); - } - std::atomic_store(&_prio_task_queue_list, - std::shared_ptr>>(nullptr)); + // close all priority task queue + std::ranges::for_each(_prio_task_queues, + [](auto& prio_task_queue) { prio_task_queue.close(); }); } PipelineTask* MultiCoreTaskQueue::take(int core_id) { PipelineTask* task = nullptr; - auto prio_task_queue_list = - std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); while (!_closed) { - DCHECK(prio_task_queue_list->size() > core_id) - << " list size: " << prio_task_queue_list->size() << " core_id: " << core_id + DCHECK(_prio_task_queues.size() > core_id) + << " list size: " << _prio_task_queues.size() << " core_id: " << core_id << " _core_size: " << _core_size << " _next_core: " << _next_core.load(); - task = (*prio_task_queue_list)[core_id]->try_take(false); + task = _prio_task_queues[core_id].try_take(false); if (task) { task->set_core_id(core_id); break; } - task = _steal_take(core_id, *prio_task_queue_list); + task = _steal_take(core_id); if (task) { break; } - task = (*prio_task_queue_list)[core_id]->take(WAIT_CORE_TASK_TIMEOUT_MS /* timeout_ms */); + task = _prio_task_queues[core_id].take(WAIT_CORE_TASK_TIMEOUT_MS /* timeout_ms */); if (task) { task->set_core_id(core_id); break; @@ -182,8 +172,7 @@ PipelineTask* MultiCoreTaskQueue::take(int core_id) { return task; } -PipelineTask* MultiCoreTaskQueue::_steal_take( - int core_id, std::vector>& prio_task_queue_list) { +PipelineTask* MultiCoreTaskQueue::_steal_take(int core_id) { DCHECK(core_id < _core_size); int next_id = core_id; for (int i = 1; i < _core_size; ++i) { @@ -192,7 +181,7 @@ PipelineTask* MultiCoreTaskQueue::_steal_take( next_id = 0; } DCHECK(next_id < _core_size); - auto task = prio_task_queue_list[next_id]->try_take(true); + auto task = _prio_task_queues[next_id].try_take(true); if (task) { task->set_core_id(next_id); return task; @@ -212,17 +201,13 @@ Status MultiCoreTaskQueue::push_back(PipelineTask* task) { Status MultiCoreTaskQueue::push_back(PipelineTask* task, int core_id) { DCHECK(core_id < _core_size); task->put_in_runnable_queue(); - auto prio_task_queue_list = - std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); - return (*prio_task_queue_list)[core_id]->push(task); + return _prio_task_queues[core_id].push(task); } void MultiCoreTaskQueue::update_statistics(PipelineTask* task, int64_t time_spent) { task->inc_runtime_ns(time_spent); - auto prio_task_queue_list = - std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); - (*prio_task_queue_list)[task->get_core_id()]->inc_sub_queue_runtime(task->get_queue_level(), - time_spent); + _prio_task_queues[task->get_core_id()].inc_sub_queue_runtime(task->get_queue_level(), + time_spent); } } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/task_queue.h b/be/src/pipeline/task_queue.h index e48deb517575db..1651eb50cac4ab 100644 --- a/be/src/pipeline/task_queue.h +++ b/be/src/pipeline/task_queue.h @@ -32,30 +32,7 @@ #include "pipeline_task.h" namespace doris::pipeline { - -class TaskQueue { -public: - TaskQueue(int core_size) : _core_size(core_size) {} - virtual ~TaskQueue(); - virtual void close() = 0; - // Get the task by core id. - // TODO: To think the logic is useful? - virtual PipelineTask* take(int core_id) = 0; - - // push from scheduler - virtual Status push_back(PipelineTask* task) = 0; - - // push from worker - virtual Status push_back(PipelineTask* task, int core_id) = 0; - - virtual void update_statistics(PipelineTask* task, int64_t time_spent) {} - - int cores() const { return _core_size; } - -protected: - int _core_size; - static constexpr auto WAIT_CORE_TASK_TIMEOUT_MS = 100; -}; +#include "common/compile_check_begin.h" class SubTaskQueue { friend class PriorityTaskQueue; @@ -70,11 +47,13 @@ class SubTaskQueue { // note: // runtime is the time consumed by the actual execution of the task // vruntime(means virtual runtime) = runtime / _level_factor - double get_vruntime() { return _runtime / _level_factor; } + double get_vruntime() { return double(_runtime) / _level_factor; } void inc_runtime(uint64_t delta_time) { _runtime += delta_time; } - void adjust_runtime(uint64_t vruntime) { this->_runtime = uint64_t(vruntime * _level_factor); } + void adjust_runtime(uint64_t vruntime) { + this->_runtime = uint64_t(double(vruntime) * _level_factor); + } bool empty() { return _queue.empty(); } @@ -124,31 +103,35 @@ class PriorityTaskQueue { }; // Need consider NUMA architecture -class MultiCoreTaskQueue : public TaskQueue { +class MultiCoreTaskQueue { public: explicit MultiCoreTaskQueue(int core_size); - ~MultiCoreTaskQueue() override; + ~MultiCoreTaskQueue(); - void close() override; + void close(); // Get the task by core id. - PipelineTask* take(int core_id) override; + PipelineTask* take(int core_id); // TODO combine these methods to `push_back(task, core_id = -1)` - Status push_back(PipelineTask* task) override; + Status push_back(PipelineTask* task); + + Status push_back(PipelineTask* task, int core_id); - Status push_back(PipelineTask* task, int core_id) override; + void update_statistics(PipelineTask* task, int64_t time_spent); - void update_statistics(PipelineTask* task, int64_t time_spent) override; + int cores() const { return _core_size; } private: - PipelineTask* _steal_take( - int core_id, std::vector>& prio_task_queue_list); + PipelineTask* _steal_take(int core_id); - std::shared_ptr>> _prio_task_queue_list; - std::atomic _next_core = 0; + std::vector _prio_task_queues; + std::atomic _next_core = 0; std::atomic _closed; -}; + int _core_size; + static constexpr auto WAIT_CORE_TASK_TIMEOUT_MS = 100; +}; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/task_scheduler.cpp b/be/src/pipeline/task_scheduler.cpp index 8be30773ee11f1..45898e764175b2 100644 --- a/be/src/pipeline/task_scheduler.cpp +++ b/be/src/pipeline/task_scheduler.cpp @@ -44,14 +44,14 @@ #include "vec/runtime/vdatetime_value.h" namespace doris::pipeline { - +#include "common/compile_check_begin.h" TaskScheduler::~TaskScheduler() { stop(); LOG(INFO) << "Task scheduler " << _name << " shutdown"; } Status TaskScheduler::start() { - int cores = _task_queue->cores(); + int cores = _task_queue.cores(); RETURN_IF_ERROR(ThreadPoolBuilder(_name) .set_min_threads(cores) .set_max_threads(cores) @@ -60,14 +60,14 @@ Status TaskScheduler::start() { .build(&_fix_thread_pool)); LOG_INFO("TaskScheduler set cores").tag("size", cores); _markers.resize(cores, true); - for (size_t i = 0; i < cores; ++i) { + for (int i = 0; i < cores; ++i) { RETURN_IF_ERROR(_fix_thread_pool->submit_func([this, i] { _do_work(i); })); } return Status::OK(); } Status TaskScheduler::schedule_task(PipelineTask* task) { - return _task_queue->push_back(task); + return _task_queue.push_back(task); } // after _close_task, task maybe destructed. @@ -94,22 +94,22 @@ void _close_task(PipelineTask* task, Status exec_status) { } task->finalize(); task->set_running(false); - task->fragment_context()->close_a_pipeline(); + task->fragment_context()->close_a_pipeline(task->pipeline_id()); } -void TaskScheduler::_do_work(size_t index) { +void TaskScheduler::_do_work(int index) { while (_markers[index]) { - auto* task = _task_queue->take(index); + auto* task = _task_queue.take(index); if (!task) { continue; } if (task->is_running()) { - static_cast(_task_queue->push_back(task, index)); + static_cast(_task_queue.push_back(task, index)); continue; } task->log_detail_if_need(); task->set_running(true); - task->set_task_queue(_task_queue.get()); + task->set_task_queue(&_task_queue); auto* fragment_ctx = task->fragment_context(); bool canceled = fragment_ctx->is_canceled(); @@ -189,9 +189,7 @@ void TaskScheduler::_do_work(size_t index) { void TaskScheduler::stop() { if (!_shutdown) { - if (_task_queue) { - _task_queue->close(); - } + _task_queue.close(); if (_fix_thread_pool) { for (size_t i = 0; i < _markers.size(); ++i) { _markers[i] = false; diff --git a/be/src/pipeline/task_scheduler.h b/be/src/pipeline/task_scheduler.h index 9a20807ea268e8..bdb5bec1776f58 100644 --- a/be/src/pipeline/task_scheduler.h +++ b/be/src/pipeline/task_scheduler.h @@ -31,24 +31,20 @@ #include "gutil/ref_counted.h" #include "pipeline_task.h" #include "runtime/workload_group/workload_group.h" +#include "task_queue.h" #include "util/thread.h" namespace doris { class ExecEnv; class ThreadPool; - -namespace pipeline { -class TaskQueue; -} // namespace pipeline } // namespace doris namespace doris::pipeline { class TaskScheduler { public: - TaskScheduler(ExecEnv* exec_env, std::shared_ptr task_queue, std::string name, - CgroupCpuCtl* cgroup_cpu_ctl) - : _task_queue(std::move(task_queue)), + TaskScheduler(int core_num, std::string name, CgroupCpuCtl* cgroup_cpu_ctl) + : _task_queue(core_num), _shutdown(false), _name(std::move(name)), _cgroup_cpu_ctl(cgroup_cpu_ctl) {} @@ -65,12 +61,12 @@ class TaskScheduler { private: std::unique_ptr _fix_thread_pool; - std::shared_ptr _task_queue; + MultiCoreTaskQueue _task_queue; std::vector _markers; bool _shutdown; std::string _name; CgroupCpuCtl* _cgroup_cpu_ctl = nullptr; - void _do_work(size_t index); + void _do_work(int index); }; } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/runtime/buffer_control_block.cpp b/be/src/runtime/buffer_control_block.cpp index 61ea5ef080de5f..6420f533e42fcd 100644 --- a/be/src/runtime/buffer_control_block.cpp +++ b/be/src/runtime/buffer_control_block.cpp @@ -104,7 +104,7 @@ BufferControlBlock::BufferControlBlock(const TUniqueId& id, int buffer_size, int } BufferControlBlock::~BufferControlBlock() { - cancel(); + cancel(Status::Cancelled("Cancelled")); } Status BufferControlBlock::init() { @@ -275,12 +275,12 @@ Status BufferControlBlock::close(const TUniqueId& id, Status exec_status) { return Status::OK(); } -void BufferControlBlock::cancel() { +void BufferControlBlock::cancel(const Status& reason) { std::unique_lock l(_lock); _is_cancelled = true; _arrow_data_arrival.notify_all(); for (auto& ctx : _waiting_rpc) { - ctx->on_failure(Status::Cancelled("Cancelled")); + ctx->on_failure(reason); } _waiting_rpc.clear(); _update_dependency(); diff --git a/be/src/runtime/buffer_control_block.h b/be/src/runtime/buffer_control_block.h index 8b45552b2fadb1..4aff1accbd5476 100644 --- a/be/src/runtime/buffer_control_block.h +++ b/be/src/runtime/buffer_control_block.h @@ -85,8 +85,8 @@ class BufferControlBlock { // close buffer block, set _status to exec_status and set _is_close to true; // called because data has been read or error happened. Status close(const TUniqueId& id, Status exec_status); - // this is called by RPC, called from coordinator - void cancel(); + + void cancel(const Status& reason); [[nodiscard]] const TUniqueId& fragment_id() const { return _fragment_id; } diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h index 66d571d3b95541..b807c567543038 100644 --- a/be/src/runtime/descriptors.h +++ b/be/src/runtime/descriptors.h @@ -35,6 +35,7 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/global_types.h" #include "common/status.h" +#include "olap/utils.h" #include "runtime/define_primitive_type.h" #include "runtime/types.h" #include "vec/data_types/data_type.h" @@ -83,6 +84,9 @@ class SlotDescriptor { bool is_auto_increment() const { return _is_auto_increment; } + bool is_skip_bitmap_col() const { return _col_name == SKIP_BITMAP_COL; } + bool is_sequence_col() const { return _col_name == SEQUENCE_COL; } + const std::string& col_default_value() const { return _col_default_value; } PrimitiveType col_type() const { return _col_type; } diff --git a/be/src/runtime/exec_env.cpp b/be/src/runtime/exec_env.cpp index c714db2d5e40fa..872069ee70a1a4 100644 --- a/be/src/runtime/exec_env.cpp +++ b/be/src/runtime/exec_env.cpp @@ -45,6 +45,10 @@ ExecEnv::~ExecEnv() { } #ifdef BE_TEST +void ExecEnv::set_inverted_index_searcher_cache( + segment_v2::InvertedIndexSearcherCache* inverted_index_searcher_cache) { + _inverted_index_searcher_cache = inverted_index_searcher_cache; +} void ExecEnv::set_storage_engine(std::unique_ptr&& engine) { _storage_engine = std::move(engine); } @@ -54,7 +58,10 @@ void ExecEnv::set_write_cooldown_meta_executors() { #endif // BE_TEST Result ExecEnv::get_tablet(int64_t tablet_id) { - return GetInstance()->storage_engine().get_tablet(tablet_id); + auto storage_engine = GetInstance()->_storage_engine.get(); + return storage_engine != nullptr + ? storage_engine->get_tablet(tablet_id) + : ResultError(Status::InternalError("failed to get tablet {}", tablet_id)); } const std::string& ExecEnv::token() const { diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 61cebad10b9e78..9321b17070edb2 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -101,6 +101,7 @@ class FrontendServiceClient; class FileMetaCache; class GroupCommitMgr; class TabletSchemaCache; +class TabletColumnObjectPool; class UserFunctionCache; class SchemaCache; class StoragePageCache; @@ -109,6 +110,8 @@ class LookupConnectionCache; class RowCache; class DummyLRUCache; class CacheManager; +class ProcessProfile; +class HeapProfiler; class WalManager; class DNSCache; @@ -270,8 +273,14 @@ class ExecEnv { } void set_storage_engine(std::unique_ptr&& engine); + void set_inverted_index_searcher_cache( + segment_v2::InvertedIndexSearcherCache* inverted_index_searcher_cache); void set_cache_manager(CacheManager* cm) { this->_cache_manager = cm; } + void set_process_profile(ProcessProfile* pp) { this->_process_profile = pp; } void set_tablet_schema_cache(TabletSchemaCache* c) { this->_tablet_schema_cache = c; } + void set_tablet_column_object_pool(TabletColumnObjectPool* c) { + this->_tablet_column_object_pool = c; + } void set_storage_page_cache(StoragePageCache* c) { this->_storage_page_cache = c; } void set_segment_loader(SegmentLoader* sl) { this->_segment_loader = sl; } void set_routine_load_task_executor(RoutineLoadTaskExecutor* r) { @@ -297,12 +306,15 @@ class ExecEnv { std::map get_running_frontends(); TabletSchemaCache* get_tablet_schema_cache() { return _tablet_schema_cache; } + TabletColumnObjectPool* get_tablet_column_object_pool() { return _tablet_column_object_pool; } SchemaCache* schema_cache() { return _schema_cache; } StoragePageCache* get_storage_page_cache() { return _storage_page_cache; } SegmentLoader* segment_loader() { return _segment_loader; } LookupConnectionCache* get_lookup_connection_cache() { return _lookup_connection_cache; } RowCache* get_row_cache() { return _row_cache; } CacheManager* get_cache_manager() { return _cache_manager; } + ProcessProfile* get_process_profile() { return _process_profile; } + HeapProfiler* get_heap_profiler() { return _heap_profiler; } segment_v2::InvertedIndexSearcherCache* get_inverted_index_searcher_cache() { return _inverted_index_searcher_cache; } @@ -434,6 +446,7 @@ class ExecEnv { // these redundancy header could introduce potential bug, at least, more header means slow compile. // So we choose to use raw pointer, please remember to delete these pointer in deconstructor. TabletSchemaCache* _tablet_schema_cache = nullptr; + TabletColumnObjectPool* _tablet_column_object_pool = nullptr; std::unique_ptr _storage_engine; SchemaCache* _schema_cache = nullptr; StoragePageCache* _storage_page_cache = nullptr; @@ -441,6 +454,8 @@ class ExecEnv { LookupConnectionCache* _lookup_connection_cache = nullptr; RowCache* _row_cache = nullptr; CacheManager* _cache_manager = nullptr; + ProcessProfile* _process_profile = nullptr; + HeapProfiler* _heap_profiler = nullptr; segment_v2::InvertedIndexSearcherCache* _inverted_index_searcher_cache = nullptr; segment_v2::InvertedIndexQueryCache* _inverted_index_query_cache = nullptr; QueryCache* _query_cache = nullptr; diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index adb6b7fd101f27..e43524b2d2a00b 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -53,6 +53,7 @@ #include "olap/schema_cache.h" #include "olap/segment_loader.h" #include "olap/storage_engine.h" +#include "olap/tablet_column_object_pool.h" #include "olap/tablet_schema_cache.h" #include "olap/wal/wal_manager.h" #include "pipeline/pipeline_tracing.h" @@ -71,9 +72,11 @@ #include "runtime/load_path_mgr.h" #include "runtime/load_stream_mgr.h" #include "runtime/memory/cache_manager.h" +#include "runtime/memory/heap_profiler.h" #include "runtime/memory/mem_tracker.h" #include "runtime/memory/mem_tracker_limiter.h" #include "runtime/memory/thread_mem_tracker_mgr.h" +#include "runtime/process_profile.h" #include "runtime/result_buffer_mgr.h" #include "runtime/result_queue_mgr.h" #include "runtime/routine_load/routine_load_task_executor.h" @@ -337,6 +340,9 @@ Status ExecEnv::_init(const std::vector& store_paths, _tablet_schema_cache = TabletSchemaCache::create_global_schema_cache(config::tablet_schema_cache_capacity); + _tablet_column_object_pool = TabletColumnObjectPool::create_global_column_cache( + config::tablet_schema_cache_capacity); + // Storage engine doris::EngineOptions options; options.store_paths = store_paths; @@ -379,9 +385,8 @@ Status ExecEnv::init_pipeline_task_scheduler() { LOG_INFO("pipeline executors_size set ").tag("size", executors_size); // TODO pipeline workload group combie two blocked schedulers. - auto t_queue = std::make_shared(executors_size); _without_group_task_scheduler = - new pipeline::TaskScheduler(this, t_queue, "PipeNoGSchePool", nullptr); + new pipeline::TaskScheduler(executors_size, "PipeNoGSchePool", nullptr); RETURN_IF_ERROR(_without_group_task_scheduler->start()); _runtime_filter_timer_queue = new doris::pipeline::RuntimeFilterTimerQueue(); @@ -440,8 +445,11 @@ void ExecEnv::init_file_cache_factory(std::vector& cache_paths } for (const auto& status : cache_status) { if (!status.ok()) { - LOG(FATAL) << "failed to init file cache, err: " << status; - exit(-1); + if (!doris::config::ignore_broken_disk) { + LOG(FATAL) << "failed to init file cache, err: " << status; + exit(-1); + } + LOG(WARNING) << "failed to init file cache, err: " << status; } } } @@ -450,6 +458,8 @@ Status ExecEnv::_init_mem_env() { bool is_percent = false; std::stringstream ss; // 1. init mem tracker + _process_profile = ProcessProfile::create_global_instance(); + _heap_profiler = HeapProfiler::create_global_instance(); init_mem_tracker(); thread_context()->thread_mem_tracker_mgr->init(); #if defined(USE_MEM_TRACKER) && !defined(__SANITIZE_ADDRESS__) && !defined(ADDRESS_SANITIZER) && \ @@ -672,7 +682,7 @@ void ExecEnv::destroy() { SAFE_STOP(_write_cooldown_meta_executors); // StorageEngine must be destoried before _page_no_cache_mem_tracker.reset and _cache_manager destory - // shouldn't use SAFE_STOP. otherwise will lead to twice stop. + SAFE_STOP(_storage_engine); _storage_engine.reset(); SAFE_STOP(_spill_stream_mgr); @@ -772,6 +782,9 @@ void ExecEnv::destroy() { // dns cache is a global instance and need to be released at last SAFE_DELETE(_dns_cache); + SAFE_DELETE(_process_profile); + SAFE_DELETE(_heap_profiler); + _s_tracking_memory = false; LOG(INFO) << "Doris exec envorinment is destoried."; diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 3f8e762408cc71..18aacb452a6477 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -106,7 +106,6 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(fragment_instance_count, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(timeout_canceled_fragment_count, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(fragment_thread_pool_queue_size, MetricUnit::NOUNIT); bvar::LatencyRecorder g_fragmentmgr_prepare_latency("doris_FragmentMgr", "prepare"); -bvar::Adder g_pipeline_fragment_instances_count("doris_pipeline_fragment_instances_count"); bvar::Adder g_fragment_executing_count("fragment_executing_count"); bvar::Status g_fragment_last_active_time( @@ -300,6 +299,10 @@ Status FragmentMgr::trigger_pipeline_context_report( // including the final status when execution finishes. void FragmentMgr::coordinator_callback(const ReportStatusRequest& req) { DCHECK(req.status.ok() || req.done); // if !status.ok() => done + if (req.coord_addr.hostname == "external") { + // External query (flink/spark read tablets) not need to report to FE. + return; + } Status exec_status = req.status; Status coord_status; FrontendServiceConnection coord(_exec_env->frontend_client_cache(), req.coord_addr, @@ -573,13 +576,19 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, } } +// Stage 2. prepare finished. then get FE instruction to execute Status FragmentMgr::start_query_execution(const PExecPlanFragmentStartRequest* request) { - std::lock_guard lock(_lock); TUniqueId query_id; query_id.__set_hi(request->query_id().hi()); query_id.__set_lo(request->query_id().lo()); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { + std::shared_ptr q_ctx = nullptr; + { + std::lock_guard lock(_lock); + q_ctx = _get_or_erase_query_ctx(query_id); + } + if (q_ctx) { q_ctx->set_ready_to_execute(Status::OK()); + LOG_INFO("Query {} start execution", print_id(query_id)); } else { return Status::InternalError( "Failed to get query fragments context. Query may be " @@ -594,18 +603,12 @@ void FragmentMgr::remove_pipeline_context( { std::lock_guard lock(_lock); auto query_id = f_context->get_query_id(); - std::vector ins_ids; - f_context->instance_ids(ins_ids); int64 now = duration_cast( std::chrono::system_clock::now().time_since_epoch()) .count(); g_fragment_executing_count << -1; g_fragment_last_active_time.set_value(now); - for (const auto& ins_id : ins_ids) { - LOG_INFO("Removing query {} instance {}", print_id(query_id), print_id(ins_id)); - _pipeline_map.erase(ins_id); - g_pipeline_fragment_instances_count << -1; - } + _pipeline_map.erase({query_id, f_context->get_fragment_id()}); } } @@ -658,6 +661,7 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo return Status::OK(); } + // First time a fragment of a query arrived. print logs. LOG(INFO) << "query_id: " << print_id(query_id) << ", coord_addr: " << params.coord << ", total fragment num on current host: " << params.fragment_num_on_host << ", fe process uuid: " << params.query_options.fe_process_uuid @@ -667,7 +671,7 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo // This may be a first fragment request of the query. // Create the query fragments context. query_ctx = QueryContext::create_shared(query_id, _exec_env, params.query_options, - params.coord, pipeline, params.is_nereids, + params.coord, params.is_nereids, params.current_connect_fe, query_source); SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(query_ctx->query_mem_tracker); RETURN_IF_ERROR(DescriptorTbl::create(&(query_ctx->obj_pool), params.desc_tbl, @@ -686,7 +690,6 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo } _set_scan_concurrency(params, query_ctx.get()); - const bool is_pipeline = std::is_same_v; if (params.__isset.workload_groups && !params.workload_groups.empty()) { uint64_t tg_id = params.workload_groups[0].id; @@ -697,21 +700,14 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo RETURN_IF_ERROR(query_ctx->set_workload_group(workload_group_ptr)); _exec_env->runtime_query_statistics_mgr()->set_workload_group_id(print_id(query_id), tg_id); - - LOG(INFO) << "Query/load id: " << print_id(query_ctx->query_id()) - << ", use workload group: " << workload_group_ptr->debug_string() - << ", is pipeline: " << ((int)is_pipeline); } else { - LOG(INFO) << "Query/load id: " << print_id(query_ctx->query_id()) - << " carried group info but can not find group in be"; + LOG(WARNING) << "Query/load id: " << print_id(query_ctx->query_id()) + << "can't find its workload group " << tg_id; } } // There is some logic in query ctx's dctor, we could not check if exists and delete the // temp query ctx now. For example, the query id maybe removed from workload group's queryset. _query_ctx_map.insert(std::make_pair(query_ctx->query_id(), query_ctx)); - LOG(INFO) << "Register query/load memory tracker, query/load id: " - << print_id(query_ctx->query_id()) - << " limit: " << PrettyPrinter::print(query_ctx->mem_limit(), TUnit::BYTES); } return Status::OK(); } @@ -739,11 +735,10 @@ std::string FragmentMgr::dump_pipeline_tasks(int64_t duration) { continue; } auto timeout_second = it.second->timeout_second(); - fmt::format_to(debug_string_buffer, - "No.{} (elapse_second={}s, query_timeout_second={}s, instance_id=" - "{}, is_timeout={}) : {}\n", - i, elapsed, timeout_second, print_id(it.first), - it.second->is_timeout(now), it.second->debug_string()); + fmt::format_to( + debug_string_buffer, + "No.{} (elapse_second={}s, query_timeout_second={}s, is_timeout={}) : {}\n", i, + elapsed, timeout_second, it.second->is_timeout(now), it.second->debug_string()); i++; } } @@ -800,36 +795,33 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, query_ctx->set_merge_controller_handler(handler); } - for (const auto& local_param : params.local_params) { - const TUniqueId& fragment_instance_id = local_param.fragment_instance_id; + { + // (query_id, fragment_id) is executed only on one BE, locks _pipeline_map. std::lock_guard lock(_lock); - auto iter = _pipeline_map.find(fragment_instance_id); - if (iter != _pipeline_map.end()) { - return Status::InternalError( - "exec_plan_fragment input duplicated fragment_instance_id({})", - UniqueId(fragment_instance_id).to_string()); + for (const auto& local_param : params.local_params) { + const TUniqueId& fragment_instance_id = local_param.fragment_instance_id; + auto iter = _pipeline_map.find({params.query_id, params.fragment_id}); + if (iter != _pipeline_map.end()) { + return Status::InternalError( + "exec_plan_fragment query_id({}) input duplicated fragment_id({})", + print_id(params.query_id), params.fragment_id); + } + query_ctx->fragment_instance_ids.push_back(fragment_instance_id); } - query_ctx->fragment_instance_ids.push_back(fragment_instance_id); + + int64 now = duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + g_fragment_executing_count << 1; + g_fragment_last_active_time.set_value(now); + // TODO: simplify this mapping + _pipeline_map.insert({{params.query_id, params.fragment_id}, context}); } if (!params.__isset.need_wait_execution_trigger || !params.need_wait_execution_trigger) { query_ctx->set_ready_to_execute_only(); } - int64 now = duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - { - g_fragment_executing_count << 1; - g_fragment_last_active_time.set_value(now); - std::lock_guard lock(_lock); - std::vector ins_ids; - context->instance_ids(ins_ids); - // TODO: simplify this mapping - for (const auto& ins_id : ins_ids) { - _pipeline_map.insert({ins_id, context}); - } - } query_ctx->set_pipeline_context(params.fragment_id, context); RETURN_IF_ERROR(context->submit()); @@ -873,31 +865,6 @@ void FragmentMgr::cancel_query(const TUniqueId query_id, const Status reason) { << " is cancelled and removed. Reason: " << reason.to_string(); } -void FragmentMgr::cancel_instance(const TUniqueId instance_id, const Status reason) { - std::shared_ptr pipeline_ctx; - { - std::lock_guard state_lock(_lock); - DCHECK(!_pipeline_map.contains(instance_id)) - << " Pipeline tasks should be canceled by query instead of instance! Query ID: " - << print_id(_pipeline_map[instance_id]->get_query_id()); - const bool is_pipeline_instance = _pipeline_map.contains(instance_id); - if (is_pipeline_instance) { - auto itr = _pipeline_map.find(instance_id); - if (itr != _pipeline_map.end()) { - pipeline_ctx = itr->second; - } else { - LOG(WARNING) << "Could not find the pipeline instance id:" << print_id(instance_id) - << " to cancel"; - return; - } - } - } - - if (pipeline_ctx != nullptr) { - pipeline_ctx->cancel(reason); - } -} - void FragmentMgr::cancel_worker() { LOG(INFO) << "FragmentMgr cancel worker start working."; @@ -925,11 +892,20 @@ void FragmentMgr::cancel_worker() { running_queries_on_all_fes.clear(); } + std::vector> ctx; { std::lock_guard lock(_lock); + ctx.reserve(_pipeline_map.size()); for (auto& pipeline_itr : _pipeline_map) { - pipeline_itr.second->clear_finished_tasks(); + ctx.push_back(pipeline_itr.second); } + } + for (auto& c : ctx) { + c->clear_finished_tasks(); + } + + { + std::lock_guard lock(_lock); for (auto it = _query_ctx_map.begin(); it != _query_ctx_map.end();) { if (auto q_ctx = it->second.lock()) { if (q_ctx->is_timeout(now)) { @@ -1064,6 +1040,7 @@ void FragmentMgr::debug(std::stringstream& ss) {} */ Status FragmentMgr::exec_external_plan_fragment(const TScanOpenParams& params, const TQueryPlanInfo& t_query_plan_info, + const TUniqueId& query_id, const TUniqueId& fragment_instance_id, std::vector* selected_columns) { // set up desc tbl @@ -1104,8 +1081,9 @@ Status FragmentMgr::exec_external_plan_fragment(const TScanOpenParams& params, // assign the param used for executing of PlanFragment-self TPipelineInstanceParams fragment_exec_params; - exec_fragment_params.query_id = t_query_plan_info.query_id; + exec_fragment_params.query_id = query_id; fragment_exec_params.fragment_instance_id = fragment_instance_id; + exec_fragment_params.coord.hostname = "external"; std::map<::doris::TPlanNodeId, std::vector> per_node_scan_ranges; std::vector scan_ranges; std::vector tablet_ids = params.tablet_ids; @@ -1155,7 +1133,6 @@ Status FragmentMgr::exec_external_plan_fragment(const TScanOpenParams& params, Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, butil::IOBufAsZeroCopyInputStream* attach_data) { - bool is_pipeline = request->has_is_pipeline() && request->is_pipeline(); int64_t start_apply = MonotonicMillis(); std::shared_ptr pip_context; @@ -1163,27 +1140,22 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, RuntimeFilterMgr* runtime_filter_mgr = nullptr; - const auto& fragment_instance_ids = request->fragment_instance_ids(); + const auto& fragment_ids = request->fragment_ids(); { std::unique_lock lock(_lock); - for (UniqueId fragment_instance_id : fragment_instance_ids) { - TUniqueId tfragment_instance_id = fragment_instance_id.to_thrift(); - - if (is_pipeline) { - auto iter = _pipeline_map.find(tfragment_instance_id); - if (iter == _pipeline_map.end()) { - continue; - } - pip_context = iter->second; - - DCHECK(pip_context != nullptr); - runtime_filter_mgr = pip_context->get_query_ctx()->runtime_filter_mgr(); - query_thread_context = {pip_context->get_query_ctx()->query_id(), - pip_context->get_query_ctx()->query_mem_tracker, - pip_context->get_query_ctx()->workload_group()}; - } else { - return Status::InternalError("Non-pipeline is disabled!"); + for (auto fragment_id : fragment_ids) { + auto iter = + _pipeline_map.find({UniqueId(request->query_id()).to_thrift(), fragment_id}); + if (iter == _pipeline_map.end()) { + continue; } + pip_context = iter->second; + + DCHECK(pip_context != nullptr); + runtime_filter_mgr = pip_context->get_query_ctx()->runtime_filter_mgr(); + query_thread_context = {pip_context->get_query_ctx()->query_id(), + pip_context->get_query_ctx()->query_mem_tracker, + pip_context->get_query_ctx()->workload_group()}; break; } } diff --git a/be/src/runtime/fragment_mgr.h b/be/src/runtime/fragment_mgr.h index bc066066f7b6a6..20b2fd8cdc2063 100644 --- a/be/src/runtime/fragment_mgr.h +++ b/be/src/runtime/fragment_mgr.h @@ -100,9 +100,6 @@ class FragmentMgr : public RestMonitorIface { Status trigger_pipeline_context_report(const ReportStatusRequest, std::shared_ptr&&); - // Cancel instance (pipeline or nonpipeline). - void cancel_instance(const TUniqueId instance_id, const Status reason); - // Can be used in both version. void cancel_query(const TUniqueId query_id, const Status reason); @@ -115,6 +112,7 @@ class FragmentMgr : public RestMonitorIface { // execute external query, all query info are packed in TScanOpenParams Status exec_external_plan_fragment(const TScanOpenParams& params, const TQueryPlanInfo& t_query_plan_info, + const TUniqueId& query_id, const TUniqueId& fragment_instance_id, std::vector* selected_columns); @@ -169,7 +167,10 @@ class FragmentMgr : public RestMonitorIface { // call _lock, so that there is dead lock. std::mutex _lock; - std::unordered_map> _pipeline_map; + // (QueryID, FragmentID) -> PipelineFragmentContext + std::unordered_map, + std::shared_ptr> + _pipeline_map; // query id -> QueryContext std::unordered_map> _query_ctx_map; diff --git a/be/src/runtime/group_commit_mgr.cpp b/be/src/runtime/group_commit_mgr.cpp index 3250379cf85924..cd54718bc5fb0a 100644 --- a/be/src/runtime/group_commit_mgr.cpp +++ b/be/src/runtime/group_commit_mgr.cpp @@ -499,7 +499,6 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ LOG(INFO) << "debug promise set: " << msg; ExecEnv::GetInstance()->group_commit_mgr()->debug_promise.set_value( Status ::InternalError(msg)); - return status; }); } std::shared_ptr load_block_queue; diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp index f8c11639719303..9369c0c833c53c 100644 --- a/be/src/runtime/load_channel.cpp +++ b/be/src/runtime/load_channel.cpp @@ -64,7 +64,6 @@ LoadChannel::LoadChannel(const UniqueId& load_id, int64_t timeout_s, bool is_hig if (workload_group_ptr) { wg_ptr = workload_group_ptr; wg_ptr->add_mem_tracker_limiter(mem_tracker); - _need_release_memtracker = true; } } } @@ -85,12 +84,6 @@ LoadChannel::~LoadChannel() { rows_str << ", index id: " << entry.first << ", total_received_rows: " << entry.second.first << ", num_rows_filtered: " << entry.second.second; } - if (_need_release_memtracker) { - WorkloadGroupPtr wg_ptr = _query_thread_context.get_workload_group_ptr(); - if (wg_ptr) { - wg_ptr->remove_mem_tracker_limiter(_query_thread_context.get_memory_tracker()); - } - } LOG(INFO) << "load channel removed" << " load_id=" << _load_id << ", is high priority=" << _is_high_priority << ", sender_ip=" << _sender_ip << rows_str.str(); @@ -142,7 +135,7 @@ Status LoadChannel::open(const PTabletWriterOpenRequest& params) { _is_high_priority, _self_profile); } { - std::lock_guard l(_tablets_channels_lock); + std::lock_guard l(_tablets_channels_lock); _tablets_channels.insert({index_id, channel}); } } @@ -244,7 +237,7 @@ Status LoadChannel::_handle_eos(BaseTabletsChannel* channel, if (finished) { std::lock_guard l(_lock); { - std::lock_guard l(_tablets_channels_lock); + std::lock_guard l(_tablets_channels_lock); _tablets_channels_rows.insert(std::make_pair( index_id, std::make_pair(channel->total_received_rows(), channel->num_rows_filtered()))); @@ -270,7 +263,7 @@ void LoadChannel::_report_profile(PTabletWriterAddBlockResult* response) { _self_profile->set_timestamp(_last_updated_time); { - std::lock_guard l(_tablets_channels_lock); + std::lock_guard l(_tablets_channels_lock); for (auto& it : _tablets_channels) { it.second->refresh_profile(); } diff --git a/be/src/runtime/load_channel.h b/be/src/runtime/load_channel.h index 6fad8c536ec4fa..36a8f363ba9bac 100644 --- a/be/src/runtime/load_channel.h +++ b/be/src/runtime/load_channel.h @@ -104,7 +104,7 @@ class LoadChannel { std::unordered_map> _tablets_channels; // index id -> (received rows, filtered rows) std::unordered_map> _tablets_channels_rows; - SpinLock _tablets_channels_lock; + std::mutex _tablets_channels_lock; // This is to save finished channels id, to handle the retry request. std::unordered_set _finished_channel_ids; // set to true if at least one tablets channel has been opened @@ -127,7 +127,6 @@ class LoadChannel { int64_t _backend_id; bool _enable_profile; - bool _need_release_memtracker = false; }; inline std::ostream& operator<<(std::ostream& os, LoadChannel& load_channel) { diff --git a/be/src/runtime/load_stream.cpp b/be/src/runtime/load_stream.cpp index 80cd167260c04d..460ad5e9580652 100644 --- a/be/src/runtime/load_stream.cpp +++ b/be/src/runtime/load_stream.cpp @@ -31,11 +31,14 @@ #include #include "bvar/bvar.h" +#include "cloud/config.h" #include "common/signal_handler.h" #include "exec/tablet_info.h" #include "gutil/ref_counted.h" +#include "olap/tablet.h" #include "olap/tablet_fwd.h" #include "olap/tablet_schema.h" +#include "runtime/exec_env.h" #include "runtime/fragment_mgr.h" #include "runtime/load_channel.h" #include "runtime/load_stream_mgr.h" @@ -149,6 +152,14 @@ Status TabletStream::append_data(const PStreamHeader& header, butil::IOBuf* data signal::set_signal_task_id(_load_id); g_load_stream_flush_running_threads << -1; auto st = _load_stream_writer->append_data(new_segid, header.offset(), buf, file_type); + if (!st.ok() && !config::is_cloud_mode()) { + auto res = ExecEnv::get_tablet(_id); + TabletSharedPtr tablet = + res.has_value() ? std::dynamic_pointer_cast(res.value()) : nullptr; + if (tablet) { + tablet->report_error(st); + } + } if (eos && st.ok()) { DBUG_EXECUTE_IF("TabletStream.append_data.unknown_file_type", { file_type = static_cast(-1); }); @@ -266,30 +277,43 @@ Status TabletStream::add_segment(const PStreamHeader& header, butil::IOBuf* data return _status; } -Status TabletStream::close() { - if (!_status.ok()) { - return _status; - } - - SCOPED_TIMER(_close_wait_timer); +Status TabletStream::_run_in_heavy_work_pool(std::function fn) { bthread::Mutex mu; std::unique_lock lock(mu); bthread::ConditionVariable cv; - auto wait_func = [this, &mu, &cv] { + auto st = Status::OK(); + auto func = [this, &mu, &cv, &st, &fn] { signal::set_signal_task_id(_load_id); - for (auto& token : _flush_tokens) { - token->wait(); - } + st = fn(); std::lock_guard lock(mu); cv.notify_one(); }; - bool ret = _load_stream_mgr->heavy_work_pool()->try_offer(wait_func); - if (ret) { - cv.wait(lock); - } else { - _status = Status::Error( + bool ret = _load_stream_mgr->heavy_work_pool()->try_offer(func); + if (!ret) { + return Status::Error( "there is not enough thread resource for close load"); - return _status; + } + cv.wait(lock); + return st; +} + +void TabletStream::pre_close() { + if (!_status.ok()) { + return; + } + + SCOPED_TIMER(_close_wait_timer); + _status = _run_in_heavy_work_pool([this]() { + for (auto& token : _flush_tokens) { + token->wait(); + } + return Status::OK(); + }); + // it is necessary to check status after wait_func, + // for create_rowset could fail during add_segment when loading to MOW table, + // in this case, should skip close to avoid submit_calc_delete_bitmap_task which could cause coredump. + if (!_status.ok()) { + return; } DBUG_EXECUTE_IF("TabletStream.close.segment_num_mismatch", { _num_segments++; }); @@ -297,32 +321,19 @@ Status TabletStream::close() { _status = Status::Corruption( "segment num mismatch in tablet {}, expected: {}, actual: {}, load_id: {}", _id, _num_segments, _next_segid.load(), print_id(_load_id)); - return _status; + return; } - // it is necessary to check status after wait_func, - // for create_rowset could fail during add_segment when loading to MOW table, - // in this case, should skip close to avoid submit_calc_delete_bitmap_task which could cause coredump. + _status = _run_in_heavy_work_pool([this]() { return _load_stream_writer->pre_close(); }); +} + +Status TabletStream::close() { if (!_status.ok()) { return _status; } - auto close_func = [this, &mu, &cv]() { - signal::set_signal_task_id(_load_id); - auto st = _load_stream_writer->close(); - if (!st.ok() && _status.ok()) { - _status = st; - } - std::lock_guard lock(mu); - cv.notify_one(); - }; - ret = _load_stream_mgr->heavy_work_pool()->try_offer(close_func); - if (ret) { - cv.wait(lock); - } else { - _status = Status::Error( - "there is not enough thread resource for close load"); - } + SCOPED_TIMER(_close_wait_timer); + _status = _run_in_heavy_work_pool([this]() { return _load_stream_writer->close(); }); return _status; } @@ -391,6 +402,10 @@ void IndexStream::close(const std::vector& tablets_to_commit, } } + for (auto& [_, tablet_stream] : _tablet_streams_map) { + tablet_stream->pre_close(); + } + for (auto& [_, tablet_stream] : _tablet_streams_map) { auto st = tablet_stream->close(); if (st.ok()) { diff --git a/be/src/runtime/load_stream.h b/be/src/runtime/load_stream.h index 3b649c688355fe..c156eb45c8bddb 100644 --- a/be/src/runtime/load_stream.h +++ b/be/src/runtime/load_stream.h @@ -54,12 +54,15 @@ class TabletStream { Status add_segment(const PStreamHeader& header, butil::IOBuf* data); void add_num_segments(int64_t num_segments) { _num_segments += num_segments; } void disable_num_segments_check() { _check_num_segments = false; } + void pre_close(); Status close(); int64_t id() const { return _id; } friend std::ostream& operator<<(std::ostream& ostr, const TabletStream& tablet_stream); private: + Status _run_in_heavy_work_pool(std::function fn); + int64_t _id; LoadStreamWriterSharedPtr _load_stream_writer; std::vector> _flush_tokens; diff --git a/be/src/runtime/load_stream_writer.cpp b/be/src/runtime/load_stream_writer.cpp index 37243fab14bdb3..377b27e6e45105 100644 --- a/be/src/runtime/load_stream_writer.cpp +++ b/be/src/runtime/load_stream_writer.cpp @@ -201,7 +201,7 @@ Status LoadStreamWriter::add_segment(uint32_t segid, const SegmentStatistics& st } DBUG_EXECUTE_IF("LoadStreamWriter.add_segment.size_not_match", { segment_file_size++; }); - if (segment_file_size + inverted_file_size != stat.data_size) { + if (segment_file_size != stat.data_size) { return Status::Corruption( "add_segment failed, segment stat {} does not match, file size={}, inverted file " "size={}, stat.data_size={}, tablet id={}", @@ -245,8 +245,7 @@ Status LoadStreamWriter::_calc_file_size(uint32_t segid, FileType file_type, siz return Status::OK(); } -Status LoadStreamWriter::close() { - std::lock_guard l(_lock); +Status LoadStreamWriter::_pre_close() { SCOPED_ATTACH_TASK(_query_thread_context); if (!_is_init) { // if this delta writer is not initialized, but close() is called. @@ -306,6 +305,15 @@ Status LoadStreamWriter::close() { RETURN_IF_ERROR(_rowset_builder->build_rowset()); RETURN_IF_ERROR(_rowset_builder->submit_calc_delete_bitmap_task()); + _pre_closed = true; + return Status::OK(); +} + +Status LoadStreamWriter::close() { + std::lock_guard l(_lock); + if (!_pre_closed) { + RETURN_IF_ERROR(_pre_close()); + } RETURN_IF_ERROR(_rowset_builder->wait_calc_delete_bitmap()); // FIXME(plat1ko): No `commit_txn` operation in cloud mode, need better abstractions RETURN_IF_ERROR(static_cast(_rowset_builder.get())->commit_txn()); diff --git a/be/src/runtime/load_stream_writer.h b/be/src/runtime/load_stream_writer.h index b22817cb85cb47..8815b0f0e3e70a 100644 --- a/be/src/runtime/load_stream_writer.h +++ b/be/src/runtime/load_stream_writer.h @@ -70,14 +70,23 @@ class LoadStreamWriter { Status add_segment(uint32_t segid, const SegmentStatistics& stat, TabletSchemaSPtr flush_chema); - Status _calc_file_size(uint32_t segid, FileType file_type, size_t* file_size); + Status pre_close() { + std::lock_guard l(_lock); + return _pre_close(); + } // wait for all memtables to be flushed. Status close(); private: + Status _calc_file_size(uint32_t segid, FileType file_type, size_t* file_size); + + // without lock + Status _pre_close(); + bool _is_init = false; bool _is_canceled = false; + bool _pre_closed = false; WriteRequest _req; std::unique_ptr _rowset_builder; std::shared_ptr _rowset_writer; diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h index 5241efb9c2924a..e7e1c73e7cbb41 100644 --- a/be/src/runtime/memory/cache_policy.h +++ b/be/src/runtime/memory/cache_policy.h @@ -48,7 +48,8 @@ class CachePolicy { CLOUD_TXN_DELETE_BITMAP_CACHE = 17, NONE = 18, // not be used FOR_UT_CACHE_NUMBER = 19, - QUERY_CACHE = 20 + QUERY_CACHE = 20, + TABLET_COLUMN_OBJECT_POOL = 21, }; static std::string type_string(CacheType type) { @@ -92,7 +93,9 @@ class CachePolicy { case CacheType::FOR_UT_CACHE_NUMBER: return "ForUTCacheNumber"; case CacheType::QUERY_CACHE: - return "QUERY_CACHE"; + return "QueryCache"; + case CacheType::TABLET_COLUMN_OBJECT_POOL: + return "TabletColumnObjectPool"; default: LOG(FATAL) << "not match type of cache policy :" << static_cast(type); } @@ -119,7 +122,8 @@ class CachePolicy { {"CreateTabletRRIdxCache", CacheType::CREATE_TABLET_RR_IDX_CACHE}, {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE}, {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}, - {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}}; + {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}, + {"TabletColumnObjectPool", CacheType::TABLET_COLUMN_OBJECT_POOL}}; static CacheType string_to_type(std::string type) { if (StringToType.contains(type)) { diff --git a/be/src/runtime/memory/global_memory_arbitrator.cpp b/be/src/runtime/memory/global_memory_arbitrator.cpp index 45d7781786f2d7..0458dd72a33a35 100644 --- a/be/src/runtime/memory/global_memory_arbitrator.cpp +++ b/be/src/runtime/memory/global_memory_arbitrator.cpp @@ -19,6 +19,7 @@ #include +#include "runtime/process_profile.h" #include "runtime/thread_context.h" namespace doris { @@ -33,7 +34,7 @@ bvar::PassiveStatus g_sys_mem_avail( "meminfo_sys_mem_avail", [](void*) { return GlobalMemoryArbitrator::sys_mem_available(); }, nullptr); -std::atomic GlobalMemoryArbitrator::_s_process_reserved_memory = 0; +std::atomic GlobalMemoryArbitrator::_process_reserved_memory = 0; std::atomic GlobalMemoryArbitrator::refresh_interval_memory_growth = 0; std::mutex GlobalMemoryArbitrator::cache_adjust_capacity_lock; std::condition_variable GlobalMemoryArbitrator::cache_adjust_capacity_cv; @@ -45,9 +46,10 @@ std::atomic GlobalMemoryArbitrator::memtable_memory_refresh_notify {false} bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) { if (sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark()) { + doris::ProcessProfile::instance()->memory_profile()->print_log_process_usage(); return false; } - int64_t old_reserved_mem = _s_process_reserved_memory.load(std::memory_order_relaxed); + int64_t old_reserved_mem = _process_reserved_memory.load(std::memory_order_relaxed); int64_t new_reserved_mem = 0; do { new_reserved_mem = old_reserved_mem + bytes; @@ -55,15 +57,16 @@ bool GlobalMemoryArbitrator::try_reserve_process_memory(int64_t bytes) { refresh_interval_memory_growth.load(std::memory_order_relaxed) + new_reserved_mem >= MemInfo::soft_mem_limit())) { + doris::ProcessProfile::instance()->memory_profile()->print_log_process_usage(); return false; } - } while (!_s_process_reserved_memory.compare_exchange_weak(old_reserved_mem, new_reserved_mem, - std::memory_order_relaxed)); + } while (!_process_reserved_memory.compare_exchange_weak(old_reserved_mem, new_reserved_mem, + std::memory_order_relaxed)); return true; } void GlobalMemoryArbitrator::release_process_reserved_memory(int64_t bytes) { - _s_process_reserved_memory.fetch_sub(bytes, std::memory_order_relaxed); + _process_reserved_memory.fetch_sub(bytes, std::memory_order_relaxed); } int64_t GlobalMemoryArbitrator::sub_thread_reserve_memory(int64_t bytes) { diff --git a/be/src/runtime/memory/global_memory_arbitrator.h b/be/src/runtime/memory/global_memory_arbitrator.h index 1859f45391fca3..075113088fbc5b 100644 --- a/be/src/runtime/memory/global_memory_arbitrator.h +++ b/be/src/runtime/memory/global_memory_arbitrator.h @@ -17,7 +17,7 @@ #pragma once -#include "runtime/memory/mem_tracker_limiter.h" +#include "runtime/process_profile.h" #include "util/mem_info.h" namespace doris { @@ -107,7 +107,7 @@ class GlobalMemoryArbitrator { static void release_process_reserved_memory(int64_t bytes); static inline int64_t process_reserved_memory() { - return _s_process_reserved_memory.load(std::memory_order_relaxed); + return _process_reserved_memory.load(std::memory_order_relaxed); } // `process_memory_usage` includes all reserved memory. if a thread has `reserved_memory`, @@ -122,8 +122,12 @@ class GlobalMemoryArbitrator { if (bytes <= 0) { return false; } - return process_memory_usage() + bytes >= MemInfo::soft_mem_limit() || - sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark(); + auto rt = process_memory_usage() + bytes >= MemInfo::soft_mem_limit() || + sys_mem_available() - bytes < MemInfo::sys_mem_available_warning_water_mark(); + if (rt) { + doris::ProcessProfile::instance()->memory_profile()->print_log_process_usage(); + } + return rt; } static bool is_exceed_hard_mem_limit(int64_t bytes = 0) { @@ -139,8 +143,12 @@ class GlobalMemoryArbitrator { // tcmalloc/jemalloc allocator cache does not participate in the mem check as part of the process physical memory. // because `new/malloc` will trigger mem hook when using tcmalloc/jemalloc allocator cache, // but it may not actually alloc physical memory, which is not expected in mem hook fail. - return process_memory_usage() + bytes >= MemInfo::mem_limit() || - sys_mem_available() - bytes < MemInfo::sys_mem_available_low_water_mark(); + auto rt = process_memory_usage() + bytes >= MemInfo::mem_limit() || + sys_mem_available() - bytes < MemInfo::sys_mem_available_low_water_mark(); + if (rt) { + doris::ProcessProfile::instance()->memory_profile()->print_log_process_usage(); + } + return rt; } static std::string process_mem_log_str() { @@ -192,7 +200,7 @@ class GlobalMemoryArbitrator { } private: - static std::atomic _s_process_reserved_memory; + static std::atomic _process_reserved_memory; }; } // namespace doris diff --git a/be/src/runtime/memory/heap_profiler.cpp b/be/src/runtime/memory/heap_profiler.cpp new file mode 100644 index 00000000000000..01ed82f76ef6d1 --- /dev/null +++ b/be/src/runtime/memory/heap_profiler.cpp @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/memory/heap_profiler.h" + +#ifdef USE_JEMALLOC +#include "jemalloc/jemalloc.h" +#endif +#include "agent/utils.h" +#include "common/config.h" +#include "io/fs/local_file_system.h" + +namespace doris { + +void HeapProfiler::set_prof_active(bool prof) { +#ifdef USE_JEMALLOC + std::lock_guard guard(_mutex); + try { + int err = jemallctl("prof.active", nullptr, nullptr, &prof, 1); + err |= jemallctl("prof.thread_active_init", nullptr, nullptr, &prof, 1); + if (err) { + LOG(WARNING) << "jemalloc heap profiling start failed, " << err; + } else { + LOG(WARNING) << "jemalloc heap profiling started"; + } + } catch (...) { + LOG(WARNING) << "jemalloc heap profiling start failed"; + } +#endif +} + +bool HeapProfiler::get_prof_dump(const std::string& profile_file_name) { +#ifdef USE_JEMALLOC + std::lock_guard guard(_mutex); + const char* file_name_ptr = profile_file_name.c_str(); + try { + int err = jemallctl("prof.dump", nullptr, nullptr, &file_name_ptr, sizeof(const char*)); + if (err) { + LOG(WARNING) << "dump heap profile failed, " << err; + return false; + } else { + LOG(INFO) << "dump heap profile to " << profile_file_name; + return true; + } + } catch (...) { + LOG(WARNING) << "dump heap profile failed"; + return false; + } +#else + return false; +#endif +} + +static std::string jeprof_profile_to_dot(const std::string& profile_file_name) { + AgentUtils util; + const static std::string jeprof_path = fmt::format("{}/bin/jeprof", std::getenv("DORIS_HOME")); + const static std::string binary_path = + fmt::format("{}/lib/doris_be", std::getenv("DORIS_HOME")); + // https://doris.apache.org/community/developer-guide/debug-tool/#3-jeprof-parses-heap-profile + std::string jeprof_cmd = + fmt::format("{} --dot {} {}", jeprof_path, binary_path, profile_file_name); + std::string msg; + bool rc = util.exec_cmd(jeprof_cmd, &msg); + if (!rc) { + LOG(WARNING) << "jeprof profile to dot failed: " << msg; + } + return msg; +} + +void HeapProfiler::heap_profiler_start() { + set_prof_active(true); +} + +void HeapProfiler::heap_profiler_stop() { + set_prof_active(false); +} + +bool HeapProfiler::check_heap_profiler() { +#ifdef USE_JEMALLOC + size_t value = 0; + size_t sz = sizeof(value); + jemallctl("prof.active", &value, &sz, nullptr, 0); + return value; +#else + return false; +#endif +} + +std::string HeapProfiler::dump_heap_profile() { + if (!config::jeprofile_dir.empty()) { + auto st = io::global_local_filesystem()->create_directory(config::jeprofile_dir); + if (!st.ok()) { + LOG(WARNING) << "create jeprofile dir failed."; + return ""; + } + } + std::string profile_file_name = + fmt::format("{}/jeheap_dump.{}.{}.{}.heap", config::jeprofile_dir, std::time(nullptr), + getpid(), rand()); + if (get_prof_dump(profile_file_name)) { + return profile_file_name; + } else { + return ""; + } +} + +std::string HeapProfiler::dump_heap_profile_to_dot() { + std::string profile_file_name = dump_heap_profile(); + if (!profile_file_name.empty()) { + return jeprof_profile_to_dot(profile_file_name); + } else { + return ""; + } +} + +} // namespace doris diff --git a/be/src/runtime/memory/heap_profiler.h b/be/src/runtime/memory/heap_profiler.h new file mode 100644 index 00000000000000..7f156351200b3a --- /dev/null +++ b/be/src/runtime/memory/heap_profiler.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/exec_env.h" + +namespace doris { + +class HeapProfiler { +public: + static HeapProfiler* create_global_instance() { return new HeapProfiler(); } + static HeapProfiler* instance() { return ExecEnv::GetInstance()->get_heap_profiler(); } + HeapProfiler() = default; + + void heap_profiler_start(); + void heap_profiler_stop(); + bool check_heap_profiler(); + std::string dump_heap_profile(); + std::string dump_heap_profile_to_dot(); + +private: + void set_prof_active(bool prof); + bool get_prof_dump(const std::string& profile_file_name); + + std::mutex _mutex; +}; + +} // namespace doris diff --git a/be/src/runtime/memory/lru_cache_policy.h b/be/src/runtime/memory/lru_cache_policy.h index f65a1e23e1a3af..ea34e2837f1313 100644 --- a/be/src/runtime/memory/lru_cache_policy.h +++ b/be/src/runtime/memory/lru_cache_policy.h @@ -149,7 +149,7 @@ class LRUCachePolicy : public CachePolicy { std::lock_guard l(_lock); COUNTER_SET(_freed_entrys_counter, (int64_t)0); COUNTER_SET(_freed_memory_counter, (int64_t)0); - if (_stale_sweep_time_s <= 0 && _cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) { + if (_stale_sweep_time_s <= 0 || _cache == ExecEnv::GetInstance()->get_dummy_lru_cache()) { return; } if (exceed_prune_limit()) { diff --git a/be/src/runtime/memory/mem_tracker.cpp b/be/src/runtime/memory/mem_tracker.cpp deleted file mode 100644 index 796e6c166e04fe..00000000000000 --- a/be/src/runtime/memory/mem_tracker.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from - -#include "runtime/memory/mem_tracker.h" - -#include - -namespace doris { - -constexpr size_t MEM_TRACKERS_GROUP_NUM = 1000; -std::atomic mem_tracker_group_counter(0); -bvar::Adder g_memtracker_cnt("memtracker_cnt"); - -std::vector MemTracker::mem_tracker_pool(MEM_TRACKERS_GROUP_NUM); - -MemTracker::MemTracker(const std::string& label) { - _label = label; - _group_num = mem_tracker_group_counter.fetch_add(1) % MEM_TRACKERS_GROUP_NUM; - { - std::lock_guard l(mem_tracker_pool[_group_num].group_lock); - _trackers_group_it = mem_tracker_pool[_group_num].trackers.insert( - mem_tracker_pool[_group_num].trackers.end(), this); - } - g_memtracker_cnt << 1; -} - -MemTracker::~MemTracker() { - if (_group_num != -1) { - std::lock_guard l(mem_tracker_pool[_group_num].group_lock); - if (_trackers_group_it != mem_tracker_pool[_group_num].trackers.end()) { - mem_tracker_pool[_group_num].trackers.erase(_trackers_group_it); - _trackers_group_it = mem_tracker_pool[_group_num].trackers.end(); - } - g_memtracker_cnt << -1; - } -} - -} // namespace doris \ No newline at end of file diff --git a/be/src/runtime/memory/mem_tracker.h b/be/src/runtime/memory/mem_tracker.h index 82b05fe544afc8..36dfa8e44f1d6d 100644 --- a/be/src/runtime/memory/mem_tracker.h +++ b/be/src/runtime/memory/mem_tracker.h @@ -33,8 +33,8 @@ namespace doris { class MemTracker final { public: MemTracker() = default; - MemTracker(const std::string& label); - ~MemTracker(); + MemTracker(std::string label) : _label(std::move(label)) {}; + ~MemTracker() = default; void consume(int64_t bytes) { _mem_counter.add(bytes); } void consume_no_update_peak(int64_t bytes) { _mem_counter.add_no_update_peak(bytes); } @@ -53,21 +53,6 @@ class MemTracker final { private: MemCounter _mem_counter; std::string _label {"None"}; - - /* - * Save all MemTrackers, used by dump memory info. - */ - struct TrackersGroup { - std::list trackers; - std::mutex group_lock; - }; - // Each group corresponds to several MemCountes and has a lock. - // Multiple groups are used to reduce the impact of locks. - static std::vector mem_tracker_pool; - // Group number in mem_tracker_pool, generated by the timestamp. - int64_t _group_num {-1}; - // Iterator into mem_tracker_pool for this object. Stored to have O(1) remove. - std::list::iterator _trackers_group_it; }; } // namespace doris diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp index 78e66b6a579b79..05ff13f0e7c646 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.cpp +++ b/be/src/runtime/memory/mem_tracker_limiter.cpp @@ -33,27 +33,15 @@ #include "runtime/workload_group/workload_group.h" #include "service/backend_options.h" #include "util/mem_info.h" -#include "util/perf_counters.h" #include "util/runtime_profile.h" namespace doris { static bvar::Adder memory_memtrackerlimiter_cnt("memory_memtrackerlimiter_cnt"); -static bvar::Adder memory_all_trackers_sum_bytes("memory_all_trackers_sum_bytes"); -static bvar::Adder memory_global_trackers_sum_bytes("memory_global_trackers_sum_bytes"); -static bvar::Adder memory_query_trackers_sum_bytes("memory_query_trackers_sum_bytes"); -static bvar::Adder memory_load_trackers_sum_bytes("memory_load_trackers_sum_bytes"); -static bvar::Adder memory_compaction_trackers_sum_bytes( - "memory_compaction_trackers_sum_bytes"); -static bvar::Adder memory_schema_change_trackers_sum_bytes( - "memory_schema_change_trackers_sum_bytes"); -static bvar::Adder memory_other_trackers_sum_bytes("memory_other_trackers_sum_bytes"); std::atomic mem_tracker_limiter_group_counter(0); constexpr auto GC_MAX_SEEK_TRACKER = 1000; -std::atomic MemTrackerLimiter::_enable_print_log_process_usage {true}; - // Reset before each free static std::unique_ptr free_top_memory_task_profile { std::make_unique("-")}; @@ -75,6 +63,7 @@ MemTrackerLimiter::MemTrackerLimiter(Type type, const std::string& label, int64_ _type = type; _label = label; _limit = byte_limit; + _uid = UniqueId::gen_uid(); if (_type == Type::GLOBAL) { _group_num = 0; } else { @@ -216,87 +205,38 @@ std::string MemTrackerLimiter::print_address_sanitizers() { return detail; } -MemTrackerLimiter::Snapshot MemTrackerLimiter::make_snapshot() const { - Snapshot snapshot; - snapshot.type = type_string(_type); - snapshot.label = _label; - snapshot.limit = _limit; - snapshot.cur_consumption = consumption(); - snapshot.peak_consumption = peak_consumption(); - return snapshot; -} - -MemTrackerLimiter::Snapshot MemTrackerLimiter::make_reserved_trackers_snapshot() const { - Snapshot snapshot; - snapshot.type = "reserved_memory"; - snapshot.label = _label; - snapshot.limit = -1; - snapshot.cur_consumption = reserved_consumption(); - snapshot.peak_consumption = reserved_peak_consumption(); - return snapshot; -} - -void MemTrackerLimiter::make_all_reserved_trackers_snapshots(std::vector* snapshots) { - for (auto& i : ExecEnv::GetInstance()->mem_tracker_limiter_pool) { - std::lock_guard l(i.group_lock); - for (auto trackerWptr : i.trackers) { - auto tracker = trackerWptr.lock(); - if (tracker != nullptr && tracker->reserved_consumption() != 0) { - (*snapshots).emplace_back(tracker->make_reserved_trackers_snapshot()); - } - } +RuntimeProfile* MemTrackerLimiter::make_profile(RuntimeProfile* profile) const { + RuntimeProfile* profile_snapshot = profile->create_child( + fmt::format("{}@{}@id={}", _label, type_string(_type), _uid.to_string()), true, false); + RuntimeProfile::Counter* current_usage_counter = + ADD_COUNTER(profile_snapshot, "CurrentUsage", TUnit::BYTES); + RuntimeProfile::Counter* peak_usage_counter = + ADD_COUNTER(profile_snapshot, "PeakUsage", TUnit::BYTES); + COUNTER_SET(current_usage_counter, consumption()); + COUNTER_SET(peak_usage_counter, peak_consumption()); + if (has_limit()) { + RuntimeProfile::Counter* limit_counter = + ADD_COUNTER(profile_snapshot, "Limit", TUnit::BYTES); + COUNTER_SET(limit_counter, _limit); + } + if (reserved_peak_consumption() != 0) { + RuntimeProfile::Counter* reserved_counter = + ADD_COUNTER(profile_snapshot, "ReservedMemory", TUnit::BYTES); + RuntimeProfile::Counter* reserved_peak_counter = + ADD_COUNTER(profile_snapshot, "ReservedPeakMemory", TUnit::BYTES); + COUNTER_SET(reserved_counter, reserved_consumption()); + COUNTER_SET(reserved_peak_counter, reserved_peak_consumption()); } + return profile_snapshot; } -void MemTrackerLimiter::refresh_global_counter() { - std::unordered_map type_mem_sum = { - {Type::GLOBAL, 0}, {Type::QUERY, 0}, {Type::LOAD, 0}, - {Type::COMPACTION, 0}, {Type::SCHEMA_CHANGE, 0}, {Type::OTHER, 0}}; - // always ExecEnv::ready(), because Daemon::_stop_background_threads_latch - for (auto& group : ExecEnv::GetInstance()->mem_tracker_limiter_pool) { - std::lock_guard l(group.group_lock); - for (auto trackerWptr : group.trackers) { - auto tracker = trackerWptr.lock(); - if (tracker != nullptr) { - type_mem_sum[tracker->type()] += tracker->consumption(); - } - } - } - int64_t all_trackers_mem_sum = 0; - for (auto it : type_mem_sum) { - MemTrackerLimiter::TypeMemSum[it.first].set(it.second); - - all_trackers_mem_sum += it.second; - switch (it.first) { - case Type::GLOBAL: - memory_global_trackers_sum_bytes - << it.second - memory_global_trackers_sum_bytes.get_value(); - break; - case Type::QUERY: - memory_query_trackers_sum_bytes - << it.second - memory_query_trackers_sum_bytes.get_value(); - break; - case Type::LOAD: - memory_load_trackers_sum_bytes - << it.second - memory_load_trackers_sum_bytes.get_value(); - break; - case Type::COMPACTION: - memory_compaction_trackers_sum_bytes - << it.second - memory_compaction_trackers_sum_bytes.get_value(); - break; - case Type::SCHEMA_CHANGE: - memory_schema_change_trackers_sum_bytes - << it.second - memory_schema_change_trackers_sum_bytes.get_value(); - break; - case Type::OTHER: - memory_other_trackers_sum_bytes - << it.second - memory_other_trackers_sum_bytes.get_value(); - } - } - all_trackers_mem_sum += MemInfo::allocator_cache_mem(); - all_trackers_mem_sum += MemInfo::allocator_metadata_mem(); - memory_all_trackers_sum_bytes << all_trackers_mem_sum - - memory_all_trackers_sum_bytes.get_value(); +std::string MemTrackerLimiter::make_profile_str() const { + std::unique_ptr profile_snapshot = + std::make_unique("MemTrackerSnapshot"); + make_profile(profile_snapshot.get()); + std::stringstream ss; + profile_snapshot->pretty_print(&ss); + return ss.str(); } void MemTrackerLimiter::clean_tracker_limiter_group() { @@ -317,78 +257,15 @@ void MemTrackerLimiter::clean_tracker_limiter_group() { #endif } -void MemTrackerLimiter::make_process_snapshots(std::vector* snapshots) { - MemTrackerLimiter::refresh_global_counter(); - int64_t all_trackers_mem_sum = 0; - Snapshot snapshot; - for (const auto& it : MemTrackerLimiter::TypeMemSum) { - snapshot.type = "overview"; - snapshot.label = type_string(it.first); - snapshot.limit = -1; - snapshot.cur_consumption = it.second.current_value(); - snapshot.peak_consumption = it.second.peak_value(); - (*snapshots).emplace_back(snapshot); - all_trackers_mem_sum += it.second.current_value(); - } - - snapshot.type = "overview"; - snapshot.label = "tc/jemalloc_cache"; - snapshot.limit = -1; - snapshot.cur_consumption = MemInfo::allocator_cache_mem(); - snapshot.peak_consumption = -1; - (*snapshots).emplace_back(snapshot); - all_trackers_mem_sum += MemInfo::allocator_cache_mem(); - - snapshot.type = "overview"; - snapshot.label = "tc/jemalloc_metadata"; - snapshot.limit = -1; - snapshot.cur_consumption = MemInfo::allocator_metadata_mem(); - snapshot.peak_consumption = -1; - (*snapshots).emplace_back(snapshot); - all_trackers_mem_sum += MemInfo::allocator_metadata_mem(); - - snapshot.type = "overview"; - snapshot.label = "reserved_memory"; - snapshot.limit = -1; - snapshot.cur_consumption = GlobalMemoryArbitrator::process_reserved_memory(); - snapshot.peak_consumption = -1; - (*snapshots).emplace_back(snapshot); - - snapshot.type = "overview"; - snapshot.label = "sum_of_all_trackers"; // is virtual memory - snapshot.limit = -1; - snapshot.cur_consumption = all_trackers_mem_sum; - snapshot.peak_consumption = -1; - (*snapshots).emplace_back(snapshot); - - snapshot.type = "overview"; -#ifdef ADDRESS_SANITIZER - snapshot.label = "[ASAN]VmRSS(process resident memory)"; // from /proc VmRSS VmHWM -#else - snapshot.label = "VmRSS(process resident memory)"; // from /proc VmRSS VmHWM -#endif - snapshot.limit = -1; - snapshot.cur_consumption = PerfCounters::get_vm_rss(); - snapshot.peak_consumption = PerfCounters::get_vm_hwm(); - (*snapshots).emplace_back(snapshot); - - snapshot.type = "overview"; - snapshot.label = "VmSize(process virtual memory)"; // from /proc VmSize VmPeak - snapshot.limit = -1; - snapshot.cur_consumption = PerfCounters::get_vm_size(); - snapshot.peak_consumption = PerfCounters::get_vm_peak(); - (*snapshots).emplace_back(snapshot); -} - -void MemTrackerLimiter::make_type_snapshots(std::vector* snapshots, - MemTrackerLimiter::Type type) { +void MemTrackerLimiter::make_type_trackers_profile(RuntimeProfile* profile, + MemTrackerLimiter::Type type) { if (type == Type::GLOBAL) { std::lock_guard l( ExecEnv::GetInstance()->mem_tracker_limiter_pool[0].group_lock); for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[0].trackers) { auto tracker = trackerWptr.lock(); if (tracker != nullptr) { - (*snapshots).emplace_back(tracker->make_snapshot()); + tracker->make_profile(profile); } } } else { @@ -398,125 +275,80 @@ void MemTrackerLimiter::make_type_snapshots(std::vector* snapshots, for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].trackers) { auto tracker = trackerWptr.lock(); if (tracker != nullptr && tracker->type() == type) { - (*snapshots).emplace_back(tracker->make_snapshot()); + tracker->make_profile(profile); } } } } } -void MemTrackerLimiter::make_top_consumption_snapshots(std::vector* snapshots, - int top_num) { - std::priority_queue max_pq; - // not include global type. +std::string MemTrackerLimiter::make_type_trackers_profile_str(MemTrackerLimiter::Type type) { + std::unique_ptr profile_snapshot = + std::make_unique("TypeMemTrackersSnapshot"); + make_type_trackers_profile(profile_snapshot.get(), type); + std::stringstream ss; + profile_snapshot->pretty_print(&ss); + return ss.str(); +} + +void MemTrackerLimiter::make_top_consumption_tasks_tracker_profile(RuntimeProfile* profile, + int top_num) { + std::unique_ptr tmp_profile_snapshot = + std::make_unique("tmpSnapshot"); + std::priority_queue> max_pq; + // start from 2, not include global type. for (unsigned i = 1; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { std::lock_guard l( ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].group_lock); for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].trackers) { auto tracker = trackerWptr.lock(); if (tracker != nullptr) { - max_pq.emplace(tracker->make_snapshot()); + auto* profile_snapshot = tracker->make_profile(tmp_profile_snapshot.get()); + max_pq.emplace(tracker->consumption(), profile_snapshot); } } } while (!max_pq.empty() && top_num > 0) { - (*snapshots).emplace_back(max_pq.top()); + RuntimeProfile* profile_snapshot = + profile->create_child(max_pq.top().second->name(), true, false); + profile_snapshot->merge(max_pq.top().second); top_num--; max_pq.pop(); } } -void MemTrackerLimiter::make_all_trackers_snapshots(std::vector* snapshots) { - for (auto& i : ExecEnv::GetInstance()->mem_tracker_limiter_pool) { - std::lock_guard l(i.group_lock); - for (auto trackerWptr : i.trackers) { - auto tracker = trackerWptr.lock(); - if (tracker != nullptr) { - (*snapshots).emplace_back(tracker->make_snapshot()); - } - } - } -} - -void MemTrackerLimiter::make_all_memory_state_snapshots(std::vector* snapshots) { - make_process_snapshots(snapshots); - make_all_trackers_snapshots(snapshots); - make_all_reserved_trackers_snapshots(snapshots); -} - -std::string MemTrackerLimiter::log_usage(Snapshot snapshot) { - return fmt::format("MemTracker Label={}, Type={}, Limit={}({} B), Used={}({} B), Peak={}({} B)", - snapshot.label, snapshot.type, MemCounter::print_bytes(snapshot.limit), - snapshot.limit, MemCounter::print_bytes(snapshot.cur_consumption), - snapshot.cur_consumption, MemCounter::print_bytes(snapshot.peak_consumption), - snapshot.peak_consumption); -} +void MemTrackerLimiter::make_all_tasks_tracker_profile(RuntimeProfile* profile) { + std::unordered_map types_profile; + types_profile[Type::QUERY] = profile->create_child("QueryTasks", true, false); + types_profile[Type::LOAD] = profile->create_child("LoadTasks", true, false); + types_profile[Type::COMPACTION] = profile->create_child("CompactionTasks", true, false); + types_profile[Type::SCHEMA_CHANGE] = profile->create_child("SchemaChangeTasks", true, false); + types_profile[Type::OTHER] = profile->create_child("OtherTasks", true, false); -std::string MemTrackerLimiter::type_log_usage(Snapshot snapshot) { - return fmt::format("Type={}, Used={}({} B), Peak={}({} B)", snapshot.type, - MemCounter::print_bytes(snapshot.cur_consumption), snapshot.cur_consumption, - MemCounter::print_bytes(snapshot.peak_consumption), - snapshot.peak_consumption); -} - -std::string MemTrackerLimiter::type_detail_usage(const std::string& msg, Type type) { - std::string detail = fmt::format("{}, Type:{}, Memory Tracker Summary", msg, type_string(type)); + // start from 2, not include global type. for (unsigned i = 1; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { std::lock_guard l( ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].group_lock); for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].trackers) { auto tracker = trackerWptr.lock(); - if (tracker != nullptr && tracker->type() == type) { - detail += "\n " + MemTrackerLimiter::log_usage(tracker->make_snapshot()); + if (tracker != nullptr) { + tracker->make_profile(types_profile[tracker->type()]); } } } - return detail; } void MemTrackerLimiter::print_log_usage(const std::string& msg) { if (_enable_print_log_usage) { _enable_print_log_usage = false; std::string detail = msg; - detail += "\nProcess Memory Summary:\n " + GlobalMemoryArbitrator::process_mem_log_str(); - detail += "\nMemory Tracker Summary: " + log_usage(); + detail += "\nProcess Memory Summary: " + GlobalMemoryArbitrator::process_mem_log_str(); + detail += "\n" + make_profile_str(); LOG(WARNING) << detail; } } -std::string MemTrackerLimiter::log_process_usage_str() { - std::string detail; - detail += "\nProcess Memory Summary:\n " + GlobalMemoryArbitrator::process_mem_log_str(); - std::vector snapshots; - MemTrackerLimiter::make_process_snapshots(&snapshots); - MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::GLOBAL); - MemTrackerLimiter::make_top_consumption_snapshots(&snapshots, 15); - MemTrackerLimiter::make_all_reserved_trackers_snapshots(&snapshots); - - detail += "\nMemory Tracker Summary:"; - for (const auto& snapshot : snapshots) { - if (snapshot.label.empty()) { - detail += "\n " + MemTrackerLimiter::type_log_usage(snapshot); - } else { - detail += "\n " + MemTrackerLimiter::log_usage(snapshot); - } - } - - // Add additional tracker printed when memory exceeds limit. - detail += "\n " + - ExecEnv::GetInstance()->memtable_memory_limiter()->mem_tracker()->log_usage(); - return detail; -} - -void MemTrackerLimiter::print_log_process_usage() { - // The default interval between two prints is 100ms (config::memory_maintenance_sleep_time_ms). - if (MemTrackerLimiter::_enable_print_log_process_usage) { - MemTrackerLimiter::_enable_print_log_process_usage = false; - LOG(WARNING) << log_process_usage_str(); - } -} - std::string MemTrackerLimiter::tracker_limit_exceeded_str() { std::string err_msg = fmt::format( "memory tracker limit exceeded, tracker label:{}, type:{}, limit " diff --git a/be/src/runtime/memory/mem_tracker_limiter.h b/be/src/runtime/memory/mem_tracker_limiter.h index faf354cca4cbf3..445856b1f6af83 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.h +++ b/be/src/runtime/memory/mem_tracker_limiter.h @@ -85,27 +85,47 @@ class MemTrackerLimiter final { OTHER = 5, }; - struct Snapshot { - std::string type; - std::string label; - int64_t limit = 0; - int64_t cur_consumption = 0; - int64_t peak_consumption = 0; - - bool operator<(const Snapshot& rhs) const { return cur_consumption < rhs.cur_consumption; } - }; + static std::string type_string(Type type) { + switch (type) { + case Type::GLOBAL: + return "global"; + case Type::QUERY: + return "query"; + case Type::LOAD: + return "load"; + case Type::COMPACTION: + return "compaction"; + case Type::SCHEMA_CHANGE: + return "schema_change"; + case Type::OTHER: + return "other"; + default: + LOG(FATAL) << "not match type of mem tracker limiter :" << static_cast(type); + } + LOG(FATAL) << "__builtin_unreachable"; + __builtin_unreachable(); + } - // Corresponding to MemTrackerLimiter::Type. - // MemCounter contains atomic variables, which are not allowed to be copied or moved. - inline static std::unordered_map TypeMemSum; + static std::string gc_type_string(GCType type) { + switch (type) { + case GCType::PROCESS: + return "process"; + case GCType::WORK_LOAD_GROUP: + return "work load group"; + default: + LOG(FATAL) << "not match gc type:" << static_cast(type); + } + LOG(FATAL) << "__builtin_unreachable"; + __builtin_unreachable(); + } /* * Part 2, Constructors and property methods */ - static std::shared_ptr create_shared( - MemTrackerLimiter::Type type, const std::string& label = std::string(), - int64_t byte_limit = -1); + static std::shared_ptr create_shared(MemTrackerLimiter::Type type, + const std::string& label, + int64_t byte_limit = -1); // byte_limit equal to -1 means no consumption limit, only participate in process memory statistics. MemTrackerLimiter(Type type, const std::string& label, int64_t byte_limit); @@ -119,12 +139,13 @@ class MemTrackerLimiter final { int64_t limit() const { return _limit; } bool limit_exceeded() const { return _limit >= 0 && _limit < consumption(); } Status check_limit(int64_t bytes = 0); + // Log the memory usage when memory limit is exceeded. + std::string tracker_limit_exceeded_str(); bool is_overcommit_tracker() const { return type() == Type::QUERY || type() == Type::LOAD; } bool is_query_cancelled() { return _is_query_cancelled; } void set_is_query_cancelled(bool is_cancelled) { _is_query_cancelled.store(is_cancelled); } - // Iterator into mem_tracker_limiter_pool for this object. Stored to have O(1) remove. - std::list>::iterator wg_tracker_limiter_group_it; + static void clean_tracker_limiter_group(); /* * Part 3, Memory tracking method (use carefully!) @@ -200,36 +221,18 @@ class MemTrackerLimiter final { DCHECK(reserved_consumption() >= 0); } - Snapshot make_reserved_trackers_snapshot() const; - static void make_all_reserved_trackers_snapshots(std::vector* snapshots); - /* - * Part 4, Memory snapshot and log method + * Part 4, Memory profile and log method */ + RuntimeProfile* make_profile(RuntimeProfile* profile) const; + std::string make_profile_str() const; + static void make_type_trackers_profile(RuntimeProfile* profile, MemTrackerLimiter::Type type); + static std::string make_type_trackers_profile_str(MemTrackerLimiter::Type type); + static void make_top_consumption_tasks_tracker_profile(RuntimeProfile* profile, int top_num); + static void make_all_tasks_tracker_profile(RuntimeProfile* profile); - static void refresh_global_counter(); - static void clean_tracker_limiter_group(); - - Snapshot make_snapshot() const; - // Returns a list of all the valid tracker snapshots. - static void make_process_snapshots(std::vector* snapshots); - static void make_type_snapshots(std::vector* snapshots, Type type); - static void make_all_trackers_snapshots(std::vector* snapshots); - static void make_all_memory_state_snapshots(std::vector* snapshots); - static void make_top_consumption_snapshots(std::vector* snapshots, int top_num); - - static std::string log_usage(Snapshot snapshot); - std::string log_usage() const { return log_usage(make_snapshot()); } - static std::string type_log_usage(Snapshot snapshot); - static std::string type_detail_usage(const std::string& msg, Type type); void print_log_usage(const std::string& msg); void enable_print_log_usage() { _enable_print_log_usage = true; } - // process memory changes more than 256M, or the GC ends - static void enable_print_log_process_usage() { _enable_print_log_process_usage = true; } - static std::string log_process_usage_str(); - static void print_log_process_usage(); - // Log the memory usage when memory limit is exceeded. - std::string tracker_limit_exceeded_str(); /* * Part 5, Memory GC method @@ -273,44 +276,6 @@ class MemTrackerLimiter final { bool is_group_commit_load {false}; private: - /* - * Part 7, Private method - */ - - static std::string type_string(Type type) { - switch (type) { - case Type::GLOBAL: - return "global"; - case Type::QUERY: - return "query"; - case Type::LOAD: - return "load"; - case Type::COMPACTION: - return "compaction"; - case Type::SCHEMA_CHANGE: - return "schema_change"; - case Type::OTHER: - return "other"; - default: - LOG(FATAL) << "not match type of mem tracker limiter :" << static_cast(type); - } - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - - static std::string gc_type_string(GCType type) { - switch (type) { - case GCType::PROCESS: - return "process"; - case GCType::WORK_LOAD_GROUP: - return "work load group"; - default: - LOG(FATAL) << "not match gc type:" << static_cast(type); - } - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - // only for Type::QUERY or Type::LOAD. static TUniqueId label_to_queryid(const std::string& label) { if (label.find("#Id=") == std::string::npos) { @@ -335,6 +300,8 @@ class MemTrackerLimiter final { // label used in the make snapshot, not guaranteed unique. std::string _label; + // For generate runtime profile, profile name must be unique. + UniqueId _uid; MemCounter _mem_counter; MemCounter _reserved_counter; @@ -354,7 +321,6 @@ class MemTrackerLimiter final { // Avoid frequent printing. bool _enable_print_log_usage = false; - static std::atomic _enable_print_log_process_usage; std::shared_ptr _query_statistics = nullptr; diff --git a/be/src/runtime/memory/memory_profile.cpp b/be/src/runtime/memory/memory_profile.cpp new file mode 100644 index 00000000000000..8dbdcbdd3af769 --- /dev/null +++ b/be/src/runtime/memory/memory_profile.cpp @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/memory/memory_profile.h" + +#include "bvar/reducer.h" +#include "runtime/exec_env.h" +#include "runtime/memory/global_memory_arbitrator.h" +#include "runtime/memory/mem_tracker_limiter.h" +#include "util/mem_info.h" +#include "util/runtime_profile.h" + +namespace doris { + +static bvar::Adder memory_all_tracked_sum_bytes("memory_all_tracked_sum_bytes"); +static bvar::Adder memory_global_trackers_sum_bytes("memory_global_trackers_sum_bytes"); +static bvar::Adder memory_query_trackers_sum_bytes("memory_query_trackers_sum_bytes"); +static bvar::Adder memory_load_trackers_sum_bytes("memory_load_trackers_sum_bytes"); +static bvar::Adder memory_compaction_trackers_sum_bytes( + "memory_compaction_trackers_sum_bytes"); +static bvar::Adder memory_schema_change_trackers_sum_bytes( + "memory_schema_change_trackers_sum_bytes"); +static bvar::Adder memory_other_trackers_sum_bytes("memory_other_trackers_sum_bytes"); +static bvar::Adder memory_reserved_memory_bytes("memory_reserved_memory_bytes"); +static bvar::Adder memory_all_tasks_memory_bytes("memory_all_tasks_memory_bytes"); +static bvar::Adder memory_untracked_memory_bytes("memory_untracked_memory_bytes"); + +MemoryProfile::MemoryProfile() { + _memory_overview_profile.set(std::make_unique("MemoryOverviewSnapshot")); + _global_memory_profile.set(std::make_unique("GlobalMemorySnapshot")); + _top_memory_tasks_profile.set(std::make_unique("TopMemoryTasksSnapshot")); + _tasks_memory_profile.set(std::make_unique("TasksMemorySnapshot")); +} + +void MemoryProfile::refresh_memory_overview_profile() { +#ifdef ADDRESS_SANITIZER + std::unique_ptr memory_overview_profile = + std::make_unique("[ASAN]MemoryOverviewSnapshot"); +#else + std::unique_ptr memory_overview_profile = + std::make_unique("MemoryOverviewSnapshot"); +#endif + std::unique_ptr global_memory_profile = + std::make_unique("GlobalMemorySnapshot"); + std::unique_ptr top_memory_tasks_profile = + std::make_unique("TopMemoryTasksSnapshot"); + + // 1. create profile + RuntimeProfile* untracked_memory_profile = + memory_overview_profile->create_child("UntrackedMemory", true, false); + RuntimeProfile* tracked_memory_profile = + memory_overview_profile->create_child("TrackedMemory", true, false); + RuntimeProfile* tasks_memory_overview_profile = + tracked_memory_profile->create_child("TasksMemory", true, false); + RuntimeProfile* tasks_memory_overview_details_profile = + tasks_memory_overview_profile->create_child("Details", true, false); + RuntimeProfile* global_memory_overview_profile = + tracked_memory_profile->create_child("GlobalMemory", true, false); + RuntimeProfile* jemalloc_memory_profile = + tracked_memory_profile->create_child("JemallocMemory", true, false); + RuntimeProfile* jemalloc_memory_details_profile = + jemalloc_memory_profile->create_child("Details", true, false); + + // 2. add counter + // 2.1 add process memory counter + RuntimeProfile::Counter* process_physical_memory_current_usage_counter = + ADD_COUNTER(memory_overview_profile, "PhysicalMemory(VmRSS)", TUnit::BYTES); + RuntimeProfile::Counter* process_physical_memory_peak_usage_counter = + memory_overview_profile->AddHighWaterMarkCounter("PhysicalMemoryPeak", TUnit::BYTES); + RuntimeProfile::Counter* process_virtual_memory_current_usage_counter = + ADD_COUNTER(memory_overview_profile, "VirtualMemory(VmSize)", TUnit::BYTES); + RuntimeProfile::Counter* process_virtual_memory_peak_usage_counter = + memory_overview_profile->AddHighWaterMarkCounter("VirtualMemoryPeak", TUnit::BYTES); + + // 2.2 add untracked memory counter + RuntimeProfile::Counter* untracked_memory_current_usage_counter = + ADD_COUNTER(untracked_memory_profile, "CurrentUsage", TUnit::BYTES); + RuntimeProfile::Counter* untracked_memory_peak_usage_counter = + untracked_memory_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); + + // 2.3 add tracked memory counter + RuntimeProfile::Counter* tracked_memory_current_usage_counter = + ADD_COUNTER(tracked_memory_profile, "CurrentUsage", TUnit::BYTES); + RuntimeProfile::Counter* tracked_memory_peak_usage_counter = + tracked_memory_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); + + // 2.4 add jemalloc memory counter + RuntimeProfile::Counter* jemalloc_memory_current_usage_counter = + ADD_COUNTER(jemalloc_memory_profile, "CurrentUsage", TUnit::BYTES); + RuntimeProfile::Counter* jemalloc_memory_peak_usage_counter = + jemalloc_memory_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); + RuntimeProfile::Counter* jemalloc_cache_current_usage_counter = + ADD_COUNTER(jemalloc_memory_details_profile, "Cache", TUnit::BYTES); + RuntimeProfile::Counter* jemalloc_cache_peak_usage_counter = + jemalloc_memory_details_profile->AddHighWaterMarkCounter("CachePeak", TUnit::BYTES); + RuntimeProfile::Counter* jemalloc_metadata_current_usage_counter = + ADD_COUNTER(jemalloc_memory_details_profile, "Metadata", TUnit::BYTES); + RuntimeProfile::Counter* jemalloc_metadata_peak_usage_counter = + jemalloc_memory_details_profile->AddHighWaterMarkCounter("MetadataPeak", TUnit::BYTES); + + // 2.5 add global memory counter + RuntimeProfile::Counter* global_current_usage_counter = + ADD_COUNTER(global_memory_overview_profile, "CurrentUsage", TUnit::BYTES); + RuntimeProfile::Counter* global_peak_usage_counter = + global_memory_overview_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); + + // 2.6 add tasks memory counter + RuntimeProfile::Counter* tasks_memory_current_usage_counter = + ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_profile, "CurrentUsage", TUnit::BYTES, 1); + // Reserved memory is the sum of all task reserved memory, is duplicated with all task memory counter. + RuntimeProfile::Counter* reserved_memory_current_usage_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + tasks_memory_overview_profile, "ReservedMemory", TUnit::BYTES, "CurrentUsage", 1); + RuntimeProfile::Counter* reserved_memory_peak_usage_counter = + tasks_memory_overview_profile->AddHighWaterMarkCounter("ReservedMemoryPeak", + TUnit::BYTES, "CurrentUsage", 1); + RuntimeProfile::Counter* tasks_memory_peak_usage_counter = + tasks_memory_overview_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); + RuntimeProfile::Counter* query_current_usage_counter = + ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, "Query", TUnit::BYTES, 1); + RuntimeProfile::Counter* query_peak_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter( + "QueryPeak", TUnit::BYTES, "Query", 1); + RuntimeProfile::Counter* load_current_usage_counter = + ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, "Load", TUnit::BYTES, 1); + RuntimeProfile::Counter* load_peak_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter("LoadPeak", TUnit::BYTES, + "Load", 1); + RuntimeProfile::Counter* load_all_memtables_current_usage_counter = + ADD_CHILD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, + "AllMemTablesMemory", TUnit::BYTES, "Load", 1); + RuntimeProfile::Counter* load_all_memtables_peak_usage_counter = + ADD_CHILD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, + "AllMemTablesMemoryPeak", TUnit::BYTES, "Load", 1); + RuntimeProfile::Counter* compaction_current_usage_counter = ADD_COUNTER_WITH_LEVEL( + tasks_memory_overview_details_profile, "Compaction", TUnit::BYTES, 1); + RuntimeProfile::Counter* compaction_peak_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter( + "CompactionPeak", TUnit::BYTES, "Compaction", 1); + RuntimeProfile::Counter* schema_change_current_usage_counter = ADD_COUNTER_WITH_LEVEL( + tasks_memory_overview_details_profile, "SchemaChange", TUnit::BYTES, 1); + RuntimeProfile::Counter* schema_change_peak_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter( + "SchemaChangePeak", TUnit::BYTES, "SchemaChange", 1); + RuntimeProfile::Counter* other_current_usage_counter = + ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, "Other", TUnit::BYTES, 1); + RuntimeProfile::Counter* other_peak_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter( + "OtherPeak", TUnit::BYTES, "Other", 1); + // 3. refresh counter + // 3.1 refresh process memory counter + COUNTER_SET(process_physical_memory_current_usage_counter, + PerfCounters::get_vm_rss()); // from /proc VmRSS VmHWM + COUNTER_SET(process_physical_memory_peak_usage_counter, PerfCounters::get_vm_hwm()); + COUNTER_SET(process_virtual_memory_current_usage_counter, + PerfCounters::get_vm_size()); // from /proc VmSize VmPeak + COUNTER_SET(process_virtual_memory_peak_usage_counter, PerfCounters::get_vm_peak()); + + // 3.2 refresh tracked memory counter + std::unordered_map type_mem_sum = { + {MemTrackerLimiter::Type::GLOBAL, 0}, {MemTrackerLimiter::Type::QUERY, 0}, + {MemTrackerLimiter::Type::LOAD, 0}, {MemTrackerLimiter::Type::COMPACTION, 0}, + {MemTrackerLimiter::Type::SCHEMA_CHANGE, 0}, {MemTrackerLimiter::Type::OTHER, 0}}; + // always ExecEnv::ready(), because Daemon::_stop_background_threads_latch + for (auto& group : ExecEnv::GetInstance()->mem_tracker_limiter_pool) { + std::lock_guard l(group.group_lock); + for (auto trackerWptr : group.trackers) { + auto tracker = trackerWptr.lock(); + if (tracker != nullptr) { + type_mem_sum[tracker->type()] += tracker->consumption(); + } + } + } + + int64_t all_tracked_mem_sum = 0; + int64_t tasks_trackers_mem_sum = 0; + for (auto it : type_mem_sum) { + all_tracked_mem_sum += it.second; + switch (it.first) { + case MemTrackerLimiter::Type::GLOBAL: + COUNTER_SET(global_current_usage_counter, it.second); + COUNTER_SET(global_peak_usage_counter, it.second); + memory_global_trackers_sum_bytes + << it.second - memory_global_trackers_sum_bytes.get_value(); + break; + case MemTrackerLimiter::Type::QUERY: + COUNTER_SET(query_current_usage_counter, it.second); + COUNTER_SET(query_peak_usage_counter, it.second); + tasks_trackers_mem_sum += it.second; + memory_query_trackers_sum_bytes + << it.second - memory_query_trackers_sum_bytes.get_value(); + break; + case MemTrackerLimiter::Type::LOAD: + COUNTER_SET(load_current_usage_counter, it.second); + COUNTER_SET(load_peak_usage_counter, it.second); + tasks_trackers_mem_sum += it.second; + memory_load_trackers_sum_bytes + << it.second - memory_load_trackers_sum_bytes.get_value(); + break; + case MemTrackerLimiter::Type::COMPACTION: + COUNTER_SET(compaction_current_usage_counter, it.second); + COUNTER_SET(compaction_peak_usage_counter, it.second); + tasks_trackers_mem_sum += it.second; + memory_compaction_trackers_sum_bytes + << it.second - memory_compaction_trackers_sum_bytes.get_value(); + break; + case MemTrackerLimiter::Type::SCHEMA_CHANGE: + COUNTER_SET(schema_change_current_usage_counter, it.second); + COUNTER_SET(schema_change_peak_usage_counter, it.second); + tasks_trackers_mem_sum += it.second; + memory_schema_change_trackers_sum_bytes + << it.second - memory_schema_change_trackers_sum_bytes.get_value(); + break; + case MemTrackerLimiter::Type::OTHER: + COUNTER_SET(other_current_usage_counter, it.second); + COUNTER_SET(other_peak_usage_counter, it.second); + tasks_trackers_mem_sum += it.second; + memory_other_trackers_sum_bytes + << it.second - memory_other_trackers_sum_bytes.get_value(); + } + } + + MemTrackerLimiter::make_type_trackers_profile(global_memory_profile.get(), + MemTrackerLimiter::Type::GLOBAL); + + MemTrackerLimiter::make_top_consumption_tasks_tracker_profile(top_memory_tasks_profile.get(), + 15); + + COUNTER_SET(tasks_memory_current_usage_counter, tasks_trackers_mem_sum); + COUNTER_SET(tasks_memory_peak_usage_counter, tasks_trackers_mem_sum); + memory_all_tasks_memory_bytes << tasks_trackers_mem_sum - + memory_all_tasks_memory_bytes.get_value(); + + COUNTER_SET(reserved_memory_current_usage_counter, + GlobalMemoryArbitrator::process_reserved_memory()); + COUNTER_SET(reserved_memory_peak_usage_counter, + GlobalMemoryArbitrator::process_reserved_memory()); + memory_reserved_memory_bytes << GlobalMemoryArbitrator::process_reserved_memory() - + memory_reserved_memory_bytes.get_value(); + + all_tracked_mem_sum += MemInfo::allocator_cache_mem(); + COUNTER_SET(jemalloc_cache_current_usage_counter, + static_cast(MemInfo::allocator_cache_mem())); + COUNTER_SET(jemalloc_cache_peak_usage_counter, + static_cast(MemInfo::allocator_cache_mem())); + all_tracked_mem_sum += MemInfo::allocator_metadata_mem(); + COUNTER_SET(jemalloc_metadata_current_usage_counter, + static_cast(MemInfo::allocator_metadata_mem())); + COUNTER_SET(jemalloc_metadata_peak_usage_counter, + static_cast(MemInfo::allocator_metadata_mem())); + COUNTER_SET(jemalloc_memory_current_usage_counter, + jemalloc_cache_current_usage_counter->value() + + jemalloc_metadata_current_usage_counter->value()); + COUNTER_SET(jemalloc_memory_peak_usage_counter, + jemalloc_cache_current_usage_counter->value() + + jemalloc_metadata_current_usage_counter->value()); + + COUNTER_SET(tracked_memory_current_usage_counter, all_tracked_mem_sum); + COUNTER_SET(tracked_memory_peak_usage_counter, all_tracked_mem_sum); + memory_all_tracked_sum_bytes << all_tracked_mem_sum - memory_all_tracked_sum_bytes.get_value(); + + // 3.3 refresh untracked memory counter + int64_t untracked_memory = + process_physical_memory_current_usage_counter->value() - all_tracked_mem_sum; + COUNTER_SET(untracked_memory_current_usage_counter, untracked_memory); + COUNTER_SET(untracked_memory_peak_usage_counter, untracked_memory); + memory_untracked_memory_bytes << untracked_memory - memory_untracked_memory_bytes.get_value(); + + // 3.4 refresh additional tracker printed when memory exceeds limit. + COUNTER_SET(load_all_memtables_current_usage_counter, + ExecEnv::GetInstance()->memtable_memory_limiter()->mem_tracker()->consumption()); + COUNTER_SET( + load_all_memtables_peak_usage_counter, + ExecEnv::GetInstance()->memtable_memory_limiter()->mem_tracker()->peak_consumption()); + + // 4. reset profile + _memory_overview_profile.set(std::move(memory_overview_profile)); + _global_memory_profile.set(std::move(global_memory_profile)); + _top_memory_tasks_profile.set(std::move(top_memory_tasks_profile)); +} + +void MemoryProfile::refresh_tasks_memory_profile() { + std::unique_ptr tasks_memory_profile = + std::make_unique("AllTasksMemorySnapshot"); + MemTrackerLimiter::make_all_tasks_tracker_profile(tasks_memory_profile.get()); + _tasks_memory_profile.set(std::move(tasks_memory_profile)); +} + +void MemoryProfile::make_memory_profile(RuntimeProfile* profile) const { + RuntimeProfile* memory_profile_snapshot = profile->create_child("MemoryProfile", true, false); + + auto memory_overview_version_ptr = _memory_overview_profile.get(); + RuntimeProfile* memory_overview_profile = + memory_profile_snapshot->create_child(memory_overview_version_ptr->name(), true, false); + memory_overview_profile->merge(const_cast(memory_overview_version_ptr.get())); + + auto global_memory_version_ptr = _global_memory_profile.get(); + RuntimeProfile* global_memory_profile = + memory_profile_snapshot->create_child(global_memory_version_ptr->name(), true, false); + global_memory_profile->merge(const_cast(global_memory_version_ptr.get())); + + auto top_memory_tasks_version_ptr = _top_memory_tasks_profile.get(); + RuntimeProfile* top_memory_tasks_profile = memory_profile_snapshot->create_child( + top_memory_tasks_version_ptr->name(), true, false); + top_memory_tasks_profile->merge( + const_cast(top_memory_tasks_version_ptr.get())); + + auto tasks_memory_version_ptr = _tasks_memory_profile.get(); + RuntimeProfile* tasks_memory_profile = + memory_profile_snapshot->create_child(tasks_memory_version_ptr->name(), true, false); + tasks_memory_profile->merge(const_cast(tasks_memory_version_ptr.get())); +} + +int64_t MemoryProfile::query_current_usage() { + return memory_query_trackers_sum_bytes.get_value(); +} +int64_t MemoryProfile::load_current_usage() { + return memory_load_trackers_sum_bytes.get_value(); +} +int64_t MemoryProfile::compaction_current_usage() { + return memory_compaction_trackers_sum_bytes.get_value(); +} +int64_t MemoryProfile::schema_change_current_usage() { + return memory_schema_change_trackers_sum_bytes.get_value(); +} +int64_t MemoryProfile::other_current_usage() { + return memory_other_trackers_sum_bytes.get_value(); +} + +void MemoryProfile::print_log_process_usage() { + if (_enable_print_log_process_usage) { + _enable_print_log_process_usage = false; + LOG(WARNING) << "Process Memory Summary: " + GlobalMemoryArbitrator::process_mem_log_str(); + LOG(WARNING) << "\n" << print_memory_overview_profile(); + LOG(WARNING) << "\n" << print_global_memory_profile(); + LOG(WARNING) << "\n" << print_top_memory_tasks_profile(); + } +} + +} // namespace doris diff --git a/be/src/runtime/memory/memory_profile.h b/be/src/runtime/memory/memory_profile.h new file mode 100644 index 00000000000000..9f1bab0c02a802 --- /dev/null +++ b/be/src/runtime/memory/memory_profile.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "util/runtime_profile.h" + +namespace doris { + +class MemoryProfile { +public: + MemoryProfile(); + + void refresh_memory_overview_profile(); + void refresh_tasks_memory_profile(); + + void make_memory_profile(RuntimeProfile* profile) const; + + std::string print_memory_overview_profile() const { + std::stringstream ss; + auto version_ptr = _memory_overview_profile.get(); + version_ptr->pretty_print(&ss); + return ss.str(); + } + + std::string print_global_memory_profile() const { + std::stringstream ss; + auto version_ptr = _global_memory_profile.get(); + version_ptr->pretty_print(&ss); + return ss.str(); + } + + std::string print_top_memory_tasks_profile() const { + std::stringstream ss; + auto version_ptr = _top_memory_tasks_profile.get(); + version_ptr->pretty_print(&ss); + return ss.str(); + } + + std::string print_tasks_memory_profile() const { + std::stringstream ss; + auto version_ptr = _tasks_memory_profile.get(); + version_ptr->pretty_print(&ss); + return ss.str(); + } + + static int64_t query_current_usage(); + static int64_t load_current_usage(); + static int64_t compaction_current_usage(); + static int64_t schema_change_current_usage(); + static int64_t other_current_usage(); + + // process memory changes more than 256M, or the GC ends + void enable_print_log_process_usage() { _enable_print_log_process_usage = true; } + void print_log_process_usage(); + +private: + MultiVersion _memory_overview_profile; + MultiVersion _global_memory_profile; + MultiVersion _top_memory_tasks_profile; + MultiVersion _tasks_memory_profile; + + std::atomic _enable_print_log_process_usage {true}; +}; + +} // namespace doris diff --git a/be/src/runtime/memory/memory_reclamation.cpp b/be/src/runtime/memory/memory_reclamation.cpp index 17f5a41f462b50..2d6098f7438759 100644 --- a/be/src/runtime/memory/memory_reclamation.cpp +++ b/be/src/runtime/memory/memory_reclamation.cpp @@ -17,7 +17,8 @@ #include "runtime/memory/memory_reclamation.h" -#include "runtime/memory/cache_manager.h" +#include "runtime/exec_env.h" +#include "runtime/memory/mem_tracker_limiter.h" #include "runtime/workload_group/workload_group.h" #include "runtime/workload_group/workload_group_manager.h" #include "util/mem_info.h" @@ -55,9 +56,15 @@ bool MemoryReclamation::process_minor_gc(std::string mem_info) { } if (config::enable_query_memory_overcommit) { - VLOG_NOTICE << MemTrackerLimiter::type_detail_usage( - "[MemoryGC] before free top memory overcommit query in minor GC", - MemTrackerLimiter::Type::QUERY); + if (config::crash_in_memory_tracker_inaccurate) { + LOG(INFO) << fmt::format( + "[MemoryGC] before free top memory overcommit query in minor GC, Type:{}, " + "Memory " + "Tracker Summary: {}", + MemTrackerLimiter::type_string(MemTrackerLimiter::Type::QUERY), + MemTrackerLimiter::make_type_trackers_profile_str( + MemTrackerLimiter::Type::QUERY)); + } RuntimeProfile* toq_profile = profile->create_child("FreeTopOvercommitMemoryQuery", true, true); freed_mem += MemTrackerLimiter::free_top_overcommit_query( @@ -98,8 +105,14 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) { } } - VLOG_NOTICE << MemTrackerLimiter::type_detail_usage( - "[MemoryGC] before free top memory query in full GC", MemTrackerLimiter::Type::QUERY); + if (config::crash_in_memory_tracker_inaccurate) { + LOG(INFO) << fmt::format( + "[MemoryGC] before free top memory query in full GC, Type:{}, Memory Tracker " + "Summary: " + "{}", + MemTrackerLimiter::type_string(MemTrackerLimiter::Type::QUERY), + MemTrackerLimiter::make_type_trackers_profile_str(MemTrackerLimiter::Type::QUERY)); + } RuntimeProfile* tmq_profile = profile->create_child("FreeTopMemoryQuery", true, true); freed_mem += MemTrackerLimiter::free_top_memory_query( MemInfo::process_full_gc_size() - freed_mem, mem_info, tmq_profile); @@ -108,9 +121,14 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) { } if (config::enable_query_memory_overcommit) { - VLOG_NOTICE << MemTrackerLimiter::type_detail_usage( - "[MemoryGC] before free top memory overcommit load in full GC", - MemTrackerLimiter::Type::LOAD); + if (config::crash_in_memory_tracker_inaccurate) { + LOG(INFO) << fmt::format( + "[MemoryGC] before free top memory overcommit load in full GC, Type:{}, Memory " + "Tracker Summary: {}", + MemTrackerLimiter::type_string(MemTrackerLimiter::Type::LOAD), + MemTrackerLimiter::make_type_trackers_profile_str( + MemTrackerLimiter::Type::LOAD)); + } RuntimeProfile* tol_profile = profile->create_child("FreeTopMemoryOvercommitLoad", true, true); freed_mem += MemTrackerLimiter::free_top_overcommit_load( @@ -120,8 +138,14 @@ bool MemoryReclamation::process_full_gc(std::string mem_info) { } } - VLOG_NOTICE << MemTrackerLimiter::type_detail_usage( - "[MemoryGC] before free top memory load in full GC", MemTrackerLimiter::Type::LOAD); + if (config::crash_in_memory_tracker_inaccurate) { + LOG(INFO) << fmt::format( + "[MemoryGC] before free top memory load in full GC, Type:{}, Memory Tracker " + "Summary: " + "{}", + MemTrackerLimiter::type_string(MemTrackerLimiter::Type::LOAD), + MemTrackerLimiter::make_type_trackers_profile_str(MemTrackerLimiter::Type::LOAD)); + } RuntimeProfile* tml_profile = profile->create_child("FreeTopMemoryLoad", true, true); freed_mem += MemTrackerLimiter::free_top_memory_load( MemInfo::process_full_gc_size() - freed_mem, mem_info, tml_profile); diff --git a/be/src/runtime/memory/thread_mem_tracker_mgr.h b/be/src/runtime/memory/thread_mem_tracker_mgr.h index fd14750d8b8ebc..db3b32a6298820 100644 --- a/be/src/runtime/memory/thread_mem_tracker_mgr.h +++ b/be/src/runtime/memory/thread_mem_tracker_mgr.h @@ -111,7 +111,7 @@ class ThreadMemTrackerMgr { return fmt::format( "ThreadMemTrackerMgr debug, _untracked_mem:{}, " "_limiter_tracker:<{}>, _consumer_tracker_stack:<{}>", - std::to_string(_untracked_mem), _limiter_tracker->log_usage(), + std::to_string(_untracked_mem), _limiter_tracker->make_profile_str(), fmt::to_string(consumer_tracker_buf)); } diff --git a/be/src/runtime/process_profile.cpp b/be/src/runtime/process_profile.cpp new file mode 100644 index 00000000000000..d91aedbeac2025 --- /dev/null +++ b/be/src/runtime/process_profile.cpp @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/process_profile.h" + +#include + +#include "runtime/memory/memory_profile.h" + +namespace doris { + +ProcessProfile::ProcessProfile() { + _memory_profile = std::make_unique(); +} + +void ProcessProfile::refresh_profile() { + // 1. refresh profile + _memory_profile->refresh_memory_overview_profile(); + _memory_profile->refresh_tasks_memory_profile(); + // TODO refresh other profile + + // 2. make profile + std::unique_ptr process_profile = + std::make_unique("ProcessProfile"); + _memory_profile->make_memory_profile(process_profile.get()); + _process_profile.set(std::move(process_profile)); + // TODO make other profile +} + +} // namespace doris diff --git a/be/src/runtime/process_profile.h b/be/src/runtime/process_profile.h new file mode 100644 index 00000000000000..24b128ab5528e2 --- /dev/null +++ b/be/src/runtime/process_profile.h @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/exec_env.h" +#include "runtime/memory/memory_profile.h" +#include "util/runtime_profile.h" + +namespace doris { + +class ProcessProfile { +public: + static ProcessProfile* create_global_instance() { return new ProcessProfile(); } + static ProcessProfile* instance() { return ExecEnv::GetInstance()->get_process_profile(); } + ProcessProfile(); + + void refresh_profile(); + + std::string print_process_profile() const { + auto version_ptr = _process_profile.get(); + std::stringstream ss; + version_ptr->pretty_print(&ss); + return ss.str(); + } + + std::string print_process_profile_no_root() const { + std::stringstream ss; + std::vector profiles; + auto version_ptr = _process_profile.get(); + auto* process_profile = const_cast(version_ptr.get()); + process_profile->get_children(&profiles); + for (auto* profile : profiles) { + profile->pretty_print(&ss); + } + return ss.str(); + } + + MemoryProfile* memory_profile() { return _memory_profile.get(); } + +private: + MultiVersion _process_profile; + std::unique_ptr _memory_profile; +}; + +} // namespace doris diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp index 8931854897e168..811fa6002b5cf5 100644 --- a/be/src/runtime/query_context.cpp +++ b/be/src/runtime/query_context.cpp @@ -26,11 +26,9 @@ #include #include #include -#include #include #include "common/logging.h" -#include "olap/olap_common.h" #include "pipeline/dependency.h" #include "pipeline/pipeline_fragment_context.h" #include "runtime/exec_env.h" @@ -74,12 +72,11 @@ const std::string toString(QuerySource queryType) { QueryContext::QueryContext(TUniqueId query_id, ExecEnv* exec_env, const TQueryOptions& query_options, TNetworkAddress coord_addr, - bool is_pipeline, bool is_nereids, TNetworkAddress current_connect_fe, + bool is_nereids, TNetworkAddress current_connect_fe, QuerySource query_source) : _timeout_second(-1), _query_id(query_id), _exec_env(exec_env), - _is_pipeline(is_pipeline), _is_nereids(is_nereids), _query_options(query_options), _query_source(query_source) { @@ -150,7 +147,7 @@ QueryContext::~QueryContext() { std::string mem_tracker_msg; if (query_mem_tracker->peak_consumption() != 0) { mem_tracker_msg = fmt::format( - ", deregister query/load memory tracker, queryId={}, Limit={}, CurrUsed={}, " + "deregister query/load memory tracker, queryId={}, Limit={}, CurrUsed={}, " "PeakUsed={}", print_id(_query_id), MemCounter::print_bytes(query_mem_tracker->limit()), MemCounter::print_bytes(query_mem_tracker->consumption()), @@ -159,8 +156,6 @@ QueryContext::~QueryContext() { uint64_t group_id = 0; if (_workload_group) { group_id = _workload_group->id(); // before remove - _workload_group->remove_mem_tracker_limiter(query_mem_tracker); - _workload_group->remove_query(_query_id); } _exec_env->runtime_query_statistics_mgr()->set_query_finished(print_id(_query_id)); @@ -182,8 +177,7 @@ QueryContext::~QueryContext() { } } - //TODO: check if pipeline and tracing both enabled - if (_is_pipeline && ExecEnv::GetInstance()->pipeline_tracer_context()->enabled()) [[unlikely]] { + if (ExecEnv::GetInstance()->pipeline_tracer_context()->enabled()) [[unlikely]] { try { ExecEnv::GetInstance()->pipeline_tracer_context()->end_query(_query_id, group_id); } catch (std::exception& e) { @@ -200,7 +194,8 @@ QueryContext::~QueryContext() { _exec_env->spill_stream_mgr()->async_cleanup_query(_query_id); DorisMetrics::instance()->query_ctx_cnt->increment(-1); - LOG_INFO("Query {} deconstructed, {}", print_id(this->_query_id), mem_tracker_msg); + // the only one msg shows query's end. any other msg should append to it if need. + LOG_INFO("Query {} deconstructed, mem_tracker: {}", print_id(this->_query_id), mem_tracker_msg); } void QueryContext::set_ready_to_execute(Status reason) { diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index d1d78573923a7e..ef753ee62259b4 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -79,8 +79,8 @@ class QueryContext { public: QueryContext(TUniqueId query_id, ExecEnv* exec_env, const TQueryOptions& query_options, - TNetworkAddress coord_addr, bool is_pipeline, bool is_nereids, - TNetworkAddress current_connect_fe, QuerySource query_type); + TNetworkAddress coord_addr, bool is_nereids, TNetworkAddress current_connect_fe, + QuerySource query_type); ~QueryContext(); @@ -193,8 +193,6 @@ class QueryContext { ThreadPool* get_memtable_flush_pool(); - std::vector get_fragment_instance_ids() const { return fragment_instance_ids; } - int64_t mem_limit() const { return _bytes_limit; } void set_merge_controller_handler( @@ -248,7 +246,6 @@ class QueryContext { ExecEnv* _exec_env = nullptr; MonotonicStopWatch _query_watcher; int64_t _bytes_limit = 0; - bool _is_pipeline = false; bool _is_nereids = false; std::atomic _running_big_mem_op_num = 0; @@ -257,7 +254,7 @@ class QueryContext { // And will be shared by all instances of this query. // So that we can control the max thread that a query can be used to execute. // If this token is not set, the scanner will be executed in "_scan_thread_pool" in exec env. - std::unique_ptr _thread_token; + std::unique_ptr _thread_token {nullptr}; void _init_query_mem_tracker(); diff --git a/be/src/runtime/record_batch_queue.cpp b/be/src/runtime/record_batch_queue.cpp index 83982688880948..25db550db3a7f1 100644 --- a/be/src/runtime/record_batch_queue.cpp +++ b/be/src/runtime/record_batch_queue.cpp @@ -23,10 +23,16 @@ namespace doris { bool RecordBatchQueue::blocking_get(std::shared_ptr* result) { - auto res = _queue.blocking_get(result); - if (_dep && size() <= 10) { + if (_dep && size() <= config::max_memory_sink_batch_count) { _dep->set_ready(); } + // Before each get queue, will set sink task dependency ready. + // so if the sink task put queue faster than the fetch result get queue, + // the queue size will always be 10. + // be sure to set sink dependency ready before getting queue. + // otherwise, if queue is emptied after sink task put queue and before block dependency, + // get queue will stuck and will never set sink dependency ready. + auto res = _queue.blocking_get(result); return res; } diff --git a/be/src/runtime/result_buffer_mgr.cpp b/be/src/runtime/result_buffer_mgr.cpp index ccbf0c3ff6729e..86ef1efe63f573 100644 --- a/be/src/runtime/result_buffer_mgr.cpp +++ b/be/src/runtime/result_buffer_mgr.cpp @@ -32,6 +32,7 @@ #include "arrow/record_batch.h" #include "arrow/type_fwd.h" +#include "common/status.h" #include "runtime/buffer_control_block.h" #include "util/doris_metrics.h" #include "util/metrics.h" @@ -144,13 +145,13 @@ Status ResultBufferMgr::fetch_arrow_data(const TUniqueId& finst_id, return Status::OK(); } -void ResultBufferMgr::cancel(const TUniqueId& query_id) { +void ResultBufferMgr::cancel(const TUniqueId& query_id, const Status& reason) { { std::unique_lock wlock(_buffer_map_lock); auto iter = _buffer_map.find(query_id); if (_buffer_map.end() != iter) { - iter->second->cancel(); + iter->second->cancel(reason); _buffer_map.erase(iter); } } @@ -200,7 +201,7 @@ void ResultBufferMgr::cancel_thread() { // cancel query for (const auto& id : query_to_cancel) { - cancel(id); + cancel(id, Status::TimedOut("Query tiemout")); } } while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(1))); diff --git a/be/src/runtime/result_buffer_mgr.h b/be/src/runtime/result_buffer_mgr.h index 8bac69c23ac522..7534cd5c791f12 100644 --- a/be/src/runtime/result_buffer_mgr.h +++ b/be/src/runtime/result_buffer_mgr.h @@ -71,7 +71,7 @@ class ResultBufferMgr { std::shared_ptr find_arrow_schema(const TUniqueId& query_id); // cancel - void cancel(const TUniqueId& fragment_id); + void cancel(const TUniqueId& query_id, const Status& reason); // cancel one query at a future time. void cancel_at_time(time_t cancel_time, const TUniqueId& query_id); diff --git a/be/src/runtime/result_queue_mgr.cpp b/be/src/runtime/result_queue_mgr.cpp index 8090a3e6ee4787..8a6e5b1093542d 100644 --- a/be/src/runtime/result_queue_mgr.cpp +++ b/be/src/runtime/result_queue_mgr.cpp @@ -82,8 +82,10 @@ void ResultQueueMgr::create_queue(const TUniqueId& fragment_instance_id, if (iter != _fragment_queue_map.end()) { *queue = iter->second; } else { - // the blocking queue size = 20 (default), in this way, one queue have 20 * 1024 rows at most - BlockQueueSharedPtr tmp(new RecordBatchQueue(config::max_memory_sink_batch_count)); + // max_elements will not take effect, because when queue size reaches max_memory_sink_batch_count, + // MemoryScratchSink will block queue dependency, in this way, one queue have 20 * 1024 rows at most. + // use MemoryScratchSink queue dependency instead of BlockingQueue to achieve blocking. + BlockQueueSharedPtr tmp(new RecordBatchQueue(config::max_memory_sink_batch_count * 2)); _fragment_queue_map.insert(std::make_pair(fragment_instance_id, tmp)); *queue = tmp; } diff --git a/be/src/runtime/routine_load/data_consumer.cpp b/be/src/runtime/routine_load/data_consumer.cpp index 92840721581671..b6272a92056575 100644 --- a/be/src/runtime/routine_load/data_consumer.cpp +++ b/be/src/runtime/routine_load/data_consumer.cpp @@ -261,12 +261,13 @@ Status KafkaDataConsumer::group_consume(BlockingQueue* queue, } [[fallthrough]]; case RdKafka::ERR__PARTITION_EOF: { - LOG(INFO) << "consumer meet partition eof: " << _id - << " partition offset: " << msg->offset(); + VLOG_NOTICE << "consumer meet partition eof: " << _id + << " partition offset: " << msg->offset(); _consuming_partition_ids.erase(msg->partition()); if (!queue->blocking_put(msg.get())) { done = true; } else if (_consuming_partition_ids.size() <= 0) { + LOG(INFO) << "all partitions meet eof: " << _id; msg.release(); done = true; } else { diff --git a/be/src/runtime/routine_load/routine_load_task_executor.cpp b/be/src/runtime/routine_load/routine_load_task_executor.cpp index 2c69b8a58704bf..84f0d283cac26b 100644 --- a/be/src/runtime/routine_load/routine_load_task_executor.cpp +++ b/be/src/runtime/routine_load/routine_load_task_executor.cpp @@ -42,6 +42,7 @@ #include "io/fs/multi_table_pipe.h" #include "io/fs/stream_load_pipe.h" #include "runtime/exec_env.h" +#include "runtime/memory/memory_profile.h" #include "runtime/message_body_sink.h" #include "runtime/routine_load/data_consumer.h" #include "runtime/routine_load/data_consumer_group.h" @@ -314,8 +315,7 @@ Status RoutineLoadTaskExecutor::submit_task(const TRoutineLoadTask& task) { bool RoutineLoadTaskExecutor::_reach_memory_limit() { bool is_exceed_soft_mem_limit = GlobalMemoryArbitrator::is_exceed_soft_mem_limit(); - auto current_load_mem_value = - MemTrackerLimiter::TypeMemSum[MemTrackerLimiter::Type::LOAD].current_value(); + auto current_load_mem_value = MemoryProfile::load_current_usage(); if (is_exceed_soft_mem_limit || current_load_mem_value > _load_mem_limit) { LOG(INFO) << "is_exceed_soft_mem_limit: " << is_exceed_soft_mem_limit << " current_load_mem_value: " << current_load_mem_value diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 01fcf851321fc1..1a238787207b17 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -29,6 +29,7 @@ #include #include +#include "common/config.h" #include "common/logging.h" #include "common/status.h" #include "exprs/bloom_filter_func.h" @@ -129,6 +130,7 @@ Status RuntimeFilterMgr::register_local_merge_producer_filter( RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, &merge_filter, build_bf_exactly, true)); + merge_filter->set_ignored(); iter->second.filters.emplace_back(merge_filter); } iter->second.merge_time++; @@ -150,7 +152,6 @@ Status RuntimeFilterMgr::get_local_merge_producer_filters( } *local_merge_filters = &iter->second; DCHECK(!iter->second.filters.empty()); - DCHECK_GT(iter->second.merge_time, 0); return Status::OK(); } @@ -228,13 +229,13 @@ Status RuntimeFilterMergeControllerEntity::_init_with_desc( // so we need to copy to cnt_val cnt_val->producer_size = producer_size; cnt_val->runtime_filter_desc = *runtime_filter_desc; - cnt_val->target_info = *target_info; cnt_val->pool.reset(new ObjectPool()); cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); auto filter_id = runtime_filter_desc->filter_id; RETURN_IF_ERROR(cnt_val->filter->init_with_desc(&cnt_val->runtime_filter_desc, query_options, -1, false)); + cnt_val->filter->set_ignored(); _filter_map.emplace(filter_id, cnt_val); return Status::OK(); } @@ -253,6 +254,7 @@ Status RuntimeFilterMergeControllerEntity::_init_with_desc( cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); auto filter_id = runtime_filter_desc->filter_id; RETURN_IF_ERROR(cnt_val->filter->init_with_desc(&cnt_val->runtime_filter_desc, query_options)); + cnt_val->filter->set_ignored(); std::unique_lock guard(_filter_map_mutex); _filter_map.emplace(filter_id, cnt_val); @@ -342,7 +344,10 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz auto* pquery_id = closure->request_->mutable_query_id(); pquery_id->set_hi(_state->query_id.hi()); pquery_id->set_lo(_state->query_id.lo()); - closure->cntl_->set_timeout_ms(std::min(3600, _state->execution_timeout) * 1000); + closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + if (config::execution_ignore_eovercrowded) { + closure->cntl_->ignore_eovercrowded(); + } closure->request_->set_filter_id(filter_id); closure->request_->set_filter_size(cnt_val->global_size); @@ -447,19 +452,29 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ DummyBrpcCallback::create_shared()); closure->request_->set_filter_id(request->filter_id()); - closure->request_->set_is_pipeline(request->has_is_pipeline() && - request->is_pipeline()); closure->request_->set_merge_time(merge_time); *closure->request_->mutable_query_id() = request->query_id(); if (has_attachment) { closure->cntl_->request_attachment().append(request_attachment); } - closure->cntl_->set_timeout_ms(std::min(3600, _state->execution_timeout) * 1000); + + closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + if (config::execution_ignore_eovercrowded) { + closure->cntl_->ignore_eovercrowded(); + } + // set fragment-id - for (auto& target_fragment_instance_id : target.target_fragment_instance_ids) { - PUniqueId* cur_id = closure->request_->add_fragment_instance_ids(); - cur_id->set_hi(target_fragment_instance_id.hi); - cur_id->set_lo(target_fragment_instance_id.lo); + if (target.__isset.target_fragment_ids) { + for (auto& target_fragment_id : target.target_fragment_ids) { + closure->request_->add_fragment_ids(target_fragment_id); + } + } else { + // FE not upgraded yet. + for (auto& target_fragment_instance_id : target.target_fragment_instance_ids) { + PUniqueId* cur_id = closure->request_->add_fragment_instance_ids(); + cur_id->set_hi(target_fragment_instance_id.hi); + cur_id->set_lo(target_fragment_instance_id.lo); + } } std::shared_ptr stub( diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index d89a3b9f1b1768..b0aea7568cff65 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -168,7 +168,6 @@ class RuntimeFilterMergeControllerEntity { int producer_size; uint64_t global_size; TRuntimeFilterDesc runtime_filter_desc; - std::vector target_info; std::vector targetv2_info; IRuntimeFilter* filter = nullptr; std::unordered_set arrive_id; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index d4e3cba36cda97..e3f9d075c8ffc2 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -69,7 +69,6 @@ RuntimeState::RuntimeState(const TPlanFragmentExecParams& fragment_exec_params, _num_print_error_rows(0), _num_bytes_load_total(0), _num_finished_scan_range(0), - _normal_row_number(0), _error_row_number(0), _query_ctx(ctx) { Status status = @@ -110,7 +109,6 @@ RuntimeState::RuntimeState(const TUniqueId& instance_id, const TUniqueId& query_ _num_print_error_rows(0), _num_bytes_load_total(0), _num_finished_scan_range(0), - _normal_row_number(0), _error_row_number(0), _query_ctx(ctx) { [[maybe_unused]] auto status = init(instance_id, query_options, query_globals, exec_env); @@ -143,7 +141,6 @@ RuntimeState::RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& _num_print_error_rows(0), _num_bytes_load_total(0), _num_finished_scan_range(0), - _normal_row_number(0), _error_row_number(0), _query_ctx(ctx) { [[maybe_unused]] auto status = init(instance_id, query_options, query_globals, exec_env); @@ -174,7 +171,6 @@ RuntimeState::RuntimeState(const TUniqueId& query_id, int32_t fragment_id, _num_print_error_rows(0), _num_bytes_load_total(0), _num_finished_scan_range(0), - _normal_row_number(0), _error_row_number(0), _query_ctx(ctx) { // TODO: do we really need instance id? diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index e7f2c18b09404a..abc823bc25b291 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -38,6 +38,7 @@ #include "agent/be_exec_version_manager.h" #include "cctz/time_zone.h" #include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" #include "common/factory_creator.h" #include "common/status.h" #include "gutil/integral_types.h" @@ -51,6 +52,10 @@ namespace doris { class IRuntimeFilter; +inline int32_t get_execution_rpc_timeout_ms(int32_t execution_timeout_sec) { + return std::min(config::execution_max_rpc_timeout_sec, execution_timeout_sec) * 1000; +} + namespace pipeline { class PipelineXLocalStateBase; class PipelineXSinkLocalStateBase; @@ -113,12 +118,6 @@ class RuntimeState { return _query_options.__isset.scan_queue_mem_limit ? _query_options.scan_queue_mem_limit : _query_options.mem_limit / 20; } - int64_t query_mem_limit() const { - if (_query_options.__isset.mem_limit && (_query_options.mem_limit > 0)) { - return _query_options.mem_limit; - } - return 0; - } ObjectPool* obj_pool() const { return _obj_pool.get(); } @@ -174,7 +173,7 @@ class RuntimeState { _query_options.check_overflow_for_decimal; } - bool enable_decima256() const { + bool enable_decimal256() const { return _query_options.__isset.enable_decimal256 && _query_options.enable_decimal256; } @@ -460,11 +459,6 @@ class RuntimeState { return _query_options.__isset.enable_profile && _query_options.enable_profile; } - bool enable_scan_node_run_serial() const { - return _query_options.__isset.enable_scan_node_run_serial && - _query_options.enable_scan_node_run_serial; - } - bool enable_share_hash_table_for_broadcast_join() const { return _query_options.__isset.enable_share_hash_table_for_broadcast_join && _query_options.enable_share_hash_table_for_broadcast_join; @@ -610,10 +604,6 @@ class RuntimeState { int task_num() const { return _task_num; } - vectorized::ColumnInt64* partial_update_auto_inc_column() { - return _partial_update_auto_inc_column; - }; - private: Status create_error_log_file(); @@ -698,7 +688,6 @@ class RuntimeState { size_t _content_length = 0; // mini load - int64_t _normal_row_number; int64_t _error_row_number; std::string _error_log_file_path; std::unique_ptr _error_log_file; // error file path, absolute path @@ -729,8 +718,6 @@ class RuntimeState { // prohibit copies RuntimeState(const RuntimeState&); - vectorized::ColumnInt64* _partial_update_auto_inc_column; - // save error log to s3 std::shared_ptr _s3_error_fs; // error file path on s3, ${bucket}/${prefix}/error_log/${label}_${fragment_instance_id} diff --git a/be/src/runtime/stream_load/stream_load_context.h b/be/src/runtime/stream_load/stream_load_context.h index 9d1601372f877d..93f76fad4e613c 100644 --- a/be/src/runtime/stream_load/stream_load_context.h +++ b/be/src/runtime/stream_load/stream_load_context.h @@ -164,9 +164,10 @@ class StreamLoadContext { // the following members control the max progress of a consuming // process. if any of them reach, the consuming will finish. - int64_t max_interval_s = 5; - int64_t max_batch_rows = 100000; - int64_t max_batch_size = 100 * 1024 * 1024; // 100MB + // same as values set in fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java + int64_t max_interval_s = 60; + int64_t max_batch_rows = 20000000; + int64_t max_batch_size = 1024 * 1024 * 1024; // 1GB // for parse json-data std::string data_format = ""; diff --git a/be/src/runtime/stream_load/stream_load_executor.cpp b/be/src/runtime/stream_load/stream_load_executor.cpp index 2fc93d356879cb..ec83141893af8f 100644 --- a/be/src/runtime/stream_load/stream_load_executor.cpp +++ b/be/src/runtime/stream_load/stream_load_executor.cpp @@ -99,6 +99,7 @@ Status StreamLoadExecutor::execute_plan_fragment(std::shared_ptrbrief(); + ctx->number_loaded_rows = 0; // cancel body_sink, make sender known it if (ctx->body_sink != nullptr) { ctx->body_sink->cancel(status->to_string()); diff --git a/be/src/runtime/workload_group/workload_group.cpp b/be/src/runtime/workload_group/workload_group.cpp index 6f3b51f09fd1f2..c6a3c07adda1dd 100644 --- a/be/src/runtime/workload_group/workload_group.cpp +++ b/be/src/runtime/workload_group/workload_group.cpp @@ -144,21 +144,32 @@ void WorkloadGroup::check_and_update(const WorkloadGroupInfo& tg_info) { } } +// MemtrackerLimiter is not removed during query context release, so that should remove it here. int64_t WorkloadGroup::make_memory_tracker_snapshots( std::list>* tracker_snapshots) { int64_t used_memory = 0; for (auto& mem_tracker_group : _mem_tracker_limiter_pool) { std::lock_guard l(mem_tracker_group.group_lock); - for (const auto& trackerWptr : mem_tracker_group.trackers) { - auto tracker = trackerWptr.lock(); - CHECK(tracker != nullptr); - if (tracker_snapshots != nullptr) { - tracker_snapshots->insert(tracker_snapshots->end(), tracker); + for (auto trackerWptr = mem_tracker_group.trackers.begin(); + trackerWptr != mem_tracker_group.trackers.end();) { + auto tracker = trackerWptr->lock(); + if (tracker == nullptr) { + trackerWptr = mem_tracker_group.trackers.erase(trackerWptr); + } else { + if (tracker_snapshots != nullptr) { + tracker_snapshots->insert(tracker_snapshots->end(), tracker); + } + used_memory += tracker->consumption(); + ++trackerWptr; } - used_memory += tracker->consumption(); } } - refresh_memory(used_memory); + // refresh total memory used. + _total_mem_used = used_memory; + // reserve memory is recorded in the query mem tracker + // and _total_mem_used already contains all the current reserve memory. + // so after refreshing _total_mem_used, reset _wg_refresh_interval_memory_growth. + _wg_refresh_interval_memory_growth.store(0.0); _mem_used_status->set_value(used_memory); return used_memory; } @@ -167,35 +178,38 @@ int64_t WorkloadGroup::memory_used() { return make_memory_tracker_snapshots(nullptr); } -void WorkloadGroup::refresh_memory(int64_t used_memory) { - // refresh total memory used. - _total_mem_used = used_memory; - // reserve memory is recorded in the query mem tracker - // and _total_mem_used already contains all the current reserve memory. - // so after refreshing _total_mem_used, reset _wg_refresh_interval_memory_growth. - _wg_refresh_interval_memory_growth.store(0.0); -} +void WorkloadGroup::do_sweep() { + // Clear memtracker limiter that is registered during query or load. + for (auto& mem_tracker_group : _mem_tracker_limiter_pool) { + std::lock_guard l(mem_tracker_group.group_lock); + for (auto trackerWptr = mem_tracker_group.trackers.begin(); + trackerWptr != mem_tracker_group.trackers.end();) { + auto tracker = trackerWptr->lock(); + if (tracker == nullptr) { + trackerWptr = mem_tracker_group.trackers.erase(trackerWptr); + } else { + ++trackerWptr; + } + } + } -void WorkloadGroup::add_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr) { + // Clear query context that is registered during query context ctor std::unique_lock wlock(_mutex); - auto group_num = mem_tracker_ptr->group_num(); - std::lock_guard l(_mem_tracker_limiter_pool[group_num].group_lock); - mem_tracker_ptr->wg_tracker_limiter_group_it = - _mem_tracker_limiter_pool[group_num].trackers.insert( - _mem_tracker_limiter_pool[group_num].trackers.end(), mem_tracker_ptr); + for (auto iter = _query_ctxs.begin(); iter != _query_ctxs.end();) { + if (iter->second.lock() == nullptr) { + iter = _query_ctxs.erase(iter); + } else { + iter++; + } + } } -void WorkloadGroup::remove_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr) { +void WorkloadGroup::add_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr) { std::unique_lock wlock(_mutex); auto group_num = mem_tracker_ptr->group_num(); std::lock_guard l(_mem_tracker_limiter_pool[group_num].group_lock); - if (mem_tracker_ptr->wg_tracker_limiter_group_it != - _mem_tracker_limiter_pool[group_num].trackers.end()) { - _mem_tracker_limiter_pool[group_num].trackers.erase( - mem_tracker_ptr->wg_tracker_limiter_group_it); - mem_tracker_ptr->wg_tracker_limiter_group_it = - _mem_tracker_limiter_pool[group_num].trackers.end(); - } + _mem_tracker_limiter_pool[group_num].trackers.insert( + _mem_tracker_limiter_pool[group_num].trackers.end(), mem_tracker_ptr); } int64_t WorkloadGroup::gc_memory(int64_t need_free_mem, RuntimeProfile* profile, bool is_minor_gc) { @@ -230,14 +244,16 @@ int64_t WorkloadGroup::gc_memory(int64_t need_free_mem, RuntimeProfile* profile, auto cancel_top_overcommit_str = [cancel_str](int64_t mem_consumption, const std::string& label) { return fmt::format( - "{} cancel top memory overcommit tracker <{}> consumption {}. details:{}, Execute " + "{} cancel top memory overcommit tracker <{}> consumption {}. details:{}, " + "Execute " "again after enough memory, details see be.INFO.", cancel_str, label, MemCounter::print_bytes(mem_consumption), GlobalMemoryArbitrator::process_limit_exceeded_errmsg_str()); }; auto cancel_top_usage_str = [cancel_str](int64_t mem_consumption, const std::string& label) { return fmt::format( - "{} cancel top memory used tracker <{}> consumption {}. details:{}, Execute again " + "{} cancel top memory used tracker <{}> consumption {}. details:{}, Execute " + "again " "after enough memory, details see be.INFO.", cancel_str, label, MemCounter::print_bytes(mem_consumption), GlobalMemoryArbitrator::process_soft_limit_exceeded_errmsg_str()); @@ -249,7 +265,8 @@ int64_t WorkloadGroup::gc_memory(int64_t need_free_mem, RuntimeProfile* profile, _id, _name, _memory_limit, used_memory, need_free_mem); Defer defer {[&]() { LOG(INFO) << fmt::format( - "[MemoryGC] work load group finished gc, id:{} name:{}, memory limit: {}, used: " + "[MemoryGC] work load group finished gc, id:{} name:{}, memory limit: {}, " + "used: " "{}, need_free_mem: {}, freed memory: {}.", _id, _name, _memory_limit, used_memory, need_free_mem, freed_mem); }}; @@ -451,10 +468,9 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e if (executors_size <= 0) { executors_size = CpuInfo::num_cores(); } - auto task_queue = std::make_shared(executors_size); std::unique_ptr pipeline_task_scheduler = - std::make_unique(exec_env, std::move(task_queue), - "Pipe_" + tg_name, cg_cpu_ctl_ptr); + std::make_unique(executors_size, "Pipe_" + tg_name, + cg_cpu_ctl_ptr); Status ret = pipeline_task_scheduler->start(); if (ret.ok()) { _task_sched = std::move(pipeline_task_scheduler); @@ -542,7 +558,8 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e _cgroup_cpu_ctl->update_cpu_soft_limit( CgroupCpuCtl::cpu_soft_limit_default_value()); } else { - LOG(INFO) << "[upsert wg thread pool] enable cpu hard limit but value is illegal: " + LOG(INFO) << "[upsert wg thread pool] enable cpu hard limit but value is " + "illegal: " << cpu_hard_limit << ", gid=" << tg_id; } } else { diff --git a/be/src/runtime/workload_group/workload_group.h b/be/src/runtime/workload_group/workload_group.h index 2fbb4dd303059c..2ba84ce982b304 100644 --- a/be/src/runtime/workload_group/workload_group.h +++ b/be/src/runtime/workload_group/workload_group.h @@ -89,7 +89,8 @@ class WorkloadGroup : public std::enable_shared_from_this { std::list>* tracker_snapshots); // call make_memory_tracker_snapshots, so also refresh total memory used. int64_t memory_used(); - void refresh_memory(int64_t used_memory); + + void do_sweep(); int spill_threshold_low_water_mark() const { return _spill_low_watermark.load(std::memory_order_relaxed); @@ -132,8 +133,6 @@ class WorkloadGroup : public std::enable_shared_from_this { void add_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr); - void remove_mem_tracker_limiter(std::shared_ptr mem_tracker_ptr); - // when mem_limit <=0 , it's an invalid value, then current group not participating in memory GC // because mem_limit is not a required property bool is_mem_limit_valid() { @@ -154,11 +153,6 @@ class WorkloadGroup : public std::enable_shared_from_this { return Status::OK(); } - void remove_query(TUniqueId query_id) { - std::unique_lock wlock(_mutex); - _query_ctxs.erase(query_id); - } - void shutdown() { std::unique_lock wlock(_mutex); _is_shutdown = true; @@ -169,11 +163,6 @@ class WorkloadGroup : public std::enable_shared_from_this { return _is_shutdown && _query_ctxs.empty(); } - int query_num() { - std::shared_lock r_lock(_mutex); - return _query_ctxs.size(); - } - int64_t gc_memory(int64_t need_free_mem, RuntimeProfile* profile, bool is_minor_gc); void upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* exec_env); @@ -209,6 +198,17 @@ class WorkloadGroup : public std::enable_shared_from_this { } int64_t get_remote_scan_bytes_per_second(); + CgroupCpuCtl* get_cgroup_cpu_ctl_ptr() { + std::shared_lock rlock(_task_sched_lock); + return _cgroup_cpu_ctl.get(); + } + + ThreadPool* get_memtable_flush_pool_ptr() { + // no lock here because this is called by memtable flush, + // to avoid lock competition with the workload thread pool's update + return _memtable_flush_pool.get(); + } + private: mutable std::shared_mutex _mutex; // lock _name, _version, _cpu_share, _memory_limit const uint64_t _id; diff --git a/be/src/runtime/workload_group/workload_group_manager.cpp b/be/src/runtime/workload_group/workload_group_manager.cpp index 65a8e3685c80ed..003f07f1db0c4a 100644 --- a/be/src/runtime/workload_group/workload_group_manager.cpp +++ b/be/src/runtime/workload_group/workload_group_manager.cpp @@ -136,6 +136,13 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set used_wg_i << ", before wg size=" << old_wg_size << ", after wg size=" << new_wg_size; } +void WorkloadGroupMgr::do_sweep() { + std::shared_lock r_lock(_group_mutex); + for (auto& [wg_id, wg] : _workload_groups) { + wg->do_sweep(); + } +} + struct WorkloadGroupMemInfo { int64_t total_mem_used = 0; std::list> tracker_snapshots = diff --git a/be/src/runtime/workload_group/workload_group_manager.h b/be/src/runtime/workload_group/workload_group_manager.h index d8547c3383e219..f76e98d26063ba 100644 --- a/be/src/runtime/workload_group/workload_group_manager.h +++ b/be/src/runtime/workload_group/workload_group_manager.h @@ -50,6 +50,8 @@ class WorkloadGroupMgr { WorkloadGroupPtr get_task_group_by_id(uint64_t tg_id); + void do_sweep(); + void stop(); std::atomic _enable_cpu_hard_limit = false; diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp index aa29661da02208..86d47add0dadad 100644 --- a/be/src/service/backend_service.cpp +++ b/be/src/service/backend_service.cpp @@ -353,11 +353,8 @@ void _ingest_binlog(StorageEngine& engine, IngestBinlogArg* arg) { std::vector segment_index_file_names; auto tablet_schema = rowset_meta->tablet_schema(); if (tablet_schema->get_inverted_index_storage_format() == InvertedIndexStorageFormatPB::V1) { - for (const auto& index : tablet_schema->indexes()) { - if (index.index_type() != IndexType::INVERTED) { - continue; - } - auto index_id = index.index_id(); + for (const auto& index : tablet_schema->inverted_indexes()) { + auto index_id = index->index_id(); for (int64_t segment_index = 0; segment_index < num_segments; ++segment_index) { auto get_segment_index_file_size_url = fmt::format( "{}?method={}&tablet_id={}&rowset_id={}&segment_index={}&segment_index_id={" @@ -379,7 +376,7 @@ void _ingest_binlog(StorageEngine& engine, IngestBinlogArg* arg) { rowset_meta->rowset_id().to_string(), segment_index); segment_index_file_names.push_back(InvertedIndexDescriptor::get_index_file_path_v1( InvertedIndexDescriptor::get_index_file_path_prefix(segment_path), index_id, - index.get_index_suffix())); + index->get_index_suffix())); status = HttpClient::execute_with_retry(max_retry, 1, get_segment_index_file_size_cb); @@ -657,13 +654,6 @@ Status BaseBackendService::start_plan_fragment_execution( QuerySource::INTERNAL_FRONTEND); } -void BaseBackendService::cancel_plan_fragment(TCancelPlanFragmentResult& return_val, - const TCancelPlanFragmentParams& params) { - LOG(INFO) << "cancel_plan_fragment(): instance_id=" << print_id(params.fragment_instance_id); - _exec_env->fragment_mgr()->cancel_instance( - params.fragment_instance_id, Status::InternalError("cancel message received from FE")); -} - void BaseBackendService::transmit_data(TTransmitDataResult& return_val, const TTransmitDataParams& params) { VLOG_ROW << "transmit_data(): instance_id=" << params.dest_fragment_instance_id @@ -809,6 +799,11 @@ void BaseBackendService::submit_routine_load_task(TStatus& t_status, void BaseBackendService::open_scanner(TScanOpenResult& result_, const TScanOpenParams& params) { TStatus t_status; TUniqueId fragment_instance_id = generate_uuid(); + // A query_id is randomly generated to replace t_query_plan_info.query_id. + // external query does not need to report anything to FE, so the query_id can be changed. + // Otherwise, multiple independent concurrent open tablet scanners have the same query_id. + // when one of the scanners ends, the other scanners will be canceled through FragmentMgr.cancel(query_id). + TUniqueId query_id = generate_uuid(); std::shared_ptr p_context; static_cast(_exec_env->external_scan_context_mgr()->create_scan_context(&p_context)); p_context->fragment_instance_id = fragment_instance_id; @@ -845,13 +840,18 @@ void BaseBackendService::open_scanner(TScanOpenResult& result_, const TScanOpenP << " deserialize error, should not be modified after returned Doris FE processed"; exec_st = Status::InvalidArgument(msg.str()); } - p_context->query_id = t_query_plan_info.query_id; + p_context->query_id = query_id; } std::vector selected_columns; if (exec_st.ok()) { // start the scan procedure + LOG(INFO) << fmt::format( + "exec external scanner, old_query_id = {}, new_query_id = {}, fragment_instance_id " + "= {}", + print_id(t_query_plan_info.query_id), print_id(query_id), + print_id(fragment_instance_id)); exec_st = _exec_env->fragment_mgr()->exec_external_plan_fragment( - params, t_query_plan_info, fragment_instance_id, &selected_columns); + params, t_query_plan_info, query_id, fragment_instance_id, &selected_columns); } exec_st.to_thrift(&t_status); //return status diff --git a/be/src/service/backend_service.h b/be/src/service/backend_service.h index 4d01107ba8a832..1d4219e21917b8 100644 --- a/be/src/service/backend_service.h +++ b/be/src/service/backend_service.h @@ -90,7 +90,7 @@ class BaseBackendService : public BackendServiceIf { const TExecPlanFragmentParams& params) override; void cancel_plan_fragment(TCancelPlanFragmentResult& return_val, - const TCancelPlanFragmentParams& params) override; + const TCancelPlanFragmentParams& params) override {}; void transmit_data(TTransmitDataResult& return_val, const TTransmitDataParams& params) override; diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index 9330867ded65a1..e7b920796a1b98 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -203,7 +203,20 @@ Status HttpService::start() { static_cast(PprofActions::setup(_env, _ev_http_server.get(), _pool)); // register jeprof actions - static_cast(JeprofileActions::setup(_env, _ev_http_server.get(), _pool)); + SetJeHeapProfileActiveActions* set_jeheap_profile_active_action = + _pool.add(new SetJeHeapProfileActiveActions(_env)); + _ev_http_server->register_handler(HttpMethod::GET, "/jeheap/active/{prof_value}", + set_jeheap_profile_active_action); + + DumpJeHeapProfileToDotActions* dump_jeheap_profile_to_dot_action = + _pool.add(new DumpJeHeapProfileToDotActions(_env)); + _ev_http_server->register_handler(HttpMethod::GET, "/jeheap/dump", + dump_jeheap_profile_to_dot_action); + + DumpJeHeapProfileActions* dump_jeheap_profile_action = + _pool.add(new DumpJeHeapProfileActions(_env)); + _ev_http_server->register_handler(HttpMethod::GET, "/jeheap/dump_only", + dump_jeheap_profile_action); // register metrics { @@ -374,7 +387,7 @@ void HttpService::register_local_handler(StorageEngine& engine) { _ev_http_server->register_handler(HttpMethod::POST, "/api/pad_rowset", pad_rowset_action); ReportAction* report_tablet_action = _pool.add(new ReportAction( - _env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN, "REPORT_OLAP_TABLE")); + _env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN, "REPORT_OLAP_TABLET")); _ev_http_server->register_handler(HttpMethod::GET, "/api/report/tablet", report_tablet_action); ReportAction* report_disk_action = _pool.add(new ReportAction( diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 06dad46e90c15f..89b43ec5223501 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -886,13 +886,10 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController Status PInternalService::_tablet_fetch_data(const PTabletKeyLookupRequest* request, PTabletKeyLookupResponse* response) { - PointQueryExecutor lookup_util; - RETURN_IF_ERROR(lookup_util.init(request, response)); - RETURN_IF_ERROR(lookup_util.lookup_up()); - if (VLOG_DEBUG_IS_ON) { - VLOG_DEBUG << lookup_util.print_profile(); - } - LOG_EVERY_N(INFO, 500) << lookup_util.print_profile(); + PointQueryExecutor executor; + RETURN_IF_ERROR(executor.init(request, response)); + RETURN_IF_ERROR(executor.lookup_up()); + executor.print_profile(); return Status::OK(); } @@ -1159,7 +1156,10 @@ void PInternalService::fetch_remote_tablet_schema(google::protobuf::RpcControlle LOG(WARNING) << "tablet does not exist, tablet id is " << tablet_id; continue; } - tablet_schemas.push_back(res.value()->merged_tablet_schema()); + auto schema = res.value()->merged_tablet_schema(); + if (schema != nullptr) { + tablet_schemas.push_back(schema); + } } if (!tablet_schemas.empty()) { // merge all @@ -1669,17 +1669,13 @@ void PInternalService::reset_rpc_channel(google::protobuf::RpcController* contro void PInternalService::hand_shake(google::protobuf::RpcController* controller, const PHandShakeRequest* request, PHandShakeResponse* response, google::protobuf::Closure* done) { - bool ret = _light_work_pool.try_offer([request, response, done]() { - brpc::ClosureGuard closure_guard(done); - if (request->has_hello()) { - response->set_hello(request->hello()); - } - response->mutable_status()->set_status_code(0); - }); - if (!ret) { - offer_failed(response, done, _light_work_pool); - return; + // The light pool may be full. Handshake is used to check the connection state of brpc. + // Should not be interfered by the thread pool logic. + brpc::ClosureGuard closure_guard(done); + if (request->has_hello()) { + response->set_hello(request->hello()); } + response->mutable_status()->set_status_code(0); } constexpr char HttpProtocol[] = "http://"; @@ -1970,7 +1966,7 @@ void PInternalServiceImpl::_response_pull_slave_rowset(const std::string& remote void PInternalServiceImpl::response_slave_tablet_pull_rowset( google::protobuf::RpcController* controller, const PTabletWriteSlaveDoneRequest* request, PTabletWriteSlaveDoneResult* response, google::protobuf::Closure* done) { - bool ret = _heavy_work_pool.try_offer([txn_mgr = _engine.txn_manager(), request, response, + bool ret = _light_work_pool.try_offer([txn_mgr = _engine.txn_manager(), request, response, done]() { brpc::ClosureGuard closure_guard(done); VLOG_CRITICAL << "receive the result of slave replica pull rowset from slave replica. " diff --git a/be/src/service/internal_service.h b/be/src/service/internal_service.h index 04c0a86ef046eb..b3ab1c5a6474c0 100644 --- a/be/src/service/internal_service.h +++ b/be/src/service/internal_service.h @@ -50,9 +50,11 @@ void offer_failed(T* response, google::protobuf::Closure* done, const FifoThread template void offer_failed(T* response, google::protobuf::Closure* done, const FifoThreadPool& pool) { brpc::ClosureGuard closure_guard(done); - response->mutable_status()->set_status_code(TStatusCode::CANCELLED); - response->mutable_status()->add_error_msgs("fail to offer request to the work pool, pool=" + - pool.get_info()); + // Should use status to generate protobuf message, because it will encoding Backend Info + // into the error message and then we could know which backend's pool is full. + Status st = Status::Error( + "fail to offer request to the work pool, pool={}", pool.get_info()); + st.to_protobuf(response->mutable_status()); LOG(WARNING) << "cancelled due to fail to offer request to the work pool, pool=" << pool.get_info(); } diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 9719a672b8dff4..74dab466340330 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ #include "olap/olap_tuple.h" #include "olap/row_cursor.h" #include "olap/rowset/beta_rowset.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/storage_engine.h" #include "olap/tablet_manager.h" #include "olap/tablet_schema.h" @@ -313,34 +315,48 @@ Status PointQueryExecutor::lookup_up() { return Status::OK(); } -std::string PointQueryExecutor::print_profile() { +void PointQueryExecutor::print_profile() { auto init_us = _profile_metrics.init_ns.value() / 1000; auto init_key_us = _profile_metrics.init_key_ns.value() / 1000; auto lookup_key_us = _profile_metrics.lookup_key_ns.value() / 1000; auto lookup_data_us = _profile_metrics.lookup_data_ns.value() / 1000; auto output_data_us = _profile_metrics.output_data_ns.value() / 1000; + auto load_segments_key_us = _profile_metrics.load_segment_key_stage_ns.value() / 1000; + auto load_segments_data_us = _profile_metrics.load_segment_data_stage_ns.value() / 1000; auto total_us = init_us + lookup_key_us + lookup_data_us + output_data_us; auto read_stats = _profile_metrics.read_stats; - return fmt::format( - "" + const std::string stats_str = fmt::format( "[lookup profile:{}us] init:{}us, init_key:{}us," - "" - "" - "lookup_key:{}us, lookup_data:{}us, output_data:{}us, hit_lookup_cache:{}" - "" - "" + " lookup_key:{}us, load_segments_key:{}us, lookup_data:{}us, load_segments_data:{}us," + " output_data:{}us, " + "hit_lookup_cache:{}" ", is_binary_row:{}, output_columns:{}, total_keys:{}, row_cache_hits:{}" ", hit_cached_pages:{}, total_pages_read:{}, compressed_bytes_read:{}, " "io_latency:{}ns, " "uncompressed_bytes_read:{}, result_data_bytes:{}, row_hits:{}" - ", rs_column_uid:{}" - "", - total_us, init_us, init_key_us, lookup_key_us, lookup_data_us, output_data_us, - _profile_metrics.hit_lookup_cache, _binary_row_format, _reusable->output_exprs().size(), - _row_read_ctxs.size(), _profile_metrics.row_cache_hits, read_stats.cached_pages_num, + ", rs_column_uid:{}, bytes_read_from_local:{}, bytes_read_from_remote:{}, " + "local_io_timer:{}, remote_io_timer:{}, local_write_timer:{}", + total_us, init_us, init_key_us, lookup_key_us, load_segments_key_us, lookup_data_us, + load_segments_data_us, output_data_us, _profile_metrics.hit_lookup_cache, + _binary_row_format, _reusable->output_exprs().size(), _row_read_ctxs.size(), + _profile_metrics.row_cache_hits, read_stats.cached_pages_num, read_stats.total_pages_num, read_stats.compressed_bytes_read, read_stats.io_ns, read_stats.uncompressed_bytes_read, _profile_metrics.result_data_bytes, _row_hits, - _reusable->rs_column_uid()); + _reusable->rs_column_uid(), + _profile_metrics.read_stats.file_cache_stats.bytes_read_from_local, + _profile_metrics.read_stats.file_cache_stats.bytes_read_from_remote, + _profile_metrics.read_stats.file_cache_stats.local_io_timer, + _profile_metrics.read_stats.file_cache_stats.remote_io_timer, + _profile_metrics.read_stats.file_cache_stats.write_cache_io_timer); + + constexpr static int kSlowThreholdUs = 50 * 1000; // 50ms + if (total_us > kSlowThreholdUs) { + LOG(WARNING) << "slow query, " << stats_str; + } else if (VLOG_DEBUG_IS_ON) { + VLOG_DEBUG << stats_str; + } else { + LOG_EVERY_N(INFO, 1000) << stats_str; + } } Status PointQueryExecutor::_init_keys(const PTabletKeyLookupRequest* request) { @@ -380,6 +396,17 @@ Status PointQueryExecutor::_lookup_row_key() { specified_rowsets = _tablet->get_rowset_by_ids(nullptr); } std::vector> segment_caches(specified_rowsets.size()); + // init segment_cache + { + SCOPED_TIMER(&_profile_metrics.load_segment_key_stage_ns); + for (size_t i = 0; i < specified_rowsets.size(); i++) { + auto& rs = specified_rowsets[i]; + segment_caches[i] = std::make_unique(); + RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( + std::static_pointer_cast(rs), segment_caches[i].get(), true, true, + &_profile_metrics.read_stats)); + } + } for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { RowLocation location; if (!config::disable_storage_row_cache) { @@ -396,7 +423,8 @@ Status PointQueryExecutor::_lookup_row_key() { auto rowset_ptr = std::make_unique(); st = (_tablet->lookup_row_key(_row_read_ctxs[i]._primary_key, nullptr, false, specified_rowsets, &location, INT32_MAX /*rethink?*/, - segment_caches, rowset_ptr.get(), false)); + segment_caches, rowset_ptr.get(), false, nullptr, + &_profile_metrics.read_stats)); if (st.is()) { continue; } @@ -459,7 +487,11 @@ Status PointQueryExecutor::_lookup_row_data() { BetaRowsetSharedPtr rowset = std::static_pointer_cast(_tablet->get_rowset(row_loc.rowset_id)); SegmentCacheHandle segment_cache; - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + { + SCOPED_TIMER(&_profile_metrics.load_segment_data_stage_ns); + RETURN_IF_ERROR( + SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + } // find segment auto it = std::find_if(segment_cache.get_segments().cbegin(), segment_cache.get_segments().cend(), diff --git a/be/src/service/point_query_executor.h b/be/src/service/point_query_executor.h index b22dc5bfd1d73f..89f4ecff9b137a 100644 --- a/be/src/service/point_query_executor.h +++ b/be/src/service/point_query_executor.h @@ -276,12 +276,16 @@ struct Metrics { init_key_ns(TUnit::TIME_NS), lookup_key_ns(TUnit::TIME_NS), lookup_data_ns(TUnit::TIME_NS), - output_data_ns(TUnit::TIME_NS) {} + output_data_ns(TUnit::TIME_NS), + load_segment_key_stage_ns(TUnit::TIME_NS), + load_segment_data_stage_ns(TUnit::TIME_NS) {} RuntimeProfile::Counter init_ns; RuntimeProfile::Counter init_key_ns; RuntimeProfile::Counter lookup_key_ns; RuntimeProfile::Counter lookup_data_ns; RuntimeProfile::Counter output_data_ns; + RuntimeProfile::Counter load_segment_key_stage_ns; + RuntimeProfile::Counter load_segment_data_stage_ns; OlapReaderStatistics read_stats; size_t row_cache_hits = 0; bool hit_lookup_cache = false; @@ -297,7 +301,9 @@ class PointQueryExecutor { Status lookup_up(); - std::string print_profile(); + void print_profile(); + + const OlapReaderStatistics& read_stats() const { return _read_stats; } private: Status _init_keys(const PTabletKeyLookupRequest* request); diff --git a/be/src/udf/udf.h b/be/src/udf/udf.h index 39af2ad1c25c13..d717c18ccec64a 100644 --- a/be/src/udf/udf.h +++ b/be/src/udf/udf.h @@ -26,6 +26,7 @@ #include #include "runtime/types.h" +#include "util/runtime_profile.h" #include "vec/common/arena.h" namespace doris { @@ -88,6 +89,12 @@ class FunctionContext { _jsonb_string_as_string = jsonb_string_as_string; } + void set_udf_execute_timer(RuntimeProfile::Counter* udf_execute_timer) { + _udf_execute_timer = udf_execute_timer; + } + + RuntimeProfile::Counter* get_udf_execute_timer() { return _udf_execute_timer; } + // Cast flag, when enable string_as_jsonb_string, string casting to jsonb will not parse string // instead just insert a string literal bool string_as_jsonb_string() const { return _string_as_jsonb_string; } @@ -176,6 +183,8 @@ class FunctionContext { std::vector> _constant_cols; + //udf execute timer + RuntimeProfile::Counter* _udf_execute_timer = nullptr; bool _check_overflow_for_decimal = false; bool _string_as_jsonb_string = false; diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index 728057667e8505..0cbb6bcd0c8916 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -46,7 +46,8 @@ namespace doris { using strings::Substitute; -Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* result) { +Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* result, + const std::string& timezone) { switch (type.type) { case TYPE_NULL: *result = arrow::null(); @@ -96,11 +97,11 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr 3) { - *result = std::make_shared(arrow::TimeUnit::MICRO); + *result = std::make_shared(arrow::TimeUnit::MICRO, timezone); } else if (type.scale > 0) { - *result = std::make_shared(arrow::TimeUnit::MILLI); + *result = std::make_shared(arrow::TimeUnit::MILLI, timezone); } else { - *result = std::make_shared(arrow::TimeUnit::SECOND); + *result = std::make_shared(arrow::TimeUnit::SECOND, timezone); } break; case TYPE_DECIMALV2: @@ -120,7 +121,7 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr item_type; - static_cast(convert_to_arrow_type(type.children[0], &item_type)); + static_cast(convert_to_arrow_type(type.children[0], &item_type, timezone)); *result = std::make_shared(item_type); break; } @@ -128,8 +129,8 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr key_type; std::shared_ptr val_type; - static_cast(convert_to_arrow_type(type.children[0], &key_type)); - static_cast(convert_to_arrow_type(type.children[1], &val_type)); + static_cast(convert_to_arrow_type(type.children[0], &key_type, timezone)); + static_cast(convert_to_arrow_type(type.children[1], &val_type, timezone)); *result = std::make_shared(key_type, val_type); break; } @@ -138,7 +139,7 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr> fields; for (size_t i = 0; i < type.children.size(); i++) { std::shared_ptr field_type; - static_cast(convert_to_arrow_type(type.children[i], &field_type)); + static_cast(convert_to_arrow_type(type.children[i], &field_type, timezone)); fields.push_back(std::make_shared(type.field_names[i], field_type, type.contains_nulls[i])); } @@ -156,20 +157,13 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* field) { - std::shared_ptr type; - RETURN_IF_ERROR(convert_to_arrow_type(desc->type(), &type)); - *field = arrow::field(desc->col_name(), type, desc->is_nullable()); - return Status::OK(); -} - -Status convert_block_arrow_schema(const vectorized::Block& block, - std::shared_ptr* result) { +Status get_arrow_schema(const vectorized::Block& block, std::shared_ptr* result, + const std::string& timezone) { std::vector> fields; for (const auto& type_and_name : block) { std::shared_ptr arrow_type; RETURN_IF_ERROR(convert_to_arrow_type(type_and_name.type->get_type_as_type_descriptor(), - &arrow_type)); + &arrow_type, timezone)); fields.push_back(std::make_shared(type_and_name.name, arrow_type, type_and_name.type->is_nullable())); } @@ -177,27 +171,14 @@ Status convert_block_arrow_schema(const vectorized::Block& block, return Status::OK(); } -Status convert_to_arrow_schema(const RowDescriptor& row_desc, - std::shared_ptr* result) { - std::vector> fields; - for (auto tuple_desc : row_desc.tuple_descriptors()) { - for (auto desc : tuple_desc->slots()) { - std::shared_ptr field; - RETURN_IF_ERROR(convert_to_arrow_field(desc, &field)); - fields.push_back(field); - } - } - *result = arrow::schema(std::move(fields)); - return Status::OK(); -} - Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, - std::shared_ptr* result) { + std::shared_ptr* result, + const std::string& timezone) { std::vector> fields; for (int i = 0; i < output_vexpr_ctxs.size(); i++) { std::shared_ptr arrow_type; auto root_expr = output_vexpr_ctxs.at(i)->root(); - RETURN_IF_ERROR(convert_to_arrow_type(root_expr->type(), &arrow_type)); + RETURN_IF_ERROR(convert_to_arrow_type(root_expr->type(), &arrow_type, timezone)); auto field_name = root_expr->is_slot_ref() && !root_expr->expr_label().empty() ? root_expr->expr_label() : fmt::format("{}_{}", root_expr->data_type()->get_name(), i); diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h index ddffc3324d3451..3993003baf6e95 100644 --- a/be/src/util/arrow/row_batch.h +++ b/be/src/util/arrow/row_batch.h @@ -41,17 +41,16 @@ namespace doris { class RowDescriptor; -Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* result); +Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr* result, + const std::string& timezone); // Convert Doris RowDescriptor to Arrow Schema. -Status convert_to_arrow_schema(const RowDescriptor& row_desc, - std::shared_ptr* result); - -Status convert_block_arrow_schema(const vectorized::Block& block, - std::shared_ptr* result); +Status get_arrow_schema(const vectorized::Block& block, std::shared_ptr* result, + const std::string& timezone); Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, - std::shared_ptr* result); + std::shared_ptr* result, + const std::string& timezone); Status serialize_record_batch(const arrow::RecordBatch& record_batch, std::string* result); diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index 44b391f44dae34..504b0b27428190 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -20,6 +20,9 @@ #pragma once +#include + +#include "vec/core/wide_integer.h" #ifndef __APPLE__ #include #endif @@ -209,7 +212,11 @@ class BitUtil { template static T big_endian_to_host(T value) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return BigEndian::ToHost256(value); + } else if constexpr (std::is_same_v) { + return BigEndian::ToHost256(value); + } else if constexpr (std::is_same_v) { return BigEndian::ToHost128(value); } else if constexpr (std::is_same_v) { return BigEndian::ToHost128(value); diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index ae672068119a53..d1788b0948a6f2 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -28,24 +28,24 @@ defined(__i386) || defined(_M_IX86) #include #endif +#include #include #include -#include #include #include #include #include #include -#include #include #include #include #include #include +#include #include #include -#include +#include #include #include "common/config.h" @@ -53,9 +53,7 @@ #include "exec/decompressor.h" #include "gutil/endian.h" #include "gutil/strings/substitute.h" -#include "orc/OrcFile.hh" #include "runtime/thread_context.h" -#include "util/bit_util.h" #include "util/defer_op.h" #include "util/faststring.h" @@ -74,8 +72,6 @@ uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* o namespace doris { -using strings::Substitute; - // exception safe Status BlockCompressionCodec::compress(const std::vector& inputs, size_t uncompressed_size, faststring* output) { @@ -1492,6 +1488,31 @@ class LzoBlockCompression final : public BlockCompressionCodec { } }; +class BrotliBlockCompression final : public BlockCompressionCodec { +public: + static BrotliBlockCompression* instance() { + static BrotliBlockCompression s_instance; + return &s_instance; + } + + Status compress(const Slice& input, faststring* output) override { + return Status::InvalidArgument("not impl brotli compress."); + } + + size_t max_compressed_len(size_t len) override { return 0; }; + + Status decompress(const Slice& input, Slice* output) override { + // The size of output buffer is always equal to the umcompressed length. + BrotliDecoderResult result = BrotliDecoderDecompress( + input.get_size(), reinterpret_cast(input.get_data()), &output->size, + reinterpret_cast(output->data)); + if (result != BROTLI_DECODER_RESULT_SUCCESS) { + return Status::InternalError("Brotli decompression failed, result={}", result); + } + return Status::OK(); + } +}; + Status get_block_compression_codec(segment_v2::CompressionTypePB type, BlockCompressionCodec** codec) { switch (type) { @@ -1582,6 +1603,9 @@ Status get_block_compression_codec(tparquet::CompressionCodec::type parquet_code case tparquet::CompressionCodec::LZO: *codec = LzoBlockCompression::instance(); break; + case tparquet::CompressionCodec::BROTLI: + *codec = BrotliBlockCompression::instance(); + break; default: return Status::InternalError("unknown compression type({})", parquet_codec); } diff --git a/be/src/util/byte_stream_split.cpp b/be/src/util/byte_stream_split.cpp new file mode 100644 index 00000000000000..0e0fc9257e113a --- /dev/null +++ b/be/src/util/byte_stream_split.cpp @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "byte_stream_split.h" + +#include + +#include +#include +#include + +#include "gutil/port.h" + +namespace doris { + +inline void do_merge_streams(const uint8_t** src_streams, int width, int64_t nvalues, + uint8_t* dest) { + // Value empirically chosen to provide the best performance on the author's machine + constexpr int kBlockSize = 128; + + while (nvalues >= kBlockSize) { + for (int stream = 0; stream < width; ++stream) { + // Take kBlockSize bytes from the given stream and spread them + // to their logical places in destination. + const uint8_t* src = src_streams[stream]; + for (int i = 0; i < kBlockSize; i += 8) { + uint64_t v; + std::memcpy(&v, src + i, sizeof(v)); +#ifdef IS_LITTLE_ENDIAN + dest[stream + i * width] = static_cast(v); + dest[stream + (i + 1) * width] = static_cast(v >> 8); + dest[stream + (i + 2) * width] = static_cast(v >> 16); + dest[stream + (i + 3) * width] = static_cast(v >> 24); + dest[stream + (i + 4) * width] = static_cast(v >> 32); + dest[stream + (i + 5) * width] = static_cast(v >> 40); + dest[stream + (i + 6) * width] = static_cast(v >> 48); + dest[stream + (i + 7) * width] = static_cast(v >> 56); +#elif defined IS_BIG_ENDIAN + dest[stream + i * width] = static_cast(v >> 56); + dest[stream + (i + 1) * width] = static_cast(v >> 48); + dest[stream + (i + 2) * width] = static_cast(v >> 40); + dest[stream + (i + 3) * width] = static_cast(v >> 32); + dest[stream + (i + 4) * width] = static_cast(v >> 24); + dest[stream + (i + 5) * width] = static_cast(v >> 16); + dest[stream + (i + 6) * width] = static_cast(v >> 8); + dest[stream + (i + 7) * width] = static_cast(v); +#endif + } + src_streams[stream] += kBlockSize; + } + dest += width * kBlockSize; + nvalues -= kBlockSize; + } + + // Epilog + for (int stream = 0; stream < width; ++stream) { + const uint8_t* src = src_streams[stream]; + for (int64_t i = 0; i < nvalues; ++i) { + dest[stream + i * width] = src[i]; + } + } +} + +template +void byte_stream_split_decode_scalar(const uint8_t* src, int width, int64_t offset, + int64_t num_values, int64_t stride, uint8_t* dest) { + DCHECK(width == kNumStreams); + std::array src_streams; + for (int stream = 0; stream < kNumStreams; ++stream) { + src_streams[stream] = &src[stream * stride + offset]; + } + do_merge_streams(src_streams.data(), kNumStreams, num_values, dest); +} + +inline void byte_stream_split_decode_scalar_dynamic(const uint8_t* src, int width, int64_t offset, + int64_t num_values, int64_t stride, + uint8_t* dest) { + std::vector src_streams; + src_streams.resize(width); + for (int stream = 0; stream < width; ++stream) { + src_streams[stream] = &src[stream * stride + offset]; + } + do_merge_streams(src_streams.data(), width, num_values, dest); +} + +// TODO: optimize using simd: https://github.com/apache/arrow/pull/38529 +void byte_stream_split_decode(const uint8_t* src, int width, int64_t offset, int64_t num_values, + int64_t stride, uint8_t* dest) { + switch (width) { + case 1: + memcpy(dest, src + offset * width, num_values); + return; + case 2: + return byte_stream_split_decode_scalar<2>(src, width, offset, num_values, stride, dest); + case 4: + return byte_stream_split_decode_scalar<4>(src, width, offset, num_values, stride, dest); + case 8: + return byte_stream_split_decode_scalar<8>(src, width, offset, num_values, stride, dest); + case 16: + return byte_stream_split_decode_scalar<16>(src, width, offset, num_values, stride, dest); + } + return byte_stream_split_decode_scalar_dynamic(src, width, offset, num_values, stride, dest); +} + +} // namespace doris diff --git a/be/src/util/byte_stream_split.h b/be/src/util/byte_stream_split.h new file mode 100644 index 00000000000000..4b016e2e692e61 --- /dev/null +++ b/be/src/util/byte_stream_split.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace doris { + +/** + * @brief Decode a byte stream into a byte stream split format. + * + * @param src The encoded data by byte stream split. + * @param width The width of type. + * @param offset The offset of encoded data. + * @param num_values The num of values to decode. + * @param stride The length of each stream. + * @param dest The buffer to store the decoded data. + */ +void byte_stream_split_decode(const uint8_t* src, int width, int64_t offset, int64_t num_values, + int64_t stride, uint8_t* dest); + +} // namespace doris diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp index 3fe6b92c923f92..e9d4f31e5ca137 100644 --- a/be/src/util/doris_metrics.cpp +++ b/be/src/util/doris_metrics.cpp @@ -91,13 +91,13 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(base_compaction_deltas_total, MetricUnit::R DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(cumulative_compaction_deltas_total, MetricUnit::ROWSETS, "", compaction_deltas_total, Labels({{"type", "cumulative"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(full_compaction_deltas_total, MetricUnit::ROWSETS, "", - compaction_deltas_total, Labels({{"type", "base"}})); + compaction_deltas_total, Labels({{"type", "full"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(base_compaction_bytes_total, MetricUnit::BYTES, "", compaction_bytes_total, Labels({{"type", "base"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(cumulative_compaction_bytes_total, MetricUnit::BYTES, "", compaction_bytes_total, Labels({{"type", "cumulative"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(full_compaction_bytes_total, MetricUnit::BYTES, "", - compaction_bytes_total, Labels({{"type", "base"}})); + compaction_bytes_total, Labels({{"type", "full"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(segment_read_total, MetricUnit::OPERATIONS, "(segment_v2) total number of segments read", segment_read, diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index dc70b1c9f9c40b..e9ac72c5ccdcb4 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -134,7 +134,7 @@ class HashUtil { static const uint32_t MURMUR3_32_SEED = 104729; // modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp - static uint32_t murmur_hash3_32(const void* key, int32_t len, uint32_t seed) { + static uint32_t murmur_hash3_32(const void* key, int64_t len, uint32_t seed) { uint32_t out = 0; murmur_hash3_x86_32(key, len, seed, &out); return out; @@ -227,7 +227,7 @@ class HashUtil { // Our hash function is MurmurHash2, 64 bit version. // It was modified in order to provide the same result in // big and little endian archs (endian neutral). - static uint64_t murmur_hash64A(const void* key, int32_t len, unsigned int seed) { + static uint64_t murmur_hash64A(const void* key, int64_t len, unsigned int seed) { const uint64_t m = MURMUR_PRIME; const int r = 47; uint64_t h = seed ^ (len * m); diff --git a/be/src/util/jni-util.cpp b/be/src/util/jni-util.cpp index 02d20ed9a4fe80..6ad0790ef0859e 100644 --- a/be/src/util/jni-util.cpp +++ b/be/src/util/jni-util.cpp @@ -317,6 +317,7 @@ Status JniUtil::GetJniExceptionMsg(JNIEnv* env, bool log_stack, const string& pr } jobject JniUtil::convert_to_java_map(JNIEnv* env, const std::map& map) { + //TODO: ADD EXCEPTION CHECK. jclass hashmap_class = env->FindClass("java/util/HashMap"); jmethodID hashmap_constructor = env->GetMethodID(hashmap_class, "", "(I)V"); jobject hashmap_object = env->NewObject(hashmap_class, hashmap_constructor, map.size()); @@ -399,16 +400,26 @@ std::map JniUtil::convert_to_cpp_map(JNIEnv* env, jobj Status JniUtil::GetGlobalClassRef(JNIEnv* env, const char* class_str, jclass* class_ref) { *class_ref = NULL; - jclass local_cl = env->FindClass(class_str); - RETURN_ERROR_IF_EXC(env); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF(jclass, local_cl, env, FindClass(class_str)); RETURN_IF_ERROR(LocalToGlobalRef(env, local_cl, reinterpret_cast(class_ref))); - env->DeleteLocalRef(local_cl); - RETURN_ERROR_IF_EXC(env); return Status::OK(); } Status JniUtil::LocalToGlobalRef(JNIEnv* env, jobject local_ref, jobject* global_ref) { *global_ref = env->NewGlobalRef(local_ref); + // NewGlobalRef: + // Returns a global reference to the given obj. + // + //May return NULL if: + // obj refers to null + // the system has run out of memory + // obj was a weak global reference and has already been garbage collected + if (*global_ref == NULL) { + return Status::InternalError( + "LocalToGlobalRef fail,global ref is NULL,maybe the system has run out of memory."); + } + + //NewGlobalRef not throw exception,maybe we just need check NULL. RETURN_ERROR_IF_EXC(env); return Status::OK(); } diff --git a/be/src/util/jni-util.h b/be/src/util/jni-util.h index 666a5e526dfbda..df332951afebb8 100644 --- a/be/src/util/jni-util.h +++ b/be/src/util/jni-util.h @@ -28,6 +28,7 @@ #include "common/status.h" #include "jni_md.h" +#include "util/defer_op.h" #include "util/thrift_util.h" #ifdef USE_HADOOP_HDFS @@ -38,12 +39,25 @@ extern "C" JNIEnv* getJNIEnv(void); namespace doris { class JniUtil; -#define RETURN_ERROR_IF_EXC(env) \ - do { \ - jthrowable exc = (env)->ExceptionOccurred(); \ - if (exc != nullptr) return JniUtil::GetJniExceptionMsg(env); \ +#define RETURN_ERROR_IF_EXC(env) \ + do { \ + if (env->ExceptionCheck()) [[unlikely]] \ + return JniUtil::GetJniExceptionMsg(env); \ } while (false) +#define JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF(type, result, env, func) \ + type result = env->func; \ + DEFER(env->DeleteLocalRef(result)); \ + RETURN_ERROR_IF_EXC(env) + +#define JNI_CALL_METHOD_CHECK_EXCEPTION(type, result, env, func) \ + type result = env->func; \ + RETURN_ERROR_IF_EXC(env) + +//In order to reduce the potential risks caused by not handling exceptions, +// you need to refer to https://docs.oracle.com/javase/8/docs/technotes/guides/jni/spec/functions.html +// to confirm whether the jni method will throw an exception. + class JniUtil { public: static Status Init() WARN_UNUSED_RESULT; @@ -65,6 +79,10 @@ class JniUtil { return Status::OK(); } + //jclass is generally a local reference. + //Method ID and field ID values are forever. + //If you want to use the jclass across multiple threads or multiple calls into the JNI code you need + // to create a global reference to it with GetGlobalClassRef(). static Status GetGlobalClassRef(JNIEnv* env, const char* class_str, jclass* class_ref) WARN_UNUSED_RESULT; diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h index 2a9cf8a8191caa..016da3142cd24c 100644 --- a/be/src/util/jsonb_document.h +++ b/be/src/util/jsonb_document.h @@ -177,7 +177,7 @@ class JsonbDocument { static JsonbDocument* makeDocument(char* pb, uint32_t size, const JsonbValue* rval); // create an JsonbDocument object from JSONB packed bytes - static JsonbDocument* createDocument(const char* pb, uint32_t size); + static JsonbDocument* createDocument(const char* pb, size_t size); // create an JsonbValue from JSONB packed bytes static JsonbValue* createValue(const char* pb, uint32_t size); @@ -1138,7 +1138,7 @@ inline JsonbDocument* JsonbDocument::makeDocument(char* pb, uint32_t size, const return doc; } -inline JsonbDocument* JsonbDocument::createDocument(const char* pb, uint32_t size) { +inline JsonbDocument* JsonbDocument::createDocument(const char* pb, size_t size) { if (!pb || size < sizeof(JsonbHeader) + sizeof(JsonbValue)) { return nullptr; } diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp index fc30d1073acdc6..4cb71f5e827878 100644 --- a/be/src/util/jvm_metrics.cpp +++ b/be/src/util/jvm_metrics.cpp @@ -22,7 +22,9 @@ #include #include "common/config.h" +#include "util/defer_op.h" #include "util/metrics.h" + namespace doris { #define DEFINE_JVM_SIZE_BYTES_METRIC(name, type) \ @@ -90,9 +92,13 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) { break; } try { - _jvm_stats.init(env); + Status st = _jvm_stats.init(env); + if (!st) { + LOG(WARNING) << "jvm Stats Init Fail. " << st.to_string(); + break; + } } catch (...) { - LOG(WARNING) << "JVM STATS INIT FAIL"; + LOG(WARNING) << "jvm Stats Throw Exception Init Fail."; break; } if (!_jvm_stats.init_complete()) { @@ -133,21 +139,22 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) { void JvmMetrics::update() { static long fail_count = 0; - bool have_exception = false; try { - _jvm_stats.refresh(this); + Status st = _jvm_stats.refresh(this); + if (!st) { + fail_count++; + LOG(WARNING) << "Jvm Stats update Fail! " << st.to_string(); + } else { + fail_count = 0; + } } catch (...) { - have_exception = true; - LOG(WARNING) << "JVM MONITOR UPDATE FAIL!"; + LOG(WARNING) << "Jvm Stats update throw Exception!"; fail_count++; } //When 30 consecutive exceptions occur, turn off jvm information collection. - if (!have_exception) { - fail_count = 0; - } if (fail_count >= 30) { - LOG(WARNING) << "JVM MONITOR CLOSE!"; + LOG(WARNING) << "Jvm Stats CLOSE!"; _jvm_stats.set_complete(false); _server_entity->deregister_hook(_s_hook_name); @@ -182,193 +189,257 @@ void JvmMetrics::update() { } } -void JvmStats::init(JNIEnv* ENV) { - env = ENV; - _managementFactoryClass = env->FindClass("java/lang/management/ManagementFactory"); - if (_managementFactoryClass == nullptr) { - LOG(WARNING) - << "Class java/lang/management/ManagementFactory Not Find.JVM monitoring fails."; - return; - } +Status JvmStats::init(JNIEnv* env) { + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/ManagementFactory", + &_managementFactoryClass)); - _getMemoryMXBeanMethod = env->GetStaticMethodID(_managementFactoryClass, "getMemoryMXBean", - "()Ljava/lang/management/MemoryMXBean;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryMXBeanMethod, env, + GetStaticMethodID(_managementFactoryClass, "getMemoryMXBean", + "()Ljava/lang/management/MemoryMXBean;")); - _memoryUsageClass = env->FindClass("java/lang/management/MemoryUsage"); - if (_memoryUsageClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/MemoryUsage Not Find.JVM monitoring fails."; - return; - } - _getMemoryUsageUsedMethod = env->GetMethodID(_memoryUsageClass, "getUsed", "()J"); - _getMemoryUsageCommittedMethod = env->GetMethodID(_memoryUsageClass, "getCommitted", "()J"); - _getMemoryUsageMaxMethod = env->GetMethodID(_memoryUsageClass, "getMax", "()J"); + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/MemoryUsage", + &_memoryUsageClass)); - _memoryMXBeanClass = env->FindClass("java/lang/management/MemoryMXBean"); - if (_memoryMXBeanClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/MemoryMXBean Not Find.JVM monitoring fails."; - return; - } - _getHeapMemoryUsageMethod = env->GetMethodID(_memoryMXBeanClass, "getHeapMemoryUsage", - "()Ljava/lang/management/MemoryUsage;"); - _getNonHeapMemoryUsageMethod = env->GetMethodID(_memoryMXBeanClass, "getNonHeapMemoryUsage", - "()Ljava/lang/management/MemoryUsage;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryUsageUsedMethod, env, + GetMethodID(_memoryUsageClass, "getUsed", "()J")); - _getMemoryPoolMXBeansMethod = env->GetStaticMethodID( - _managementFactoryClass, "getMemoryPoolMXBeans", "()Ljava/util/List;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryUsageCommittedMethod, env, + GetMethodID(_memoryUsageClass, "getCommitted", "()J")); - _listClass = env->FindClass("java/util/List"); - if (_listClass == nullptr) { - LOG(WARNING) << "Class java/util/List Not Find.JVM monitoring fails."; - return; - } - _getListSizeMethod = env->GetMethodID(_listClass, "size", "()I"); - _getListUseIndexMethod = env->GetMethodID(_listClass, "get", "(I)Ljava/lang/Object;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryUsageMaxMethod, env, + GetMethodID(_memoryUsageClass, "getMax", "()J")); - _memoryPoolMXBeanClass = env->FindClass("java/lang/management/MemoryPoolMXBean"); - if (_memoryPoolMXBeanClass == nullptr) { - LOG(WARNING) - << "Class java/lang/management/MemoryPoolMXBean Not Find.JVM monitoring fails."; - return; - } - _getMemoryPoolMXBeanUsageMethod = env->GetMethodID(_memoryPoolMXBeanClass, "getUsage", - "()Ljava/lang/management/MemoryUsage;"); - _getMemoryPollMXBeanPeakMethod = env->GetMethodID(_memoryPoolMXBeanClass, "getPeakUsage", - "()Ljava/lang/management/MemoryUsage;"); - _getMemoryPollMXBeanNameMethod = - env->GetMethodID(_memoryPoolMXBeanClass, "getName", "()Ljava/lang/String;"); - - _getThreadMXBeanMethod = env->GetStaticMethodID(_managementFactoryClass, "getThreadMXBean", - "()Ljava/lang/management/ThreadMXBean;"); - - _getGarbageCollectorMXBeansMethod = env->GetStaticMethodID( - _managementFactoryClass, "getGarbageCollectorMXBeans", "()Ljava/util/List;"); - - _garbageCollectorMXBeanClass = env->FindClass("java/lang/management/GarbageCollectorMXBean"); - if (_garbageCollectorMXBeanClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/GarbageCollectorMXBean Not Find.JVM monitoring " - "fails."; - return; - } - _getGCNameMethod = - env->GetMethodID(_garbageCollectorMXBeanClass, "getName", "()Ljava/lang/String;"); - _getGCCollectionCountMethod = - env->GetMethodID(_garbageCollectorMXBeanClass, "getCollectionCount", "()J"); - _getGCCollectionTimeMethod = - env->GetMethodID(_garbageCollectorMXBeanClass, "getCollectionTime", "()J"); - - _threadMXBeanClass = env->FindClass("java/lang/management/ThreadMXBean"); - if (_threadMXBeanClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/ThreadMXBean Not Find.JVM monitoring fails."; - return; - } - _getAllThreadIdsMethod = env->GetMethodID(_threadMXBeanClass, "getAllThreadIds", "()[J"); - _getThreadInfoMethod = env->GetMethodID(_threadMXBeanClass, "getThreadInfo", - "([JI)[Ljava/lang/management/ThreadInfo;"); - _getPeakThreadCountMethod = env->GetMethodID(_threadMXBeanClass, "getPeakThreadCount", "()I"); - - _threadInfoClass = env->FindClass("java/lang/management/ThreadInfo"); - if (_threadInfoClass == nullptr) { - LOG(WARNING) << "Class java/lang/management/ThreadInfo Not Find.JVM monitoring fails."; - return; - } + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/MemoryMXBean", + &_memoryMXBeanClass)); - _getThreadStateMethod = - env->GetMethodID(_threadInfoClass, "getThreadState", "()Ljava/lang/Thread$State;"); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getHeapMemoryUsageMethod, env, + GetMethodID(_memoryMXBeanClass, "getHeapMemoryUsage", + "()Ljava/lang/management/MemoryUsage;")); + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getNonHeapMemoryUsageMethod, env, + GetMethodID(_memoryMXBeanClass, "getNonHeapMemoryUsage", + "()Ljava/lang/management/MemoryUsage;")); - _threadStateClass = env->FindClass("java/lang/Thread$State"); - if (_threadStateClass == nullptr) { - LOG(WARNING) << "Class java/lang/Thread$State Not Find.JVM monitoring fails."; - return; - } + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getMemoryPoolMXBeansMethod, env, + GetStaticMethodID(_managementFactoryClass, "getMemoryPoolMXBeans", + "()Ljava/util/List;")); - jfieldID newThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "NEW", "Ljava/lang/Thread$State;"); - jfieldID runnableThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "RUNNABLE", "Ljava/lang/Thread$State;"); - jfieldID blockedThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "BLOCKED", "Ljava/lang/Thread$State;"); - jfieldID waitingThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "WAITING", "Ljava/lang/Thread$State;"); - jfieldID timedWaitingThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "TIMED_WAITING", "Ljava/lang/Thread$State;"); - jfieldID terminatedThreadFieldID = - env->GetStaticFieldID(_threadStateClass, "TERMINATED", "Ljava/lang/Thread$State;"); - - _newThreadStateObj = env->GetStaticObjectField(_threadStateClass, newThreadFieldID); - _runnableThreadStateObj = env->GetStaticObjectField(_threadStateClass, runnableThreadFieldID); - _blockedThreadStateObj = env->GetStaticObjectField(_threadStateClass, blockedThreadFieldID); - _waitingThreadStateObj = env->GetStaticObjectField(_threadStateClass, waitingThreadFieldID); - _timedWaitingThreadStateObj = - env->GetStaticObjectField(_threadStateClass, timedWaitingThreadFieldID); - _terminatedThreadStateObj = - env->GetStaticObjectField(_threadStateClass, terminatedThreadFieldID); + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/util/List", &_listClass)); - LOG(INFO) << "Start JVM monitoring."; + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getListSizeMethod, env, + GetMethodID(_listClass, "size", "()I")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getListUseIndexMethod, env, + GetMethodID(_listClass, "get", "(I)Ljava/lang/Object;")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/MemoryPoolMXBean", + &_memoryPoolMXBeanClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryPoolMXBeanUsageMethod, env, + GetMethodID(_memoryPoolMXBeanClass, "getUsage", + "()Ljava/lang/management/MemoryUsage;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getMemoryPollMXBeanPeakMethod, env, + GetMethodID(_memoryPoolMXBeanClass, "getPeakUsage", + "()Ljava/lang/management/MemoryUsage;")); + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getMemoryPollMXBeanNameMethod, env, + GetMethodID(_memoryPoolMXBeanClass, "getName", "()Ljava/lang/String;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, _getThreadMXBeanMethod, env, + GetStaticMethodID(_managementFactoryClass, "getThreadMXBean", + "()Ljava/lang/management/ThreadMXBean;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGarbageCollectorMXBeansMethod, env, + GetStaticMethodID(_managementFactoryClass, "getGarbageCollectorMXBeans", + "()Ljava/util/List;")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/GarbageCollectorMXBean", + &_garbageCollectorMXBeanClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGCNameMethod, env, + GetMethodID(_garbageCollectorMXBeanClass, "getName", "()Ljava/lang/String;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGCCollectionCountMethod, env, + GetMethodID(_garbageCollectorMXBeanClass, "getCollectionCount", "()J")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , _getGCCollectionTimeMethod, env, + GetMethodID(_garbageCollectorMXBeanClass, "getCollectionTime", "()J")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/management/ThreadMXBean", + &_threadMXBeanClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, + + _getAllThreadIdsMethod, env, + GetMethodID(_threadMXBeanClass, "getAllThreadIds", "()[J")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, + + _getThreadInfoMethod, env, + GetMethodID(_threadMXBeanClass, "getThreadInfo", + "([JI)[Ljava/lang/management/ThreadInfo;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(, + + _getPeakThreadCountMethod, env, + GetMethodID(_threadMXBeanClass, "getPeakThreadCount", "()I")); + + RETURN_IF_ERROR( + JniUtil::GetGlobalClassRef(env, "java/lang/management/ThreadInfo", &_threadInfoClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + , + + _getThreadStateMethod, env, + GetMethodID(_threadInfoClass, "getThreadState", "()Ljava/lang/Thread$State;")); + + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/Thread$State", &_threadStateClass)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, newThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "NEW", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, runnableThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "RUNNABLE", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, blockedThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "BLOCKED", "Ljava/lang/Thread$State;")); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, waitingThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "WAITING", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, timedWaitingThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "TIMED_WAITING", "Ljava/lang/Thread$State;")); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jfieldID, terminatedThreadFieldID, env, + GetStaticFieldID(_threadStateClass, "TERMINATED", "Ljava/lang/Thread$State;")); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, newThreadStateObj, env, + GetStaticObjectField(_threadStateClass, newThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, newThreadStateObj, &_newThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, runnableThreadStateObj, env, + GetStaticObjectField(_threadStateClass, runnableThreadFieldID)); + RETURN_IF_ERROR( + JniUtil::LocalToGlobalRef(env, runnableThreadStateObj, &_runnableThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, blockedThreadStateObj, env, + GetStaticObjectField(_threadStateClass, blockedThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, blockedThreadStateObj, &_blockedThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, waitingThreadStateObj, env, + GetStaticObjectField(_threadStateClass, waitingThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, waitingThreadStateObj, &_waitingThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jobject, timedWaitingThreadStateObj, env, + GetStaticObjectField(_threadStateClass, timedWaitingThreadFieldID)); + RETURN_IF_ERROR(JniUtil::LocalToGlobalRef(env, timedWaitingThreadStateObj, + &_timedWaitingThreadStateObj)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jobject, terminatedThreadStateObj, env, + GetStaticObjectField(_threadStateClass, terminatedThreadFieldID)); + RETURN_IF_ERROR( + JniUtil::LocalToGlobalRef(env, terminatedThreadStateObj, &_terminatedThreadStateObj)); _init_complete = true; - return; + + LOG(INFO) << "Start JVM monitoring."; + return Status::OK(); } -void JvmStats::refresh(JvmMetrics* jvm_metrics) { +Status JvmStats::refresh(JvmMetrics* jvm_metrics) const { if (!_init_complete) { - return; + return Status::InternalError("Jvm Stats not init complete."); } - Status st = JniUtil::GetJNIEnv(&env); - if (!st.ok()) { - LOG(WARNING) << "JVM STATS GET JNI ENV FAIL"; - return; - } + JNIEnv* env = nullptr; + RETURN_IF_ERROR(JniUtil::GetJNIEnv(&env)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, memoryMXBeanObj, env, + CallStaticObjectMethod(_managementFactoryClass, _getMemoryMXBeanMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, heapMemoryUsageObj, env, + CallObjectMethod(memoryMXBeanObj, _getHeapMemoryUsageMethod)); - jobject memoryMXBeanObj = - env->CallStaticObjectMethod(_managementFactoryClass, _getMemoryMXBeanMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, heapMemoryUsed, env, + CallLongMethod(heapMemoryUsageObj, _getMemoryUsageUsedMethod)); - jobject heapMemoryUsageObj = env->CallObjectMethod(memoryMXBeanObj, _getHeapMemoryUsageMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jlong, heapMemoryCommitted, env, + CallLongMethod(heapMemoryUsageObj, _getMemoryUsageCommittedMethod)); - jlong heapMemoryUsed = env->CallLongMethod(heapMemoryUsageObj, _getMemoryUsageUsedMethod); - jlong heapMemoryCommitted = - env->CallLongMethod(heapMemoryUsageObj, _getMemoryUsageCommittedMethod); - jlong heapMemoryMax = env->CallLongMethod(heapMemoryUsageObj, _getMemoryUsageMaxMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, heapMemoryMax, env, + CallLongMethod(heapMemoryUsageObj, _getMemoryUsageMaxMethod)); jvm_metrics->jvm_heap_size_bytes_used->set_value(heapMemoryUsed < 0 ? 0 : heapMemoryUsed); jvm_metrics->jvm_heap_size_bytes_committed->set_value( heapMemoryCommitted < 0 ? 0 : heapMemoryCommitted); jvm_metrics->jvm_heap_size_bytes_max->set_value(heapMemoryMax < 0 ? 0 : heapMemoryMax); - jobject nonHeapMemoryUsageObj = - env->CallObjectMethod(memoryMXBeanObj, _getNonHeapMemoryUsageMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, nonHeapMemoryUsageObj, env, + CallObjectMethod(memoryMXBeanObj, _getNonHeapMemoryUsageMethod)); - jlong nonHeapMemoryCommitted = - env->CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageCommittedMethod); - jlong nonHeapMemoryUsed = env->CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageUsedMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION( + jlong, nonHeapMemoryCommitted, env, + CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageCommittedMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION( + jlong, nonHeapMemoryUsed, env, + CallLongMethod(nonHeapMemoryUsageObj, _getMemoryUsageUsedMethod)); jvm_metrics->jvm_non_heap_size_bytes_committed->set_value( nonHeapMemoryCommitted < 0 ? 0 : nonHeapMemoryCommitted); jvm_metrics->jvm_non_heap_size_bytes_used->set_value(nonHeapMemoryUsed < 0 ? 0 : nonHeapMemoryUsed); - jobject memoryPoolMXBeansList = - env->CallStaticObjectMethod(_managementFactoryClass, _getMemoryPoolMXBeansMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, memoryPoolMXBeansList, env, + CallStaticObjectMethod(_managementFactoryClass, _getMemoryPoolMXBeansMethod)); - jint size = env->CallIntMethod(memoryPoolMXBeansList, _getListSizeMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, size, env, + CallIntMethod(memoryPoolMXBeansList, _getListSizeMethod)); for (int i = 0; i < size; ++i) { - jobject memoryPoolMXBean = - env->CallObjectMethod(memoryPoolMXBeansList, _getListUseIndexMethod, i); - jobject usageObject = - env->CallObjectMethod(memoryPoolMXBean, _getMemoryPoolMXBeanUsageMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, memoryPoolMXBean, env, + CallObjectMethod(memoryPoolMXBeansList, _getListUseIndexMethod, i)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, usageObject, env, + CallObjectMethod(memoryPoolMXBean, _getMemoryPoolMXBeanUsageMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, used, env, + CallLongMethod(usageObject, _getMemoryUsageUsedMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, max, env, + CallLongMethod(usageObject, _getMemoryUsageMaxMethod)); - jlong used = env->CallLongMethod(usageObject, _getMemoryUsageUsedMethod); - jlong max = env->CallLongMethod(usageObject, _getMemoryUsageMaxMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, peakUsageObject, env, + CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanPeakMethod)); - jobject peakUsageObject = - env->CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanPeakMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, peakUsed, env, + CallLongMethod(peakUsageObject, _getMemoryUsageUsedMethod)); - jlong peakUsed = env->CallLongMethod(peakUsageObject, _getMemoryUsageUsedMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, name, env, + CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanNameMethod)); - jstring name = - (jstring)env->CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanNameMethod); - const char* nameStr = env->GetStringUTFChars(name, nullptr); + const char* nameStr = env->GetStringUTFChars( + (jstring)name, nullptr); // GetStringUTFChars not throw exception if (nameStr != nullptr) { auto it = _memoryPoolName.find(nameStr); if (it == _memoryPoolName.end()) { @@ -385,36 +456,46 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { jvm_metrics->jvm_old_size_bytes_max->set_value(max < 0 ? 0 : max); } - env->ReleaseStringUTFChars(name, nameStr); + env->ReleaseStringUTFChars((jstring)name, + nameStr); // ReleaseStringUTFChars not throw exception } - env->DeleteLocalRef(memoryPoolMXBean); - env->DeleteLocalRef(usageObject); - env->DeleteLocalRef(peakUsageObject); } + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadMXBean, env, + CallStaticObjectMethod(_managementFactoryClass, _getThreadMXBeanMethod)); - jobject threadMXBean = - env->CallStaticObjectMethod(_managementFactoryClass, _getThreadMXBeanMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadIdsObject, env, CallObjectMethod(threadMXBean, _getAllThreadIdsMethod)); - jlongArray threadIds = (jlongArray)env->CallObjectMethod(threadMXBean, _getAllThreadIdsMethod); - jint threadCount = env->GetArrayLength(threadIds); + auto threadIds = (jlongArray)threadIdsObject; - jobjectArray threadInfos = - (jobjectArray)env->CallObjectMethod(threadMXBean, _getThreadInfoMethod, threadIds, 0); + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, threadCount, env, GetArrayLength(threadIds)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadInfos, env, + CallObjectMethod(threadMXBean, _getThreadInfoMethod, (jlongArray)threadIds, 0)); int threadsNew = 0, threadsRunnable = 0, threadsBlocked = 0, threadsWaiting = 0, threadsTimedWaiting = 0, threadsTerminated = 0; - jint peakThreadCount = env->CallIntMethod(threadMXBean, _getPeakThreadCountMethod); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, peakThreadCount, env, + CallIntMethod(threadMXBean, _getPeakThreadCountMethod)); jvm_metrics->jvm_thread_peak_count->set_value(peakThreadCount < 0 ? 0 : peakThreadCount); jvm_metrics->jvm_thread_count->set_value(threadCount < 0 ? 0 : threadCount); for (int i = 0; i < threadCount; i++) { - jobject threadInfo = env->GetObjectArrayElement(threadInfos, i); + JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, threadInfo, env, + GetObjectArrayElement((jobjectArray)threadInfos, i)); + if (threadInfo == nullptr) { continue; } - jobject threadState = env->CallObjectMethod(threadInfo, _getThreadStateMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadState, env, CallObjectMethod(threadInfo, _getThreadStateMethod)); + + //IsSameObject not throw exception if (env->IsSameObject(threadState, _newThreadStateObj)) { threadsNew++; } else if (env->IsSameObject(threadState, _runnableThreadStateObj)) { @@ -428,8 +509,6 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { } else if (env->IsSameObject(threadState, _terminatedThreadStateObj)) { threadsTerminated++; } - env->DeleteLocalRef(threadInfo); - env->DeleteLocalRef(threadState); } jvm_metrics->jvm_thread_new_count->set_value(threadsNew < 0 ? 0 : threadsNew); @@ -441,18 +520,27 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { jvm_metrics->jvm_thread_terminated_count->set_value(threadsTerminated < 0 ? 0 : threadsTerminated); - jobject gcMXBeansList = - env->CallStaticObjectMethod(_managementFactoryClass, _getGarbageCollectorMXBeansMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, gcMXBeansList, env, + CallStaticObjectMethod(_managementFactoryClass, _getGarbageCollectorMXBeansMethod)); - jint numCollectors = env->CallIntMethod(gcMXBeansList, _getListSizeMethod); + JNI_CALL_METHOD_CHECK_EXCEPTION(jint, numCollectors, env, + CallIntMethod(gcMXBeansList, _getListSizeMethod)); for (int i = 0; i < numCollectors; i++) { - jobject gcMXBean = env->CallObjectMethod(gcMXBeansList, _getListUseIndexMethod, i); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, gcMXBean, env, CallObjectMethod(gcMXBeansList, _getListUseIndexMethod, i)); + + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF(jobject, gcName, env, + CallObjectMethod(gcMXBean, _getGCNameMethod)); - jstring gcName = (jstring)env->CallObjectMethod(gcMXBean, _getGCNameMethod); - jlong gcCollectionCount = env->CallLongMethod(gcMXBean, _getGCCollectionCountMethod); - jlong gcCollectionTime = env->CallLongMethod(gcMXBean, _getGCCollectionTimeMethod); - const char* gcNameStr = env->GetStringUTFChars(gcName, NULL); + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, gcCollectionCount, env, + CallLongMethod(gcMXBean, _getGCCollectionCountMethod)); + + JNI_CALL_METHOD_CHECK_EXCEPTION(jlong, gcCollectionTime, env, + CallLongMethod(gcMXBean, _getGCCollectionTimeMethod)); + + const char* gcNameStr = env->GetStringUTFChars((jstring)gcName, NULL); if (gcNameStr != nullptr) { if (strcmp(gcNameStr, "G1 Young Generation") == 0) { jvm_metrics->jvm_gc_g1_young_generation_count->set_value(gcCollectionCount); @@ -463,31 +551,40 @@ void JvmStats::refresh(JvmMetrics* jvm_metrics) { jvm_metrics->jvm_gc_g1_old_generation_time_ms->set_value(gcCollectionTime); } - env->ReleaseStringUTFChars(gcName, gcNameStr); + env->ReleaseStringUTFChars((jstring)gcName, gcNameStr); } - env->DeleteLocalRef(gcMXBean); } - env->DeleteLocalRef(memoryMXBeanObj); - env->DeleteLocalRef(heapMemoryUsageObj); - env->DeleteLocalRef(nonHeapMemoryUsageObj); - env->DeleteLocalRef(memoryPoolMXBeansList); - env->DeleteLocalRef(threadMXBean); - env->DeleteLocalRef(gcMXBeansList); + + return Status::OK(); } JvmStats::~JvmStats() { if (!_init_complete) { return; } try { - env->DeleteLocalRef(_newThreadStateObj); - env->DeleteLocalRef(_runnableThreadStateObj); - env->DeleteLocalRef(_blockedThreadStateObj); - env->DeleteLocalRef(_waitingThreadStateObj); - env->DeleteLocalRef(_timedWaitingThreadStateObj); - env->DeleteLocalRef(_terminatedThreadStateObj); + JNIEnv* env = nullptr; + Status st = JniUtil::GetJNIEnv(&env); + if (!st.ok()) { + return; + } + env->DeleteGlobalRef(_managementFactoryClass); + env->DeleteGlobalRef(_memoryUsageClass); + env->DeleteGlobalRef(_memoryMXBeanClass); + env->DeleteGlobalRef(_listClass); + env->DeleteGlobalRef(_memoryPoolMXBeanClass); + env->DeleteGlobalRef(_threadMXBeanClass); + env->DeleteGlobalRef(_threadInfoClass); + env->DeleteGlobalRef(_threadStateClass); + env->DeleteGlobalRef(_garbageCollectorMXBeanClass); + + env->DeleteGlobalRef(_newThreadStateObj); + env->DeleteGlobalRef(_runnableThreadStateObj); + env->DeleteGlobalRef(_blockedThreadStateObj); + env->DeleteGlobalRef(_waitingThreadStateObj); + env->DeleteGlobalRef(_timedWaitingThreadStateObj); + env->DeleteGlobalRef(_terminatedThreadStateObj); } catch (...) { - // When be is killed, DeleteLocalRef may fail. // In order to exit more gracefully, we catch the exception here. } } diff --git a/be/src/util/jvm_metrics.h b/be/src/util/jvm_metrics.h index 459a3cbf938f79..78346c022b0aba 100644 --- a/be/src/util/jvm_metrics.h +++ b/be/src/util/jvm_metrics.h @@ -27,7 +27,6 @@ class JvmMetrics; class JvmStats { private: - JNIEnv* env = nullptr; jclass _managementFactoryClass = nullptr; jmethodID _getMemoryMXBeanMethod = nullptr; jclass _memoryUsageClass = nullptr; @@ -96,11 +95,10 @@ class JvmStats { bool _init_complete = false; public: - // JvmStats(JNIEnv* ENV); - void init(JNIEnv* ENV); + Status init(JNIEnv* env); bool init_complete() const { return _init_complete; } void set_complete(bool val) { _init_complete = val; } - void refresh(JvmMetrics* jvm_metrics); + Status refresh(JvmMetrics* jvm_metrics) const; ~JvmStats(); }; diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index b1bcfdcc56b430..36579452db3f85 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -74,9 +74,9 @@ std::atomic MemInfo::_s_je_dirty_pages_mem = std::numeric_limits MemInfo::_s_je_dirty_pages_mem_limit = std::numeric_limits::max(); std::atomic MemInfo::_s_virtual_memory_used = 0; -int64_t MemInfo::_s_cgroup_mem_limit = std::numeric_limits::max(); -int64_t MemInfo::_s_cgroup_mem_usage = std::numeric_limits::min(); -bool MemInfo::_s_cgroup_mem_refresh_state = false; +std::atomic MemInfo::_s_cgroup_mem_limit = std::numeric_limits::max(); +std::atomic MemInfo::_s_cgroup_mem_usage = std::numeric_limits::min(); +std::atomic MemInfo::_s_cgroup_mem_refresh_state = false; int64_t MemInfo::_s_cgroup_mem_refresh_wait_times = 0; static std::unordered_map _mem_info_bytes; @@ -94,7 +94,7 @@ void MemInfo::refresh_allocator_mem() { #elif defined(USE_JEMALLOC) // jemalloc mallctl refer to : https://jemalloc.net/jemalloc.3.html // https://www.bookstack.cn/read/aliyun-rds-core/4a0cdf677f62feb3.md - // Check the Doris BE web page `http://ip:webserver_port/memz` to get the Jemalloc Profile. + // Check the Doris BE web page `http://ip:webserver_port/memory` to get the Jemalloc Profile. // 'epoch' is a special mallctl -- it updates the statistics. Without it, all // the following calls will return stale values. It increments and returns @@ -191,7 +191,8 @@ void MemInfo::refresh_proc_meminfo() { // refresh cgroup memory if (config::enable_use_cgroup_memory_info) { if (_s_cgroup_mem_refresh_wait_times >= 0) { - auto status = CGroupMemoryCtl::find_cgroup_mem_limit(&_s_cgroup_mem_limit); + int64_t cgroup_mem_limit; + auto status = CGroupMemoryCtl::find_cgroup_mem_limit(&cgroup_mem_limit); if (!status.ok()) { _s_cgroup_mem_limit = std::numeric_limits::max(); // find cgroup limit failed, wait 300s, 1000 * 100ms. @@ -200,6 +201,7 @@ void MemInfo::refresh_proc_meminfo() { "mem limit: " << _s_cgroup_mem_limit; } else { + _s_cgroup_mem_limit = cgroup_mem_limit; // wait 10s, 100 * 100ms, avoid too frequently. _s_cgroup_mem_refresh_wait_times = -100; } @@ -208,11 +210,13 @@ void MemInfo::refresh_proc_meminfo() { } if (_s_cgroup_mem_limit != std::numeric_limits::max()) { - auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&_s_cgroup_mem_usage); + int64_t cgroup_mem_usage; + auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); if (!status.ok()) { _s_cgroup_mem_usage = std::numeric_limits::min(); _s_cgroup_mem_refresh_state = false; } else { + _s_cgroup_mem_usage = cgroup_mem_usage; _s_cgroup_mem_refresh_state = true; } } else { @@ -231,7 +235,8 @@ void MemInfo::refresh_proc_meminfo() { if (physical_mem < 0) { physical_mem = _s_cgroup_mem_limit; } else { - physical_mem = std::min(physical_mem, _s_cgroup_mem_limit); + physical_mem = + std::min(physical_mem, _s_cgroup_mem_limit.load(std::memory_order_relaxed)); } } diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h index 60ce26016b1b32..39ae9eb0b79cfb 100644 --- a/be/src/util/mem_info.h +++ b/be/src/util/mem_info.h @@ -219,6 +219,18 @@ class MemInfo { return PrettyPrinter::print(_s_soft_mem_limit.load(std::memory_order_relaxed), TUnit::BYTES); } + static inline int64_t cgroup_mem_limit() { + DCHECK(_s_initialized); + return _s_cgroup_mem_limit.load(std::memory_order_relaxed); + } + static inline int64_t cgroup_mem_usage() { + DCHECK(_s_initialized); + return _s_cgroup_mem_usage.load(std::memory_order_relaxed); + } + static inline int64_t cgroup_mem_refresh_state() { + DCHECK(_s_initialized); + return _s_cgroup_mem_refresh_state.load(std::memory_order_relaxed); + } static std::string debug_string(); @@ -236,9 +248,9 @@ class MemInfo { static std::atomic _s_je_dirty_pages_mem_limit; static std::atomic _s_virtual_memory_used; - static int64_t _s_cgroup_mem_limit; - static int64_t _s_cgroup_mem_usage; - static bool _s_cgroup_mem_refresh_state; + static std::atomic _s_cgroup_mem_limit; + static std::atomic _s_cgroup_mem_usage; + static std::atomic _s_cgroup_mem_refresh_state; static int64_t _s_cgroup_mem_refresh_wait_times; static std::atomic _s_sys_mem_available; diff --git a/be/src/util/murmur_hash3.cpp b/be/src/util/murmur_hash3.cpp index 96568d6978e225..edd1c44f338473 100644 --- a/be/src/util/murmur_hash3.cpp +++ b/be/src/util/murmur_hash3.cpp @@ -85,7 +85,7 @@ FORCE_INLINE uint64_t fmix64(uint64_t k) { //----------------------------------------------------------------------------- -void murmur_hash3_x86_32(const void* key, int len, uint32_t seed, void* out) { +void murmur_hash3_x86_32(const void* key, int64_t len, uint32_t seed, void* out) { const uint8_t* data = (const uint8_t*)key; const int nblocks = len / 4; @@ -435,7 +435,7 @@ void murmur_hash3_x64_128(const void* key, const int len, const uint32_t seed, v ((uint64_t*)out)[1] = h2; } -void murmur_hash3_x64_64(const void* key, const int len, const uint64_t seed, void* out) { +void murmur_hash3_x64_64(const void* key, const int64_t len, const uint64_t seed, void* out) { const uint8_t* data = (const uint8_t*)key; const int nblocks = len / 8; uint64_t h1 = seed; diff --git a/be/src/util/murmur_hash3.h b/be/src/util/murmur_hash3.h index c8e8964bf6a20e..249966460221a3 100644 --- a/be/src/util/murmur_hash3.h +++ b/be/src/util/murmur_hash3.h @@ -25,12 +25,12 @@ typedef unsigned __int64 uint64_t; //----------------------------------------------------------------------------- -void murmur_hash3_x86_32(const void* key, int len, uint32_t seed, void* out); +void murmur_hash3_x86_32(const void* key, int64_t len, uint32_t seed, void* out); void murmur_hash3_x86_128(const void* key, int len, uint32_t seed, void* out); void murmur_hash3_x64_128(const void* key, int len, uint32_t seed, void* out); -void murmur_hash3_x64_64(const void* key, int len, uint64_t seed, void* out); +void murmur_hash3_x64_64(const void* key, int64_t len, uint64_t seed, void* out); //----------------------------------------------------------------------------- diff --git a/be/src/util/mysql_row_buffer.cpp b/be/src/util/mysql_row_buffer.cpp index 4823920508a940..3e20a2d9de72fe 100644 --- a/be/src/util/mysql_row_buffer.cpp +++ b/be/src/util/mysql_row_buffer.cpp @@ -87,9 +87,9 @@ MysqlRowBuffer::MysqlRowBuffer() _len_pos(0) {} template -void MysqlRowBuffer::start_binary_row(uint32_t num_cols) { +void MysqlRowBuffer::start_binary_row(uint64_t num_cols) { assert(is_binary_format); - int bit_fields = (num_cols + 9) / 8; + auto bit_fields = (num_cols + 9) / 8; reserve(bit_fields + 1); memset(_pos, 0, 1 + bit_fields); _pos += bit_fields + 1; diff --git a/be/src/util/mysql_row_buffer.h b/be/src/util/mysql_row_buffer.h index b740efa7764ed1..50b17c91c170ca 100644 --- a/be/src/util/mysql_row_buffer.h +++ b/be/src/util/mysql_row_buffer.h @@ -62,7 +62,7 @@ class MysqlRowBuffer { // Prepare for binary row buffer // init bitmap - void start_binary_row(uint32_t num_cols); + void start_binary_row(uint64_t num_cols); // TODO(zhaochun): add signed/unsigned support int push_tinyint(int8_t data); diff --git a/be/src/util/runtime_profile.cpp b/be/src/util/runtime_profile.cpp index a9e197fba9baf6..e87301880d2479 100644 --- a/be/src/util/runtime_profile.cpp +++ b/be/src/util/runtime_profile.cpp @@ -274,7 +274,7 @@ void RuntimeProfile::compute_time_in_profile(int64_t total) { RuntimeProfile* RuntimeProfile::create_child(const std::string& name, bool indent, bool prepend) { std::lock_guard l(_children_lock); - DCHECK(_child_map.find(name) == _child_map.end()); + DCHECK(_child_map.find(name) == _child_map.end()) << ", name: " << name; RuntimeProfile* child = _pool->add(new RuntimeProfile(name)); if (this->is_set_metadata()) { child->set_metadata(this->metadata()); @@ -285,8 +285,8 @@ RuntimeProfile* RuntimeProfile::create_child(const std::string& name, bool inden if (_children.empty()) { add_child_unlock(child, indent, nullptr); } else { - ChildVector::iterator pos = prepend ? _children.begin() : _children.end(); - add_child_unlock(child, indent, (*pos).first); + auto* pos = prepend ? _children.begin()->first : nullptr; + add_child_unlock(child, indent, pos); } return child; } diff --git a/be/src/util/runtime_profile.h b/be/src/util/runtime_profile.h index b77157d1f5b3de..955d77b72aa51c 100644 --- a/be/src/util/runtime_profile.h +++ b/be/src/util/runtime_profile.h @@ -51,8 +51,8 @@ class TRuntimeProfileTree; #define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y) #define ADD_LABEL_COUNTER(profile, name) (profile)->add_counter(name, TUnit::NONE) -#define ADD_LABEL_COUNTER_WITH_LEVEL(profile, name, type) \ - (profile)->add_counter_with_level(name, TUnit::NONE, type) +#define ADD_LABEL_COUNTER_WITH_LEVEL(profile, name, level) \ + (profile)->add_counter_with_level(name, TUnit::NONE, level) #define ADD_COUNTER(profile, name, type) (profile)->add_counter(name, type) #define ADD_COUNTER_WITH_LEVEL(profile, name, type, level) \ (profile)->add_counter_with_level(name, type, level) diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp index b2f4cdc3ce7885..18058469ee4566 100644 --- a/be/src/util/s3_util.cpp +++ b/be/src/util/s3_util.cpp @@ -401,15 +401,15 @@ S3Conf S3Conf::get_s3_conf(const cloud::ObjectStoreInfoPB& info) { S3Conf ret { .bucket = info.bucket(), .prefix = info.prefix(), - .client_conf { - .endpoint = info.endpoint(), - .region = info.region(), - .ak = info.ak(), - .sk = info.sk(), - .token {}, - .bucket = info.bucket(), - .provider = io::ObjStorageType::AWS, - }, + .client_conf {.endpoint = info.endpoint(), + .region = info.region(), + .ak = info.ak(), + .sk = info.sk(), + .token {}, + .bucket = info.bucket(), + .provider = io::ObjStorageType::AWS, + .use_virtual_addressing = + info.has_use_path_style() ? !info.use_path_style() : true}, .sse_enabled = info.sse_enabled(), }; diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h index 7e2e7c8202569d..5953c651dc6f78 100644 --- a/be/src/util/simd/bits.h +++ b/be/src/util/simd/bits.h @@ -19,6 +19,7 @@ #include #include +#include #include #if defined(__ARM_NEON) && defined(__aarch64__) @@ -27,8 +28,7 @@ #include "util/sse_util.hpp" -namespace doris { -namespace simd { +namespace doris::simd { consteval auto bits_mask_length() { #if defined(__ARM_NEON) && defined(__aarch64__) @@ -70,7 +70,7 @@ inline uint64_t bytes16_mask_to_bits64_mask(const uint8_t* data) { inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { #ifdef __AVX2__ auto zero32 = _mm256_setzero_si256(); - uint32_t mask = static_cast(_mm256_movemask_epi8( + auto mask = static_cast(_mm256_movemask_epi8( _mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast(data)), zero32))); #elif defined(__SSE2__) auto zero16 = _mm_setzero_si128(); @@ -125,8 +125,10 @@ void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullp #endif } -inline size_t count_zero_num(const int8_t* __restrict data, size_t size) { - size_t num = 0; +template + requires requires { std::is_unsigned_v; } +inline T count_zero_num(const int8_t* __restrict data, T size) { + T num = 0; const int8_t* end = data + size; #if defined(__SSE2__) && defined(__POPCNT__) const __m128i zero16 = _mm_setzero_si128(); @@ -138,13 +140,13 @@ inline size_t count_zero_num(const int8_t* __restrict data, size_t size) { _mm_loadu_si128(reinterpret_cast(data)), zero16))) | (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(data + 16)), zero16))) - << 16u) | + << 16U) | (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(data + 32)), zero16))) - << 32u) | + << 32U) | (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(data + 48)), zero16))) - << 48u)); + << 48U)); } #endif for (; data < end; ++data) { @@ -153,9 +155,10 @@ inline size_t count_zero_num(const int8_t* __restrict data, size_t size) { return num; } -inline size_t count_zero_num(const int8_t* __restrict data, const uint8_t* __restrict null_map, - size_t size) { - size_t num = 0; +template + requires requires { std::is_unsigned_v; } +inline T count_zero_num(const int8_t* __restrict data, const uint8_t* __restrict null_map, T size) { + T num = 0; const int8_t* end = data + size; #if defined(__SSE2__) && defined(__POPCNT__) const __m128i zero16 = _mm_setzero_si128(); @@ -172,19 +175,19 @@ inline size_t count_zero_num(const int8_t* __restrict data, const uint8_t* __res _mm_loadu_si128(reinterpret_cast(data + 16)), zero16), _mm_loadu_si128(reinterpret_cast(null_map + 16))))) - << 16u) | + << 16U) | (static_cast(_mm_movemask_epi8(_mm_or_si128( _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(data + 32)), zero16), _mm_loadu_si128(reinterpret_cast(null_map + 32))))) - << 32u) | + << 32U) | (static_cast(_mm_movemask_epi8(_mm_or_si128( _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(data + 48)), zero16), _mm_loadu_si128(reinterpret_cast(null_map + 48))))) - << 48u)); + << 48U)); } #endif for (; data < end; ++data, ++null_map) { @@ -235,5 +238,4 @@ inline size_t find_zero(const std::vector& vec, size_t start) { return find_byte(vec, start, 0); } -} // namespace simd -} // namespace doris +} // namespace doris::simd diff --git a/be/src/util/slice.h b/be/src/util/slice.h index b38b1147894f9e..fd6bcf0adfb510 100644 --- a/be/src/util/slice.h +++ b/be/src/util/slice.h @@ -344,6 +344,10 @@ class OwnedSlice : private Allocator(Allocator::alloc(length)), length), + _capacity(length) {} + OwnedSlice(OwnedSlice&& src) : _slice(src._slice), _capacity(src._capacity) { src._slice.data = nullptr; src._slice.size = 0; @@ -369,6 +373,8 @@ class OwnedSlice : private Allocator - static inline T string_to_int(const char* __restrict s, int len, ParseResult* result) { + static inline T string_to_int(const char* __restrict s, size_t len, ParseResult* result) { T ans = string_to_int_internal(s, len, result); if (LIKELY(*result == PARSE_SUCCESS)) { return ans; @@ -128,7 +128,7 @@ class StringParser { // Convert a string s representing a number in given base into a decimal number. template - static inline T string_to_int(const char* __restrict s, int len, int base, + static inline T string_to_int(const char* __restrict s, int64_t len, int base, ParseResult* result) { T ans = string_to_int_internal(s, len, base, result); if (LIKELY(*result == PARSE_SUCCESS)) { @@ -140,7 +140,7 @@ class StringParser { } template - static inline T string_to_float(const char* __restrict s, int len, ParseResult* result) { + static inline T string_to_float(const char* __restrict s, size_t len, ParseResult* result) { return string_to_float_internal(s, len, result); } @@ -207,7 +207,7 @@ class StringParser { // Convert a string s representing a number in given base into a decimal number. // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. template - static inline T string_to_int_internal(const char* __restrict s, int len, int base, + static inline T string_to_int_internal(const char* __restrict s, int64_t len, int base, ParseResult* result); // Converts an ascii string to an integer of type T assuming it cannot overflow @@ -385,7 +385,7 @@ T StringParser::string_to_unsigned_int_internal(const char* __restrict s, int le } template -T StringParser::string_to_int_internal(const char* __restrict s, int len, int base, +T StringParser::string_to_int_internal(const char* __restrict s, int64_t len, int base, ParseResult* result) { typedef typename std::make_unsigned::type UnsignedT; UnsignedT val = 0; diff --git a/be/src/util/timezone_utils.cpp b/be/src/util/timezone_utils.cpp index 5aef6f8702b8dc..6bb71ac46471c9 100644 --- a/be/src/util/timezone_utils.cpp +++ b/be/src/util/timezone_utils.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -58,6 +59,9 @@ static const char* tzdir = "/usr/share/zoneinfo"; // default value, may change b void TimezoneUtils::clear_timezone_caches() { lower_zone_cache_->clear(); } +int TimezoneUtils::cache_size() { + return lower_zone_cache_->size(); +} static bool parse_save_name_tz(const std::string& tz_name) { cctz::time_zone tz; @@ -106,24 +110,54 @@ void TimezoneUtils::load_timezones_to_cache() { } lower_zone_cache_->erase("lmt"); // local mean time for every timezone - LOG(INFO) << "Read " << lower_zone_cache_->size() << " timezones."; + + load_offsets_to_cache(); + LOG(INFO) << "Preloaded" << lower_zone_cache_->size() << " timezones."; +} + +static std::string to_hour_string(int arg) { + if (arg < 0 && arg > -10) { // -9 to -1 + return std::string {"-0"} + std::to_string(std::abs(arg)); + } else if (arg >= 0 && arg < 10) { //0 to 9 + return std::string {"0"} + std::to_string(arg); + } + return std::to_string(arg); +} + +void TimezoneUtils::load_offsets_to_cache() { + for (int hour = -12; hour <= +14; hour++) { + for (int minute = 0; minute <= 30; minute += 30) { + std::string offset_str = (hour >= 0 ? "+" : "") + to_hour_string(hour) + ':' + + (minute == 0 ? "00" : "30"); + cctz::time_zone result; + parse_tz_offset_string(offset_str, result); + lower_zone_cache_->emplace(offset_str, result); + } + } + // -00 for hour is also valid + std::string offset_str = "-00:00"; + cctz::time_zone result; + parse_tz_offset_string(offset_str, result); + lower_zone_cache_->emplace(offset_str, result); + offset_str = "-00:30"; + parse_tz_offset_string(offset_str, result); + lower_zone_cache_->emplace(offset_str, result); } bool TimezoneUtils::find_cctz_time_zone(const std::string& timezone, cctz::time_zone& ctz) { - if (auto it = lower_zone_cache_->find(to_lower_copy(timezone)); - it != lower_zone_cache_->end()) { + if (auto it = lower_zone_cache_->find(to_lower_copy(timezone)); it != lower_zone_cache_->end()) + [[likely]] { ctz = it->second; return true; } - // offset format or just illegal - return parse_tz_offset_string(timezone, ctz); + return false; } bool TimezoneUtils::parse_tz_offset_string(const std::string& timezone, cctz::time_zone& ctz) { // like +08:00, which not in timezone_names_map_ re2::StringPiece value; - if (time_zone_offset_format_reg.Match(timezone, 0, timezone.size(), RE2::UNANCHORED, &value, - 1)) { + if (time_zone_offset_format_reg.Match(timezone, 0, timezone.size(), RE2::UNANCHORED, &value, 1)) + [[likely]] { bool positive = value[0] != '-'; //Regular expression guarantees hour and minute must be int @@ -139,8 +173,6 @@ bool TimezoneUtils::parse_tz_offset_string(const std::string& timezone, cctz::ti int offset = hour * 60 * 60 + minute * 60; offset *= positive ? 1 : -1; ctz = cctz::fixed_time_zone(cctz::seconds(offset)); - // try to push the result time offset of "+08:00" need lock. now it's harmful for performance. - // maybe we can use rcu of hazard-pointer to opt it. return true; } return false; diff --git a/be/src/util/timezone_utils.h b/be/src/util/timezone_utils.h index c8bce44b5aba89..3cdb17fc6fdfe5 100644 --- a/be/src/util/timezone_utils.h +++ b/be/src/util/timezone_utils.h @@ -41,6 +41,9 @@ class TimezoneUtils { private: // for ut only static void clear_timezone_caches(); + static int cache_size(); + + static void load_offsets_to_cache(); static bool parse_tz_offset_string(const std::string& timezone, cctz::time_zone& ctz); }; diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index 05f1bd2a602c68..e9148716f99f35 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -20,6 +20,8 @@ #pragma once +#include "common/exception.h" +#include "common/status.h" #include "util/defer_op.h" #include "vec/columns/column_complex.h" #include "vec/columns/column_string.h" @@ -30,6 +32,7 @@ #include "vec/core/column_numbers.h" #include "vec/core/field.h" #include "vec/core/types.h" +#include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_string.h" namespace doris::vectorized { @@ -38,6 +41,11 @@ class Arena; class IColumn; class IDataType; +struct AggregateFunctionAttr { + bool enable_decimal256 {false}; + std::vector> column_infos; +}; + template class AggregateFunctionBitmapCount; template @@ -111,21 +119,21 @@ class IAggregateFunction { * Additional parameter arena should be used instead of standard memory allocator if the addition requires memory allocation. */ virtual void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const = 0; + Arena*) const = 0; virtual void add_many(AggregateDataPtr __restrict place, const IColumn** columns, - std::vector& rows, Arena* arena) const {} + std::vector& rows, Arena*) const {} /// Merges state (on which place points to) with other state of current aggregation function. virtual void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const = 0; + Arena*) const = 0; virtual void merge_vec(const AggregateDataPtr* places, size_t offset, ConstAggregateDataPtr rhs, - Arena* arena, const size_t num_rows) const = 0; + Arena*, const size_t num_rows) const = 0; // same as merge_vec, but only call "merge" function when place is not nullptr virtual void merge_vec_selected(const AggregateDataPtr* places, size_t offset, - ConstAggregateDataPtr rhs, Arena* arena, + ConstAggregateDataPtr rhs, Arena*, const size_t num_rows) const = 0; /// Serializes state (to transmit it over the network, for example). @@ -142,21 +150,21 @@ class IAggregateFunction { /// Deserializes state. This function is called only for empty (just created) states. virtual void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, - Arena* arena) const = 0; + Arena*) const = 0; - virtual void deserialize_vec(AggregateDataPtr places, const ColumnString* column, Arena* arena, + virtual void deserialize_vec(AggregateDataPtr places, const ColumnString* column, Arena*, size_t num_rows) const = 0; virtual void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const = 0; + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const = 0; virtual void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const = 0; + Arena*, const size_t num_rows) const = 0; - virtual void deserialize_from_column(AggregateDataPtr places, const IColumn& column, - Arena* arena, size_t num_rows) const = 0; + virtual void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, + size_t num_rows) const = 0; /// Deserializes state and merge it with current aggregation function. virtual void deserialize_and_merge(AggregateDataPtr __restrict place, @@ -165,10 +173,10 @@ class IAggregateFunction { virtual void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, - size_t end, Arena* arena) const = 0; + size_t end, Arena*) const = 0; virtual void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, - const IColumn& column, Arena* arena) const = 0; + const IColumn& column, Arena*) const = 0; /// Inserts results into a column. virtual void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const = 0; @@ -181,33 +189,32 @@ class IAggregateFunction { * and do a single call to "add_batch" for devirtualization and inlining. */ virtual void add_batch(size_t batch_size, AggregateDataPtr* places, size_t place_offset, - const IColumn** columns, Arena* arena, bool agg_many = false) const = 0; + const IColumn** columns, Arena*, bool agg_many = false) const = 0; // same as add_batch, but only call "add" function when place is not nullptr virtual void add_batch_selected(size_t batch_size, AggregateDataPtr* places, - size_t place_offset, const IColumn** columns, - Arena* arena) const = 0; + size_t place_offset, const IColumn** columns, Arena*) const = 0; /** The same for single place. */ virtual void add_batch_single_place(size_t batch_size, AggregateDataPtr place, - const IColumn** columns, Arena* arena) const = 0; + const IColumn** columns, Arena*) const = 0; // only used at agg reader virtual void add_batch_range(size_t batch_begin, size_t batch_end, AggregateDataPtr place, - const IColumn** columns, Arena* arena, bool has_null = false) = 0; + const IColumn** columns, Arena*, bool has_null = false) = 0; // only used at window function virtual void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end, AggregateDataPtr place, const IColumn** columns, - Arena* arena) const = 0; + Arena*) const = 0; virtual void streaming_agg_serialize(const IColumn** columns, BufferWritable& buf, - const size_t num_rows, Arena* arena) const = 0; + const size_t num_rows, Arena*) const = 0; virtual void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const = 0; + const size_t num_rows, Arena*) const = 0; const DataTypes& get_argument_types() const { return argument_types; } @@ -219,6 +226,10 @@ class IAggregateFunction { virtual AggregateFunctionPtr transmit_to_stable() { return nullptr; } + /// Verify function signature + virtual Status verify_result_type(const bool without_key, const DataTypes& argument_types, + const DataTypePtr result_type) const = 0; + protected: DataTypes argument_types; int version {}; @@ -491,6 +502,43 @@ class IAggregateFunctionHelper : public IAggregateFunction { arena); assert_cast(this)->merge(place, rhs, arena); } + + Status verify_result_type(const bool without_key, const DataTypes& argument_types_with_nullable, + const DataTypePtr result_type_with_nullable) const override { + DataTypePtr function_result_type = assert_cast(this)->get_return_type(); + + if (function_result_type->equals(*result_type_with_nullable)) { + return Status::OK(); + } + + if (!remove_nullable(function_result_type) + ->equals(*remove_nullable(result_type_with_nullable))) { + return Status::InternalError( + "Result type of {} is not matched, planner expect {}, but get {}, with group " + "by: " + "{}", + get_name(), result_type_with_nullable->get_name(), + function_result_type->get_name(), !without_key); + } + + if (without_key == true) { + if (result_type_with_nullable->is_nullable()) { + // This branch is decicated for NullableAggregateFunction. + // When they are executed without group by key, the result from planner will be AlwaysNullable + // since Planer does not know whether there are any invalid input at runtime, if so, the result + // should be Null, so the result type must be nullable. + // Backend will wrap a ColumnNullable in this situation. For example: AggLocalState::_get_without_key_result + return Status::OK(); + } + } + + // Executed with group by key, result type must be exactly same with the return type from Planner. + return Status::InternalError( + "Result type of {} is not matched, planner expect {}, but get {}, with group by: " + "{}", + get_name(), result_type_with_nullable->get_name(), function_result_type->get_name(), + !without_key); + } }; /// Implements several methods for manipulation with data. T - type of structure with data for aggregation. diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp index 10616be4258477..18662bf66cf38c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp @@ -31,7 +31,8 @@ namespace doris::vectorized { AggregateFunctionPtr create_aggregate_function_approx_count_distinct( - const std::string& name, const DataTypes& argument_types, const bool result_is_nullable) { + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { WhichDataType which(remove_nullable(argument_types[0])); #define DISPATCH(TYPE, COLUMN_TYPE) \ diff --git a/be/src/exec/schema_scanner/schema_statistics_scanner.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h similarity index 70% rename from be/src/exec/schema_scanner/schema_statistics_scanner.h rename to be/src/vec/aggregate_functions/aggregate_function_approx_top.h index 1a756512abf18a..7885321bba3e11 100644 --- a/be/src/exec/schema_scanner/schema_statistics_scanner.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h @@ -17,19 +17,13 @@ #pragma once -#include +#include "vec/core/types.h" -#include "exec/schema_scanner.h" - -namespace doris { -class SchemaStatisticsScanner : public SchemaScanner { - ENABLE_FACTORY_CREATOR(SchemaStatisticsScanner); +namespace doris::vectorized { +class AggregateFunctionApproxTop { public: - SchemaStatisticsScanner(); - ~SchemaStatisticsScanner() override; - -private: - static std::vector _s_cols_statistics; + static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; }; -} // namespace doris + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp new file mode 100644 index 00000000000000..d6298881a90630 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_approx_top_k.h" + +#include "common/exception.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/aggregate_functions/helpers.h" +#include "vec/data_types/data_type.h" + +namespace doris::vectorized { + +int32_t is_valid_const_columns(const std::vector& is_const_columns) { + int32_t true_count = 0; + bool found_false_after_true = false; + for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { + if (is_const_columns[i]) { + true_count++; + if (found_false_after_true) { + return false; + } + } else { + if (true_count > 2) { + return false; + } + found_false_after_true = true; + } + } + if (true_count > 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); + } + return true_count; +} + +AggregateFunctionPtr create_aggregate_function_approx_top_k(const std::string& name, + const DataTypes& argument_types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (argument_types.empty()) { + return nullptr; + } + + std::vector is_const_columns; + std::vector column_names; + for (const auto& [name, is_const] : attr.column_infos) { + is_const_columns.push_back(is_const); + if (!is_const) { + column_names.push_back(name); + } + } + + int32_t true_count = is_valid_const_columns(is_const_columns); + if (true_count == 0) { + return creator_without_type::create>( + argument_types, result_is_nullable, column_names); + } else if (true_count == 1) { + return creator_without_type::create>( + argument_types, result_is_nullable, column_names); + } else if (true_count == 2) { + return creator_without_type::create>( + argument_types, result_is_nullable, column_names); + } else { + return nullptr; + } +} + +void register_aggregate_function_approx_top_k(AggregateFunctionSimpleFactory& factory) { + factory.register_function_both("approx_top_k", create_aggregate_function_approx_top_k); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h new file mode 100644 index 00000000000000..7253ae8a96e200 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h @@ -0,0 +1,290 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/aggregate_functions/aggregate_function_approx_top.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" +#include "vec/columns/column_vector.h" +#include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" +#include "vec/common/space_saving.h" +#include "vec/common/string_ref.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_ipv4.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_struct.h" +#include "vec/io/io_helper.h" + +namespace doris::vectorized { + +inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; + +struct AggregateFunctionTopKGenericData { + using Set = SpaceSaving; + + Set value; +}; + +template +class AggregateFunctionApproxTopK final + : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTop { +private: + using State = AggregateFunctionTopKGenericData; + +public: + AggregateFunctionApproxTopK(std::vector column_names, + const DataTypes& argument_types_) + : IAggregateFunctionDataHelper>(argument_types_), + _column_names(std::move(column_names)) {} + + String get_name() const override { return "approx_top_k"; } + + DataTypePtr get_return_type() const override { return std::make_shared(); } + + // Serializes the aggregate function's state (including the SpaceSaving structure and threshold) into a buffer. + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { + this->data(place).value.write(buf); + + write_var_uint(_column_names.size(), buf); + for (const auto& column_name : _column_names) { + write_string_binary(column_name, buf); + } + write_var_uint(_threshold, buf); + write_var_uint(_reserved, buf); + } + + // Deserializes the aggregate function's state from a buffer (including the SpaceSaving structure and threshold). + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena* arena) const override { + auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { + size_t size = 0; + read_var_uint(size, buf); + + if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, "Too large string size."); + } + + char* data = arena.alloc(size); + buf.read(data, size); + + return StringRef(data, size); + }; + + auto& set = this->data(place).value; + set.clear(); + + size_t size = 0; + read_var_uint(size, buf); + if (UNLIKELY(size > TOP_K_MAX_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Too large size ({}) for aggregate function '{}' state (maximum is {})", + size, get_name(), TOP_K_MAX_SIZE); + } + + set.resize(size); + for (size_t i = 0; i < size; ++i) { + auto ref = readStringBinaryInto(*arena, buf); + uint64_t count = 0; + uint64_t error = 0; + read_var_uint(count, buf); + read_var_uint(error, buf); + set.insert(ref, count, error); + arena->rollback(ref.size); + } + + set.read_alpha_map(buf); + + uint64_t column_size = 0; + read_var_uint(column_size, buf); + _column_names.clear(); + for (uint64_t i = 0; i < column_size; i++) { + std::string column_name; + read_string_binary(column_name, buf); + _column_names.emplace_back(std::move(column_name)); + } + read_var_uint(_threshold, buf); + read_var_uint(_reserved, buf); + } + + // Adds a new row of data to the aggregate function (inserts a new value into the SpaceSaving structure). + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena* arena) const override { + if (!_init_flag) { + lazy_init(columns, row_num); + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + + auto all_serialize_value_into_arena = + [](size_t i, size_t keys_size, const IColumn** columns, Arena* arena) -> StringRef { + const char* begin = nullptr; + + size_t sum_size = 0; + for (size_t j = 0; j < keys_size; ++j) { + sum_size += columns[j]->serialize_value_into_arena(i, *arena, begin).size; + } + + return {begin, sum_size}; + }; + + StringRef str_serialized = + all_serialize_value_into_arena(row_num, _column_names.size(), columns, arena); + set.insert(str_serialized); + arena->rollback(str_serialized.size); + } + + void add_many(AggregateDataPtr __restrict place, const IColumn** columns, + std::vector& rows, Arena* arena) const override { + for (auto row : rows) { + add(place, columns, row, arena); + } + } + + void reset(AggregateDataPtr __restrict place) const override { + this->data(place).value.clear(); + } + + // Merges the state of another aggregate function into the current one (merges two SpaceSaving sets). + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena*) const override { + auto& rhs_set = this->data(rhs).value; + if (!rhs_set.size()) { + return; + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + set.merge(rhs_set); + } + + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + auto& data_to = assert_cast(to); + + const typename State::Set& set = this->data(place).value; + auto result_vec = set.top_k(_threshold); + + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter writer(buffer); + writer.StartArray(); + for (auto& result : result_vec) { + auto argument_types = this->get_argument_types(); + MutableColumns argument_columns(_column_names.size()); + for (size_t i = 0; i < _column_names.size(); ++i) { + argument_columns[i] = argument_types[i]->create_column(); + } + rapidjson::StringBuffer sub_buffer; + rapidjson::Writer sub_writer(sub_buffer); + sub_writer.StartObject(); + const char* begin = result.key.data; + for (size_t i = 0; i < _column_names.size(); i++) { + begin = argument_columns[i]->deserialize_and_insert_from_arena(begin); + std::string row_str = argument_types[i]->to_string(*argument_columns[i], 0); + sub_writer.Key(_column_names[i].data(), _column_names[i].size()); + sub_writer.String(row_str.data(), row_str.size()); + } + sub_writer.Key("count"); + sub_writer.String(std::to_string(result.count).c_str()); + sub_writer.EndObject(); + writer.RawValue(sub_buffer.GetString(), sub_buffer.GetSize(), rapidjson::kObjectType); + } + writer.EndArray(); + std::string res = buffer.GetString(); + data_to.insert_data(res.data(), res.size()); + } + +private: + void lazy_init(const IColumn** columns, ssize_t row_num) const { + auto get_param = [](size_t idx, const DataTypes& data_types, + const IColumn** columns) -> uint64_t { + const auto& data_type = data_types.at(idx); + const IColumn* column = columns[idx]; + + const auto* type = data_type.get(); + if (type->is_nullable()) { + type = assert_cast(type) + ->get_nested_type() + .get(); + } + int64_t value = 0; + WhichDataType which(type); + if (which.idx == TypeIndex::Int8) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int16) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int32) { + value = assert_cast(column) + ->get_element(0); + } + if (value <= 0) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "The parameter cannot be less than or equal to 0."); + } + return value; + }; + + const auto& data_types = this->get_argument_types(); + if (ArgsSize == 1) { + _threshold = + std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); + } else if (ArgsSize == 2) { + _threshold = + std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); + _reserved = std::min( + std::max(get_param(_column_names.size() + 1, data_types, columns), _threshold), + (uint64_t)1000); + } + + if (_threshold == 0 || _reserved == 0 || _threshold > 1000 || _reserved > 1000) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "approx_top_k param error, _threshold: {}, _reserved: {}", _threshold, + _reserved); + } + + _init_flag = true; + } + + mutable std::vector _column_names; + mutable bool _init_flag = false; + mutable uint64_t _threshold = 10; + mutable uint64_t _reserved = 300; +}; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.cpp b/be/src/vec/aggregate_functions/aggregate_function_avg.cpp index 0f3d0fd3bdad6b..6a6711f90f983e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_avg.cpp @@ -45,8 +45,17 @@ template using AggregateFuncAvgDecimal256 = typename AvgDecimal256::Function; void register_aggregate_function_avg(AggregateFunctionSimpleFactory& factory) { - factory.register_function_both("avg", creator_with_type::creator); - factory.register_function_both("avg_decimal256", - creator_with_type::creator); + AggregateFunctionCreator creator = [&](const std::string& name, const DataTypes& types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (attr.enable_decimal256) { + return creator_with_type::creator(name, types, + result_is_nullable, attr); + } else { + return creator_with_type::creator(name, types, result_is_nullable, + attr); + } + }; + factory.register_function_both("avg", creator); } } // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.h b/be/src/vec/aggregate_functions/aggregate_function_avg.h index 8a18a88839b4db..62fbb8078ea949 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_avg.h @@ -184,7 +184,7 @@ class AggregateFunctionAvg final column.get_data().push_back(this->data(place).template result()); } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto& col = assert_cast(column); DCHECK(col.size() >= num_rows) << "source column's size should greater than num_rows"; @@ -205,7 +205,7 @@ class AggregateFunctionAvg final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto* src_data = assert_cast(*columns[0]).get_data().data(); auto& dst_col = assert_cast(*dst); dst_col.set_item_size(sizeof(Data)); @@ -219,7 +219,7 @@ class AggregateFunctionAvg final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); DCHECK(col.size() >= num_rows) << "source column's size should greater than num_rows"; @@ -233,7 +233,7 @@ class AggregateFunctionAvg final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -245,19 +245,19 @@ class AggregateFunctionAvg final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec(places, offset, rhs, arena, num_rows); + this->merge_vec(places, offset, rhs, nullptr, num_rows); } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec_selected(places, offset, rhs, arena, num_rows); + this->merge_vec_selected(places, offset, rhs, nullptr, num_rows); } void serialize_without_key_to_column(ConstAggregateDataPtr __restrict place, diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp b/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp index 0676fd5bc27090..e9c86d4b9556da 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp @@ -40,9 +40,9 @@ AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_type) { return nullptr; } -AggregateFunctionPtr create_aggregate_function_bitmap_union_count(const std::string& name, - const DataTypes& argument_types, - const bool result_is_nullable) { +AggregateFunctionPtr create_aggregate_function_bitmap_union_count( + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { const bool arg_is_nullable = argument_types[0]->is_nullable(); if (arg_is_nullable) { return std::make_shared>(argument_types); @@ -53,7 +53,8 @@ AggregateFunctionPtr create_aggregate_function_bitmap_union_count(const std::str AggregateFunctionPtr create_aggregate_function_bitmap_union_int(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { const bool arg_is_nullable = argument_types[0]->is_nullable(); if (arg_is_nullable) { return AggregateFunctionPtr( diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h index 6c504b91bf4abd..b0619a63e1ffe8 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h @@ -159,7 +159,7 @@ class AggregateFunctionBitmapSerializationHelper : IAggregateFunctionDataHelper(argument_types_) {} void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { if (version >= BITMAP_SERDE) { auto& col = assert_cast(*dst); char place[sizeof(Data)]; @@ -171,11 +171,11 @@ class AggregateFunctionBitmapSerializationHelper assert_cast(this)->destroy(place); }); assert_cast(this)->add(place, columns, - i, arena); + i, nullptr); data[i] = std::move(this->data(place).value); } } else { - BaseHelper::streaming_agg_serialize_to_column(columns, dst, num_rows, arena); + BaseHelper::streaming_agg_serialize_to_column(columns, dst, num_rows, nullptr); } } @@ -194,7 +194,7 @@ class AggregateFunctionBitmapSerializationHelper } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { if (version >= BITMAP_SERDE) { auto& col = assert_cast(column); const size_t num_rows = column.size(); @@ -204,13 +204,13 @@ class AggregateFunctionBitmapSerializationHelper this->data(place).merge(data[i]); } } else { - BaseHelper::deserialize_and_merge_from_column(place, column, arena); + BaseHelper::deserialize_and_merge_from_column(place, column, nullptr); } } void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); if (version >= BITMAP_SERDE) { @@ -220,12 +220,12 @@ class AggregateFunctionBitmapSerializationHelper this->data(place).merge(data[i]); } } else { - BaseHelper::deserialize_and_merge_from_column_range(place, column, begin, end, arena); + BaseHelper::deserialize_and_merge_from_column_range(place, column, begin, end, nullptr); } } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { if (version >= BITMAP_SERDE) { const auto& col = assert_cast(*column); @@ -234,13 +234,13 @@ class AggregateFunctionBitmapSerializationHelper this->data(places[i] + offset).merge(data[i]); } } else { - BaseHelper::deserialize_and_merge_vec(places, offset, rhs, column, arena, num_rows); + BaseHelper::deserialize_and_merge_vec(places, offset, rhs, column, nullptr, num_rows); } } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { if (version >= BITMAP_SERDE) { const auto& col = assert_cast(*column); const auto* data = col.get_data().data(); @@ -250,7 +250,7 @@ class AggregateFunctionBitmapSerializationHelper } } } else { - BaseHelper::deserialize_and_merge_vec_selected(places, offset, rhs, column, arena, + BaseHelper::deserialize_and_merge_vec_selected(places, offset, rhs, column, nullptr, num_rows); } } diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp index b8ae4c6530d575..0b95ddfd46f0d5 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp @@ -41,7 +41,8 @@ AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_types) AggregateFunctionPtr create_aggregate_function_bitmap_agg(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { const bool arg_is_nullable = argument_types[0]->is_nullable(); if (arg_is_nullable) { return AggregateFunctionPtr(create_with_int_data_type(argument_types)); diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h index 19352e022fa7a2..5747faf1b8e8c1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h @@ -72,7 +72,7 @@ class AggregateFunctionBitmapAgg final DataTypePtr get_return_type() const override { return std::make_shared(); } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { DCHECK_LT(row_num, columns[0]->size()); if constexpr (arg_nullable) { auto& nullable_col = @@ -90,7 +90,7 @@ class AggregateFunctionBitmapAgg final } void add_batch_single_place(size_t batch_size, AggregateDataPtr place, const IColumn** columns, - Arena* arena) const override { + Arena*) const override { if constexpr (arg_nullable) { auto& nullable_column = assert_cast(*columns[0]); const auto& column = @@ -111,7 +111,7 @@ class AggregateFunctionBitmapAgg final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } @@ -130,7 +130,7 @@ class AggregateFunctionBitmapAgg final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& col = assert_cast(*dst); char place[sizeof(Data)]; col.resize(num_rows); @@ -138,12 +138,12 @@ class AggregateFunctionBitmapAgg final for (size_t i = 0; i != num_rows; ++i) { this->create(place); DEFER({ this->destroy(place); }); - this->add(place, columns, i, arena); + this->add(place, columns, i, nullptr); data[i] = std::move(this->data(place).value); } } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto& col = assert_cast(column); DCHECK(col.size() >= num_rows) << "source column's size should greater than num_rows"; @@ -165,7 +165,7 @@ class AggregateFunctionBitmapAgg final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); auto* data = col.get_data().data(); @@ -177,7 +177,7 @@ class AggregateFunctionBitmapAgg final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -188,7 +188,7 @@ class AggregateFunctionBitmapAgg final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { const auto& col = assert_cast(*column); const auto* data = col.get_data().data(); @@ -198,8 +198,8 @@ class AggregateFunctionBitmapAgg final } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { const auto& col = assert_cast(*column); const auto* data = col.get_data().data(); for (size_t i = 0; i != num_rows; ++i) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.cpp b/be/src/vec/aggregate_functions/aggregate_function_collect.cpp index 4fcf09b59b33c6..d726b7c6355318 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.cpp @@ -96,7 +96,8 @@ AggregateFunctionPtr create_aggregate_function_collect_impl(const std::string& n AggregateFunctionPtr create_aggregate_function_collect(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() == 1) { if (name == "array_agg") { return create_aggregate_function_collect_impl( diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.h b/be/src/vec/aggregate_functions/aggregate_function_collect.h index 68de426ea1fdcf..02490be56a0bf1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.h +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.h @@ -35,7 +35,6 @@ #include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" -#include "vec/common/hash_table/hash_set.h" #include "vec/common/pod_array_fwd.h" #include "vec/common/string_buffer.hpp" #include "vec/common/string_ref.h" @@ -62,7 +61,7 @@ struct AggregateFunctionCollectSetData { using ColVecType = ColumnVectorOrDecimal; using ElementNativeType = typename NativeType::Type; using SelfType = AggregateFunctionCollectSetData; - using Set = HashSetWithStackMemory, 4>; + using Set = phmap::flat_hash_set; Set data_set; Int64 max_size = -1; @@ -83,20 +82,29 @@ struct AggregateFunctionCollectSetData { if (size() >= max_size) { return; } - data_set.insert(rhs_elem.get_value()); + data_set.insert(rhs_elem); } } else { - data_set.merge(rhs.data_set); + data_set.merge(Set(rhs.data_set)); } } void write(BufferWritable& buf) const { - data_set.write(buf); + write_var_uint(data_set.size(), buf); + for (const auto& value : data_set) { + write_binary(value, buf); + } write_var_int(max_size, buf); } void read(BufferReadable& buf) { - data_set.read(buf); + size_t new_size = 0; + read_var_uint(new_size, buf); + ElementNativeType x; + for (size_t i = 0; i < new_size; ++i) { + read_binary(x, buf); + data_set.insert(x); + } read_var_int(max_size, buf); } @@ -104,7 +112,7 @@ struct AggregateFunctionCollectSetData { auto& vec = assert_cast(to).get_data(); vec.reserve(size()); for (const auto& item : data_set) { - vec.push_back(item.key); + vec.push_back(item); } } @@ -116,23 +124,19 @@ struct AggregateFunctionCollectSetData { using ElementType = StringRef; using ColVecType = ColumnString; using SelfType = AggregateFunctionCollectSetData; - using Set = HashSetWithStackMemory, 4>; + using Set = phmap::flat_hash_set; Set data_set; Int64 max_size = -1; size_t size() const { return data_set.size(); } void add(const IColumn& column, size_t row_num, Arena* arena) { - Set::LookupResult it; - bool inserted; auto key = column.get_data_at(row_num); key.data = arena->insert(key.data, key.size); - data_set.emplace(key, it, inserted); + data_set.insert(key); } void merge(const SelfType& rhs, Arena* arena) { - bool inserted; - Set::LookupResult it; if (max_size == -1) { max_size = rhs.max_size; } @@ -145,16 +149,16 @@ struct AggregateFunctionCollectSetData { } } assert(arena != nullptr); - StringRef key = rhs_elem.get_value(); + StringRef key = rhs_elem; key.data = arena->insert(key.data, key.size); - data_set.emplace(key, it, inserted); + data_set.insert(key); } } void write(BufferWritable& buf) const { write_var_uint(size(), buf); for (const auto& elem : data_set) { - write_string_binary(elem.get_value(), buf); + write_string_binary(elem, buf); } write_var_int(max_size, buf); } @@ -174,7 +178,7 @@ struct AggregateFunctionCollectSetData { auto& vec = assert_cast(to); vec.reserve(size()); for (const auto& item : data_set) { - vec.insert_data(item.key.data, item.key.size); + vec.insert_data(item.data, item.size); } } diff --git a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp index a454afb45f22e0..cdaab6e086f4a5 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp @@ -89,7 +89,8 @@ struct CorrMoment { AggregateFunctionPtr create_aggregate_corr_function(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_binary(name, argument_types); return create_with_two_basic_numeric_types(argument_types[0], argument_types[1], argument_types, result_is_nullable); diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.cpp b/be/src/vec/aggregate_functions/aggregate_function_count.cpp index 8c54714b046da1..5cfe5af41982f6 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_count.cpp @@ -29,15 +29,16 @@ namespace doris::vectorized { AggregateFunctionPtr create_aggregate_function_count(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_arity_at_most<1>(name, argument_types); return std::make_shared(argument_types); } -AggregateFunctionPtr create_aggregate_function_count_not_null_unary(const std::string& name, - const DataTypes& argument_types, - const bool result_is_nullable) { +AggregateFunctionPtr create_aggregate_function_count_not_null_unary( + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_arity_at_most<1>(name, argument_types); return std::make_shared(argument_types); diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.h b/be/src/vec/aggregate_functions/aggregate_function_count.h index 62aa869771c0a5..7b54d074683b04 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count.h @@ -91,7 +91,7 @@ class AggregateFunctionCount final assert_cast(to).get_data().push_back(data(place).count); } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto data = assert_cast(column).get_data().data(); memcpy(places, data, sizeof(Data) * num_rows); @@ -111,7 +111,7 @@ class AggregateFunctionCount final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& dst_col = assert_cast(*dst); DCHECK(dst_col.item_size() == sizeof(Data)) << "size is not equal: " << dst_col.item_size() << " " << sizeof(Data); @@ -124,7 +124,7 @@ class AggregateFunctionCount final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); auto* data = reinterpret_cast(col.get_data().data()); @@ -135,7 +135,7 @@ class AggregateFunctionCount final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -146,19 +146,19 @@ class AggregateFunctionCount final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec(places, offset, rhs, arena, num_rows); + this->merge_vec(places, offset, rhs, nullptr, num_rows); } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec_selected(places, offset, rhs, arena, num_rows); + this->merge_vec_selected(places, offset, rhs, nullptr, num_rows); } void serialize_without_key_to_column(ConstAggregateDataPtr __restrict place, @@ -229,7 +229,7 @@ class AggregateFunctionCountNotNullUnary final } } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { auto data = assert_cast(column).get_data().data(); memcpy(places, data, sizeof(Data) * num_rows); @@ -249,7 +249,7 @@ class AggregateFunctionCountNotNullUnary final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& col = assert_cast(*dst); DCHECK(col.item_size() == sizeof(Data)) << "size is not equal: " << col.item_size() << " " << sizeof(Data); @@ -263,7 +263,7 @@ class AggregateFunctionCountNotNullUnary final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); auto* data = reinterpret_cast(col.get_data().data()); @@ -274,7 +274,7 @@ class AggregateFunctionCountNotNullUnary final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); auto& col = assert_cast(column); @@ -286,19 +286,19 @@ class AggregateFunctionCountNotNullUnary final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec(places, offset, rhs, arena, num_rows); + this->merge_vec(places, offset, rhs, nullptr, num_rows); } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { - this->deserialize_from_column(rhs, *column, arena, num_rows); + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { + this->deserialize_from_column(rhs, *column, nullptr, num_rows); DEFER({ this->destroy_vec(rhs, num_rows); }); - this->merge_vec_selected(places, offset, rhs, arena, num_rows); + this->merge_vec_selected(places, offset, rhs, nullptr, num_rows); } void serialize_without_key_to_column(ConstAggregateDataPtr __restrict place, diff --git a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp index 1a0bf2518202f3..093b31d57db554 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp @@ -29,7 +29,8 @@ namespace doris::vectorized { AggregateFunctionPtr create_aggregate_function_count_by_enum(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() < 1) { LOG(WARNING) << fmt::format("Illegal number {} of argument for aggregate function {}", argument_types.size(), name); diff --git a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h index 5d4a3dde3550a1..1f5093de68263e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h @@ -197,7 +197,7 @@ class AggregateFunctionCountByEnum final DataTypePtr get_return_type() const override { return std::make_shared(); } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { for (int i = 0; i < arg_count; i++) { const auto* nullable_column = check_and_get_column(columns[i]); if (nullable_column == nullptr) { @@ -217,7 +217,7 @@ class AggregateFunctionCountByEnum final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp index 76a2881dd78280..71d09f61de4302 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp @@ -51,25 +51,18 @@ AggregateFunctionPtr create_function_single_value(const String& name, return nullptr; } -template -AggregateFunctionPtr create_aggregate_function_covariance_samp_old(const std::string& name, - const DataTypes& argument_types, - const bool result_is_nullable) { - return create_function_single_value(name, argument_types, result_is_nullable, - NULLABLE); -} - AggregateFunctionPtr create_aggregate_function_covariance_samp(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { return create_function_single_value( name, argument_types, result_is_nullable, NOTNULLABLE); } AggregateFunctionPtr create_aggregate_function_covariance_pop(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { return create_function_single_value( name, argument_types, result_is_nullable, NOTNULLABLE); } @@ -80,12 +73,7 @@ void register_aggregate_function_covar_pop(AggregateFunctionSimpleFactory& facto } void register_aggregate_function_covar_samp_old(AggregateFunctionSimpleFactory& factory) { - factory.register_alternative_function( - "covar_samp", create_aggregate_function_covariance_samp_old, false, - AGG_FUNCTION_NULLABLE); - factory.register_alternative_function("covar_samp", - create_aggregate_function_covariance_samp_old, - true, AGG_FUNCTION_NULLABLE); + BeExecVersionManager::registe_restrict_function_compatibility("covar_samp"); } void register_aggregate_function_covar_samp(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.h b/be/src/vec/aggregate_functions/aggregate_function_covar.h index 78a3eae5bcb4e9..e6ebec70285d72 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.h +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.h @@ -137,32 +137,6 @@ struct PopData : Data { static DataTypePtr get_return_type() { return std::make_shared>(); } }; -template -struct SampData_OLDER : Data { - void insert_result_into(IColumn& to) const { - if (to.is_nullable()) { - ColumnNullable& nullable_column = assert_cast(to); - if (this->count == 1 || this->count == 0) { - nullable_column.insert_default(); - } else { - auto& col = assert_cast(nullable_column.get_nested_column()); - col.get_data().push_back(this->get_samp_result()); - nullable_column.get_null_map_data().push_back(0); - } - } else { - auto& col = assert_cast(to); - if (this->count == 1 || this->count == 0) { - col.insert_default(); - } else { - col.get_data().push_back(this->get_samp_result()); - } - } - } - static DataTypePtr get_return_type() { - return make_nullable(std::make_shared>()); - } -}; - template struct SampData : Data { void insert_result_into(IColumn& to) const { @@ -258,14 +232,6 @@ class AggregateFunctionSampCovariance } }; -template -class AggregateFunctionSamp_OLDER final - : public AggregateFunctionSampCovariance { -public: - AggregateFunctionSamp_OLDER(const DataTypes& argument_types_) - : AggregateFunctionSampCovariance(argument_types_) {} -}; - template class AggregateFunctionSamp final : public AggregateFunctionSampCovariance { diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp index 9bb2954207babb..fce58b38688b28 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp @@ -83,7 +83,8 @@ const std::string DISTINCT_FUNCTION_PREFIX = "multi_distinct_"; void register_aggregate_function_combinator_distinct(AggregateFunctionSimpleFactory& factory) { AggregateFunctionCreator creator = [&](const std::string& name, const DataTypes& types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { // 1. we should get not nullable types; DataTypes nested_types(types.size()); std::transform(types.begin(), types.end(), nested_types.begin(), @@ -92,7 +93,7 @@ void register_aggregate_function_combinator_distinct(AggregateFunctionSimpleFact auto transform_arguments = function_combinator->transform_arguments(nested_types); auto nested_function_name = name.substr(DISTINCT_FUNCTION_PREFIX.size()); auto nested_function = factory.get(nested_function_name, transform_arguments, false, - BeExecVersionManager::get_newest_version()); + BeExecVersionManager::get_newest_version(), attr); return function_combinator->transform_aggregate_function(nested_function, types, result_is_nullable); }; diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.h b/be/src/vec/aggregate_functions/aggregate_function_distinct.h index ec6936a128c869..3cce558312be24 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.h +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.h @@ -35,7 +35,6 @@ #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column.h" #include "vec/common/assert_cast.h" -#include "vec/common/hash_table/hash_set.h" #include "vec/common/string_ref.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" @@ -59,8 +58,8 @@ namespace doris::vectorized { template struct AggregateFunctionDistinctSingleNumericData { /// When creating, the hash table must be small. - using Container = std::conditional_t, - HashSetWithStackMemory, 4>>; + using Container = + std::conditional_t, phmap::flat_hash_set>; using Self = AggregateFunctionDistinctSingleNumericData; Container data; @@ -78,21 +77,30 @@ struct AggregateFunctionDistinctSingleNumericData { void merge(const Self& rhs, Arena*) { DCHECK(!stable); if constexpr (!stable) { - data.merge(rhs.data); + data.merge(Container(rhs.data)); } } void serialize(BufferWritable& buf) const { DCHECK(!stable); if constexpr (!stable) { - data.write(buf); + write_var_uint(data.size(), buf); + for (const auto& value : data) { + write_binary(value, buf); + } } } void deserialize(BufferReadable& buf, Arena*) { DCHECK(!stable); if constexpr (!stable) { - data.read(buf); + size_t new_size = 0; + read_var_uint(new_size, buf); + T x; + for (size_t i = 0; i < new_size; ++i) { + read_binary(x, buf); + data.insert(x); + } } } @@ -108,7 +116,7 @@ struct AggregateFunctionDistinctSingleNumericData { } } else { for (const auto& elem : data) { - argument_columns[0]->insert(elem.get_value()); + argument_columns[0]->insert(elem); } } @@ -120,19 +128,17 @@ template struct AggregateFunctionDistinctGenericData { /// When creating, the hash table must be small. using Container = std::conditional_t, - HashSetWithStackMemory>; + phmap::flat_hash_set>; using Self = AggregateFunctionDistinctGenericData; Container data; void merge(const Self& rhs, Arena* arena) { DCHECK(!stable); if constexpr (!stable) { - typename Container::LookupResult it; - bool inserted; for (const auto& elem : rhs.data) { - StringRef key = elem.get_value(); + StringRef key = elem; key.data = arena->insert(key.data, key.size); - data.emplace(key, it, inserted); + data.emplace(key); } } } @@ -142,7 +148,7 @@ struct AggregateFunctionDistinctGenericData { if constexpr (!stable) { write_var_uint(data.size(), buf); for (const auto& elem : data) { - write_string_binary(elem.get_value(), buf); + write_string_binary(elem, buf); } } } @@ -174,9 +180,7 @@ struct AggregateFunctionDistinctSingleGenericData if constexpr (stable) { data.emplace(key, data.size()); } else { - typename Base::Container::LookupResult it; - bool inserted; - data.emplace(key, it, inserted); + data.insert(key); } } @@ -193,7 +197,7 @@ struct AggregateFunctionDistinctSingleGenericData } } else { for (const auto& elem : data) { - argument_columns[0]->insert_data(elem.get_value().data, elem.get_value().size); + argument_columns[0]->insert_data(elem.data, elem.size); } } @@ -218,9 +222,7 @@ struct AggregateFunctionDistinctMultipleGenericData if constexpr (stable) { data.emplace(key, data.size()); } else { - typename Base::Container::LookupResult it; - bool inserted; - data.emplace(key, it, inserted); + data.emplace(key); } } @@ -243,7 +245,7 @@ struct AggregateFunctionDistinctMultipleGenericData } } else { for (const auto& elem : data) { - const char* begin = elem.get_value().data; + const char* begin = elem.data; for (auto& column : argument_columns) { begin = column->deserialize_and_insert_from_arena(begin); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp b/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp index ab6d0142f6a8c0..c1cbcc89996caf 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_foreach.cpp @@ -34,8 +34,9 @@ namespace doris::vectorized { void register_aggregate_function_combinator_foreach(AggregateFunctionSimpleFactory& factory) { - AggregateFunctionCreator creator = [&](const std::string& name, const DataTypes& types, - const bool result_is_nullable) -> AggregateFunctionPtr { + AggregateFunctionCreator creator = + [&](const std::string& name, const DataTypes& types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) -> AggregateFunctionPtr { const std::string& suffix = AggregateFunctionForEach::AGG_FOREACH_SUFFIX; DataTypes transform_arguments; for (const auto& t : types) { @@ -46,7 +47,7 @@ void register_aggregate_function_combinator_foreach(AggregateFunctionSimpleFacto auto nested_function_name = name.substr(0, name.size() - suffix.size()); auto nested_function = factory.get(nested_function_name, transform_arguments, result_is_nullable, - BeExecVersionManager::get_newest_version(), false); + BeExecVersionManager::get_newest_version(), attr); if (!nested_function) { throw Exception( ErrorCode::INTERNAL_ERROR, diff --git a/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp b/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp index b3b9a8b9af47c6..24faf58b2e1ff9 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.cpp @@ -70,7 +70,8 @@ inline AggregateFunctionPtr create_aggregate_function_group_array_intersect_impl } AggregateFunctionPtr create_aggregate_function_group_array_intersect( - const std::string& name, const DataTypes& argument_types, const bool result_is_nullable) { + const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, + const AggregateFunctionAttr& attr) { assert_unary(name, argument_types); const DataTypePtr& argument_type = remove_nullable(argument_types[0]); diff --git a/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.h b/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.h index 94b34caff78645..fd6076686acb65 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.h +++ b/be/src/vec/aggregate_functions/aggregate_function_group_array_intersect.h @@ -132,7 +132,8 @@ struct AggregateFunctionGroupArrayIntersectData { const T* src_data = is_null_element ? nullptr : &(nested_column_data->get_element(offset + i)); - if (set->find(src_data) || (set->contain_null() && src_data == nullptr)) { + if ((!is_null_element && set->find(src_data)) || + (set->contain_null() && is_null_element)) { new_set->insert(src_data); } } @@ -424,7 +425,8 @@ class AggregateFunctionGroupArrayIntersectGeneric for (size_t i = 0; i < arr_size; ++i) { StringRef src = process_element(i); - if (set->find(src.data, src.size) || (set->contain_null() && src.data == nullptr)) { + if ((set->find(src.data, src.size) && src.data != nullptr) || + (set->contain_null() && src.data == nullptr)) { new_set->insert((void*)src.data, src.size); } } diff --git a/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp b/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp index 9661b9c89d5700..286795ea2ba70c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_group_concat.cpp @@ -28,7 +28,8 @@ const std::string AggregateFunctionGroupConcatImplStr::separator = ","; AggregateFunctionPtr create_aggregate_function_group_concat(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() == 1) { return creator_without_type::create< AggregateFunctionGroupConcat>( diff --git a/be/src/vec/aggregate_functions/aggregate_function_group_concat.h b/be/src/vec/aggregate_functions/aggregate_function_group_concat.h index a62ffb8da619f9..a0cac9ab78016d 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_group_concat.h +++ b/be/src/vec/aggregate_functions/aggregate_function_group_concat.h @@ -43,20 +43,27 @@ class IColumn; namespace doris::vectorized { struct AggregateFunctionGroupConcatData { - std::string data; + ColumnString::Chars data; std::string separator; bool inited = false; void add(StringRef ref, StringRef sep) { + auto delta_size = ref.size; if (!inited) { - inited = true; separator.assign(sep.data, sep.data + sep.size); } else { - data += separator; + delta_size += separator.size(); } + auto offset = data.size(); + data.resize(data.size() + delta_size); - data.resize(data.length() + ref.size); - memcpy(data.data() + data.length() - ref.size, ref.data, ref.size); + if (!inited) { + inited = true; + } else { + memcpy(data.data() + offset, separator.data(), separator.size()); + offset += separator.size(); + } + memcpy(data.data() + offset, ref.data, ref.size); } void merge(const AggregateFunctionGroupConcatData& rhs) { @@ -67,17 +74,23 @@ struct AggregateFunctionGroupConcatData { if (!inited) { inited = true; separator = rhs.separator; - data = rhs.data; + data.assign(rhs.data); } else { - data += separator; - data += rhs.data; + auto offset = data.size(); + + auto delta_size = separator.size() + rhs.data.size(); + data.resize(data.size() + delta_size); + + memcpy(data.data() + offset, separator.data(), separator.size()); + offset += separator.size(); + memcpy(data.data() + offset, rhs.data.data(), rhs.data.size()); } } - const std::string& get() const { return data; } + StringRef get() const { return StringRef {data.data(), data.size()}; } void write(BufferWritable& buf) const { - write_binary(data, buf); + write_binary(StringRef {data.data(), data.size()}, buf); write_binary(separator, buf); write_binary(inited, buf); } @@ -89,7 +102,7 @@ struct AggregateFunctionGroupConcatData { } void reset() { - data = ""; + data.clear(); separator = ""; inited = false; } @@ -150,8 +163,8 @@ class AggregateFunctionGroupConcat final } void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { - const std::string& result = this->data(place).get(); - assert_cast(to).insert_data(result.c_str(), result.length()); + const auto result = this->data(place).get(); + assert_cast(to).insert_data(result.data, result.size); } }; diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp index 5b06af28399d71..fb2fa9c2513ec0 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp @@ -47,7 +47,8 @@ AggregateFunctionPtr create_agg_function_histogram(const DataTypes& argument_typ AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { WhichDataType type(remove_nullable(argument_types[0])); #define DISPATCH(TYPE) \ diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h b/be/src/vec/aggregate_functions/aggregate_function_histogram.h index 25fc6957321586..1d2c5725ed370f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h +++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h @@ -192,7 +192,7 @@ class AggregateFunctionHistogram final DataTypePtr get_return_type() const override { return std::make_shared(); } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { if constexpr (has_input_param) { Int32 input_max_num_buckets = assert_cast(columns[1])->get_element(row_num); @@ -220,7 +220,7 @@ class AggregateFunctionHistogram final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h index 1cf6dc7f2a29a9..44835194eb4b88 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h @@ -122,7 +122,7 @@ class AggregateFunctionHLLUnion } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { this->data(place).add(columns[0], row_num); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h b/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h index d314cba7a656a9..d16da1a34e66e3 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h +++ b/be/src/vec/aggregate_functions/aggregate_function_java_udaf.h @@ -148,6 +148,7 @@ struct AggregateJavaUdafData { jbyteArray arr = env->NewByteArray(len); env->SetByteArrayRegion(arr, 0, len, reinterpret_cast(serialize_data.data())); env->CallNonvirtualVoidMethod(executor_obj, executor_cl, executor_merge_id, place, arr); + RETURN_IF_ERROR(JniUtil::GetJniExceptionMsg(env)); jbyte* pBytes = env->GetByteArrayElements(arr, nullptr); env->ReleaseByteArrayElements(arr, pBytes, JNI_ABORT); env->DeleteLocalRef(arr); @@ -332,7 +333,7 @@ class AggregateJavaUdaf final } void add_batch(size_t batch_size, AggregateDataPtr* places, size_t place_offset, - const IColumn** columns, Arena* /*arena*/, bool /*agg_many*/) const override { + const IColumn** columns, Arena*, bool /*agg_many*/) const override { int64_t places_address = reinterpret_cast(places); Status st = this->data(_exec_place) .add(places_address, false, columns, 0, batch_size, argument_types, @@ -343,7 +344,7 @@ class AggregateJavaUdaf final } void add_batch_single_place(size_t batch_size, AggregateDataPtr place, const IColumn** columns, - Arena* /*arena*/) const override { + Arena*) const override { int64_t places_address = reinterpret_cast(place); Status st = this->data(_exec_place) .add(places_address, true, columns, 0, batch_size, argument_types, 0); @@ -354,7 +355,7 @@ class AggregateJavaUdaf final void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, int64_t frame_end, AggregateDataPtr place, const IColumn** columns, - Arena* arena) const override { + Arena*) const override { frame_start = std::max(frame_start, partition_start); frame_end = std::min(frame_end, partition_end); int64_t places_address = reinterpret_cast(place); diff --git a/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp b/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp index 00ad1893eafcf6..a763721f3f4061 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_kurtosis.cpp @@ -45,7 +45,8 @@ AggregateFunctionPtr type_dispatch_for_aggregate_function_kurt(const DataTypes& AggregateFunctionPtr create_aggregate_function_kurt(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { if (argument_types.size() != 1) { LOG(WARNING) << "aggregate function " << name << " requires exactly 1 argument"; return nullptr; diff --git a/be/src/vec/aggregate_functions/aggregate_function_linear_histogram.cpp b/be/src/vec/aggregate_functions/aggregate_function_linear_histogram.cpp new file mode 100644 index 00000000000000..683cf1a18f78ba --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_linear_histogram.cpp @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_linear_histogram.h" + +#include "vec/aggregate_functions/helpers.h" + +namespace doris::vectorized { + +const std::string AggregateFunctionLinearHistogramConsts::NAME = "linear_histogram"; + +template +AggregateFunctionPtr create_agg_function_linear_histogram(const DataTypes& argument_types, + const bool result_is_nullable) { + bool has_offset = (argument_types.size() == 3); + + if (has_offset) { + return creator_without_type::create< + AggregateFunctionLinearHistogram, true>>( + argument_types, result_is_nullable); + } else { + return creator_without_type::create, false>>(argument_types, + result_is_nullable); + } +} + +AggregateFunctionPtr create_aggregate_function_linear_histogram(const std::string& name, + const DataTypes& argument_types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + WhichDataType type(remove_nullable(argument_types[0])); + +#define DISPATCH(TYPE) \ + if (type.idx == TypeIndex::TYPE) \ + return create_agg_function_linear_histogram(argument_types, result_is_nullable); + FOR_NUMERIC_TYPES(DISPATCH) + FOR_DECIMAL_TYPES(DISPATCH) +#undef DISPATCH + + LOG(WARNING) << fmt::format("unsupported input type {} for aggregate function {}", + argument_types[0]->get_name(), name); + return nullptr; +} + +void register_aggregate_function_linear_histogram(AggregateFunctionSimpleFactory& factory) { + factory.register_function_both(AggregateFunctionLinearHistogramConsts::NAME, + create_aggregate_function_linear_histogram); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_linear_histogram.h b/be/src/vec/aggregate_functions/aggregate_function_linear_histogram.h new file mode 100644 index 00000000000000..173324b9463750 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_linear_histogram.h @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include +#include + +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_decimal.h" +#include "vec/io/io_helper.h" + +// TODO: optimize count=0 +// TODO: support datetime +// TODO: support foreach + +namespace doris::vectorized { + +template +struct AggregateFunctionLinearHistogramData { + // bucket key limits + const static int32_t MIN_BUCKET_KEY = std::numeric_limits::min(); + const static int32_t MAX_BUCKET_KEY = std::numeric_limits::max(); + +private: + // influxdb use double + double interval = 0; + double offset; + double lower; // not used yet + double upper; // not used yet + std::unordered_map(key); })> + buckets; + +public: + // reset + void reset() { + offset = 0; + interval = 0; + buckets.clear(); + } + + void set_parameters(double input_interval, double input_offset) { + interval = input_interval; + offset = input_offset; + } + + // add + void add(const T& value, UInt32 scale) { + double val = 0; + if constexpr (IsDecimalNumber) { + using NativeType = typename T::NativeType; + val = static_cast(value.value) / decimal_scale_multiplier(scale); + } else { + val = static_cast(value); + } + double key = std::floor((val - offset) / interval); + if (key <= MIN_BUCKET_KEY || key >= MAX_BUCKET_KEY) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "{} exceeds the bucket range limit", + value); + } + buckets[static_cast(key)]++; + } + + // merge + void merge(const AggregateFunctionLinearHistogramData& rhs) { + if (rhs.interval == 0) { + return; + } + + interval = rhs.interval; + offset = rhs.offset; + + for (const auto& [key, count] : rhs.buckets) { + buckets[key] += count; + } + } + + // write + void write(BufferWritable& buf) const { + write_binary(offset, buf); + write_binary(interval, buf); + write_binary(lower, buf); + write_binary(upper, buf); + write_binary(buckets.size(), buf); + for (const auto& [key, count] : buckets) { + write_binary(key, buf); + write_binary(count, buf); + } + } + + // read + void read(BufferReadable& buf) { + read_binary(offset, buf); + read_binary(interval, buf); + read_binary(lower, buf); + read_binary(upper, buf); + size_t size; + read_binary(size, buf); + for (size_t i = 0; i < size; i++) { + int32_t key; + size_t count; + read_binary(key, buf); + read_binary(count, buf); + buckets[key] = count; + } + } + + // insert_result_into + void insert_result_into(IColumn& to) const { + std::vector> bucket_vector(buckets.begin(), buckets.end()); + std::sort(bucket_vector.begin(), bucket_vector.end(), + [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); + + rapidjson::Document doc; + doc.SetObject(); + rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + + unsigned num_buckets = bucket_vector.empty() ? 0 + : bucket_vector.rbegin()->first - + bucket_vector.begin()->first + 1; + doc.AddMember("num_buckets", num_buckets, allocator); + + rapidjson::Value bucket_arr(rapidjson::kArrayType); + bucket_arr.Reserve(num_buckets, allocator); + + if (num_buckets > 0) { + int32_t idx = bucket_vector.begin()->first; + double left = bucket_vector.begin()->first * interval + offset; + size_t count = 0; + size_t acc_count = 0; + + for (const auto& [key, count_] : bucket_vector) { + for (; idx <= key; ++idx) { + rapidjson::Value bucket_json(rapidjson::kObjectType); + bucket_json.AddMember("lower", left, allocator); + left += interval; + bucket_json.AddMember("upper", left, allocator); + count = (idx == key) ? count_ : 0; + bucket_json.AddMember("count", static_cast(count), allocator); + acc_count += count; + bucket_json.AddMember("acc_count", static_cast(acc_count), allocator); + bucket_arr.PushBack(bucket_json, allocator); + } + } + } + + doc.AddMember("buckets", bucket_arr, allocator); + + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc.Accept(writer); + + auto& column = assert_cast(to); + column.insert_data(buffer.GetString(), buffer.GetSize()); + } +}; + +class AggregateFunctionLinearHistogramConsts { +public: + const static std::string NAME; +}; + +template +class AggregateFunctionLinearHistogram final + : public IAggregateFunctionDataHelper< + Data, AggregateFunctionLinearHistogram> { +public: + using ColVecType = ColumnVectorOrDecimal; + + AggregateFunctionLinearHistogram(const DataTypes& argument_types_) + : IAggregateFunctionDataHelper>( + argument_types_), + scale(get_decimal_scale(*argument_types_[0])) {} + + std::string get_name() const override { return AggregateFunctionLinearHistogramConsts::NAME; } + + DataTypePtr get_return_type() const override { return std::make_shared(); } + + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena*) const override { + double interval = + assert_cast(*columns[1]) + .get_data()[row_num]; + if (interval <= 0) { + throw doris::Exception( + ErrorCode::INVALID_ARGUMENT, + "Invalid interval {}, row_num {}, interval should be larger than 0", interval, + row_num); + } + + double offset = 0; + if constexpr (has_offset) { + offset = assert_cast(*columns[2]) + .get_data()[row_num]; + if (offset < 0 || offset >= interval) { + throw doris::Exception( + ErrorCode::INVALID_ARGUMENT, + "Invalid offset {}, row_num {}, offset should be in [0, interval)", offset, + row_num); + } + } + + this->data(place).set_parameters(interval, offset); + + this->data(place).add( + assert_cast(*columns[0]) + .get_data()[row_num], + scale); + } + + void reset(AggregateDataPtr place) const override { this->data(place).reset(); } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena*) const override { + this->data(place).merge(this->data(rhs)); + } + + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { + this->data(place).write(buf); + } + + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena*) const override { + this->data(place).read(buf); + } + + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + this->data(place).insert_result_into(to); + } + +private: + UInt32 scale; +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_map.cpp b/be/src/vec/aggregate_functions/aggregate_function_map.cpp index bcf3f2d66dfeaf..f289d885f48f52 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_map.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_map.cpp @@ -32,7 +32,8 @@ AggregateFunctionPtr create_agg_function_map_agg(const DataTypes& argument_types AggregateFunctionPtr create_aggregate_function_map_agg(const std::string& name, const DataTypes& argument_types, - const bool result_is_nullable) { + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { WhichDataType type(remove_nullable(argument_types[0])); #define DISPATCH(TYPE) \ diff --git a/be/src/vec/aggregate_functions/aggregate_function_map.h b/be/src/vec/aggregate_functions/aggregate_function_map.h index d56cbf21f31136..3ec25cdc706152 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_map.h +++ b/be/src/vec/aggregate_functions/aggregate_function_map.h @@ -203,7 +203,7 @@ class AggregateFunctionMapAgg final } void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, - Arena* arena) const override { + Arena*) const override { if (columns[0]->is_nullable()) { const auto& nullable_col = assert_cast(*columns[0]); @@ -234,7 +234,7 @@ class AggregateFunctionMapAgg final void reset(AggregateDataPtr place) const override { this->data(place).reset(); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, - Arena* arena) const override { + Arena*) const override { this->data(place).merge(this->data(rhs)); } @@ -248,7 +248,7 @@ class AggregateFunctionMapAgg final } void streaming_agg_serialize_to_column(const IColumn** columns, MutableColumnPtr& dst, - const size_t num_rows, Arena* arena) const override { + const size_t num_rows, Arena*) const override { auto& col = assert_cast(*dst); for (size_t i = 0; i != num_rows; ++i) { Field key, value; @@ -263,7 +263,7 @@ class AggregateFunctionMapAgg final } } - void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena* arena, + void deserialize_from_column(AggregateDataPtr places, const IColumn& column, Arena*, size_t num_rows) const override { const auto& col = assert_cast(column); auto* data = &(this->data(places)); @@ -282,7 +282,7 @@ class AggregateFunctionMapAgg final } void deserialize_and_merge_from_column(AggregateDataPtr __restrict place, const IColumn& column, - Arena* arena) const override { + Arena*) const override { auto& col = assert_cast(column); const size_t num_rows = column.size(); for (size_t i = 0; i != num_rows; ++i) { @@ -293,7 +293,7 @@ class AggregateFunctionMapAgg final void deserialize_and_merge_from_column_range(AggregateDataPtr __restrict place, const IColumn& column, size_t begin, size_t end, - Arena* arena) const override { + Arena*) const override { DCHECK(end <= column.size() && begin <= end) << ", begin:" << begin << ", end:" << end << ", column.size():" << column.size(); const auto& col = assert_cast(column); @@ -304,7 +304,7 @@ class AggregateFunctionMapAgg final } void deserialize_and_merge_vec(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, Arena* arena, + AggregateDataPtr rhs, const IColumn* column, Arena*, const size_t num_rows) const override { const auto& col = assert_cast(*column); for (size_t i = 0; i != num_rows; ++i) { @@ -314,8 +314,8 @@ class AggregateFunctionMapAgg final } void deserialize_and_merge_vec_selected(const AggregateDataPtr* places, size_t offset, - AggregateDataPtr rhs, const IColumn* column, - Arena* arena, const size_t num_rows) const override { + AggregateDataPtr rhs, const IColumn* column, Arena*, + const size_t num_rows) const override { const auto& col = assert_cast(*column); for (size_t i = 0; i != num_rows; ++i) { if (places[i]) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp b/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp index 8aa8850a314d84..c1a72fd52bdd76 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_min_max.cpp @@ -30,7 +30,8 @@ namespace doris::vectorized { template